diff --git a/docs/components/embedding-models.mdx b/docs/components/embedding-models.mdx index 0f030bdc..f1dab686 100644 --- a/docs/components/embedding-models.mdx +++ b/docs/components/embedding-models.mdx @@ -24,7 +24,7 @@ Once you have obtained the key, you can use it like this: ```python main.py import os -from embedchain import App +from embedchain import Pipeline as App os.environ['OPENAI_API_KEY'] = 'xxx' @@ -52,7 +52,7 @@ To use Azure OpenAI embedding model, you have to set some of the azure openai re ```python main.py import os -from embedchain import App +from embedchain import Pipeline as App os.environ["OPENAI_API_TYPE"] = "azure" os.environ["OPENAI_API_BASE"] = "https://xxx.openai.azure.com/" @@ -90,7 +90,7 @@ GPT4All supports generating high quality embeddings of arbitrary length document ```python main.py -from embedchain import App +from embedchain import Pipeline as App # load embedding model configuration from config.yaml file app = App.from_config(yaml_path="config.yaml") @@ -119,7 +119,7 @@ Hugging Face supports generating embeddings of arbitrary length documents of tex ```python main.py -from embedchain import App +from embedchain import Pipeline as App # load embedding model configuration from config.yaml file app = App.from_config(yaml_path="config.yaml") @@ -150,7 +150,7 @@ Embedchain supports Google's VertexAI embeddings model through a simple interfac ```python main.py -from embedchain import App +from embedchain import Pipeline as App # load embedding model configuration from config.yaml file app = App.from_config(yaml_path="config.yaml") diff --git a/docs/components/llms.mdx b/docs/components/llms.mdx index 49b5d7cd..031a90a4 100644 --- a/docs/components/llms.mdx +++ b/docs/components/llms.mdx @@ -26,7 +26,7 @@ Once you have obtained the key, you can use it like this: ```python import os -from embedchain import App +from embedchain import Pipeline as App os.environ['OPENAI_API_KEY'] = 'xxx' @@ -41,7 +41,7 @@ If you are looking to configure the different parameters of the LLM, you can do ```python main.py import os -from embedchain import App +from embedchain import Pipeline as App os.environ['OPENAI_API_KEY'] = 'xxx' @@ -71,7 +71,7 @@ To use Azure OpenAI model, you have to set some of the azure openai related envi ```python main.py import os -from embedchain import App +from embedchain import Pipeline as App os.environ["OPENAI_API_TYPE"] = "azure" os.environ["OPENAI_API_BASE"] = "https://xxx.openai.azure.com/" @@ -110,7 +110,7 @@ To use anthropic's model, please set the `ANTHROPIC_API_KEY` which you find on t ```python main.py import os -from embedchain import App +from embedchain import Pipeline as App os.environ["ANTHROPIC_API_KEY"] = "xxx" @@ -147,7 +147,7 @@ Once you have the API key, you are all set to use it with Embedchain. ```python main.py import os -from embedchain import App +from embedchain import Pipeline as App os.environ["COHERE_API_KEY"] = "xxx" @@ -180,7 +180,7 @@ GPT4all is a free-to-use, locally running, privacy-aware chatbot. No GPU or inte ```python main.py -from embedchain import App +from embedchain import Pipeline as App # load llm configuration from config.yaml file app = App.from_config(yaml_path="config.yaml") @@ -212,7 +212,7 @@ Once you have the key, load the app using the config yaml file: ```python main.py import os -from embedchain import App +from embedchain import Pipeline as App os.environ["JINACHAT_API_KEY"] = "xxx" # load llm configuration from config.yaml file @@ -248,7 +248,7 @@ Once you have the token, load the app using the config yaml file: ```python main.py import os -from embedchain import App +from embedchain import Pipeline as App os.environ["HUGGINGFACE_ACCESS_TOKEN"] = "xxx" @@ -278,7 +278,7 @@ Once you have the token, load the app using the config yaml file: ```python main.py import os -from embedchain import App +from embedchain import Pipeline as App os.environ["REPLICATE_API_TOKEN"] = "xxx" @@ -305,7 +305,7 @@ Setup Google Cloud Platform application credentials by following the instruction ```python main.py -from embedchain import App +from embedchain import Pipeline as App # load llm configuration from config.yaml file app = App.from_config(yaml_path="config.yaml") diff --git a/docs/components/vector-databases.mdx b/docs/components/vector-databases.mdx index 94e965d3..ccb5dc66 100644 --- a/docs/components/vector-databases.mdx +++ b/docs/components/vector-databases.mdx @@ -22,7 +22,7 @@ Utilizing a vector database alongside Embedchain is a seamless process. All you ```python main.py -from embedchain import App +from embedchain import Pipeline as App # load chroma configuration from yaml file app = App.from_config(yaml_path="config1.yaml") @@ -61,7 +61,7 @@ pip install --upgrade 'embedchain[elasticsearch]' ```python main.py -from embedchain import App +from embedchain import Pipeline as App # load elasticsearch configuration from yaml file app = App.from_config(yaml_path="config.yaml") @@ -89,7 +89,7 @@ pip install --upgrade 'embedchain[opensearch]' ```python main.py -from embedchain import App +from embedchain import Pipeline as App # load opensearch configuration from yaml file app = App.from_config(yaml_path="config.yaml") @@ -125,7 +125,7 @@ Set the Zilliz environment variables `ZILLIZ_CLOUD_URI` and `ZILLIZ_CLOUD_TOKEN` ```python main.py import os -from embedchain import App +from embedchain import Pipeline as App os.environ['ZILLIZ_CLOUD_URI'] = 'https://xxx.zillizcloud.com' os.environ['ZILLIZ_CLOUD_TOKEN'] = 'xxx' @@ -164,7 +164,7 @@ In order to use Pinecone as vector database, set the environment variables `PINE ```python main.py -from embedchain import App +from embedchain import Pipeline as App # load pinecone configuration from yaml file app = App.from_config(yaml_path="config.yaml") @@ -187,7 +187,7 @@ In order to use Qdrant as a vector database, set the environment variables `QDRA ```python main.py -from embedchain import App +from embedchain import Pipeline as App # load qdrant configuration from yaml file app = App.from_config(yaml_path="config.yaml") @@ -207,7 +207,7 @@ In order to use Weaviate as a vector database, set the environment variables `WE ```python main.py -from embedchain import App +from embedchain import Pipeline as App # load weaviate configuration from yaml file app = App.from_config(yaml_path="config.yaml") diff --git a/docs/data-sources/csv.mdx b/docs/data-sources/csv.mdx index b56453ec..add89bc7 100644 --- a/docs/data-sources/csv.mdx +++ b/docs/data-sources/csv.mdx @@ -5,7 +5,7 @@ title: 'πŸ“Š CSV' To add any csv file, use the data_type as `csv`. `csv` allows remote urls and conventional file paths. Headers are included for each line, so if you have an `age` column, `18` will be added as `age: 18`. Eg: ```python -from embedchain import App +from embedchain import Pipeline as App app = App() app.add('https://people.sc.fsu.edu/~jburkardt/data/csv/airtravel.csv', data_type="csv") diff --git a/docs/data-sources/data-type-handling.mdx b/docs/data-sources/data-type-handling.mdx index bc2ed3dc..b6798e01 100644 --- a/docs/data-sources/data-type-handling.mdx +++ b/docs/data-sources/data-type-handling.mdx @@ -35,7 +35,7 @@ Default behavior is to create a persistent vector db in the directory **./db**. Create a local index: ```python -from embedchain import App +from embedchain import Pipeline as App naval_chat_bot = App() naval_chat_bot.add("https://www.youtube.com/watch?v=3qHkcs3kG44") @@ -45,7 +45,7 @@ naval_chat_bot.add("https://navalmanack.s3.amazonaws.com/Eric-Jorgenson_The-Alma You can reuse the local index with the same code, but without adding new documents: ```python -from embedchain import App +from embedchain import Pipeline as App naval_chat_bot = App() print(naval_chat_bot.query("What unique capacity does Naval argue humans possess when it comes to understanding explanations or concepts?")) diff --git a/docs/data-sources/docs-site.mdx b/docs/data-sources/docs-site.mdx index 206ef209..cb190536 100644 --- a/docs/data-sources/docs-site.mdx +++ b/docs/data-sources/docs-site.mdx @@ -5,7 +5,7 @@ title: 'πŸ“šπŸŒ Code documentation' To add any code documentation website as a loader, use the data_type as `docs_site`. Eg: ```python -from embedchain import App +from embedchain import Pipeline as App app = App() app.add("https://docs.embedchain.ai/", data_type="docs_site") diff --git a/docs/data-sources/docx.mdx b/docs/data-sources/docx.mdx index cc459621..bee46730 100644 --- a/docs/data-sources/docx.mdx +++ b/docs/data-sources/docx.mdx @@ -7,7 +7,7 @@ title: 'πŸ“„ Docx file' To add any doc/docx file, use the data_type as `docx`. `docx` allows remote urls and conventional file paths. Eg: ```python -from embedchain import App +from embedchain import Pipeline as App app = App() app.add('https://example.com/content/intro.docx', data_type="docx") diff --git a/docs/data-sources/mdx.mdx b/docs/data-sources/mdx.mdx index c59569e5..4e30235a 100644 --- a/docs/data-sources/mdx.mdx +++ b/docs/data-sources/mdx.mdx @@ -5,7 +5,7 @@ title: 'πŸ“ Mdx file' To add any `.mdx` file to your app, use the data_type (first argument to `.add()` method) as `mdx`. Note that this supports support mdx file present on machine, so this should be a file path. Eg: ```python -from embedchain import App +from embedchain import Pipeline as App app = App() app.add('path/to/file.mdx', data_type='mdx') diff --git a/docs/data-sources/notion.mdx b/docs/data-sources/notion.mdx index d6c616df..04e232c7 100644 --- a/docs/data-sources/notion.mdx +++ b/docs/data-sources/notion.mdx @@ -8,7 +8,7 @@ To load a notion page, use the data_type as `notion`. Since it is hard to automa The next argument must **end** with the `notion page id`. The id is a 32-character string. Eg: ```python -from embedchain import App +from embedchain import Pipeline as App app = App() diff --git a/docs/data-sources/pdf-file.mdx b/docs/data-sources/pdf-file.mdx index fe8b8884..637463a2 100644 --- a/docs/data-sources/pdf-file.mdx +++ b/docs/data-sources/pdf-file.mdx @@ -5,7 +5,7 @@ title: 'πŸ“° PDF file' To add any pdf file, use the data_type as `pdf_file`. Eg: ```python -from embedchain import App +from embedchain import Pipeline as App app = App() diff --git a/docs/data-sources/qna.mdx b/docs/data-sources/qna.mdx index f7a034d9..122d234c 100644 --- a/docs/data-sources/qna.mdx +++ b/docs/data-sources/qna.mdx @@ -5,7 +5,7 @@ title: 'β“πŸ’¬ Queston and answer pair' QnA pair is a local data type. To supply your own QnA pair, use the data_type as `qna_pair` and enter a tuple. Eg: ```python -from embedchain import App +from embedchain import Pipeline as App app = App() diff --git a/docs/data-sources/sitemap.mdx b/docs/data-sources/sitemap.mdx index 96b47ef1..e0c10058 100644 --- a/docs/data-sources/sitemap.mdx +++ b/docs/data-sources/sitemap.mdx @@ -5,7 +5,7 @@ title: 'πŸ—ΊοΈ Sitemap' Add all web pages from an xml-sitemap. Filters non-text files. Use the data_type as `sitemap`. Eg: ```python -from embedchain import App +from embedchain import Pipeline as App app = App() diff --git a/docs/data-sources/text.mdx b/docs/data-sources/text.mdx index 0fda6f57..78d85f7b 100644 --- a/docs/data-sources/text.mdx +++ b/docs/data-sources/text.mdx @@ -7,7 +7,7 @@ title: 'πŸ“ Text' Text is a local data type. To supply your own text, use the data_type as `text` and enter a string. The text is not processed, this can be very versatile. Eg: ```python -from embedchain import App +from embedchain import Pipeline as App app = App() diff --git a/docs/data-sources/web-page.mdx b/docs/data-sources/web-page.mdx index a30b836c..09144ccc 100644 --- a/docs/data-sources/web-page.mdx +++ b/docs/data-sources/web-page.mdx @@ -5,7 +5,7 @@ title: 'πŸŒπŸ“„ Web page' To add any web page, use the data_type as `web_page`. Eg: ```python -from embedchain import App +from embedchain import Pipeline as App app = App() diff --git a/docs/data-sources/xml.mdx b/docs/data-sources/xml.mdx index afe9a412..e8f91844 100644 --- a/docs/data-sources/xml.mdx +++ b/docs/data-sources/xml.mdx @@ -7,7 +7,7 @@ title: '🧾 XML file' To add any xml file, use the data_type as `xml`. Eg: ```python -from embedchain import App +from embedchain import Pipeline as App app = App() diff --git a/docs/data-sources/youtube-video.mdx b/docs/data-sources/youtube-video.mdx index 62a769ce..5baf2f9a 100644 --- a/docs/data-sources/youtube-video.mdx +++ b/docs/data-sources/youtube-video.mdx @@ -6,7 +6,7 @@ title: 'πŸŽ₯πŸ“Ί Youtube video' To add any youtube video to your app, use the data_type (first argument to `.add()` method) as `youtube_video`. Eg: ```python -from embedchain import App +from embedchain import Pipeline as App app = App() app.add('a_valid_youtube_url_here', data_type='youtube_video') diff --git a/docs/get-started/faq.mdx b/docs/get-started/faq.mdx index 7547670b..93da577a 100644 --- a/docs/get-started/faq.mdx +++ b/docs/get-started/faq.mdx @@ -9,7 +9,7 @@ description: 'Collections of all the frequently asked questions' ```python main.py import os -from embedchain import App +from embedchain import Pipeline as App os.environ['OPENAI_API_KEY'] = 'xxx' @@ -36,7 +36,7 @@ llm: ```python main.py import os -from embedchain import App +from embedchain import Pipeline as App os.environ['OPENAI_API_KEY'] = 'xxx' diff --git a/docs/get-started/introduction.mdx b/docs/get-started/introduction.mdx index 4a78c361..3c278efb 100644 --- a/docs/get-started/introduction.mdx +++ b/docs/get-started/introduction.mdx @@ -12,7 +12,7 @@ You can add data from different data sources using the `.add()` method. Then, si If you want to create a Naval Ravikant bot with a YouTube video, a book in PDF format, two blog posts, and a question and answer pair, all you need to do is add the respective links. Embedchain will take care of the rest, creating a bot for you. ```python -from embedchain import App +from embedchain import Pipeline as App naval_bot = App() # Add online data diff --git a/docs/get-started/quickstart.mdx b/docs/get-started/quickstart.mdx index c6344ebf..6ae4e9c6 100644 --- a/docs/get-started/quickstart.mdx +++ b/docs/get-started/quickstart.mdx @@ -16,22 +16,22 @@ Creating an app involves 3 steps: ```python -from embedchain import App +from embedchain import Pipeline as App app = App() ``` ```python # Add different data sources -elon_bot.add("https://en.wikipedia.org/wiki/Elon_Musk") -elon_bot.add("https://www.forbes.com/profile/elon-musk") +app.add("https://en.wikipedia.org/wiki/Elon_Musk") +app.add("https://www.forbes.com/profile/elon-musk") # You can also add local data sources such as pdf, csv files etc. -# elon_bot.add("/path/to/file.pdf") +# app.add("/path/to/file.pdf") ``` - + ```python -elon_bot.query("What is the net worth of Elon Musk today?") +app.query("What is the net worth of Elon Musk today?") # Answer: The net worth of Elon Musk today is $258.7 billion. ``` @@ -41,18 +41,18 @@ Putting it together, you can run your first app using the following code. Make s ```python import os -from embedchain import App +from embedchain import Pipeline as App os.environ["OPENAI_API_KEY"] = "xxx" -elon_bot = App() +app = App() # Add different data sources -elon_bot.add("https://en.wikipedia.org/wiki/Elon_Musk") -elon_bot.add("https://www.forbes.com/profile/elon-musk") +app.add("https://en.wikipedia.org/wiki/Elon_Musk") +app.add("https://www.forbes.com/profile/elon-musk") # You can also add local data sources such as pdf, csv files etc. -# elon_bot.add("/path/to/file.pdf") +# app.add("/path/to/file.pdf") -response = elon_bot.query("What is the net worth of Elon Musk today?") +response = app.query("What is the net worth of Elon Musk today?") print(response) # Answer: The net worth of Elon Musk today is $258.7 billion. ``` diff --git a/docs/integration/langsmith.mdx b/docs/integration/langsmith.mdx index 5f12717b..fb9e2a9f 100644 --- a/docs/integration/langsmith.mdx +++ b/docs/integration/langsmith.mdx @@ -39,7 +39,7 @@ os.environ['LANGCHAIN_PROJECT] = ```python -from embedchain import App +from embedchain import Pipeline as App app = App() app.add("https://en.wikipedia.org/wiki/Elon_Musk") diff --git a/docs/mint.json b/docs/mint.json index 137a10cd..0aee9c78 100644 --- a/docs/mint.json +++ b/docs/mint.json @@ -71,10 +71,6 @@ "group": "Examples", "pages": ["examples/full_stack", "examples/api_server", "examples/discord_bot", "examples/slack_bot", "examples/telegram_bot", "examples/whatsapp_bot", "examples/poe_bot"] }, - { - "group": "Pipelines", - "pages": ["pipelines/quickstart"] - }, { "group": "Community", "pages": [ diff --git a/docs/pipelines/quickstart.mdx b/docs/pipelines/quickstart.mdx deleted file mode 100644 index 660ca8a6..00000000 --- a/docs/pipelines/quickstart.mdx +++ /dev/null @@ -1,44 +0,0 @@ ---- -title: 'πŸš€ Pipelines' -description: 'πŸ’‘ Start building LLM powered data pipelines in 1 minute' ---- - -Embedchain lets you build data pipelines on your own data sources and deploy it in production in less than a minute. It can load, index, retrieve, and sync any unstructured data. - -Install embedchain python package: - -```bash -pip install embedchain -``` - -Creating a pipeline involves 3 steps: - - - -```python -from embedchain import Pipeline -p = Pipeline(name="Elon Musk") -``` - - - -```python -# Add different data sources -p.add("https://en.wikipedia.org/wiki/Elon_Musk") -p.add("https://www.forbes.com/profile/elon-musk") -# You can also add local data sources such as pdf, csv files etc. -# p.add("/path/to/file.pdf") -``` - - -```python -p.deploy() -``` - - - -That's it. Now, head to the [Embedchain platform](https://app.embedchain.ai) and your pipeline is available there. Make sure to set the `OPENAI_API_KEY` πŸ”‘ environment variable in the code. - -After you deploy your pipeline to Embedchain platform, you can still add more data sources and update the pipeline multiple times. - -Here is a Google Colab notebook for you to get started: [![Open in Colab](https://camo.githubusercontent.com/84f0493939e0c4de4e6dbe113251b4bfb5353e57134ffd9fcab6b8714514d4d1/68747470733a2f2f636f6c61622e72657365617263682e676f6f676c652e636f6d2f6173736574732f636f6c61622d62616467652e737667)](https://colab.research.google.com/drive/1YVXaBO4yqlHZY4ho67GCJ6aD4CHNiScD?usp=sharing) diff --git a/embedchain/config/apps/app_config.py b/embedchain/config/apps/app_config.py index 2aee20e5..f0efaff9 100644 --- a/embedchain/config/apps/app_config.py +++ b/embedchain/config/apps/app_config.py @@ -15,7 +15,7 @@ class AppConfig(BaseAppConfig): self, log_level: str = "WARNING", id: Optional[str] = None, - collect_metrics: Optional[bool] = None, + collect_metrics: Optional[bool] = True, collection_name: Optional[str] = None, ): """ diff --git a/embedchain/config/pipeline_config.py b/embedchain/config/pipeline_config.py index 162a3549..e46456a1 100644 --- a/embedchain/config/pipeline_config.py +++ b/embedchain/config/pipeline_config.py @@ -16,7 +16,7 @@ class PipelineConfig(BaseAppConfig): log_level: str = "WARNING", id: Optional[str] = None, name: Optional[str] = None, - collect_metrics: Optional[bool] = False, + collect_metrics: Optional[bool] = True, ): """ Initializes a configuration class instance for an App. This is the simplest form of an embedchain app. diff --git a/embedchain/embedchain.py b/embedchain/embedchain.py index 106366df..34933203 100644 --- a/embedchain/embedchain.py +++ b/embedchain/embedchain.py @@ -1,18 +1,13 @@ import hashlib -import importlib.metadata import json import logging import os import sqlite3 -import threading -import uuid from pathlib import Path from typing import Any, Dict, List, Optional -import requests from dotenv import load_dotenv from langchain.docstore.document import Document -from tenacity import retry, stop_after_attempt, wait_fixed from embedchain.chunkers.base_chunker import BaseChunker from embedchain.config import AddConfig, BaseLlmConfig @@ -24,6 +19,7 @@ from embedchain.llm.base import BaseLlm from embedchain.loaders.base_loader import BaseLoader from embedchain.models.data_type import (DataType, DirectDataType, IndirectDataType, SpecialDataType) +from embedchain.telemetry.posthog import AnonymousTelemetry from embedchain.utils import detect_datatype from embedchain.vectordb.base import BaseVectorDB @@ -89,9 +85,8 @@ class EmbedChain(JSONSerializable): self.user_asks = [] # Send anonymous telemetry - self.s_id = self.config.id if self.config.id else str(uuid.uuid4()) - self.u_id = self._load_or_generate_user_id() - + self._telemetry_props = {"class": self.__class__.__name__} + self.telemetry = AnonymousTelemetry(enabled=self.config.collect_metrics) # Establish a connection to the SQLite database self.connection = sqlite3.connect(SQLITE_PATH) self.cursor = self.connection.cursor() @@ -111,12 +106,8 @@ class EmbedChain(JSONSerializable): """ ) self.connection.commit() - - # NOTE: Uncomment the next two lines when running tests to see if any test fires a telemetry event. - # if (self.config.collect_metrics): - # raise ConnectionRefusedError("Collection of metrics should not be allowed.") - thread_telemetry = threading.Thread(target=self._send_telemetry_event, args=("init",)) - thread_telemetry.start() + # Send anonymous telemetry + self.telemetry.capture(event_name="init", properties=self._telemetry_props) @property def collect_metrics(self): @@ -138,29 +129,6 @@ class EmbedChain(JSONSerializable): raise ValueError(f"Boolean value expected but got {type(value)}.") self.llm.online = value - def _load_or_generate_user_id(self) -> str: - """ - Loads the user id from the config file if it exists, otherwise generates a new - one and saves it to the config file. - - :return: user id - :rtype: str - """ - if not os.path.exists(CONFIG_DIR): - os.makedirs(CONFIG_DIR) - - if os.path.exists(CONFIG_FILE): - with open(CONFIG_FILE, "r") as f: - data = json.load(f) - if "user_id" in data: - return data["user_id"] - - u_id = str(uuid.uuid4()) - with open(CONFIG_FILE, "w") as f: - json.dump({"user_id": u_id}, f) - - return u_id - def add( self, source: Any, @@ -259,9 +227,14 @@ class EmbedChain(JSONSerializable): # it's quicker to check the variable twice than to count words when they won't be submitted. word_count = data_formatter.chunker.get_word_count(documents) - extra_metadata = {"data_type": data_type.value, "word_count": word_count, "chunks_count": new_chunks} - thread_telemetry = threading.Thread(target=self._send_telemetry_event, args=("add", extra_metadata)) - thread_telemetry.start() + # Send anonymous telemetry + event_properties = { + **self._telemetry_props, + "data_type": data_type.value, + "word_count": word_count, + "chunks_count": new_chunks, + } + self.telemetry.capture(event_name="add", properties=event_properties) return source_hash @@ -535,9 +508,7 @@ class EmbedChain(JSONSerializable): answer = self.llm.query(input_query=input_query, contexts=contexts, config=config, dry_run=dry_run) # Send anonymous telemetry - thread_telemetry = threading.Thread(target=self._send_telemetry_event, args=("query",)) - thread_telemetry.start() - + self.telemetry.capture(event_name="query", properties=self._telemetry_props) return answer def chat( @@ -569,10 +540,8 @@ class EmbedChain(JSONSerializable): """ contexts = self.retrieve_from_database(input_query=input_query, config=config, where=where) answer = self.llm.chat(input_query=input_query, contexts=contexts, config=config, dry_run=dry_run) - # Send anonymous telemetry - thread_telemetry = threading.Thread(target=self._send_telemetry_event, args=("chat",)) - thread_telemetry.start() + self.telemetry.capture(event_name="chat", properties=self._telemetry_props) return answer @@ -608,34 +577,8 @@ class EmbedChain(JSONSerializable): Resets the database. Deletes all embeddings irreversibly. `App` does not have to be reinitialized after using this method. """ - # Send anonymous telemetry - thread_telemetry = threading.Thread(target=self._send_telemetry_event, args=("reset",)) - thread_telemetry.start() - self.db.reset() self.cursor.execute("DELETE FROM data_sources WHERE pipeline_id = ?", (self.config.id,)) self.connection.commit() - - @retry(stop=stop_after_attempt(3), wait=wait_fixed(1)) - def _send_telemetry_event(self, method: str, extra_metadata: Optional[dict] = None): - """ - Send telemetry event to the embedchain server. This is anonymous. It can be toggled off in `AppConfig`. - """ - if not self.config.collect_metrics: - return - - with threading.Lock(): - url = "https://api.embedchain.ai/api/v1/telemetry/" - metadata = { - "s_id": self.s_id, - "version": importlib.metadata.version(__package__ or __name__), - "method": method, - "language": "py", - "u_id": self.u_id, - } - if extra_metadata: - metadata.update(extra_metadata) - - response = requests.post(url, json={"metadata": metadata}) - if response.status_code != 200: - logging.warning(f"Telemetry event failed with status code {response.status_code}") + # Send anonymous telemetry + self.telemetry.capture(event_name="reset", properties=self._telemetry_props) diff --git a/embedchain/pipeline.py b/embedchain/pipeline.py index 420eaf7f..6e9ba29e 100644 --- a/embedchain/pipeline.py +++ b/embedchain/pipeline.py @@ -18,6 +18,7 @@ from embedchain.factory import EmbedderFactory, LlmFactory, VectorDBFactory from embedchain.helper.json_serializable import register_deserializable from embedchain.llm.base import BaseLlm from embedchain.llm.openai import OpenAILlm +from embedchain.telemetry.posthog import AnonymousTelemetry from embedchain.vectordb.base import BaseVectorDB from embedchain.vectordb.chroma import ChromaDB @@ -109,8 +110,9 @@ class Pipeline(EmbedChain): self.llm = llm or OpenAILlm() self._init_db() - # setup user id and directory - self.u_id = self._load_or_generate_user_id() + # Send anonymous telemetry + self._telemetry_props = {"class": self.__class__.__name__} + self.telemetry = AnonymousTelemetry(enabled=self.config.collect_metrics) # Establish a connection to the SQLite database self.connection = sqlite3.connect(SQLITE_PATH) @@ -131,8 +133,10 @@ class Pipeline(EmbedChain): """ ) self.connection.commit() + # Send anonymous telemetry + self.telemetry.capture(event_name="init", properties=self._telemetry_props) - self.user_asks = [] # legacy defaults + self.user_asks = [] if self.auto_deploy: self.deploy() @@ -219,6 +223,9 @@ class Pipeline(EmbedChain): """ Search for similar documents related to the query in the vector database. """ + # Send anonymous telemetry + self.telemetry.capture(event_name="search", properties=self._telemetry_props) + # TODO: Search will call the endpoint rather than fetching the data from the db itself when deploy=True. if self.id is None: where = {"app_id": self.local_id} @@ -312,6 +319,9 @@ class Pipeline(EmbedChain): data_hash, data_type, data_value = result[1], result[2], result[3] self._process_and_upload_data(data_hash, data_type, data_value) + # Send anonymous telemetry + self.telemetry.capture(event_name="deploy", properties=self._telemetry_props) + @classmethod def from_config(cls, yaml_path: str, auto_deploy: bool = False): """ @@ -347,6 +357,11 @@ class Pipeline(EmbedChain): embedding_model = EmbedderFactory.create( embedding_model_provider, embedding_model_config_data.get("config", {}) ) + + # Send anonymous telemetry + event_properties = {"init_type": "yaml_config"} + AnonymousTelemetry().capture(event_name="init", properties=event_properties) + return cls( config=pipeline_config, llm=llm, diff --git a/embedchain/telemetry/__init__.py b/embedchain/telemetry/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/embedchain/telemetry/posthog.py b/embedchain/telemetry/posthog.py new file mode 100644 index 00000000..cd421989 --- /dev/null +++ b/embedchain/telemetry/posthog.py @@ -0,0 +1,67 @@ +import json +import logging +import os +import uuid +from pathlib import Path + +from posthog import Posthog + +import embedchain + +HOME_DIR = str(Path.home()) +CONFIG_DIR = os.path.join(HOME_DIR, ".embedchain") +CONFIG_FILE = os.path.join(CONFIG_DIR, "config.json") + +logger = logging.getLogger(__name__) + + +class AnonymousTelemetry: + def __init__(self, host="https://app.posthog.com", enabled=True): + self.project_api_key = "phc_XnMmNHzwxE7PVHX4mD2r8K6nfxVM48a2sq2U3N1p2lO" + self.host = host + self.posthog = Posthog(project_api_key=self.project_api_key, host=self.host) + self.user_id = self.get_user_id() + self.enabled = enabled + + # Check if telemetry tracking is disabled via environment variable + if "EC_TELEMETRY" in os.environ and os.environ["EC_TELEMETRY"].lower() not in [ + "1", + "true", + "yes", + ]: + self.enabled = False + + if not self.enabled: + self.posthog.disabled = True + + # Silence posthog logging + posthog_logger = logging.getLogger("posthog") + posthog_logger.disabled = True + + def get_user_id(self): + if not os.path.exists(CONFIG_DIR): + os.makedirs(CONFIG_DIR) + + if os.path.exists(CONFIG_FILE): + with open(CONFIG_FILE, "r") as f: + data = json.load(f) + if "user_id" in data: + return data["user_id"] + + user_id = str(uuid.uuid4()) + with open(CONFIG_FILE, "w") as f: + json.dump({"user_id": user_id}, f) + return user_id + + def capture(self, event_name, properties=None): + default_properties = { + "version": embedchain.__version__, + "language": "python", + "pid": os.getpid(), + } + properties.update(default_properties) + + try: + self.posthog.capture(self.user_id, event_name, properties) + except Exception: + logger.exception(f"Failed to send telemetry {event_name=}") diff --git a/embedchain/vectordb/chroma.py b/embedchain/vectordb/chroma.py index c8d2194d..90195459 100644 --- a/embedchain/vectordb/chroma.py +++ b/embedchain/vectordb/chroma.py @@ -38,7 +38,7 @@ class ChromaDB(BaseVectorDB): else: self.config = ChromaDbConfig() - self.settings = Settings() + self.settings = Settings(anonymized_telemetry=False) self.settings.allow_reset = self.config.allow_reset if hasattr(self.config, "allow_reset") else False if self.config.chroma_settings: for key, value in self.config.chroma_settings.items(): diff --git a/poetry.lock b/poetry.lock index 8b636f02..3edd2004 100644 --- a/poetry.lock +++ b/poetry.lock @@ -7196,4 +7196,4 @@ whatsapp = ["flask", "twilio"] [metadata] lock-version = "2.0" python-versions = ">=3.9,<3.13" -content-hash = "4021d63d76a9128c8d7342bc482dfe90fa9878fde1359c2f0730fa9188e503af" +content-hash = "fad388baca92d2669530c2d928920e9f7abbe0ad74786b34dc915603b4a08a17" diff --git a/pyproject.toml b/pyproject.toml index afb25ace..215f9660 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "embedchain" -version = "0.0.82" +version = "0.0.83" description = "Data platform for LLMs - Load, index, retrieve and sync any unstructured data" authors = [ "Taranjeet Singh ", @@ -93,13 +93,14 @@ python-dotenv = "^1.0.0" langchain = "^0.0.279" requests = "^2.31.0" openai = ">=0.28.0" -tiktoken = { version="^0.4.0", optional=true } -chromadb ="^0.4.8" -youtube-transcript-api = { version="^0.6.1", optional=true } -beautifulsoup4 = { version="^4.12.2", optional=true } -pypdf = { version="^3.11.0", optional=true } -pytube = { version="^15.0.0", optional=true } -duckduckgo-search = { version="^3.8.5", optional=true } +chromadb = "^0.4.8" +posthog = "^3.0.2" +tiktoken = { version = "^0.4.0", optional = true } +youtube-transcript-api = { version = "^0.6.1", optional = true } +beautifulsoup4 = { version = "^4.12.2", optional = true } +pypdf = { version = "^3.11.0", optional = true } +pytube = { version = "^15.0.0", optional = true } +duckduckgo-search = { version = "^3.8.5", optional = true } llama-hub = { version = "^0.0.29", optional = true } sentence-transformers = { version = "^2.2.2", optional = true } torch = { version = "2.0.0", optional = true } @@ -113,12 +114,12 @@ twilio = { version = "^8.5.0", optional = true } fastapi-poe = { version = "0.0.16", optional = true } discord = { version = "^2.3.2", optional = true } slack-sdk = { version = "3.21.3", optional = true } -cohere = { version = "^4.27", optional= true } -weaviate-client = { version = "^3.24.1", optional= true } -docx2txt = { version="^0.8", optional=true } +cohere = { version = "^4.27", optional = true } +weaviate-client = { version = "^3.24.1", optional = true } +docx2txt = { version = "^0.8", optional = true } pinecone-client = { version = "^2.2.4", optional = true } qdrant-client = { version = "1.6.3", optional = true } -unstructured = {extras = ["local-inference"], version = "^0.10.18", optional=true} +unstructured = {extras = ["local-inference"], version = "^0.10.18", optional = true} pillow = { version = "10.0.1", optional = true } torchvision = { version = ">=0.15.1, !=0.15.2", optional = true } ftfy = { version = "6.1.1", optional = true } @@ -127,7 +128,7 @@ huggingface_hub = { version = "^0.17.3", optional = true } pymilvus = { version = "2.3.1", optional = true } google-cloud-aiplatform = { version = "^1.26.1", optional = true } replicate = { version = "^0.15.4", optional = true } -jq = { version=">=1.6.0", optional = true} +jq = { version = ">=1.6.0", optional = true} [tool.poetry.group.dev.dependencies] black = "^23.3.0" diff --git a/tests/conftest.py b/tests/conftest.py index 2465fa72..962d6af5 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -14,3 +14,10 @@ def setup(): clean_db() yield clean_db() + + +@pytest.fixture(autouse=True) +def disable_telemetry(): + os.environ["EC_TELEMETRY"] = "false" + yield + del os.environ["EC_TELEMETRY"] diff --git a/tests/telemetry/test_posthog.py b/tests/telemetry/test_posthog.py new file mode 100644 index 00000000..c85af370 --- /dev/null +++ b/tests/telemetry/test_posthog.py @@ -0,0 +1,62 @@ +import os +import logging +from embedchain.telemetry.posthog import AnonymousTelemetry + + +class TestAnonymousTelemetry: + def test_init(self, mocker): + # Enable telemetry specifically for this test + os.environ["EC_TELEMETRY"] = "true" + mock_posthog = mocker.patch("embedchain.telemetry.posthog.Posthog") + telemetry = AnonymousTelemetry() + assert telemetry.project_api_key == "phc_XnMmNHzwxE7PVHX4mD2r8K6nfxVM48a2sq2U3N1p2lO" + assert telemetry.host == "https://app.posthog.com" + assert telemetry.enabled is True + assert telemetry.user_id + mock_posthog.assert_called_once_with(project_api_key=telemetry.project_api_key, host=telemetry.host) + + def test_init_with_disabled_telemetry(self, mocker, monkeypatch): + mocker.patch("embedchain.telemetry.posthog.Posthog") + telemetry = AnonymousTelemetry() + assert telemetry.enabled is False + assert telemetry.posthog.disabled is True + + def test_get_user_id(self, mocker, tmpdir): + mock_uuid = mocker.patch("embedchain.telemetry.posthog.uuid.uuid4") + mock_uuid.return_value = "unique_user_id" + config_file = tmpdir.join("config.json") + mocker.patch("embedchain.telemetry.posthog.CONFIG_FILE", str(config_file)) + telemetry = AnonymousTelemetry() + + user_id = telemetry.get_user_id() + assert user_id == "unique_user_id" + assert config_file.read() == '{"user_id": "unique_user_id"}' + + def test_capture(self, mocker): + # Enable telemetry specifically for this test + os.environ["EC_TELEMETRY"] = "true" + mock_posthog = mocker.patch("embedchain.telemetry.posthog.Posthog") + telemetry = AnonymousTelemetry() + event_name = "test_event" + properties = {"key": "value"} + telemetry.capture(event_name, properties) + + mock_posthog.assert_called_once_with( + project_api_key=telemetry.project_api_key, + host=telemetry.host, + ) + mock_posthog.return_value.capture.assert_called_once_with( + telemetry.user_id, + event_name, + properties, + ) + + def test_capture_with_exception(self, mocker, caplog): + mock_posthog = mocker.patch("embedchain.telemetry.posthog.Posthog") + mock_posthog.return_value.capture.side_effect = Exception("Test Exception") + telemetry = AnonymousTelemetry() + event_name = "test_event" + properties = {"key": "value"} + with caplog.at_level(logging.ERROR): + telemetry.capture(event_name, properties) + assert "Failed to send telemetry event" in caplog.text diff --git a/tests/vectordb/test_qdrant.py b/tests/vectordb/test_qdrant.py index 47b54504..b872ab1d 100644 --- a/tests/vectordb/test_qdrant.py +++ b/tests/vectordb/test_qdrant.py @@ -76,7 +76,7 @@ class TestQdrantDB(unittest.TestCase): qdrant_client_mock.return_value.upsert.assert_called_once_with( collection_name="embedchain-store-1526", points=Batch( - ids=["def", "ghi"], + ids=["abc", "def"], payloads=[ { "identifier": "123",