From 406c46e7f4ab4f7a5cedf3adbfa5ca812391306e Mon Sep 17 00:00:00 2001 From: Deven Patel Date: Wed, 29 Nov 2023 12:25:30 -0800 Subject: [PATCH] [Improvements] Add support for creating app from YAML string config (#980) --- docs/advanced/configuration.mdx | 141 +++++++++++++++++++---- docs/components/embedding-models.mdx | 10 +- docs/components/llms.mdx | 18 +-- docs/components/vector-databases.mdx | 21 ++-- docs/data-sources/data-type-handling.mdx | 12 ++ docs/data-sources/docs-site.mdx | 2 +- docs/data-sources/overview.mdx | 12 +- docs/data-sources/web-page.mdx | 2 +- docs/data-sources/youtube-video.mdx | 2 +- docs/get-started/examples.mdx | 9 +- docs/get-started/faq.mdx | 53 ++++++--- docs/get-started/introduction.mdx | 4 +- docs/get-started/quickstart.mdx | 110 +++++++++--------- embedchain/apps/app.py | 4 +- embedchain/llm/huggingface.py | 5 +- embedchain/pipeline.py | 77 +++++++++---- embedchain/store/assistants.py | 2 +- embedchain/utils.py | 2 +- examples/rest-api/main.py | 12 +- notebooks/anthropic.ipynb | 2 +- notebooks/azure-openai.ipynb | 2 +- notebooks/chromadb.ipynb | 2 +- notebooks/cohere.ipynb | 2 +- notebooks/elasticsearch.ipynb | 2 +- notebooks/gpt4all.ipynb | 2 +- notebooks/hugging_face_hub.ipynb | 2 +- notebooks/jina.ipynb | 2 +- notebooks/llama2.ipynb | 2 +- notebooks/openai.ipynb | 2 +- notebooks/opensearch.ipynb | 2 +- notebooks/pinecone.ipynb | 2 +- notebooks/vertex_ai.ipynb | 2 +- pyproject.toml | 2 +- tests/test_utils.py | 4 +- 34 files changed, 351 insertions(+), 179 deletions(-) diff --git a/docs/advanced/configuration.mdx b/docs/advanced/configuration.mdx index 44b4a9ec..2893e5b4 100644 --- a/docs/advanced/configuration.mdx +++ b/docs/advanced/configuration.mdx @@ -6,15 +6,16 @@ Embedchain is made to work out of the box. However, for advanced users we're als You can configure different components of your app (`llm`, `embedding model`, or `vector database`) through a simple yaml configuration that Embedchain offers. Here is a generic full-stack example of the yaml config: -```yaml + + +Embedchain applications are configurable using YAML file, JSON file or by directly passing the config dictionary. + + + +```yaml config.yaml app: config: - id: 'full-stack-app' - -chunker: - chunk_size: 100 - chunk_overlap: 20 - length_function: 'len' + name: 'full-stack-app' llm: provider: openai @@ -47,38 +48,138 @@ embedder: provider: openai config: model: 'text-embedding-ada-002' + +chunker: + chunk_size: 2000 + chunk_overlap: 100 + length_function: 'len' ``` +```json config.json +{ + "app": { + "config": { + "name": "full-stack-app" + } + }, + "llm": { + "provider": "openai", + "config": { + "model": "gpt-3.5-turbo", + "temperature": 0.5, + "max_tokens": 1000, + "top_p": 1, + "stream": false, + "template": "Use the following pieces of context to answer the query at the end.\nIf you don't know the answer, just say that you don't know, don't try to make up an answer.\n$context\n\nQuery: $query\n\nHelpful Answer:", + "system_prompt": "Act as William Shakespeare. Answer the following questions in the style of William Shakespeare." + } + }, + "vectordb": { + "provider": "chroma", + "config": { + "collection_name": "full-stack-app", + "dir": "db", + "allow_reset": true + } + }, + "embedder": { + "provider": "openai", + "config": { + "model": "text-embedding-ada-002" + } + }, + "chunker": { + "chunk_size": 2000, + "chunk_overlap": 100, + "length_function": "len" + } +} +``` + +```python config.py +config = { + 'app': { + 'config': { + 'name': 'full-stack-app' + } + }, + 'llm': { + 'provider': 'openai', + 'config': { + 'model': 'gpt-3.5-turbo', + 'temperature': 0.5, + 'max_tokens': 1000, + 'top_p': 1, + 'stream': False, + 'template': ( + "Use the following pieces of context to answer the query at the end.\n" + "If you don't know the answer, just say that you don't know, don't try to make up an answer.\n" + "$context\n\nQuery: $query\n\nHelpful Answer:" + ), + 'system_prompt': ( + "Act as William Shakespeare. Answer the following questions in the style of William Shakespeare." + ) + } + }, + 'vectordb': { + 'provider': 'chroma', + 'config': { + 'collection_name': 'full-stack-app', + 'dir': 'db', + 'allow_reset': True + } + }, + 'embedder': { + 'provider': 'openai', + 'config': { + 'model': 'text-embedding-ada-002' + } + }, + 'chunker': { + 'chunk_size': 2000, + 'chunk_overlap': 100, + 'length_function': 'len' + } +} +``` + + Alright, let's dive into what each key means in the yaml config above: 1. `app` Section: - `config`: - - `id` (String): The ID or name of your full-stack application. -2. `chunker` Section: - - `chunk_size` (Integer): The size of each chunk of text that is sent to the language model. - - `chunk_overlap` (Integer): The amount of overlap between each chunk of text. - - `length_function` (String): The function used to calculate the length of each chunk of text. In this case, it's set to 'len'. You can also use any function import directly as a string here. -3. `llm` Section: + - `name` (String): The name of your full-stack application. + - `id` (String): The id of your full-stack application. + Only use this to reload already created apps. We recommend users to not create their own ids. + - `collect_metrics` (Boolean): Indicates whether metrics should be collected for the app, defaults to `True` + - `log_level` (String): The log level for the app, defaults to `WARNING` +2. `llm` Section: - `provider` (String): The provider for the language model, which is set to 'openai'. You can find the full list of llm providers in [our docs](/components/llms). - - `model` (String): The specific model being used, 'gpt-3.5-turbo'. - `config`: + - `model` (String): The specific model being used, 'gpt-3.5-turbo'. - `temperature` (Float): Controls the randomness of the model's output. A higher value (closer to 1) makes the output more random. - `max_tokens` (Integer): Controls how many tokens are used in the response. - `top_p` (Float): Controls the diversity of word selection. A higher value (closer to 1) makes word selection more diverse. - `stream` (Boolean): Controls if the response is streamed back to the user (set to false). - `template` (String): A custom template for the prompt that the model uses to generate responses. - `system_prompt` (String): A system prompt for the model to follow when generating responses, in this case, it's set to the style of William Shakespeare. -4. `vectordb` Section: + - `stream` (Boolean): Controls if the response is streamed back to the user (set to false). + - `number_documents` (Integer): Number of documents to pull from the vectordb as context, defaults to 1 +3. `vectordb` Section: - `provider` (String): The provider for the vector database, set to 'chroma'. You can find the full list of vector database providers in [our docs](/components/vector-databases). - `config`: - - `collection_name` (String): The initial collection name for the database, set to 'full-stack-app'. - - `dir` (String): The directory for the database, set to 'db'. - - `allow_reset` (Boolean): Indicates whether resetting the database is allowed, set to true. -5. `embedder` Section: + - `collection_name` (String): The initial collection name for the vectordb, set to 'full-stack-app'. + - `dir` (String): The directory for the local database, set to 'db'. + - `allow_reset` (Boolean): Indicates whether resetting the vectordb is allowed, set to true. + We recommend you to checkout vectordb specific config [here](https://docs.embedchain.ai/components/vector-databases) +4. `embedder` Section: - `provider` (String): The provider for the embedder, set to 'openai'. You can find the full list of embedding model providers in [our docs](/components/embedding-models). - `config`: - `model` (String): The specific model used for text embedding, 'text-embedding-ada-002'. - +5. `chunker` Section: + - `chunk_size` (Integer): The size of each chunk of text that is sent to the language model. + - `chunk_overlap` (Integer): The amount of overlap between each chunk of text. + - `length_function` (String): The function used to calculate the length of each chunk of text. In this case, it's set to 'len'. You can also use any function import directly as a string here. If you have questions about the configuration above, please feel free to reach out to us using one of the following methods: \ No newline at end of file diff --git a/docs/components/embedding-models.mdx b/docs/components/embedding-models.mdx index 55c5384f..d15851d9 100644 --- a/docs/components/embedding-models.mdx +++ b/docs/components/embedding-models.mdx @@ -29,7 +29,7 @@ from embedchain import Pipeline as App os.environ['OPENAI_API_KEY'] = 'xxx' # load embedding model configuration from config.yaml file -app = App.from_config(yaml_path="config.yaml") +app = App.from_config(config_path="config.yaml") app.add("https://en.wikipedia.org/wiki/OpenAI") app.query("What is OpenAI?") @@ -59,7 +59,7 @@ os.environ["AZURE_OPENAI_ENDPOINT"] = "https://xxx.openai.azure.com/" os.environ["AZURE_OPENAI_API_KEY"] = "xxx" os.environ["OPENAI_API_VERSION"] = "xxx" -app = App.from_config(yaml_path="config.yaml") +app = App.from_config(config_path="config.yaml") ``` ```yaml config.yaml @@ -93,7 +93,7 @@ GPT4All supports generating high quality embeddings of arbitrary length document from embedchain import Pipeline as App # load embedding model configuration from config.yaml file -app = App.from_config(yaml_path="config.yaml") +app = App.from_config(config_path="config.yaml") ``` ```yaml config.yaml @@ -122,7 +122,7 @@ Hugging Face supports generating embeddings of arbitrary length documents of tex from embedchain import Pipeline as App # load embedding model configuration from config.yaml file -app = App.from_config(yaml_path="config.yaml") +app = App.from_config(config_path="config.yaml") ``` ```yaml config.yaml @@ -153,7 +153,7 @@ Embedchain supports Google's VertexAI embeddings model through a simple interfac from embedchain import Pipeline as App # load embedding model configuration from config.yaml file -app = App.from_config(yaml_path="config.yaml") +app = App.from_config(config_path="config.yaml") ``` ```yaml config.yaml diff --git a/docs/components/llms.mdx b/docs/components/llms.mdx index 86c84e8e..84a5dada 100644 --- a/docs/components/llms.mdx +++ b/docs/components/llms.mdx @@ -46,7 +46,7 @@ from embedchain import Pipeline as App os.environ['OPENAI_API_KEY'] = 'xxx' # load llm configuration from config.yaml file -app = App.from_config(yaml_path="config.yaml") +app = App.from_config(config_path="config.yaml") ``` ```yaml config.yaml @@ -78,7 +78,7 @@ os.environ["OPENAI_API_BASE"] = "https://xxx.openai.azure.com/" os.environ["OPENAI_API_KEY"] = "xxx" os.environ["OPENAI_API_VERSION"] = "xxx" -app = App.from_config(yaml_path="config.yaml") +app = App.from_config(config_path="config.yaml") ``` ```yaml config.yaml @@ -115,7 +115,7 @@ from embedchain import Pipeline as App os.environ["ANTHROPIC_API_KEY"] = "xxx" # load llm configuration from config.yaml file -app = App.from_config(yaml_path="config.yaml") +app = App.from_config(config_path="config.yaml") ``` ```yaml config.yaml @@ -152,7 +152,7 @@ from embedchain import Pipeline as App os.environ["COHERE_API_KEY"] = "xxx" # load llm configuration from config.yaml file -app = App.from_config(yaml_path="config.yaml") +app = App.from_config(config_path="config.yaml") ``` ```yaml config.yaml @@ -183,7 +183,7 @@ GPT4all is a free-to-use, locally running, privacy-aware chatbot. No GPU or inte from embedchain import Pipeline as App # load llm configuration from config.yaml file -app = App.from_config(yaml_path="config.yaml") +app = App.from_config(config_path="config.yaml") ``` ```yaml config.yaml @@ -216,7 +216,7 @@ from embedchain import Pipeline as App os.environ["JINACHAT_API_KEY"] = "xxx" # load llm configuration from config.yaml file -app = App.from_config(yaml_path="config.yaml") +app = App.from_config(config_path="config.yaml") ``` ```yaml config.yaml @@ -253,7 +253,7 @@ from embedchain import Pipeline as App os.environ["HUGGINGFACE_ACCESS_TOKEN"] = "xxx" # load llm configuration from config.yaml file -app = App.from_config(yaml_path="config.yaml") +app = App.from_config(config_path="config.yaml") ``` ```yaml config.yaml @@ -283,7 +283,7 @@ from embedchain import Pipeline as App os.environ["REPLICATE_API_TOKEN"] = "xxx" # load llm configuration from config.yaml file -app = App.from_config(yaml_path="config.yaml") +app = App.from_config(config_path="config.yaml") ``` ```yaml config.yaml @@ -308,7 +308,7 @@ Setup Google Cloud Platform application credentials by following the instruction from embedchain import Pipeline as App # load llm configuration from config.yaml file -app = App.from_config(yaml_path="config.yaml") +app = App.from_config(config_path="config.yaml") ``` ```yaml config.yaml diff --git a/docs/components/vector-databases.mdx b/docs/components/vector-databases.mdx index db198316..fe54c575 100644 --- a/docs/components/vector-databases.mdx +++ b/docs/components/vector-databases.mdx @@ -25,7 +25,7 @@ Utilizing a vector database alongside Embedchain is a seamless process. All you from embedchain import Pipeline as App # load chroma configuration from yaml file -app = App.from_config(yaml_path="config1.yaml") +app = App.from_config(config_path="config1.yaml") ``` ```yaml config1.yaml @@ -64,7 +64,7 @@ pip install --upgrade 'embedchain[elasticsearch]' from embedchain import Pipeline as App # load elasticsearch configuration from yaml file -app = App.from_config(yaml_path="config.yaml") +app = App.from_config(config_path="config.yaml") ``` ```yaml config.yaml @@ -73,8 +73,11 @@ vectordb: config: collection_name: 'es-index' es_url: http://localhost:9200 - allow_reset: true + http_auth: + - admin + - admin api_key: xxx + verify_certs: false ``` @@ -92,19 +95,19 @@ pip install --upgrade 'embedchain[opensearch]' from embedchain import Pipeline as App # load opensearch configuration from yaml file -app = App.from_config(yaml_path="config.yaml") +app = App.from_config(config_path="config.yaml") ``` ```yaml config.yaml vectordb: provider: opensearch config: + collection_name: 'my-app' opensearch_url: 'https://localhost:9200' http_auth: - admin - admin vector_dimension: 1536 - collection_name: 'my-app' use_ssl: false verify_certs: false ``` @@ -131,7 +134,7 @@ os.environ['ZILLIZ_CLOUD_URI'] = 'https://xxx.zillizcloud.com' os.environ['ZILLIZ_CLOUD_TOKEN'] = 'xxx' # load zilliz configuration from yaml file -app = App.from_config(yaml_path="config.yaml") +app = App.from_config(config_path="config.yaml") ``` ```yaml config.yaml @@ -167,7 +170,7 @@ In order to use Pinecone as vector database, set the environment variables `PINE from embedchain import Pipeline as App # load pinecone configuration from yaml file -app = App.from_config(yaml_path="config.yaml") +app = App.from_config(config_path="config.yaml") ``` ```yaml config.yaml @@ -190,7 +193,7 @@ In order to use Qdrant as a vector database, set the environment variables `QDRA from embedchain import Pipeline as App # load qdrant configuration from yaml file -app = App.from_config(yaml_path="config.yaml") +app = App.from_config(config_path="config.yaml") ``` ```yaml config.yaml @@ -210,7 +213,7 @@ In order to use Weaviate as a vector database, set the environment variables `WE from embedchain import Pipeline as App # load weaviate configuration from yaml file -app = App.from_config(yaml_path="config.yaml") +app = App.from_config(config_path="config.yaml") ``` ```yaml config.yaml diff --git a/docs/data-sources/data-type-handling.mdx b/docs/data-sources/data-type-handling.mdx index b6798e01..7f0fa917 100644 --- a/docs/data-sources/data-type-handling.mdx +++ b/docs/data-sources/data-type-handling.mdx @@ -50,3 +50,15 @@ from embedchain import Pipeline as App naval_chat_bot = App() print(naval_chat_bot.query("What unique capacity does Naval argue humans possess when it comes to understanding explanations or concepts?")) ``` + +## Resetting an app and vector database + +You can reset the app by simply calling the `reset` method. This will delete the vector database and all other app related files. + +```python +from embedchain import Pipeline as App + +app = App() +app.add("https://www.youtube.com/watch?v=3qHkcs3kG44") +app.reset() +``` diff --git a/docs/data-sources/docs-site.mdx b/docs/data-sources/docs-site.mdx index cb190536..0d3fcd9e 100644 --- a/docs/data-sources/docs-site.mdx +++ b/docs/data-sources/docs-site.mdx @@ -1,5 +1,5 @@ --- -title: 'πŸ“šπŸŒ Code documentation' +title: 'πŸ“š Code documentation' --- To add any code documentation website as a loader, use the data_type as `docs_site`. Eg: diff --git a/docs/data-sources/overview.mdx b/docs/data-sources/overview.mdx index 4cf04d50..35c23903 100644 --- a/docs/data-sources/overview.mdx +++ b/docs/data-sources/overview.mdx @@ -5,20 +5,20 @@ title: Overview Embedchain comes with built-in support for various data sources. We handle the complexity of loading unstructured data from these data sources, allowing you to easily customize your app through a user-friendly interface. - + - + - - + + - + - + diff --git a/docs/data-sources/web-page.mdx b/docs/data-sources/web-page.mdx index 09144ccc..63004cd1 100644 --- a/docs/data-sources/web-page.mdx +++ b/docs/data-sources/web-page.mdx @@ -1,5 +1,5 @@ --- -title: 'πŸŒπŸ“„ Web page' +title: '🌐 Web page' --- To add any web page, use the data_type as `web_page`. Eg: diff --git a/docs/data-sources/youtube-video.mdx b/docs/data-sources/youtube-video.mdx index aed31e63..e5e70c80 100644 --- a/docs/data-sources/youtube-video.mdx +++ b/docs/data-sources/youtube-video.mdx @@ -1,5 +1,5 @@ --- -title: 'πŸ“Ί Youtube video' +title: 'πŸ“Ί Youtube' --- diff --git a/docs/get-started/examples.mdx b/docs/get-started/examples.mdx index 1556f06a..7c246674 100644 --- a/docs/get-started/examples.mdx +++ b/docs/get-started/examples.mdx @@ -1,8 +1,15 @@ --- title: πŸ”Ž Examples -description: 'Collection of Google colab notebook and Replit links for users' --- +# Explore awesome apps + +Check out the remarkable work accomplished using [Embedchain](https://app.embedchain.ai/custom-gpts/). + +## Collection of Google colab notebook and Replit links for users + +Get started with Embedchain by trying out the examples below. You can run the examples in your browser using Google Colab or Replit. + diff --git a/docs/get-started/faq.mdx b/docs/get-started/faq.mdx index 15dddac1..80028e73 100644 --- a/docs/get-started/faq.mdx +++ b/docs/get-started/faq.mdx @@ -2,13 +2,36 @@ title: ❓ FAQs description: 'Collections of all the frequently asked questions' --- - -#### Does Embedchain support OpenAI's Assistant APIs? - + + Yes, it does. Please refer to the [OpenAI Assistant docs page](/get-started/openai-assistant). + + +Use the model provided on huggingface: `mistralai/Mistral-7B-v0.1` + +```python main.py +import os +from embedchain import Pipeline as App -#### How to use `gpt-4-turbo` model released on OpenAI DevDay? +os.environ["OPENAI_API_KEY"] = "sk-xxx" +os.environ["HUGGINGFACE_ACCESS_TOKEN"] = "hf_your_token" +app = App.from_config("huggingface.yaml") +``` +```yaml huggingface.yaml +llm: + provider: huggingface + config: + model: 'mistralai/Mistral-7B-v0.1' + temperature: 0.5 + max_tokens: 1000 + top_p: 0.5 + stream: false +``` + + + +Use the model `gpt-4-turbo` provided my openai. ```python main.py @@ -18,7 +41,7 @@ from embedchain import Pipeline as App os.environ['OPENAI_API_KEY'] = 'xxx' # load llm configuration from gpt4_turbo.yaml file -app = App.from_config(yaml_path="gpt4_turbo.yaml") +app = App.from_config(config_path="gpt4_turbo.yaml") ``` ```yaml gpt4_turbo.yaml @@ -31,12 +54,9 @@ llm: top_p: 1 stream: false ``` - - - -#### How to use GPT-4 as the LLM model? - + + ```python main.py @@ -46,7 +66,7 @@ from embedchain import Pipeline as App os.environ['OPENAI_API_KEY'] = 'xxx' # load llm configuration from gpt4.yaml file -app = App.from_config(yaml_path="gpt4.yaml") +app = App.from_config(config_path="gpt4.yaml") ``` ```yaml gpt4.yaml @@ -61,9 +81,8 @@ llm: ``` - -#### I don't have OpenAI credits. How can I use some open source model? - + + ```python main.py @@ -73,7 +92,7 @@ from embedchain import Pipeline as App os.environ['OPENAI_API_KEY'] = 'xxx' # load llm configuration from opensource.yaml file -app = App.from_config(yaml_path="opensource.yaml") +app = App.from_config(config_path="opensource.yaml") ``` ```yaml opensource.yaml @@ -93,8 +112,10 @@ embedder: ``` -#### How to contact support? + + +#### Need more help? If docs aren't sufficient, please feel free to reach out to us using one of the following methods: diff --git a/docs/get-started/introduction.mdx b/docs/get-started/introduction.mdx index d5869e92..2c243fa8 100644 --- a/docs/get-started/introduction.mdx +++ b/docs/get-started/introduction.mdx @@ -105,7 +105,7 @@ app.deploy() # βœ… Data of type: web_page, value: https://www.forbes.com/profile/elon-musk added successfully. ``` -## πŸš€ How it works? +## πŸ› οΈ How it works? Embedchain abstracts out the following steps from you to easily create LLM powered apps: @@ -129,3 +129,5 @@ The process of loading the dataset and querying involves multiple steps, each wi - How should I find similar documents for a query? Which ranking model should I use? Embedchain takes care of all these nuances and provides a simple interface to create apps on any data. + +## [πŸš€ Get started](https://docs.embedchain.ai/get-started/quickstart) diff --git a/docs/get-started/quickstart.mdx b/docs/get-started/quickstart.mdx index 2179e0b2..a84ed72c 100644 --- a/docs/get-started/quickstart.mdx +++ b/docs/get-started/quickstart.mdx @@ -12,79 +12,73 @@ pip install embedchain ``` -Embedchain now supports OpenAI's latest `gpt-4-turbo` model. Checkout the [docs here](/get-started/faq#how-to-use-gpt-4-turbo-model-released-on-openai-devday) on how to use it. +Embedchain now supports OpenAI's latest `gpt-4-turbo` model. Checkout the [FAQs](/get-started/faq#how-to-use-gpt-4-turbo-model-released-on-openai-devday). Creating an app involves 3 steps: -```python -from embedchain import Pipeline as App -app = App() -``` + ```python + from embedchain import Pipeline as App + app = App() + ``` + + Embedchain provides a wide range of options to customize your app. You can customize the model, data sources, and much more. + Explore the custom configurations [here](https://docs.embedchain.ai/advanced/configuration). + ```python + from embedchain import Pipeline as App + app = App(yaml_config="config.yaml") + ``` + -```python -# Add different data sources -app.add("https://en.wikipedia.org/wiki/Elon_Musk") -app.add("https://www.forbes.com/profile/elon-musk") -# You can also add local data sources such as pdf, csv files etc. -# app.add("/path/to/file.pdf") -``` + ```python + app.add("https://en.wikipedia.org/wiki/Elon_Musk") + app.add("https://www.forbes.com/profile/elon-musk") + # app.add("path/to/file/elon_musk.pdf") + ``` + + Embedchain supports adding data from many data sources including web pages, PDFs, databases, and more. + Explore the list of supported [data sources](https://docs.embedchain.ai/data-sources/overview). + - -```python -app.query("What is the net worth of Elon Musk today?") -# Answer: The net worth of Elon Musk today is $258.7 billion. -``` + + ```python + app.query("What is the net worth of Elon Musk today?") + # Answer: The net worth of Elon Musk today is $258.7 billion. + ``` + + Embedchain provides a wide range of features to interact with your app. You can chat with your app, ask questions, search through your data, and much more. + ```python + app.chat("How many companies does Elon Musk run? Name those") + # Answer: Elon Musk runs 3 companies: Tesla, SpaceX, and Neuralink. + app.chat("What is his net worth today?") + # Answer: The net worth of Elon Musk today is $258.7 billion. + ``` + To learn about other features, click [here](https://docs.embedchain.ai/get-started/introduction) + - -```python -app.deploy() -# πŸ”‘ Enter your Embedchain API key. You can find the API key at https://app.embedchain.ai/settings/keys/ -# ec-xxxxxx + + ```python + app.deploy() + # πŸ”‘ Enter your Embedchain API key. You can find the API key at https://app.embedchain.ai/settings/keys/ + # ec-xxxxxx -# πŸ› οΈ Creating pipeline on the platform... -# πŸŽ‰πŸŽ‰πŸŽ‰ Pipeline created successfully! View your pipeline: https://app.embedchain.ai/pipelines/xxxxx + # πŸ› οΈ Creating pipeline on the platform... + # πŸŽ‰πŸŽ‰πŸŽ‰ Pipeline created successfully! View your pipeline: https://app.embedchain.ai/pipelines/xxxxx -# πŸ› οΈ Adding data to your pipeline... -# βœ… Data of type: web_page, value: https://www.forbes.com/profile/elon-musk added successfully. -``` + # πŸ› οΈ Adding data to your pipeline... + # βœ… Data of type: web_page, value: https://www.forbes.com/profile/elon-musk added successfully. + ``` + + You can now share your app with others from our platform. + Access your app on our [platform](https://app.embedchain.ai/). + -Putting it together, you can run your first app using the following code. Make sure to set the `OPENAI_API_KEY` πŸ”‘ environment variable in the code. - -```python -import os -from embedchain import Pipeline as App - -os.environ["OPENAI_API_KEY"] = "xxx" -app = App() - -# Add different data sources -app.add("https://en.wikipedia.org/wiki/Elon_Musk") -app.add("https://www.forbes.com/profile/elon-musk") -# You can also add local data sources such as pdf, csv files etc. -# app.add("/path/to/file.pdf") - -response = app.query("What is the net worth of Elon Musk today?") -print(response) -# Answer: The net worth of Elon Musk today is $258.7 billion. - -app.deploy() -# πŸ”‘ Enter your Embedchain API key. You can find the API key at https://app.embedchain.ai/settings/keys/ -# ec-xxxxxx - -# πŸ› οΈ Creating pipeline on the platform... -# πŸŽ‰πŸŽ‰πŸŽ‰ Pipeline created successfully! View your pipeline: https://app.embedchain.ai/pipelines/xxxxx - -# πŸ› οΈ Adding data to your pipeline... -# βœ… Data of type: web_page, value: https://www.forbes.com/profile/elon-musk added successfully. -``` - -You can try it out yourself using the following Google Colab notebook: +Putting it together, you can run your first app using the following Google Colab. Make sure to set the `OPENAI_API_KEY` πŸ”‘ environment variable in the code. Open in Colab diff --git a/embedchain/apps/app.py b/embedchain/apps/app.py index 15ba4e1d..043dbfde 100644 --- a/embedchain/apps/app.py +++ b/embedchain/apps/app.py @@ -12,7 +12,7 @@ from embedchain.factory import EmbedderFactory, LlmFactory, VectorDBFactory from embedchain.helpers.json_serializable import register_deserializable from embedchain.llm.base import BaseLlm from embedchain.llm.openai import OpenAILlm -from embedchain.utils import validate_yaml_config +from embedchain.utils import validate_config from embedchain.vectordb.base import BaseVectorDB from embedchain.vectordb.chroma import ChromaDB @@ -134,7 +134,7 @@ class App(EmbedChain): config_data = yaml.safe_load(file) try: - validate_yaml_config(config_data) + validate_config(config_data) except Exception as e: raise Exception(f"❌ Error occurred while validating the YAML config. Error: {str(e)}") diff --git a/embedchain/llm/huggingface.py b/embedchain/llm/huggingface.py index 4da6d517..a4628c0d 100644 --- a/embedchain/llm/huggingface.py +++ b/embedchain/llm/huggingface.py @@ -1,4 +1,5 @@ import importlib +import logging import os from typing import Optional @@ -42,9 +43,11 @@ class HuggingFaceLlm(BaseLlm): else: raise ValueError("`top_p` must be > 0.0 and < 1.0") + model = config.model or "google/flan-t5-xxl" + logging.info(f"Using HuggingFaceHub with model {model}") llm = HuggingFaceHub( huggingfacehub_api_token=os.environ["HUGGINGFACE_ACCESS_TOKEN"], - repo_id=config.model or "google/flan-t5-xxl", + repo_id=model, model_kwargs=model_kwargs, ) diff --git a/embedchain/pipeline.py b/embedchain/pipeline.py index 8a922fe1..f54e7c7a 100644 --- a/embedchain/pipeline.py +++ b/embedchain/pipeline.py @@ -4,6 +4,7 @@ import logging import os import sqlite3 import uuid +from typing import Any, Dict, Optional import requests import yaml @@ -19,7 +20,7 @@ from embedchain.helpers.json_serializable import register_deserializable from embedchain.llm.base import BaseLlm from embedchain.llm.openai import OpenAILlm from embedchain.telemetry.posthog import AnonymousTelemetry -from embedchain.utils import validate_yaml_config +from embedchain.utils import validate_config from embedchain.vectordb.base import BaseVectorDB from embedchain.vectordb.chroma import ChromaDB @@ -43,7 +44,7 @@ class Pipeline(EmbedChain): db: BaseVectorDB = None, embedding_model: BaseEmbedder = None, llm: BaseLlm = None, - yaml_path: str = None, + config_data: dict = None, log_level=logging.WARN, auto_deploy: bool = False, chunker: ChunkerConfig = None, @@ -59,15 +60,15 @@ class Pipeline(EmbedChain): :type embedding_model: BaseEmbedder, optional :param llm: The LLM model used to calculate embeddings, defaults to None :type llm: BaseLlm, optional - :param yaml_path: Path to the YAML configuration file, defaults to None - :type yaml_path: str, optional + :param config_data: Config dictionary, defaults to None + :type config_data: dict, optional :param log_level: Log level to use, defaults to logging.WARN :type log_level: int, optional :param auto_deploy: Whether to deploy the pipeline automatically, defaults to False :type auto_deploy: bool, optional :raises Exception: If an error occurs while creating the pipeline """ - if id and yaml_path: + if id and config_data: raise Exception("Cannot provide both id and config. Please provide only one of them.") if id and name: @@ -79,8 +80,8 @@ class Pipeline(EmbedChain): logging.basicConfig(level=log_level, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") self.logger = logging.getLogger(__name__) self.auto_deploy = auto_deploy - # Store the yaml config as an attribute to be able to send it - self.yaml_config = None + # Store the dict config as an attribute to be able to send it + self.config_data = config_data if (config_data and validate_config(config_data)) else None self.client = None # pipeline_id from the backend self.id = None @@ -92,11 +93,6 @@ class Pipeline(EmbedChain): self.name = self.config.name self.config.id = self.local_id = str(uuid.uuid4()) if self.config.id is None else self.config.id - if yaml_path: - with open(yaml_path, "r") as file: - config_data = yaml.safe_load(file) - self.yaml_config = config_data - if id is not None: # Init client first since user is trying to fetch the pipeline # details from the platform @@ -187,9 +183,9 @@ class Pipeline(EmbedChain): Create a pipeline on the platform. """ print("πŸ› οΈ Creating pipeline on the platform...") - # self.yaml_config is a dict. Pass it inside the key 'yaml_config' to the backend + # self.config_data is a dict. Pass it inside the key 'yaml_config' to the backend payload = { - "yaml_config": json.dumps(self.yaml_config), + "yaml_config": json.dumps(self.config_data), "name": self.name, "local_id": self.local_id, } @@ -346,24 +342,57 @@ class Pipeline(EmbedChain): self.telemetry.capture(event_name="deploy", properties=self._telemetry_props) @classmethod - def from_config(cls, yaml_path: str, auto_deploy: bool = False): + def from_config( + cls, + config_path: Optional[str] = None, + config: Optional[Dict[str, Any]] = None, + auto_deploy: bool = False, + yaml_path: Optional[str] = None, + ): """ - Instantiate a Pipeline object from a YAML configuration file. + Instantiate a Pipeline object from a configuration. - :param yaml_path: Path to the YAML configuration file. - :type yaml_path: str + :param config_path: Path to the YAML or JSON configuration file. + :type config_path: Optional[str] + :param config: A dictionary containing the configuration. + :type config: Optional[Dict[str, Any]] :param auto_deploy: Whether to deploy the pipeline automatically, defaults to False :type auto_deploy: bool, optional + :param yaml_path: (Deprecated) Path to the YAML configuration file. Use config_path instead. + :type yaml_path: Optional[str] :return: An instance of the Pipeline class. :rtype: Pipeline """ - with open(yaml_path, "r") as file: - config_data = yaml.safe_load(file) + # Backward compatibility for yaml_path + if yaml_path and not config_path: + config_path = yaml_path + + if config_path and config: + raise ValueError("Please provide only one of config_path or config.") + + config_data = None + + if config_path: + file_extension = os.path.splitext(config_path)[1] + with open(config_path, "r") as file: + if file_extension in [".yaml", ".yml"]: + config_data = yaml.safe_load(file) + elif file_extension == ".json": + config_data = json.load(file) + else: + raise ValueError("config_path must be a path to a YAML or JSON file.") + elif config and isinstance(config, dict): + config_data = config + else: + logging.error( + "Please provide either a config file path (YAML or JSON) or a config dictionary. Falling back to defaults because no config is provided.", # noqa: E501 + ) + config_data = {} try: - validate_yaml_config(config_data) + validate_config(config_data) except Exception as e: - raise Exception(f"❌ Error occurred while validating the YAML config. Error: {str(e)}") + raise Exception(f"Error occurred while validating the config. Error: {str(e)}") pipeline_config_data = config_data.get("app", {}).get("config", {}) db_config_data = config_data.get("vectordb", {}) @@ -388,7 +417,7 @@ class Pipeline(EmbedChain): ) # Send anonymous telemetry - event_properties = {"init_type": "yaml_config"} + event_properties = {"init_type": "config_data"} AnonymousTelemetry().capture(event_name="init", properties=event_properties) return cls( @@ -396,7 +425,7 @@ class Pipeline(EmbedChain): llm=llm, db=db, embedding_model=embedding_model, - yaml_path=yaml_path, + config_data=config_data, auto_deploy=auto_deploy, chunker=chunker_config_data, ) diff --git a/embedchain/store/assistants.py b/embedchain/store/assistants.py index 9803c083..87f6cb28 100644 --- a/embedchain/store/assistants.py +++ b/embedchain/store/assistants.py @@ -165,7 +165,7 @@ class AIAssistant: self.instructions = instructions self.assistant_id = assistant_id or str(uuid.uuid4()) self.thread_id = thread_id or str(uuid.uuid4()) - self.pipeline = Pipeline.from_config(yaml_path=yaml_path) if yaml_path else Pipeline() + self.pipeline = Pipeline.from_config(config_path=yaml_path) if yaml_path else Pipeline() self.pipeline.local_id = self.pipeline.config.id = self.thread_id if self.instructions: diff --git a/embedchain/utils.py b/embedchain/utils.py index e51bcebd..1f5309df 100644 --- a/embedchain/utils.py +++ b/embedchain/utils.py @@ -355,7 +355,7 @@ def is_valid_json_string(source: str): return False -def validate_yaml_config(config_data): +def validate_config(config_data): schema = Schema( { Optional("app"): { diff --git a/examples/rest-api/main.py b/examples/rest-api/main.py index a8a34991..2ed1ac50 100644 --- a/examples/rest-api/main.py +++ b/examples/rest-api/main.py @@ -108,7 +108,7 @@ async def get_datasources_associated_with_app_id(app_id: str, db: Session = Depe if db_app is None: raise HTTPException(detail=f"App with id {app_id} does not exist, please create it first.", status_code=400) - app = App.from_config(yaml_path=db_app.config) + app = App.from_config(config_path=db_app.config) response = app.get_data_sources() return {"results": response} @@ -147,7 +147,7 @@ async def add_datasource_to_an_app(body: SourceApp, app_id: str, db: Session = D if db_app is None: raise HTTPException(detail=f"App with id {app_id} does not exist, please create it first.", status_code=400) - app = App.from_config(yaml_path=db_app.config) + app = App.from_config(config_path=db_app.config) response = app.add(source=body.source, data_type=body.data_type) return DefaultResponse(response=response) @@ -185,7 +185,7 @@ async def query_an_app(body: QueryApp, app_id: str, db: Session = Depends(get_db if db_app is None: raise HTTPException(detail=f"App with id {app_id} does not exist, please create it first.", status_code=400) - app = App.from_config(yaml_path=db_app.config) + app = App.from_config(config_path=db_app.config) response = app.query(body.query) return DefaultResponse(response=response) @@ -227,7 +227,7 @@ async def query_an_app(body: QueryApp, app_id: str, db: Session = Depends(get_db # status_code=400 # ) -# app = App.from_config(yaml_path=db_app.config) +# app = App.from_config(config_path=db_app.config) # response = app.chat(body.message) # return DefaultResponse(response=response) @@ -264,7 +264,7 @@ async def deploy_app(body: DeployAppRequest, app_id: str, db: Session = Depends( if db_app is None: raise HTTPException(detail=f"App with id {app_id} does not exist, please create it first.", status_code=400) - app = App.from_config(yaml_path=db_app.config) + app = App.from_config(config_path=db_app.config) api_key = body.api_key # this will save the api key in the embedchain.db @@ -305,7 +305,7 @@ async def delete_app(app_id: str, db: Session = Depends(get_db)): if db_app is None: raise HTTPException(detail=f"App with id {app_id} does not exist, please create it first.", status_code=400) - app = App.from_config(yaml_path=db_app.config) + app = App.from_config(config_path=db_app.config) # reset app.db app.db.reset() diff --git a/notebooks/anthropic.ipynb b/notebooks/anthropic.ipynb index ad565aa1..fcf61a57 100644 --- a/notebooks/anthropic.ipynb +++ b/notebooks/anthropic.ipynb @@ -109,7 +109,7 @@ }, "outputs": [], "source": [ - "app = App.from_config(yaml_path=\"anthropic.yaml\")" + "app = App.from_config(config_path=\"anthropic.yaml\")" ] }, { diff --git a/notebooks/azure-openai.ipynb b/notebooks/azure-openai.ipynb index 1c77da1d..0a5c72ec 100644 --- a/notebooks/azure-openai.ipynb +++ b/notebooks/azure-openai.ipynb @@ -105,7 +105,7 @@ "metadata": {}, "outputs": [], "source": [ - "app = App.from_config(yaml_path=\"azure_openai.yaml\")" + "app = App.from_config(config_path=\"azure_openai.yaml\")" ] }, { diff --git a/notebooks/chromadb.ipynb b/notebooks/chromadb.ipynb index b2b80c14..d31feca6 100644 --- a/notebooks/chromadb.ipynb +++ b/notebooks/chromadb.ipynb @@ -105,7 +105,7 @@ }, "outputs": [], "source": [ - "app = App.from_config(yaml_path=\"chromadb.yaml\")" + "app = App.from_config(config_path=\"chromadb.yaml\")" ] }, { diff --git a/notebooks/cohere.ipynb b/notebooks/cohere.ipynb index 75295e5d..dc8c61fb 100644 --- a/notebooks/cohere.ipynb +++ b/notebooks/cohere.ipynb @@ -114,7 +114,7 @@ }, "outputs": [], "source": [ - "app = App.from_config(yaml_path=\"cohere.yaml\")" + "app = App.from_config(config_path=\"cohere.yaml\")" ] }, { diff --git a/notebooks/elasticsearch.ipynb b/notebooks/elasticsearch.ipynb index 9b0c9f0d..bdabb1ea 100644 --- a/notebooks/elasticsearch.ipynb +++ b/notebooks/elasticsearch.ipynb @@ -103,7 +103,7 @@ }, "outputs": [], "source": [ - "app = App.from_config(yaml_path=\"elasticsearch.yaml\")" + "app = App.from_config(config_path=\"elasticsearch.yaml\")" ] }, { diff --git a/notebooks/gpt4all.ipynb b/notebooks/gpt4all.ipynb index 3fee4617..4ed13409 100644 --- a/notebooks/gpt4all.ipynb +++ b/notebooks/gpt4all.ipynb @@ -114,7 +114,7 @@ }, "outputs": [], "source": [ - "app = App.from_config(yaml_path=\"gpt4all.yaml\")" + "app = App.from_config(config_path=\"gpt4all.yaml\")" ] }, { diff --git a/notebooks/hugging_face_hub.ipynb b/notebooks/hugging_face_hub.ipynb index 23d72881..aac964a9 100644 --- a/notebooks/hugging_face_hub.ipynb +++ b/notebooks/hugging_face_hub.ipynb @@ -114,7 +114,7 @@ }, "outputs": [], "source": [ - "app = App.from_config(yaml_path=\"huggingface.yaml\")" + "app = App.from_config(config_path=\"huggingface.yaml\")" ] }, { diff --git a/notebooks/jina.ipynb b/notebooks/jina.ipynb index 70e7589c..b7c3b619 100644 --- a/notebooks/jina.ipynb +++ b/notebooks/jina.ipynb @@ -114,7 +114,7 @@ }, "outputs": [], "source": [ - "app = App.from_config(yaml_path=\"jina.yaml\")" + "app = App.from_config(config_path=\"jina.yaml\")" ] }, { diff --git a/notebooks/llama2.ipynb b/notebooks/llama2.ipynb index d7dd11ab..fd1903a8 100644 --- a/notebooks/llama2.ipynb +++ b/notebooks/llama2.ipynb @@ -109,7 +109,7 @@ }, "outputs": [], "source": [ - "app = App.from_config(yaml_path=\"llama2.yaml\")" + "app = App.from_config(config_path=\"llama2.yaml\")" ] }, { diff --git a/notebooks/openai.ipynb b/notebooks/openai.ipynb index b3a410f9..125c4ebd 100644 --- a/notebooks/openai.ipynb +++ b/notebooks/openai.ipynb @@ -115,7 +115,7 @@ }, "outputs": [], "source": [ - "app = App.from_config(yaml_path=\"openai.yaml\")" + "app = App.from_config(config_path=\"openai.yaml\")" ] }, { diff --git a/notebooks/opensearch.ipynb b/notebooks/opensearch.ipynb index 42d2343b..be45992b 100644 --- a/notebooks/opensearch.ipynb +++ b/notebooks/opensearch.ipynb @@ -107,7 +107,7 @@ }, "outputs": [], "source": [ - "app = App.from_config(yaml_path=\"opensearch.yaml\")" + "app = App.from_config(config_path=\"opensearch.yaml\")" ] }, { diff --git a/notebooks/pinecone.ipynb b/notebooks/pinecone.ipynb index b335d6a3..d9a63c17 100644 --- a/notebooks/pinecone.ipynb +++ b/notebooks/pinecone.ipynb @@ -104,7 +104,7 @@ }, "outputs": [], "source": [ - "app = App.from_config(yaml_path=\"pinecone.yaml\")" + "app = App.from_config(config_path=\"pinecone.yaml\")" ] }, { diff --git a/notebooks/vertex_ai.ipynb b/notebooks/vertex_ai.ipynb index d36c9751..f4170115 100644 --- a/notebooks/vertex_ai.ipynb +++ b/notebooks/vertex_ai.ipynb @@ -117,7 +117,7 @@ }, "outputs": [], "source": [ - "app = App.from_config(yaml_path=\"vertexai.yaml\")" + "app = App.from_config(config_path=\"vertexai.yaml\")" ] }, { diff --git a/pyproject.toml b/pyproject.toml index b6b49a10..b477c55d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "embedchain" -version = "0.1.22" +version = "0.1.23" description = "Data platform for LLMs - Load, index, retrieve and sync any unstructured data" authors = [ "Taranjeet Singh ", diff --git a/tests/test_utils.py b/tests/test_utils.py index 3ca05548..891bb453 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,6 +1,6 @@ import yaml -from embedchain.utils import validate_yaml_config +from embedchain.utils import validate_config CONFIG_YAMLS = [ "configs/anthropic.yaml", @@ -30,7 +30,7 @@ def test_all_config_yamls(): assert config is not None try: - validate_yaml_config(config) + validate_config(config) except Exception as e: print(f"Error in {config_yaml}: {e}") raise e