From 28d41e939750e936a7280c07ead53094d8211239 Mon Sep 17 00:00:00 2001 From: Taranjeet Singh Date: Sat, 30 Sep 2023 11:13:44 -0700 Subject: [PATCH] fix: improve add data section (#740) --- docs/advanced/data_types.mdx | 160 ------------------ docs/data-sources/csv.mdx | 14 ++ docs/data-sources/data-type-handling.mdx | 54 ++++++ docs/data-sources/docs-site.mdx | 11 ++ docs/data-sources/docx.mdx | 12 ++ .../how-to-add-data.mdx} | 2 +- docs/data-sources/mdx.mdx | 12 ++ docs/data-sources/notion.mdx | 15 ++ docs/data-sources/pdf-file.mdx | 14 ++ docs/data-sources/qna.mdx | 11 ++ docs/data-sources/request-new-format.mdx | 6 + docs/data-sources/sitemap.mdx | 11 ++ docs/data-sources/text.mdx | 13 ++ docs/data-sources/web-page.mdx | 11 ++ docs/data-sources/youtube-video.mdx | 12 ++ docs/mint.json | 28 ++- 16 files changed, 224 insertions(+), 162 deletions(-) delete mode 100644 docs/advanced/data_types.mdx create mode 100644 docs/data-sources/csv.mdx create mode 100644 docs/data-sources/data-type-handling.mdx create mode 100644 docs/data-sources/docs-site.mdx create mode 100644 docs/data-sources/docx.mdx rename docs/{advanced/adding_data.mdx => data-sources/how-to-add-data.mdx} (96%) create mode 100644 docs/data-sources/mdx.mdx create mode 100644 docs/data-sources/notion.mdx create mode 100644 docs/data-sources/pdf-file.mdx create mode 100644 docs/data-sources/qna.mdx create mode 100644 docs/data-sources/request-new-format.mdx create mode 100644 docs/data-sources/sitemap.mdx create mode 100644 docs/data-sources/text.mdx create mode 100644 docs/data-sources/web-page.mdx create mode 100644 docs/data-sources/youtube-video.mdx diff --git a/docs/advanced/data_types.mdx b/docs/advanced/data_types.mdx deleted file mode 100644 index 1d0e130c..00000000 --- a/docs/advanced/data_types.mdx +++ /dev/null @@ -1,160 +0,0 @@ ---- -title: '📋 Supported data formats' ---- - -## Automatic data type detection -The add method automatically tries to detect the data_type, based on your input for the source argument. So `app.add('https://www.youtube.com/watch?v=dQw4w9WgXcQ')` is enough to embed a YouTube video. - -This detection is implemented for all formats. It is based on factors such as whether it's a URL, a local file, the source data type, etc. - -### Debugging automatic detection - - -Set `log_level=DEBUG` (in [AppConfig](http://localhost:3000/advanced/query_configuration#appconfig)) and make sure it's working as intended. - -Otherwise, you will not know when, for instance, an invalid filepath is interpreted as raw text instead. - -### Forcing a data type - -To omit any issues with the data type detection, you can **force** a data_type by adding it as a `add` method argument. -The examples below show you the keyword to force the respective `data_type`. - -Forcing can also be used for edge cases, such as interpreting a sitemap as a web_page, for reading its raw text instead of following links. - -## Remote Data Types - - -**Use local files in remote data types** - -Some data_types are meant for remote content and only work with URLs. -You can pass local files by formatting the path using the `file:` [URI scheme](https://en.wikipedia.org/wiki/File_URI_scheme), e.g. `file:///info.pdf`. - - -### Youtube video - -To add any youtube video to your app, use the data_type (first argument to `.add()` method) as `youtube_video`. Eg: - -```python -app.add('a_valid_youtube_url_here', data_type='youtube_video') -``` - -### PDF file - -To add any pdf file, use the data_type as `pdf_file`. Eg: - -```python -app.add('a_valid_url_where_pdf_file_can_be_accessed', data_type='pdf_file') -``` - -Note that we do not support password protected pdfs. - -### Web page - -To add any web page, use the data_type as `web_page`. Eg: - -```python -app.add('a_valid_web_page_url', data_type='web_page') -``` - -### Sitemap - -Add all web pages from an xml-sitemap. Filters non-text files. Use the data_type as `sitemap`. Eg: - -```python -app.add('https://example.com/sitemap.xml', data_type='sitemap') -``` - -### Doc file - -To add any doc/docx file, use the data_type as `docx`. `docx` allows remote urls and conventional file paths. Eg: - -```python -app.add('https://example.com/content/intro.docx', data_type="docx") -app.add('content/intro.docx', data_type="docx") -``` - -### CSV file - -To add any csv file, use the data_type as `csv`. `csv` allows remote urls and conventional file paths. Headers are included for each line, so if you have an `age` column, `18` will be added as `age: 18`. Eg: - -```python -app.add('https://example.com/content/sheet.csv', data_type="csv") -app.add('content/sheet.csv', data_type="csv") -``` - -Note: There is a size limit allowed for csv file beyond which it can throw error. This limit is set by the LLMs. Please consider chunking large csv files into smaller csv files. - -### Code documentation website loader - -To add any code documentation website as a loader, use the data_type as `docs_site`. Eg: - -```python -app.add("https://docs.embedchain.ai/", data_type="docs_site") -``` - -### Notion -To use notion you must install the extra dependencies with `pip install --upgrade embedchain[notion]`. - -To load a notion page, use the data_type as `notion`. Since it is hard to automatically detect, forcing this is advised. -The next argument must **end** with the `notion page id`. The id is a 32-character string. Eg: - -```python -app.add("cfbc134ca6464fc980d0391613959196", "notion") -app.add("my-page-cfbc134ca6464fc980d0391613959196", "notion") -app.add("https://www.notion.so/my-page-cfbc134ca6464fc980d0391613959196", "notion") -``` - -### Mdx file - -To add any mdx file to your app, use the data_type (first argument to `.add()` method) as `mdx`. Note that this supports support mdx file present on machine, so this should be a file path. Eg: - -```python -app.add('path/to/file.mdx', data_type='mdx') -``` - -## Local Data Types - -### Text - -To supply your own text, use the data_type as `text` and enter a string. The text is not processed, this can be very versatile. Eg: - -```python -app.add('Seek wealth, not money or status. Wealth is having assets that earn while you sleep. Money is how we transfer time and wealth. Status is your place in the social hierarchy.', data_type='text') -``` - -Note: This is not used in the examples because in most cases you will supply a whole paragraph or file, which did not fit. - -### QnA pair - -To supply your own QnA pair, use the data_type as `qna_pair` and enter a tuple. Eg: - -```python -app.add(("Question", "Answer"), data_type="qna_pair") -``` - -## Reusing a vector database - -Default behavior is to create a persistent vector DB in the directory **./db**. You can split your application into two Python scripts: one to create a local vector DB and the other to reuse this local persistent vector DB. This is useful when you want to index hundreds of documents and separately implement a chat interface. - -Create a local index: - -```python -from embedchain import App - -naval_chat_bot = App() -naval_chat_bot.add("https://www.youtube.com/watch?v=3qHkcs3kG44") -naval_chat_bot.add("https://navalmanack.s3.amazonaws.com/Eric-Jorgenson_The-Almanack-of-Naval-Ravikant_Final.pdf") -``` - -You can reuse the local index with the same code, but without adding new documents: - -```python -from embedchain import App - -naval_chat_bot = App() -print(naval_chat_bot.query("What unique capacity does Naval argue humans possess when it comes to understanding explanations or concepts?")) -``` - -## More formats (coming soon!) - -- If you want to add any other format, please create an [issue](https://github.com/embedchain/embedchain/issues) and we will add it to the list of supported formats. diff --git a/docs/data-sources/csv.mdx b/docs/data-sources/csv.mdx new file mode 100644 index 00000000..47111db2 --- /dev/null +++ b/docs/data-sources/csv.mdx @@ -0,0 +1,14 @@ +--- +title: 'CSV' +--- + +### CSV file + +To add any csv file, use the data_type as `csv`. `csv` allows remote urls and conventional file paths. Headers are included for each line, so if you have an `age` column, `18` will be added as `age: 18`. Eg: + +```python +app.add('https://example.com/content/sheet.csv', data_type="csv") +app.add('content/sheet.csv', data_type="csv") +``` + +Note: There is a size limit allowed for csv file beyond which it can throw error. This limit is set by the LLMs. Please consider chunking large csv files into smaller csv files. \ No newline at end of file diff --git a/docs/data-sources/data-type-handling.mdx b/docs/data-sources/data-type-handling.mdx new file mode 100644 index 00000000..8861586d --- /dev/null +++ b/docs/data-sources/data-type-handling.mdx @@ -0,0 +1,54 @@ +--- +title: 'Data Type Handling' +--- + +## Automatic data type detection +The add method automatically tries to detect the data_type, based on your input for the source argument. So `app.add('https://www.youtube.com/watch?v=dQw4w9WgXcQ')` is enough to embed a YouTube video. + +This detection is implemented for all formats. It is based on factors such as whether it's a URL, a local file, the source data type, etc. + +### Debugging automatic detection + + +Set `log_level=DEBUG` (in [AppConfig](http://localhost:3000/advanced/query_configuration#appconfig)) and make sure it's working as intended. + +Otherwise, you will not know when, for instance, an invalid filepath is interpreted as raw text instead. + +### Forcing a data type + +To omit any issues with the data type detection, you can **force** a data_type by adding it as a `add` method argument. +The examples below show you the keyword to force the respective `data_type`. + +Forcing can also be used for edge cases, such as interpreting a sitemap as a web_page, for reading its raw text instead of following links. + +## Remote Data Types + + +**Use local files in remote data types** + +Some data_types are meant for remote content and only work with URLs. +You can pass local files by formatting the path using the `file:` [URI scheme](https://en.wikipedia.org/wiki/File_URI_scheme), e.g. `file:///info.pdf`. + + +## Reusing a vector database + +Default behavior is to create a persistent vector DB in the directory **./db**. You can split your application into two Python scripts: one to create a local vector DB and the other to reuse this local persistent vector DB. This is useful when you want to index hundreds of documents and separately implement a chat interface. + +Create a local index: + +```python +from embedchain import App + +naval_chat_bot = App() +naval_chat_bot.add("https://www.youtube.com/watch?v=3qHkcs3kG44") +naval_chat_bot.add("https://navalmanack.s3.amazonaws.com/Eric-Jorgenson_The-Almanack-of-Naval-Ravikant_Final.pdf") +``` + +You can reuse the local index with the same code, but without adding new documents: + +```python +from embedchain import App + +naval_chat_bot = App() +print(naval_chat_bot.query("What unique capacity does Naval argue humans possess when it comes to understanding explanations or concepts?")) +``` diff --git a/docs/data-sources/docs-site.mdx b/docs/data-sources/docs-site.mdx new file mode 100644 index 00000000..d3e23141 --- /dev/null +++ b/docs/data-sources/docs-site.mdx @@ -0,0 +1,11 @@ +--- +title: 'Code Documentation' +--- + +### Code documentation + +To add any code documentation website as a loader, use the data_type as `docs_site`. Eg: + +```python +app.add("https://docs.embedchain.ai/", data_type="docs_site") +``` \ No newline at end of file diff --git a/docs/data-sources/docx.mdx b/docs/data-sources/docx.mdx new file mode 100644 index 00000000..c5eeb6b0 --- /dev/null +++ b/docs/data-sources/docx.mdx @@ -0,0 +1,12 @@ +--- +title: 'Docx File' +--- + +### Docx file + +To add any doc/docx file, use the data_type as `docx`. `docx` allows remote urls and conventional file paths. Eg: + +```python +app.add('https://example.com/content/intro.docx', data_type="docx") +app.add('content/intro.docx', data_type="docx") +``` \ No newline at end of file diff --git a/docs/advanced/adding_data.mdx b/docs/data-sources/how-to-add-data.mdx similarity index 96% rename from docs/advanced/adding_data.mdx rename to docs/data-sources/how-to-add-data.mdx index 98b30f2b..699c2d15 100644 --- a/docs/advanced/adding_data.mdx +++ b/docs/data-sources/how-to-add-data.mdx @@ -1,5 +1,5 @@ --- -title: '➕ Adding Data' +title: 'How to add data' --- ## Add Dataset diff --git a/docs/data-sources/mdx.mdx b/docs/data-sources/mdx.mdx new file mode 100644 index 00000000..02681c68 --- /dev/null +++ b/docs/data-sources/mdx.mdx @@ -0,0 +1,12 @@ +--- +title: 'Mdx' +--- + + +### Mdx file + +To add any mdx file to your app, use the data_type (first argument to `.add()` method) as `mdx`. Note that this supports support mdx file present on machine, so this should be a file path. Eg: + +```python +app.add('path/to/file.mdx', data_type='mdx') +``` \ No newline at end of file diff --git a/docs/data-sources/notion.mdx b/docs/data-sources/notion.mdx new file mode 100644 index 00000000..9530dbf8 --- /dev/null +++ b/docs/data-sources/notion.mdx @@ -0,0 +1,15 @@ +--- +title: 'Notion' +--- + +### Notion +To use notion you must install the extra dependencies with `pip install --upgrade embedchain[notion]`. + +To load a notion page, use the data_type as `notion`. Since it is hard to automatically detect, forcing this is advised. +The next argument must **end** with the `notion page id`. The id is a 32-character string. Eg: + +```python +app.add("cfbc134ca6464fc980d0391613959196", "notion") +app.add("my-page-cfbc134ca6464fc980d0391613959196", "notion") +app.add("https://www.notion.so/my-page-cfbc134ca6464fc980d0391613959196", "notion") +``` \ No newline at end of file diff --git a/docs/data-sources/pdf-file.mdx b/docs/data-sources/pdf-file.mdx new file mode 100644 index 00000000..028c3d56 --- /dev/null +++ b/docs/data-sources/pdf-file.mdx @@ -0,0 +1,14 @@ +--- +title: 'PDF File' +--- + + +### PDF File + +To add any pdf file, use the data_type as `pdf_file`. Eg: + +```python +app.add('a_valid_url_where_pdf_file_can_be_accessed', data_type='pdf_file') +``` + +Note that we do not support password protected pdfs. \ No newline at end of file diff --git a/docs/data-sources/qna.mdx b/docs/data-sources/qna.mdx new file mode 100644 index 00000000..87f118a5 --- /dev/null +++ b/docs/data-sources/qna.mdx @@ -0,0 +1,11 @@ +--- +title: 'QnA Pair' +--- + +### QnA pair + +QnA pair is a local data type. To supply your own QnA pair, use the data_type as `qna_pair` and enter a tuple. Eg: + +```python +app.add(("Question", "Answer"), data_type="qna_pair") +``` \ No newline at end of file diff --git a/docs/data-sources/request-new-format.mdx b/docs/data-sources/request-new-format.mdx new file mode 100644 index 00000000..a2812a32 --- /dev/null +++ b/docs/data-sources/request-new-format.mdx @@ -0,0 +1,6 @@ +--- +title: 'Request New Format' +--- + + +- If you want to add any other format, please create an [issue](https://github.com/embedchain/embedchain/issues) and we will add it to the list of supported formats. diff --git a/docs/data-sources/sitemap.mdx b/docs/data-sources/sitemap.mdx new file mode 100644 index 00000000..d7ce7338 --- /dev/null +++ b/docs/data-sources/sitemap.mdx @@ -0,0 +1,11 @@ +--- +title: 'Sitemap' +--- + +### Sitemap + +Add all web pages from an xml-sitemap. Filters non-text files. Use the data_type as `sitemap`. Eg: + +```python +app.add('https://example.com/sitemap.xml', data_type='sitemap') +``` \ No newline at end of file diff --git a/docs/data-sources/text.mdx b/docs/data-sources/text.mdx new file mode 100644 index 00000000..1ce81d2e --- /dev/null +++ b/docs/data-sources/text.mdx @@ -0,0 +1,13 @@ +--- +title: 'Text' +--- + +### Text + +Text is a local data type. To supply your own text, use the data_type as `text` and enter a string. The text is not processed, this can be very versatile. Eg: + +```python +app.add('Seek wealth, not money or status. Wealth is having assets that earn while you sleep. Money is how we transfer time and wealth. Status is your place in the social hierarchy.', data_type='text') +``` + +Note: This is not used in the examples because in most cases you will supply a whole paragraph or file, which did not fit. \ No newline at end of file diff --git a/docs/data-sources/web-page.mdx b/docs/data-sources/web-page.mdx new file mode 100644 index 00000000..e788bcd9 --- /dev/null +++ b/docs/data-sources/web-page.mdx @@ -0,0 +1,11 @@ +--- +title: 'Web page' +--- + +### Web page + +To add any web page, use the data_type as `web_page`. Eg: + +```python +app.add('a_valid_web_page_url', data_type='web_page') +``` \ No newline at end of file diff --git a/docs/data-sources/youtube-video.mdx b/docs/data-sources/youtube-video.mdx new file mode 100644 index 00000000..82dea222 --- /dev/null +++ b/docs/data-sources/youtube-video.mdx @@ -0,0 +1,12 @@ +--- +title: 'Youtube Video' +--- + + +### Youtube video + +To add any youtube video to your app, use the data_type (first argument to `.add()` method) as `youtube_video`. Eg: + +```python +app.add('a_valid_youtube_url_here', data_type='youtube_video') +``` diff --git a/docs/mint.json b/docs/mint.json index 79316dc6..4511fcc9 100644 --- a/docs/mint.json +++ b/docs/mint.json @@ -34,9 +34,34 @@ "group": "Get started", "pages": ["get-start/quickstart", "get-start/introduction", "get-start/faq"] }, + { "group": "Advanced", - "pages": ["advanced/app_types", "advanced/interface_types", "advanced/adding_data", "advanced/data_types", "advanced/query_configuration", "advanced/configuration", "advanced/testing", "advanced/vector_database", "advanced/showcase"] + "pages": ["advanced/app_types", "advanced/interface_types", "advanced/query_configuration", "advanced/configuration", "advanced/testing", "advanced/vector_database", "advanced/showcase"] + }, + { + "group": "Data Sources", + "pages": [ + "data-sources/how-to-add-data", + "data-sources/data-type-handling", + { + "group": "Supported Data Sources", + "pages": [ + "data-sources/csv", + "data-sources/docs-site", + "data-sources/docx", + "data-sources/mdx", + "data-sources/notion", + "data-sources/pdf-file", + "data-sources/qna", + "data-sources/sitemap", + "data-sources/text", + "data-sources/web-page", + "data-sources/youtube-video" + ] + }, + "data-sources/request-new-format" + ] }, { "group": "Examples", @@ -52,6 +77,7 @@ } ], + "footerSocials": { "twitter": "https://twitter.com/embedchain", "github": "https://github.com/embedchain/embedchain",