diff --git a/docs/advanced/data_types.mdx b/docs/advanced/data_types.mdx index b2ad8eb5..bc59a8a0 100644 --- a/docs/advanced/data_types.mdx +++ b/docs/advanced/data_types.mdx @@ -54,6 +54,18 @@ To add any code documentation website as a loader, use the data_type as `docs_si app.add("docs_site", "https://docs.embedchain.ai/") ``` +### Notion +To use notion you must install the extra dependencies with `pip install embedchain[notion]`. + +To load a notion page, use the data_type as `notion`. +The next argument must **end** with the `notion page id`. The id is a 32-character string. Eg: + +```python +app.add("notion", "cfbc134ca6464fc980d0391613959196") +app.add("notion", "my-page-cfbc134ca6464fc980d0391613959196") +app.add("notion", "https://www.notion.so/my-page-cfbc134ca6464fc980d0391613959196") +``` + ### Text To supply your own text, use the data_type as `text` and enter a string. The text is not processed, this can be very versatile. Eg: diff --git a/docs/advanced/query_configuration.mdx b/docs/advanced/query_configuration.mdx index 76a9c264..fe389a06 100644 --- a/docs/advanced/query_configuration.mdx +++ b/docs/advanced/query_configuration.mdx @@ -46,6 +46,7 @@ Default values of chunker config parameters for different `data_type`: |pdf_file|1000|0|len| |youtube_video|2000|0|len| |docs_site|500|50|len| +|notion|300|0|len| ### LoaderConfig diff --git a/embedchain/chunkers/notion.py b/embedchain/chunkers/notion.py new file mode 100644 index 00000000..3ea8012d --- /dev/null +++ b/embedchain/chunkers/notion.py @@ -0,0 +1,20 @@ +from typing import Optional + +from langchain.text_splitter import RecursiveCharacterTextSplitter + +from embedchain.chunkers.base_chunker import BaseChunker +from embedchain.config.AddConfig import ChunkerConfig + + +class NotionChunker(BaseChunker): + """Chunker for notion.""" + + def __init__(self, config: Optional[ChunkerConfig] = None): + if config is None: + config = ChunkerConfig(chunk_size=300, chunk_overlap=0, length_function=len) + text_splitter = RecursiveCharacterTextSplitter( + chunk_size=config.chunk_size, + chunk_overlap=config.chunk_overlap, + length_function=config.length_function, + ) + super().__init__(text_splitter) diff --git a/embedchain/data_formatter/data_formatter.py b/embedchain/data_formatter/data_formatter.py index b25bb985..f478ca55 100644 --- a/embedchain/data_formatter/data_formatter.py +++ b/embedchain/data_formatter/data_formatter.py @@ -1,5 +1,6 @@ from embedchain.chunkers.docs_site import DocsSiteChunker from embedchain.chunkers.docx_file import DocxFileChunker +from embedchain.chunkers.notion import NotionChunker from embedchain.chunkers.pdf_file import PdfFileChunker from embedchain.chunkers.qna_pair import QnaPairChunker from embedchain.chunkers.text import TextChunker @@ -10,6 +11,7 @@ from embedchain.loaders.docs_site_loader import DocsSiteLoader from embedchain.loaders.docx_file import DocxFileLoader from embedchain.loaders.local_qna_pair import LocalQnaPairLoader from embedchain.loaders.local_text import LocalTextLoader +from embedchain.loaders.notion import NotionLoader from embedchain.loaders.pdf_file import PdfFileLoader from embedchain.loaders.sitemap import SitemapLoader from embedchain.loaders.web_page import WebPageLoader @@ -44,6 +46,7 @@ class DataFormatter: "docx": DocxFileLoader(), "sitemap": SitemapLoader(), "docs_site": DocsSiteLoader(), + "notion": NotionLoader(), } if data_type in loaders: return loaders[data_type] @@ -67,6 +70,7 @@ class DataFormatter: "docx": DocxFileChunker, "sitemap": WebPageChunker, "docs_site": DocsSiteChunker, + "notion": NotionChunker, } if data_type in chunker_classes: chunker_class = chunker_classes[data_type] diff --git a/embedchain/loaders/notion.py b/embedchain/loaders/notion.py new file mode 100644 index 00000000..31fb210d --- /dev/null +++ b/embedchain/loaders/notion.py @@ -0,0 +1,41 @@ +import logging +import os + +try: + from llama_index import download_loader +except ImportError: + raise ImportError("Notion requires extra dependencies. Install with `pip install embedchain[notion]`") from None + + +from embedchain.loaders.base_loader import BaseLoader +from embedchain.utils import clean_string + + +class NotionLoader(BaseLoader): + def load_data(self, source): + """Load data from a PDF file.""" + + NotionPageReader = download_loader("NotionPageReader") + + # Reformat Id to match notion expectation + id = source[-32:] + formatted_id = f"{id[:8]}-{id[8:12]}-{id[12:16]}-{id[16:20]}-{id[20:]}" + logging.debug(f"Extracted notion page id as: {formatted_id}") + + # Get page through the notion api + integration_token = os.getenv("NOTION_INTEGRATION_TOKEN") + reader = NotionPageReader(integration_token=integration_token) + documents = reader.load_data(page_ids=[formatted_id]) + + # Extract text + raw_text = documents[0].text + + # Clean text + text = clean_string(raw_text) + + return [ + { + "content": text, + "meta_data": {"url": f"notion-{formatted_id}"}, + } + ] diff --git a/pyproject.toml b/pyproject.toml index 618b4357..85211804 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -90,6 +90,7 @@ youtube-transcript-api = "^0.6.1" beautifulsoup4 = "^4.12.2" pypdf = "^3.11.0" pytube = "^15.0.0" +llama-index = { version = "^0.7.21", optional = true } @@ -105,7 +106,7 @@ isort = "^5.12.0" [tool.poetry.extras] streamlit = ["streamlit"] - +community = ["llama-index"] [tool.poetry.group.docs.dependencies] diff --git a/setup.py b/setup.py index 0fc9a175..137888b6 100644 --- a/setup.py +++ b/setup.py @@ -37,5 +37,5 @@ setuptools.setup( "replicate==0.9.0", "duckduckgo-search==3.8.4", ], - extras_require={"dev": ["black", "ruff", "isort", "pytest"]}, + extras_require={"dev": ["black", "ruff", "isort", "pytest"], "community": ["llama-index==0.7.21"]}, )