feat: notion loader (#405)

2023-08-09 09:45:22 +02:00
parent eeac84e2d9
commit ce6eb39009
7 changed files with 81 additions and 2 deletions
--- a/embedchain/chunkers/notion.py
+++ b/embedchain/chunkers/notion.py
@@ -0,0 +1,20 @@
+from typing import Optional
+
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+
+from embedchain.chunkers.base_chunker import BaseChunker
+from embedchain.config.AddConfig import ChunkerConfig
+
+
+class NotionChunker(BaseChunker):
+    """Chunker for notion."""
+
+    def __init__(self, config: Optional[ChunkerConfig] = None):
+        if config is None:
+            config = ChunkerConfig(chunk_size=300, chunk_overlap=0, length_function=len)
+        text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=config.chunk_size,
+            chunk_overlap=config.chunk_overlap,
+            length_function=config.length_function,
+        )
+        super().__init__(text_splitter)
--- a/embedchain/data_formatter/data_formatter.py
+++ b/embedchain/data_formatter/data_formatter.py
@@ -1,5 +1,6 @@
 from embedchain.chunkers.docs_site import DocsSiteChunker
 from embedchain.chunkers.docx_file import DocxFileChunker
+from embedchain.chunkers.notion import NotionChunker
 from embedchain.chunkers.pdf_file import PdfFileChunker
 from embedchain.chunkers.qna_pair import QnaPairChunker
 from embedchain.chunkers.text import TextChunker
@@ -10,6 +11,7 @@ from embedchain.loaders.docs_site_loader import DocsSiteLoader
 from embedchain.loaders.docx_file import DocxFileLoader
 from embedchain.loaders.local_qna_pair import LocalQnaPairLoader
 from embedchain.loaders.local_text import LocalTextLoader
+from embedchain.loaders.notion import NotionLoader
 from embedchain.loaders.pdf_file import PdfFileLoader
 from embedchain.loaders.sitemap import SitemapLoader
 from embedchain.loaders.web_page import WebPageLoader
@@ -44,6 +46,7 @@ class DataFormatter:
            "docx": DocxFileLoader(),
            "sitemap": SitemapLoader(),
            "docs_site": DocsSiteLoader(),
+            "notion": NotionLoader(),
        }
        if data_type in loaders:
            return loaders[data_type]
@@ -67,6 +70,7 @@ class DataFormatter:
            "docx": DocxFileChunker,
            "sitemap": WebPageChunker,
            "docs_site": DocsSiteChunker,
+            "notion": NotionChunker,
        }
        if data_type in chunker_classes:
            chunker_class = chunker_classes[data_type]
--- a/embedchain/loaders/notion.py
+++ b/embedchain/loaders/notion.py
@@ -0,0 +1,41 @@
+import logging
+import os
+
+try:
+    from llama_index import download_loader
+except ImportError:
+    raise ImportError("Notion requires extra dependencies. Install with `pip install embedchain[notion]`") from None
+
+
+from embedchain.loaders.base_loader import BaseLoader
+from embedchain.utils import clean_string
+
+
+class NotionLoader(BaseLoader):
+    def load_data(self, source):
+        """Load data from a PDF file."""
+
+        NotionPageReader = download_loader("NotionPageReader")
+
+        # Reformat Id to match notion expectation
+        id = source[-32:]
+        formatted_id = f"{id[:8]}-{id[8:12]}-{id[12:16]}-{id[16:20]}-{id[20:]}"
+        logging.debug(f"Extracted notion page id as: {formatted_id}")
+
+        # Get page through the notion api
+        integration_token = os.getenv("NOTION_INTEGRATION_TOKEN")
+        reader = NotionPageReader(integration_token=integration_token)
+        documents = reader.load_data(page_ids=[formatted_id])
+
+        # Extract text
+        raw_text = documents[0].text
+
+        # Clean text
+        text = clean_string(raw_text)
+
+        return [
+            {
+                "content": text,
+                "meta_data": {"url": f"notion-{formatted_id}"},
+            }
+        ]