feat: notion loader (#405)
This commit is contained in:
@@ -54,6 +54,18 @@ To add any code documentation website as a loader, use the data_type as `docs_si
|
|||||||
app.add("docs_site", "https://docs.embedchain.ai/")
|
app.add("docs_site", "https://docs.embedchain.ai/")
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Notion
|
||||||
|
To use notion you must install the extra dependencies with `pip install embedchain[notion]`.
|
||||||
|
|
||||||
|
To load a notion page, use the data_type as `notion`.
|
||||||
|
The next argument must **end** with the `notion page id`. The id is a 32-character string. Eg:
|
||||||
|
|
||||||
|
```python
|
||||||
|
app.add("notion", "cfbc134ca6464fc980d0391613959196")
|
||||||
|
app.add("notion", "my-page-cfbc134ca6464fc980d0391613959196")
|
||||||
|
app.add("notion", "https://www.notion.so/my-page-cfbc134ca6464fc980d0391613959196")
|
||||||
|
```
|
||||||
|
|
||||||
### Text
|
### Text
|
||||||
|
|
||||||
To supply your own text, use the data_type as `text` and enter a string. The text is not processed, this can be very versatile. Eg:
|
To supply your own text, use the data_type as `text` and enter a string. The text is not processed, this can be very versatile. Eg:
|
||||||
|
|||||||
@@ -46,6 +46,7 @@ Default values of chunker config parameters for different `data_type`:
|
|||||||
|pdf_file|1000|0|len|
|
|pdf_file|1000|0|len|
|
||||||
|youtube_video|2000|0|len|
|
|youtube_video|2000|0|len|
|
||||||
|docs_site|500|50|len|
|
|docs_site|500|50|len|
|
||||||
|
|notion|300|0|len|
|
||||||
|
|
||||||
### LoaderConfig
|
### LoaderConfig
|
||||||
|
|
||||||
|
|||||||
20
embedchain/chunkers/notion.py
Normal file
20
embedchain/chunkers/notion.py
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||||
|
|
||||||
|
from embedchain.chunkers.base_chunker import BaseChunker
|
||||||
|
from embedchain.config.AddConfig import ChunkerConfig
|
||||||
|
|
||||||
|
|
||||||
|
class NotionChunker(BaseChunker):
|
||||||
|
"""Chunker for notion."""
|
||||||
|
|
||||||
|
def __init__(self, config: Optional[ChunkerConfig] = None):
|
||||||
|
if config is None:
|
||||||
|
config = ChunkerConfig(chunk_size=300, chunk_overlap=0, length_function=len)
|
||||||
|
text_splitter = RecursiveCharacterTextSplitter(
|
||||||
|
chunk_size=config.chunk_size,
|
||||||
|
chunk_overlap=config.chunk_overlap,
|
||||||
|
length_function=config.length_function,
|
||||||
|
)
|
||||||
|
super().__init__(text_splitter)
|
||||||
@@ -1,5 +1,6 @@
|
|||||||
from embedchain.chunkers.docs_site import DocsSiteChunker
|
from embedchain.chunkers.docs_site import DocsSiteChunker
|
||||||
from embedchain.chunkers.docx_file import DocxFileChunker
|
from embedchain.chunkers.docx_file import DocxFileChunker
|
||||||
|
from embedchain.chunkers.notion import NotionChunker
|
||||||
from embedchain.chunkers.pdf_file import PdfFileChunker
|
from embedchain.chunkers.pdf_file import PdfFileChunker
|
||||||
from embedchain.chunkers.qna_pair import QnaPairChunker
|
from embedchain.chunkers.qna_pair import QnaPairChunker
|
||||||
from embedchain.chunkers.text import TextChunker
|
from embedchain.chunkers.text import TextChunker
|
||||||
@@ -10,6 +11,7 @@ from embedchain.loaders.docs_site_loader import DocsSiteLoader
|
|||||||
from embedchain.loaders.docx_file import DocxFileLoader
|
from embedchain.loaders.docx_file import DocxFileLoader
|
||||||
from embedchain.loaders.local_qna_pair import LocalQnaPairLoader
|
from embedchain.loaders.local_qna_pair import LocalQnaPairLoader
|
||||||
from embedchain.loaders.local_text import LocalTextLoader
|
from embedchain.loaders.local_text import LocalTextLoader
|
||||||
|
from embedchain.loaders.notion import NotionLoader
|
||||||
from embedchain.loaders.pdf_file import PdfFileLoader
|
from embedchain.loaders.pdf_file import PdfFileLoader
|
||||||
from embedchain.loaders.sitemap import SitemapLoader
|
from embedchain.loaders.sitemap import SitemapLoader
|
||||||
from embedchain.loaders.web_page import WebPageLoader
|
from embedchain.loaders.web_page import WebPageLoader
|
||||||
@@ -44,6 +46,7 @@ class DataFormatter:
|
|||||||
"docx": DocxFileLoader(),
|
"docx": DocxFileLoader(),
|
||||||
"sitemap": SitemapLoader(),
|
"sitemap": SitemapLoader(),
|
||||||
"docs_site": DocsSiteLoader(),
|
"docs_site": DocsSiteLoader(),
|
||||||
|
"notion": NotionLoader(),
|
||||||
}
|
}
|
||||||
if data_type in loaders:
|
if data_type in loaders:
|
||||||
return loaders[data_type]
|
return loaders[data_type]
|
||||||
@@ -67,6 +70,7 @@ class DataFormatter:
|
|||||||
"docx": DocxFileChunker,
|
"docx": DocxFileChunker,
|
||||||
"sitemap": WebPageChunker,
|
"sitemap": WebPageChunker,
|
||||||
"docs_site": DocsSiteChunker,
|
"docs_site": DocsSiteChunker,
|
||||||
|
"notion": NotionChunker,
|
||||||
}
|
}
|
||||||
if data_type in chunker_classes:
|
if data_type in chunker_classes:
|
||||||
chunker_class = chunker_classes[data_type]
|
chunker_class = chunker_classes[data_type]
|
||||||
|
|||||||
41
embedchain/loaders/notion.py
Normal file
41
embedchain/loaders/notion.py
Normal file
@@ -0,0 +1,41 @@
|
|||||||
|
import logging
|
||||||
|
import os
|
||||||
|
|
||||||
|
try:
|
||||||
|
from llama_index import download_loader
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError("Notion requires extra dependencies. Install with `pip install embedchain[notion]`") from None
|
||||||
|
|
||||||
|
|
||||||
|
from embedchain.loaders.base_loader import BaseLoader
|
||||||
|
from embedchain.utils import clean_string
|
||||||
|
|
||||||
|
|
||||||
|
class NotionLoader(BaseLoader):
|
||||||
|
def load_data(self, source):
|
||||||
|
"""Load data from a PDF file."""
|
||||||
|
|
||||||
|
NotionPageReader = download_loader("NotionPageReader")
|
||||||
|
|
||||||
|
# Reformat Id to match notion expectation
|
||||||
|
id = source[-32:]
|
||||||
|
formatted_id = f"{id[:8]}-{id[8:12]}-{id[12:16]}-{id[16:20]}-{id[20:]}"
|
||||||
|
logging.debug(f"Extracted notion page id as: {formatted_id}")
|
||||||
|
|
||||||
|
# Get page through the notion api
|
||||||
|
integration_token = os.getenv("NOTION_INTEGRATION_TOKEN")
|
||||||
|
reader = NotionPageReader(integration_token=integration_token)
|
||||||
|
documents = reader.load_data(page_ids=[formatted_id])
|
||||||
|
|
||||||
|
# Extract text
|
||||||
|
raw_text = documents[0].text
|
||||||
|
|
||||||
|
# Clean text
|
||||||
|
text = clean_string(raw_text)
|
||||||
|
|
||||||
|
return [
|
||||||
|
{
|
||||||
|
"content": text,
|
||||||
|
"meta_data": {"url": f"notion-{formatted_id}"},
|
||||||
|
}
|
||||||
|
]
|
||||||
@@ -90,6 +90,7 @@ youtube-transcript-api = "^0.6.1"
|
|||||||
beautifulsoup4 = "^4.12.2"
|
beautifulsoup4 = "^4.12.2"
|
||||||
pypdf = "^3.11.0"
|
pypdf = "^3.11.0"
|
||||||
pytube = "^15.0.0"
|
pytube = "^15.0.0"
|
||||||
|
llama-index = { version = "^0.7.21", optional = true }
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@@ -105,7 +106,7 @@ isort = "^5.12.0"
|
|||||||
|
|
||||||
[tool.poetry.extras]
|
[tool.poetry.extras]
|
||||||
streamlit = ["streamlit"]
|
streamlit = ["streamlit"]
|
||||||
|
community = ["llama-index"]
|
||||||
|
|
||||||
[tool.poetry.group.docs.dependencies]
|
[tool.poetry.group.docs.dependencies]
|
||||||
|
|
||||||
|
|||||||
2
setup.py
2
setup.py
@@ -37,5 +37,5 @@ setuptools.setup(
|
|||||||
"replicate==0.9.0",
|
"replicate==0.9.0",
|
||||||
"duckduckgo-search==3.8.4",
|
"duckduckgo-search==3.8.4",
|
||||||
],
|
],
|
||||||
extras_require={"dev": ["black", "ruff", "isort", "pytest"]},
|
extras_require={"dev": ["black", "ruff", "isort", "pytest"], "community": ["llama-index==0.7.21"]},
|
||||||
)
|
)
|
||||||
|
|||||||
Reference in New Issue
Block a user