feat: notion loader (#405)
This commit is contained in:
20
embedchain/chunkers/notion.py
Normal file
20
embedchain/chunkers/notion.py
Normal file
@@ -0,0 +1,20 @@
|
||||
from typing import Optional
|
||||
|
||||
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||
|
||||
from embedchain.chunkers.base_chunker import BaseChunker
|
||||
from embedchain.config.AddConfig import ChunkerConfig
|
||||
|
||||
|
||||
class NotionChunker(BaseChunker):
|
||||
"""Chunker for notion."""
|
||||
|
||||
def __init__(self, config: Optional[ChunkerConfig] = None):
|
||||
if config is None:
|
||||
config = ChunkerConfig(chunk_size=300, chunk_overlap=0, length_function=len)
|
||||
text_splitter = RecursiveCharacterTextSplitter(
|
||||
chunk_size=config.chunk_size,
|
||||
chunk_overlap=config.chunk_overlap,
|
||||
length_function=config.length_function,
|
||||
)
|
||||
super().__init__(text_splitter)
|
||||
@@ -1,5 +1,6 @@
|
||||
from embedchain.chunkers.docs_site import DocsSiteChunker
|
||||
from embedchain.chunkers.docx_file import DocxFileChunker
|
||||
from embedchain.chunkers.notion import NotionChunker
|
||||
from embedchain.chunkers.pdf_file import PdfFileChunker
|
||||
from embedchain.chunkers.qna_pair import QnaPairChunker
|
||||
from embedchain.chunkers.text import TextChunker
|
||||
@@ -10,6 +11,7 @@ from embedchain.loaders.docs_site_loader import DocsSiteLoader
|
||||
from embedchain.loaders.docx_file import DocxFileLoader
|
||||
from embedchain.loaders.local_qna_pair import LocalQnaPairLoader
|
||||
from embedchain.loaders.local_text import LocalTextLoader
|
||||
from embedchain.loaders.notion import NotionLoader
|
||||
from embedchain.loaders.pdf_file import PdfFileLoader
|
||||
from embedchain.loaders.sitemap import SitemapLoader
|
||||
from embedchain.loaders.web_page import WebPageLoader
|
||||
@@ -44,6 +46,7 @@ class DataFormatter:
|
||||
"docx": DocxFileLoader(),
|
||||
"sitemap": SitemapLoader(),
|
||||
"docs_site": DocsSiteLoader(),
|
||||
"notion": NotionLoader(),
|
||||
}
|
||||
if data_type in loaders:
|
||||
return loaders[data_type]
|
||||
@@ -67,6 +70,7 @@ class DataFormatter:
|
||||
"docx": DocxFileChunker,
|
||||
"sitemap": WebPageChunker,
|
||||
"docs_site": DocsSiteChunker,
|
||||
"notion": NotionChunker,
|
||||
}
|
||||
if data_type in chunker_classes:
|
||||
chunker_class = chunker_classes[data_type]
|
||||
|
||||
41
embedchain/loaders/notion.py
Normal file
41
embedchain/loaders/notion.py
Normal file
@@ -0,0 +1,41 @@
|
||||
import logging
|
||||
import os
|
||||
|
||||
try:
|
||||
from llama_index import download_loader
|
||||
except ImportError:
|
||||
raise ImportError("Notion requires extra dependencies. Install with `pip install embedchain[notion]`") from None
|
||||
|
||||
|
||||
from embedchain.loaders.base_loader import BaseLoader
|
||||
from embedchain.utils import clean_string
|
||||
|
||||
|
||||
class NotionLoader(BaseLoader):
|
||||
def load_data(self, source):
|
||||
"""Load data from a PDF file."""
|
||||
|
||||
NotionPageReader = download_loader("NotionPageReader")
|
||||
|
||||
# Reformat Id to match notion expectation
|
||||
id = source[-32:]
|
||||
formatted_id = f"{id[:8]}-{id[8:12]}-{id[12:16]}-{id[16:20]}-{id[20:]}"
|
||||
logging.debug(f"Extracted notion page id as: {formatted_id}")
|
||||
|
||||
# Get page through the notion api
|
||||
integration_token = os.getenv("NOTION_INTEGRATION_TOKEN")
|
||||
reader = NotionPageReader(integration_token=integration_token)
|
||||
documents = reader.load_data(page_ids=[formatted_id])
|
||||
|
||||
# Extract text
|
||||
raw_text = documents[0].text
|
||||
|
||||
# Clean text
|
||||
text = clean_string(raw_text)
|
||||
|
||||
return [
|
||||
{
|
||||
"content": text,
|
||||
"meta_data": {"url": f"notion-{formatted_id}"},
|
||||
}
|
||||
]
|
||||
Reference in New Issue
Block a user