From 122313d8a514351840f126797f04d7293fd61e80 Mon Sep 17 00:00:00 2001 From: Sidharth Mohanty Date: Wed, 15 Nov 2023 11:22:15 +0530 Subject: [PATCH] [New] Substack loader (#949) --- docs/data-sources/substack.mdx | 16 ++++ docs/mint.json | 4 +- embedchain/chunkers/substack.py | 22 ++++++ embedchain/data_formatter/data_formatter.py | 2 + embedchain/loaders/substack.py | 85 +++++++++++++++++++++ embedchain/models/data_type.py | 2 + 6 files changed, 130 insertions(+), 1 deletion(-) create mode 100644 docs/data-sources/substack.mdx create mode 100644 embedchain/chunkers/substack.py create mode 100644 embedchain/loaders/substack.py diff --git a/docs/data-sources/substack.mdx b/docs/data-sources/substack.mdx new file mode 100644 index 00000000..2a5f9d8a --- /dev/null +++ b/docs/data-sources/substack.mdx @@ -0,0 +1,16 @@ +--- +title: "📝 Substack" +--- + +To add any Substack data sources to your app, just add the sitemap.xml of that url as the source and set the data_type to `substack`. + +```python +from embedchain import Pipeline as App + +app = App() + +# source: for any substack just add the sitemap.xml url +app.add('https://www.lennysnewsletter.com/sitemap.xml', data_type='substack') +app.query("Who is Brian Chesky?") +# Answer: Brian Chesky is the co-founder and CEO of Airbnb. +``` diff --git a/docs/mint.json b/docs/mint.json index 8d53dc92..256a5e80 100644 --- a/docs/mint.json +++ b/docs/mint.json @@ -87,7 +87,9 @@ "data-sources/text", "data-sources/web-page", "data-sources/openapi", - "data-sources/youtube-video" + "data-sources/youtube-video", + "data-sources/discourse", + "data-sources/substack" ] }, "data-sources/data-type-handling" diff --git a/embedchain/chunkers/substack.py b/embedchain/chunkers/substack.py new file mode 100644 index 00000000..85f04fa6 --- /dev/null +++ b/embedchain/chunkers/substack.py @@ -0,0 +1,22 @@ +from typing import Optional + +from langchain.text_splitter import RecursiveCharacterTextSplitter + +from embedchain.chunkers.base_chunker import BaseChunker +from embedchain.config.add_config import ChunkerConfig +from embedchain.helper.json_serializable import register_deserializable + + +@register_deserializable +class SubstackChunker(BaseChunker): + """Chunker for Substack.""" + + def __init__(self, config: Optional[ChunkerConfig] = None): + if config is None: + config = ChunkerConfig(chunk_size=1000, chunk_overlap=0, length_function=len) + text_splitter = RecursiveCharacterTextSplitter( + chunk_size=config.chunk_size, + chunk_overlap=config.chunk_overlap, + length_function=config.length_function, + ) + super().__init__(text_splitter) diff --git a/embedchain/data_formatter/data_formatter.py b/embedchain/data_formatter/data_formatter.py index 49bbd650..2e702283 100644 --- a/embedchain/data_formatter/data_formatter.py +++ b/embedchain/data_formatter/data_formatter.py @@ -63,6 +63,7 @@ class DataFormatter(JSONSerializable): DataType.OPENAPI: "embedchain.loaders.openapi.OpenAPILoader", DataType.GMAIL: "embedchain.loaders.gmail.GmailLoader", DataType.NOTION: "embedchain.loaders.notion.NotionLoader", + DataType.SUBSTACK: "embedchain.loaders.substack.SubstackLoader", } custom_loaders = set( @@ -112,6 +113,7 @@ class DataFormatter(JSONSerializable): DataType.MYSQL: "embedchain.chunkers.mysql.MySQLChunker", DataType.SLACK: "embedchain.chunkers.slack.SlackChunker", DataType.DISCOURSE: "embedchain.chunkers.discourse.DiscourseChunker", + DataType.SUBSTACK: "embedchain.chunkers.substack.SubstackChunker", } if data_type in chunker_classes: diff --git a/embedchain/loaders/substack.py b/embedchain/loaders/substack.py new file mode 100644 index 00000000..30e5878d --- /dev/null +++ b/embedchain/loaders/substack.py @@ -0,0 +1,85 @@ +import time +import hashlib +import logging +import requests + +from embedchain.helper.json_serializable import register_deserializable +from embedchain.loaders.base_loader import BaseLoader +from embedchain.utils import is_readable + + +@register_deserializable +class SubstackLoader(BaseLoader): + """ + This method takes a sitemap URL as input and retrieves + all the URLs to use the WebPageLoader to load content + of each page. + """ + + def load_data(self, url: str): + try: + from bs4 import BeautifulSoup + from bs4.builder import ParserRejectedMarkup + except ImportError: + raise ImportError( + 'Substack requires extra dependencies. Install with `pip install --upgrade "embedchain[dataloaders]"`' + ) from None + + output = [] + response = requests.get(url) + response.raise_for_status() + + soup = BeautifulSoup(response.text, "xml") + links = [link.text for link in soup.find_all("loc") if link.parent.name == "url" and "/p/" in link.text] + if len(links) == 0: + links = [link.text for link in soup.find_all("loc") if "/p/" in link.text] + + doc_id = hashlib.sha256((" ".join(links) + url).encode()).hexdigest() + + def serialize_response(soup: BeautifulSoup): + data = {} + + h1_els = soup.find_all("h1") + if h1_els is not None and len(h1_els) > 0: + data["title"] = h1_els[1].text + + description_el = soup.find("meta", {"name": "description"}) + if description_el is not None: + data["description"] = description_el["content"] + + content_el = soup.find("div", {"class": "available-content"}) + if content_el is not None: + data["content"] = content_el.text + + like_btn = soup.find("div", {"class": "like-button-container"}) + if like_btn is not None: + no_of_likes_div = like_btn.find("div", {"class": "label"}) + if no_of_likes_div is not None: + data["no_of_likes"] = no_of_likes_div.text + + return data + + def load_link(link: str): + try: + each_load_data = requests.get(link) + each_load_data.raise_for_status() + + soup = BeautifulSoup(response.text, "html.parser") + data = serialize_response(soup) + data = str(data) + if is_readable(data): + return data + else: + logging.warning(f"Page is not readable (too many invalid characters): {link}") + except ParserRejectedMarkup as e: + logging.error(f"Failed to parse {link}: {e}") + return None + + for link in links: + data = load_link(link) + if data: + output.append({"content": data, "meta_data": {"url": link}}) + # TODO: allow users to configure this + time.sleep(0.4) # added to avoid rate limiting + + return {"doc_id": doc_id, "data": output} diff --git a/embedchain/models/data_type.py b/embedchain/models/data_type.py index d973d766..a46034fc 100644 --- a/embedchain/models/data_type.py +++ b/embedchain/models/data_type.py @@ -33,6 +33,7 @@ class IndirectDataType(Enum): MYSQL = "mysql" SLACK = "slack" DISCOURSE = "discourse" + SUBSTACK = "substack" class SpecialDataType(Enum): @@ -65,3 +66,4 @@ class DataType(Enum): MYSQL = IndirectDataType.MYSQL.value SLACK = IndirectDataType.SLACK.value DISCOURSE = IndirectDataType.DISCOURSE.value + SUBSTACK = IndirectDataType.SUBSTACK.value