diff --git a/docs/data-sources/overview.mdx b/docs/data-sources/overview.mdx index 8bff8bce..8488ce40 100644 --- a/docs/data-sources/overview.mdx +++ b/docs/data-sources/overview.mdx @@ -27,6 +27,7 @@ Embedchain comes with built-in support for various data sources. We handle the c + diff --git a/docs/data-sources/substack.mdx b/docs/data-sources/substack.mdx index 2a5f9d8a..41aba375 100644 --- a/docs/data-sources/substack.mdx +++ b/docs/data-sources/substack.mdx @@ -2,15 +2,15 @@ title: "📝 Substack" --- -To add any Substack data sources to your app, just add the sitemap.xml of that url as the source and set the data_type to `substack`. +To add any Substack data sources to your app, just add the main base url as the source and set the data_type to `substack`. ```python from embedchain import Pipeline as App app = App() -# source: for any substack just add the sitemap.xml url -app.add('https://www.lennysnewsletter.com/sitemap.xml', data_type='substack') +# source: for any substack just add the root URL +app.add('https://www.lennysnewsletter.com', data_type='substack') app.query("Who is Brian Chesky?") # Answer: Brian Chesky is the co-founder and CEO of Airbnb. ``` diff --git a/embedchain/loaders/substack.py b/embedchain/loaders/substack.py index 350ab3c2..a5fd0b6f 100644 --- a/embedchain/loaders/substack.py +++ b/embedchain/loaders/substack.py @@ -3,7 +3,7 @@ import logging import time import requests - +from xml.etree import ElementTree from embedchain.helpers.json_serializable import register_deserializable from embedchain.loaders.base_loader import BaseLoader from embedchain.utils import is_readable @@ -24,9 +24,29 @@ class SubstackLoader(BaseLoader): 'Substack requires extra dependencies. Install with `pip install --upgrade "embedchain[dataloaders]"`' ) from None + if not url.endswith("sitemap.xml"): + url = url + "/sitemap.xml" + output = [] response = requests.get(url) - response.raise_for_status() + + try: + response.raise_for_status() + except requests.exceptions.HTTPError as e: + raise ValueError( + f""" + Failed to load {url}: {e}. Please use the root substack URL. For example, https://example.substack.com + """ + ) + + try: + ElementTree.fromstring(response.content) + except ElementTree.ParseError: + raise ValueError( + f""" + Failed to parse {url}. Please use the root substack URL. For example, https://example.substack.com + """ + ) soup = BeautifulSoup(response.text, "xml") links = [link.text for link in soup.find_all("loc") if link.parent.name == "url" and "/p/" in link.text]