diff --git a/docs/data-sources/overview.mdx b/docs/data-sources/overview.mdx
index 8bff8bce..8488ce40 100644
--- a/docs/data-sources/overview.mdx
+++ b/docs/data-sources/overview.mdx
@@ -27,6 +27,7 @@ Embedchain comes with built-in support for various data sources. We handle the c
+
diff --git a/docs/data-sources/substack.mdx b/docs/data-sources/substack.mdx
index 2a5f9d8a..41aba375 100644
--- a/docs/data-sources/substack.mdx
+++ b/docs/data-sources/substack.mdx
@@ -2,15 +2,15 @@
title: "📝 Substack"
---
-To add any Substack data sources to your app, just add the sitemap.xml of that url as the source and set the data_type to `substack`.
+To add any Substack data sources to your app, just add the main base url as the source and set the data_type to `substack`.
```python
from embedchain import Pipeline as App
app = App()
-# source: for any substack just add the sitemap.xml url
-app.add('https://www.lennysnewsletter.com/sitemap.xml', data_type='substack')
+# source: for any substack just add the root URL
+app.add('https://www.lennysnewsletter.com', data_type='substack')
app.query("Who is Brian Chesky?")
# Answer: Brian Chesky is the co-founder and CEO of Airbnb.
```
diff --git a/embedchain/loaders/substack.py b/embedchain/loaders/substack.py
index 350ab3c2..a5fd0b6f 100644
--- a/embedchain/loaders/substack.py
+++ b/embedchain/loaders/substack.py
@@ -3,7 +3,7 @@ import logging
import time
import requests
-
+from xml.etree import ElementTree
from embedchain.helpers.json_serializable import register_deserializable
from embedchain.loaders.base_loader import BaseLoader
from embedchain.utils import is_readable
@@ -24,9 +24,29 @@ class SubstackLoader(BaseLoader):
'Substack requires extra dependencies. Install with `pip install --upgrade "embedchain[dataloaders]"`'
) from None
+ if not url.endswith("sitemap.xml"):
+ url = url + "/sitemap.xml"
+
output = []
response = requests.get(url)
- response.raise_for_status()
+
+ try:
+ response.raise_for_status()
+ except requests.exceptions.HTTPError as e:
+ raise ValueError(
+ f"""
+ Failed to load {url}: {e}. Please use the root substack URL. For example, https://example.substack.com
+ """
+ )
+
+ try:
+ ElementTree.fromstring(response.content)
+ except ElementTree.ParseError:
+ raise ValueError(
+ f"""
+ Failed to parse {url}. Please use the root substack URL. For example, https://example.substack.com
+ """
+ )
soup = BeautifulSoup(response.text, "xml")
links = [link.text for link in soup.find_all("loc") if link.parent.name == "url" and "/p/" in link.text]