[Bugfix] fix sitemap loader (#1000)
Co-authored-by: Deven Patel <deven298@yahoo.com>
This commit is contained in:
@@ -1,9 +1,10 @@
|
||||
import hashlib
|
||||
import logging
|
||||
import time
|
||||
import requests
|
||||
from xml.etree import ElementTree
|
||||
|
||||
import requests
|
||||
|
||||
from embedchain.helpers.json_serializable import register_deserializable
|
||||
from embedchain.loaders.base_loader import BaseLoader
|
||||
from embedchain.utils import is_readable
|
||||
|
||||
@@ -28,7 +28,8 @@ class RSSFeedLoader(BaseLoader):
|
||||
@staticmethod
|
||||
def get_rss_content(url: str):
|
||||
try:
|
||||
from langchain.document_loaders import RSSFeedLoader as LangchainRSSFeedLoader
|
||||
from langchain.document_loaders import \
|
||||
RSSFeedLoader as LangchainRSSFeedLoader
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"""RSSFeedLoader file requires extra dependencies.
|
||||
|
||||
@@ -37,9 +37,11 @@ class SitemapLoader(BaseLoader):
|
||||
if urlparse(sitemap_url).scheme in ["http", "https"]:
|
||||
response = requests.get(sitemap_url)
|
||||
response.raise_for_status()
|
||||
soup = BeautifulSoup(response.text, "xml")
|
||||
else:
|
||||
with open(sitemap_url, "r") as file:
|
||||
soup = BeautifulSoup(file, "xml")
|
||||
|
||||
links = [link.text for link in soup.find_all("loc") if link.parent.name == "url"]
|
||||
if len(links) == 0:
|
||||
links = [link.text for link in soup.find_all("loc")]
|
||||
|
||||
@@ -1,9 +1,10 @@
|
||||
import hashlib
|
||||
import logging
|
||||
import time
|
||||
from xml.etree import ElementTree
|
||||
|
||||
import requests
|
||||
from xml.etree import ElementTree
|
||||
|
||||
from embedchain.helpers.json_serializable import register_deserializable
|
||||
from embedchain.loaders.base_loader import BaseLoader
|
||||
from embedchain.utils import is_readable
|
||||
|
||||
Reference in New Issue
Block a user