[Bugfix] fix sitemap loader (#1000)

Co-authored-by: Deven Patel <deven298@yahoo.com>
This commit is contained in:
Deven Patel
2023-12-07 16:32:41 -08:00
committed by GitHub
parent 111749a95d
commit ff4a333be7
6 changed files with 11 additions and 5 deletions

View File

@@ -1,9 +1,10 @@
import hashlib
import logging
import time
import requests
from xml.etree import ElementTree
import requests
from embedchain.helpers.json_serializable import register_deserializable
from embedchain.loaders.base_loader import BaseLoader
from embedchain.utils import is_readable

View File

@@ -28,7 +28,8 @@ class RSSFeedLoader(BaseLoader):
@staticmethod
def get_rss_content(url: str):
try:
from langchain.document_loaders import RSSFeedLoader as LangchainRSSFeedLoader
from langchain.document_loaders import \
RSSFeedLoader as LangchainRSSFeedLoader
except ImportError:
raise ImportError(
"""RSSFeedLoader file requires extra dependencies.

View File

@@ -37,9 +37,11 @@ class SitemapLoader(BaseLoader):
if urlparse(sitemap_url).scheme in ["http", "https"]:
response = requests.get(sitemap_url)
response.raise_for_status()
soup = BeautifulSoup(response.text, "xml")
else:
with open(sitemap_url, "r") as file:
soup = BeautifulSoup(file, "xml")
links = [link.text for link in soup.find_all("loc") if link.parent.name == "url"]
if len(links) == 0:
links = [link.text for link in soup.find_all("loc")]

View File

@@ -1,9 +1,10 @@
import hashlib
import logging
import time
from xml.etree import ElementTree
import requests
from xml.etree import ElementTree
from embedchain.helpers.json_serializable import register_deserializable
from embedchain.loaders.base_loader import BaseLoader
from embedchain.utils import is_readable

View File

@@ -196,7 +196,8 @@ def detect_datatype(source: Any) -> DataType:
formatted_source = format_source(str(source), 30)
if url:
from langchain.document_loaders.youtube import ALLOWED_NETLOCK as YOUTUBE_ALLOWED_NETLOCS
from langchain.document_loaders.youtube import \
ALLOWED_NETLOCK as YOUTUBE_ALLOWED_NETLOCS
if url.netloc in YOUTUBE_ALLOWED_NETLOCS:
logging.debug(f"Source of `{formatted_source}` detected as `youtube_video`.")