From ff4a333be7ee9f41d3a0cee1e3f56c425209d15e Mon Sep 17 00:00:00 2001 From: Deven Patel Date: Thu, 7 Dec 2023 16:32:41 -0800 Subject: [PATCH] [Bugfix] fix sitemap loader (#1000) Co-authored-by: Deven Patel --- embedchain/loaders/beehiiv.py | 3 ++- embedchain/loaders/rss_feed.py | 3 ++- embedchain/loaders/sitemap.py | 2 ++ embedchain/loaders/substack.py | 3 ++- embedchain/utils.py | 3 ++- pyproject.toml | 2 +- 6 files changed, 11 insertions(+), 5 deletions(-) diff --git a/embedchain/loaders/beehiiv.py b/embedchain/loaders/beehiiv.py index 66ccfa24..f9cc920f 100644 --- a/embedchain/loaders/beehiiv.py +++ b/embedchain/loaders/beehiiv.py @@ -1,9 +1,10 @@ import hashlib import logging import time -import requests from xml.etree import ElementTree +import requests + from embedchain.helpers.json_serializable import register_deserializable from embedchain.loaders.base_loader import BaseLoader from embedchain.utils import is_readable diff --git a/embedchain/loaders/rss_feed.py b/embedchain/loaders/rss_feed.py index 033602ea..ba44e3c9 100644 --- a/embedchain/loaders/rss_feed.py +++ b/embedchain/loaders/rss_feed.py @@ -28,7 +28,8 @@ class RSSFeedLoader(BaseLoader): @staticmethod def get_rss_content(url: str): try: - from langchain.document_loaders import RSSFeedLoader as LangchainRSSFeedLoader + from langchain.document_loaders import \ + RSSFeedLoader as LangchainRSSFeedLoader except ImportError: raise ImportError( """RSSFeedLoader file requires extra dependencies. diff --git a/embedchain/loaders/sitemap.py b/embedchain/loaders/sitemap.py index 77fc90f2..bb6e33e8 100644 --- a/embedchain/loaders/sitemap.py +++ b/embedchain/loaders/sitemap.py @@ -37,9 +37,11 @@ class SitemapLoader(BaseLoader): if urlparse(sitemap_url).scheme in ["http", "https"]: response = requests.get(sitemap_url) response.raise_for_status() + soup = BeautifulSoup(response.text, "xml") else: with open(sitemap_url, "r") as file: soup = BeautifulSoup(file, "xml") + links = [link.text for link in soup.find_all("loc") if link.parent.name == "url"] if len(links) == 0: links = [link.text for link in soup.find_all("loc")] diff --git a/embedchain/loaders/substack.py b/embedchain/loaders/substack.py index a5fd0b6f..40278a6b 100644 --- a/embedchain/loaders/substack.py +++ b/embedchain/loaders/substack.py @@ -1,9 +1,10 @@ import hashlib import logging import time +from xml.etree import ElementTree import requests -from xml.etree import ElementTree + from embedchain.helpers.json_serializable import register_deserializable from embedchain.loaders.base_loader import BaseLoader from embedchain.utils import is_readable diff --git a/embedchain/utils.py b/embedchain/utils.py index 8daacda5..ba6efd21 100644 --- a/embedchain/utils.py +++ b/embedchain/utils.py @@ -196,7 +196,8 @@ def detect_datatype(source: Any) -> DataType: formatted_source = format_source(str(source), 30) if url: - from langchain.document_loaders.youtube import ALLOWED_NETLOCK as YOUTUBE_ALLOWED_NETLOCS + from langchain.document_loaders.youtube import \ + ALLOWED_NETLOCK as YOUTUBE_ALLOWED_NETLOCS if url.netloc in YOUTUBE_ALLOWED_NETLOCS: logging.debug(f"Source of `{formatted_source}` detected as `youtube_video`.") diff --git a/pyproject.toml b/pyproject.toml index b2dfa5bb..f550b907 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "embedchain" -version = "0.1.30" +version = "0.1.31" description = "Data platform for LLMs - Load, index, retrieve and sync any unstructured data" authors = [ "Taranjeet Singh ",