[Bugfix] fix sitemap loader (#1000)
Co-authored-by: Deven Patel <deven298@yahoo.com>
This commit is contained in:
@@ -1,9 +1,10 @@
|
|||||||
import hashlib
|
import hashlib
|
||||||
import logging
|
import logging
|
||||||
import time
|
import time
|
||||||
import requests
|
|
||||||
from xml.etree import ElementTree
|
from xml.etree import ElementTree
|
||||||
|
|
||||||
|
import requests
|
||||||
|
|
||||||
from embedchain.helpers.json_serializable import register_deserializable
|
from embedchain.helpers.json_serializable import register_deserializable
|
||||||
from embedchain.loaders.base_loader import BaseLoader
|
from embedchain.loaders.base_loader import BaseLoader
|
||||||
from embedchain.utils import is_readable
|
from embedchain.utils import is_readable
|
||||||
|
|||||||
@@ -28,7 +28,8 @@ class RSSFeedLoader(BaseLoader):
|
|||||||
@staticmethod
|
@staticmethod
|
||||||
def get_rss_content(url: str):
|
def get_rss_content(url: str):
|
||||||
try:
|
try:
|
||||||
from langchain.document_loaders import RSSFeedLoader as LangchainRSSFeedLoader
|
from langchain.document_loaders import \
|
||||||
|
RSSFeedLoader as LangchainRSSFeedLoader
|
||||||
except ImportError:
|
except ImportError:
|
||||||
raise ImportError(
|
raise ImportError(
|
||||||
"""RSSFeedLoader file requires extra dependencies.
|
"""RSSFeedLoader file requires extra dependencies.
|
||||||
|
|||||||
@@ -37,9 +37,11 @@ class SitemapLoader(BaseLoader):
|
|||||||
if urlparse(sitemap_url).scheme in ["http", "https"]:
|
if urlparse(sitemap_url).scheme in ["http", "https"]:
|
||||||
response = requests.get(sitemap_url)
|
response = requests.get(sitemap_url)
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
|
soup = BeautifulSoup(response.text, "xml")
|
||||||
else:
|
else:
|
||||||
with open(sitemap_url, "r") as file:
|
with open(sitemap_url, "r") as file:
|
||||||
soup = BeautifulSoup(file, "xml")
|
soup = BeautifulSoup(file, "xml")
|
||||||
|
|
||||||
links = [link.text for link in soup.find_all("loc") if link.parent.name == "url"]
|
links = [link.text for link in soup.find_all("loc") if link.parent.name == "url"]
|
||||||
if len(links) == 0:
|
if len(links) == 0:
|
||||||
links = [link.text for link in soup.find_all("loc")]
|
links = [link.text for link in soup.find_all("loc")]
|
||||||
|
|||||||
@@ -1,9 +1,10 @@
|
|||||||
import hashlib
|
import hashlib
|
||||||
import logging
|
import logging
|
||||||
import time
|
import time
|
||||||
|
from xml.etree import ElementTree
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
from xml.etree import ElementTree
|
|
||||||
from embedchain.helpers.json_serializable import register_deserializable
|
from embedchain.helpers.json_serializable import register_deserializable
|
||||||
from embedchain.loaders.base_loader import BaseLoader
|
from embedchain.loaders.base_loader import BaseLoader
|
||||||
from embedchain.utils import is_readable
|
from embedchain.utils import is_readable
|
||||||
|
|||||||
@@ -196,7 +196,8 @@ def detect_datatype(source: Any) -> DataType:
|
|||||||
formatted_source = format_source(str(source), 30)
|
formatted_source = format_source(str(source), 30)
|
||||||
|
|
||||||
if url:
|
if url:
|
||||||
from langchain.document_loaders.youtube import ALLOWED_NETLOCK as YOUTUBE_ALLOWED_NETLOCS
|
from langchain.document_loaders.youtube import \
|
||||||
|
ALLOWED_NETLOCK as YOUTUBE_ALLOWED_NETLOCS
|
||||||
|
|
||||||
if url.netloc in YOUTUBE_ALLOWED_NETLOCS:
|
if url.netloc in YOUTUBE_ALLOWED_NETLOCS:
|
||||||
logging.debug(f"Source of `{formatted_source}` detected as `youtube_video`.")
|
logging.debug(f"Source of `{formatted_source}` detected as `youtube_video`.")
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "embedchain"
|
name = "embedchain"
|
||||||
version = "0.1.30"
|
version = "0.1.31"
|
||||||
description = "Data platform for LLMs - Load, index, retrieve and sync any unstructured data"
|
description = "Data platform for LLMs - Load, index, retrieve and sync any unstructured data"
|
||||||
authors = [
|
authors = [
|
||||||
"Taranjeet Singh <taranjeet@embedchain.ai>",
|
"Taranjeet Singh <taranjeet@embedchain.ai>",
|
||||||
|
|||||||
Reference in New Issue
Block a user