From 1d4e00ccef6969d8fb0dd9c3f17565c2712c9006 Mon Sep 17 00:00:00 2001 From: atkinsh <55955434+atkinsh@users.noreply.github.com> Date: Tue, 5 Dec 2023 22:04:20 -0500 Subject: [PATCH] local file path support for sitemap loader (#992) --- embedchain/loaders/sitemap.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/embedchain/loaders/sitemap.py b/embedchain/loaders/sitemap.py index b95f854c..1cb7e583 100644 --- a/embedchain/loaders/sitemap.py +++ b/embedchain/loaders/sitemap.py @@ -4,6 +4,7 @@ import logging import requests from tqdm import tqdm +from urllib.parse import urlparse try: from bs4 import BeautifulSoup @@ -29,10 +30,16 @@ class SitemapLoader(BaseLoader): def load_data(self, sitemap_url): output = [] web_page_loader = WebPageLoader() - response = requests.get(sitemap_url) - response.raise_for_status() - soup = BeautifulSoup(response.text, "xml") + if urlparse(sitemap_url).scheme not in ["file", "http", "https"]: + raise ValueError("Not a valid URL.") + + if urlparse(sitemap_url).scheme in ["http", "https"]: + response = requests.get(sitemap_url) + response.raise_for_status() + else: + with open(sitemap_url, "r") as file: + soup = BeautifulSoup(file, "xml") links = [link.text for link in soup.find_all("loc") if link.parent.name == "url"] if len(links) == 0: links = [link.text for link in soup.find_all("loc")]