Add local file path support to SitemapLoader (#954)

This commit is contained in:
Prikshit
2023-12-23 10:56:33 +05:30
committed by GitHub
parent 9fe80c5cca
commit 106a338371

View File

@@ -1,6 +1,7 @@
import concurrent.futures import concurrent.futures
import hashlib import hashlib
import logging import logging
import os
from urllib.parse import urlparse from urllib.parse import urlparse
import requests import requests
@@ -22,31 +23,34 @@ from embedchain.loaders.web_page import WebPageLoader
@register_deserializable @register_deserializable
class SitemapLoader(BaseLoader): class SitemapLoader(BaseLoader):
""" """
This method takes a sitemap URL as input and retrieves This method takes a sitemap URL or local file path as input and retrieves
all the URLs to use the WebPageLoader to load content all the URLs to use the WebPageLoader to load content
of each page. of each page.
""" """
def load_data(self, sitemap_url): def load_data(self, sitemap_source):
output = [] output = []
web_page_loader = WebPageLoader() web_page_loader = WebPageLoader()
if urlparse(sitemap_url).scheme not in ["file", "http", "https"]: if urlparse(sitemap_source).scheme in ("http", "https"):
raise ValueError("Not a valid URL.") try:
response = requests.get(sitemap_source)
if urlparse(sitemap_url).scheme in ["http", "https"]: response.raise_for_status()
response = requests.get(sitemap_url) soup = BeautifulSoup(response.text, "xml")
response.raise_for_status() except requests.RequestException as e:
soup = BeautifulSoup(response.text, "xml") logging.error(f"Error fetching sitemap from URL: {e}")
else: return
with open(sitemap_url, "r") as file: elif os.path.isfile(sitemap_source):
with open(sitemap_source, "r") as file:
soup = BeautifulSoup(file, "xml") soup = BeautifulSoup(file, "xml")
else:
raise ValueError("Invalid sitemap source. Please provide a valid URL or local file path.")
links = [link.text for link in soup.find_all("loc") if link.parent.name == "url"] links = [link.text for link in soup.find_all("loc") if link.parent.name == "url"]
if len(links) == 0: if len(links) == 0:
links = [link.text for link in soup.find_all("loc")] links = [link.text for link in soup.find_all("loc")]
doc_id = hashlib.sha256((" ".join(links) + sitemap_url).encode()).hexdigest() doc_id = hashlib.sha256((" ".join(links) + sitemap_source).encode()).hexdigest()
def load_web_page(link): def load_web_page(link):
try: try: