From c9fbc2e7d6fb38ece5169724bcc6574d70c6b878 Mon Sep 17 00:00:00 2001 From: Deven Patel Date: Tue, 5 Dec 2023 17:56:24 -0800 Subject: [PATCH] fix sitemap loader (#986) Co-authored-by: Deven Patel --- embedchain/loaders/sitemap.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/embedchain/loaders/sitemap.py b/embedchain/loaders/sitemap.py index 06e7e239..b95f854c 100644 --- a/embedchain/loaders/sitemap.py +++ b/embedchain/loaders/sitemap.py @@ -16,7 +16,6 @@ except ImportError: from embedchain.helpers.json_serializable import register_deserializable from embedchain.loaders.base_loader import BaseLoader from embedchain.loaders.web_page import WebPageLoader -from embedchain.utils import is_readable @register_deserializable @@ -40,19 +39,16 @@ class SitemapLoader(BaseLoader): doc_id = hashlib.sha256((" ".join(links) + sitemap_url).encode()).hexdigest() - def load_link(link): + def load_web_page(link): try: - each_load_data = web_page_loader.load_data(link) - if is_readable(each_load_data.get("data")[0].get("content")): - return each_load_data.get("data") - else: - logging.warning(f"Page is not readable (too many invalid characters): {link}") + loader_data = web_page_loader.load_data(link) + return loader_data.get("data") except ParserRejectedMarkup as e: logging.error(f"Failed to parse {link}: {e}") return None with concurrent.futures.ThreadPoolExecutor() as executor: - future_to_link = {executor.submit(load_link, link): link for link in links} + future_to_link = {executor.submit(load_web_page, link): link for link in links} for future in tqdm(concurrent.futures.as_completed(future_to_link), total=len(links), desc="Loading pages"): link = future_to_link[future] try: