fix sitemap loader (#986)

Co-authored-by: Deven Patel <deven298@yahoo.com>
This commit is contained in:
Deven Patel
2023-12-05 17:56:24 -08:00
committed by GitHub
parent fa34788df6
commit c9fbc2e7d6

View File

@@ -16,7 +16,6 @@ except ImportError:
from embedchain.helpers.json_serializable import register_deserializable
from embedchain.loaders.base_loader import BaseLoader
from embedchain.loaders.web_page import WebPageLoader
from embedchain.utils import is_readable
@register_deserializable
@@ -40,19 +39,16 @@ class SitemapLoader(BaseLoader):
doc_id = hashlib.sha256((" ".join(links) + sitemap_url).encode()).hexdigest()
def load_link(link):
def load_web_page(link):
try:
each_load_data = web_page_loader.load_data(link)
if is_readable(each_load_data.get("data")[0].get("content")):
return each_load_data.get("data")
else:
logging.warning(f"Page is not readable (too many invalid characters): {link}")
loader_data = web_page_loader.load_data(link)
return loader_data.get("data")
except ParserRejectedMarkup as e:
logging.error(f"Failed to parse {link}: {e}")
return None
with concurrent.futures.ThreadPoolExecutor() as executor:
future_to_link = {executor.submit(load_link, link): link for link in links}
future_to_link = {executor.submit(load_web_page, link): link for link in links}
for future in tqdm(concurrent.futures.as_completed(future_to_link), total=len(links), desc="Loading pages"):
link = future_to_link[future]
try: