fix sitemap loader (#986)
Co-authored-by: Deven Patel <deven298@yahoo.com>
This commit is contained in:
@@ -16,7 +16,6 @@ except ImportError:
|
||||
from embedchain.helpers.json_serializable import register_deserializable
|
||||
from embedchain.loaders.base_loader import BaseLoader
|
||||
from embedchain.loaders.web_page import WebPageLoader
|
||||
from embedchain.utils import is_readable
|
||||
|
||||
|
||||
@register_deserializable
|
||||
@@ -40,19 +39,16 @@ class SitemapLoader(BaseLoader):
|
||||
|
||||
doc_id = hashlib.sha256((" ".join(links) + sitemap_url).encode()).hexdigest()
|
||||
|
||||
def load_link(link):
|
||||
def load_web_page(link):
|
||||
try:
|
||||
each_load_data = web_page_loader.load_data(link)
|
||||
if is_readable(each_load_data.get("data")[0].get("content")):
|
||||
return each_load_data.get("data")
|
||||
else:
|
||||
logging.warning(f"Page is not readable (too many invalid characters): {link}")
|
||||
loader_data = web_page_loader.load_data(link)
|
||||
return loader_data.get("data")
|
||||
except ParserRejectedMarkup as e:
|
||||
logging.error(f"Failed to parse {link}: {e}")
|
||||
return None
|
||||
|
||||
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||
future_to_link = {executor.submit(load_link, link): link for link in links}
|
||||
future_to_link = {executor.submit(load_web_page, link): link for link in links}
|
||||
for future in tqdm(concurrent.futures.as_completed(future_to_link), total=len(links), desc="Loading pages"):
|
||||
link = future_to_link[future]
|
||||
try:
|
||||
|
||||
Reference in New Issue
Block a user