fix sitemap loader (#986)
Co-authored-by: Deven Patel <deven298@yahoo.com>
This commit is contained in:
@@ -16,7 +16,6 @@ except ImportError:
|
|||||||
from embedchain.helpers.json_serializable import register_deserializable
|
from embedchain.helpers.json_serializable import register_deserializable
|
||||||
from embedchain.loaders.base_loader import BaseLoader
|
from embedchain.loaders.base_loader import BaseLoader
|
||||||
from embedchain.loaders.web_page import WebPageLoader
|
from embedchain.loaders.web_page import WebPageLoader
|
||||||
from embedchain.utils import is_readable
|
|
||||||
|
|
||||||
|
|
||||||
@register_deserializable
|
@register_deserializable
|
||||||
@@ -40,19 +39,16 @@ class SitemapLoader(BaseLoader):
|
|||||||
|
|
||||||
doc_id = hashlib.sha256((" ".join(links) + sitemap_url).encode()).hexdigest()
|
doc_id = hashlib.sha256((" ".join(links) + sitemap_url).encode()).hexdigest()
|
||||||
|
|
||||||
def load_link(link):
|
def load_web_page(link):
|
||||||
try:
|
try:
|
||||||
each_load_data = web_page_loader.load_data(link)
|
loader_data = web_page_loader.load_data(link)
|
||||||
if is_readable(each_load_data.get("data")[0].get("content")):
|
return loader_data.get("data")
|
||||||
return each_load_data.get("data")
|
|
||||||
else:
|
|
||||||
logging.warning(f"Page is not readable (too many invalid characters): {link}")
|
|
||||||
except ParserRejectedMarkup as e:
|
except ParserRejectedMarkup as e:
|
||||||
logging.error(f"Failed to parse {link}: {e}")
|
logging.error(f"Failed to parse {link}: {e}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
with concurrent.futures.ThreadPoolExecutor() as executor:
|
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||||
future_to_link = {executor.submit(load_link, link): link for link in links}
|
future_to_link = {executor.submit(load_web_page, link): link for link in links}
|
||||||
for future in tqdm(concurrent.futures.as_completed(future_to_link), total=len(links), desc="Loading pages"):
|
for future in tqdm(concurrent.futures.as_completed(future_to_link), total=len(links), desc="Loading pages"):
|
||||||
link = future_to_link[future]
|
link = future_to_link[future]
|
||||||
try:
|
try:
|
||||||
|
|||||||
Reference in New Issue
Block a user