From addf1c0666b46f8d37e8742931912599701c5d2d Mon Sep 17 00:00:00 2001 From: cachho Date: Sat, 15 Jul 2023 21:21:25 +0200 Subject: [PATCH] feat: exclude by class, id in web_page data type and add logging (#273) --- embedchain/loaders/web_page.py | 66 ++++++++++++++++++++++++---------- 1 file changed, 47 insertions(+), 19 deletions(-) diff --git a/embedchain/loaders/web_page.py b/embedchain/loaders/web_page.py index 10de41c3..f6a0e66d 100644 --- a/embedchain/loaders/web_page.py +++ b/embedchain/loaders/web_page.py @@ -1,3 +1,5 @@ +import logging + import requests from bs4 import BeautifulSoup @@ -10,31 +12,57 @@ class WebPageLoader: response = requests.get(url) data = response.content soup = BeautifulSoup(data, "html.parser") - for tag in soup( - [ - "nav", - "aside", - "form", - "header", - "noscript", - "svg", - "canvas", - "footer", - "script", - "style", - ] - ): - tag.string = " " - output = [] + original_size = len(str(soup.get_text())) + + tags_to_exclude = [ + "nav", + "aside", + "form", + "header", + "noscript", + "svg", + "canvas", + "footer", + "script", + "style", + ] + for tag in soup(tags_to_exclude): + tag.decompose() + + ids_to_exclude = ["sidebar", "main-navigation", "menu-main-menu"] + for id in ids_to_exclude: + tags = soup.find_all(id=id) + for tag in tags: + tag.decompose() + + classes_to_exclude = [ + "elementor-location-header", + "navbar-header", + "nav", + "header-sidebar-wrapper", + "blog-sidebar-wrapper", + "related-posts", + ] + for class_name in classes_to_exclude: + tags = soup.find_all(class_=class_name) + for tag in tags: + tag.decompose() + content = soup.get_text() content = clean_string(content) + + cleaned_size = len(content) + logging.info( + f"[{url}] Cleaned page size: {cleaned_size} characters, down from {original_size} (shrunk: {original_size-cleaned_size} chars, {round((1-(cleaned_size/original_size)) * 100, 2)}%)" # noqa:E501 + ) + meta_data = { "url": url, } - output.append( + + return [ { "content": content, "meta_data": meta_data, } - ) - return output + ]