[Loaders] Improve web page and sitemap loader usability (#961)

This commit is contained in:
Deshraj Yadav
2023-11-16 16:01:43 -08:00
committed by GitHub
parent 28460f725c
commit e0b73e6a5a
3 changed files with 14 additions and 7 deletions

View File

@@ -3,6 +3,7 @@ import hashlib
import logging import logging
import requests import requests
from tqdm import tqdm
try: try:
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
@@ -52,7 +53,7 @@ class SitemapLoader(BaseLoader):
with concurrent.futures.ThreadPoolExecutor() as executor: with concurrent.futures.ThreadPoolExecutor() as executor:
future_to_link = {executor.submit(load_link, link): link for link in links} future_to_link = {executor.submit(load_link, link): link for link in links}
for future in concurrent.futures.as_completed(future_to_link): for future in tqdm(concurrent.futures.as_completed(future_to_link), total=len(links)):
link = future_to_link[future] link = future_to_link[future]
try: try:
data = future.result() data = future.result()

View File

@@ -17,15 +17,17 @@ from embedchain.utils import clean_string
@register_deserializable @register_deserializable
class WebPageLoader(BaseLoader): class WebPageLoader(BaseLoader):
# Shared session for all instances
_session = requests.Session()
def load_data(self, url): def load_data(self, url):
"""Load data from a web page.""" """Load data from a web page using a shared requests session."""
response = requests.get(url) response = self._session.get(url, timeout=30)
response.raise_for_status()
data = response.content data = response.content
content = self._get_clean_content(data, url) content = self._get_clean_content(data, url)
meta_data = { meta_data = {"url": url}
"url": url,
}
doc_id = hashlib.sha256((content + url).encode()).hexdigest() doc_id = hashlib.sha256((content + url).encode()).hexdigest()
return { return {
@@ -86,3 +88,7 @@ class WebPageLoader(BaseLoader):
) )
return content return content
@classmethod
def close_session(cls):
cls._session.close()

View File

@@ -27,7 +27,7 @@ def test_load_data(web_page_loader):
</body> </body>
</html> </html>
""" """
with patch("embedchain.loaders.web_page.requests.get", return_value=mock_response): with patch("embedchain.loaders.web_page.WebPageLoader._session.get", return_value=mock_response):
result = web_page_loader.load_data(page_url) result = web_page_loader.load_data(page_url)
content = web_page_loader._get_clean_content(mock_response.content, page_url) content = web_page_loader._get_clean_content(mock_response.content, page_url)