[Loaders] Improve web page and sitemap loader usability (#961)

This commit is contained in:
Deshraj Yadav
2023-11-16 16:01:43 -08:00
committed by GitHub
parent 28460f725c
commit e0b73e6a5a
3 changed files with 14 additions and 7 deletions

View File

@@ -3,6 +3,7 @@ import hashlib
import logging
import requests
from tqdm import tqdm
try:
from bs4 import BeautifulSoup
@@ -52,7 +53,7 @@ class SitemapLoader(BaseLoader):
with concurrent.futures.ThreadPoolExecutor() as executor:
future_to_link = {executor.submit(load_link, link): link for link in links}
for future in concurrent.futures.as_completed(future_to_link):
for future in tqdm(concurrent.futures.as_completed(future_to_link), total=len(links)):
link = future_to_link[future]
try:
data = future.result()

View File

@@ -17,15 +17,17 @@ from embedchain.utils import clean_string
@register_deserializable
class WebPageLoader(BaseLoader):
# Shared session for all instances
_session = requests.Session()
def load_data(self, url):
"""Load data from a web page."""
response = requests.get(url)
"""Load data from a web page using a shared requests session."""
response = self._session.get(url, timeout=30)
response.raise_for_status()
data = response.content
content = self._get_clean_content(data, url)
meta_data = {
"url": url,
}
meta_data = {"url": url}
doc_id = hashlib.sha256((content + url).encode()).hexdigest()
return {
@@ -86,3 +88,7 @@ class WebPageLoader(BaseLoader):
)
return content
@classmethod
def close_session(cls):
cls._session.close()