[Loaders] Improve web page and sitemap loader usability (#961)
This commit is contained in:
@@ -3,6 +3,7 @@ import hashlib
|
||||
import logging
|
||||
|
||||
import requests
|
||||
from tqdm import tqdm
|
||||
|
||||
try:
|
||||
from bs4 import BeautifulSoup
|
||||
@@ -52,7 +53,7 @@ class SitemapLoader(BaseLoader):
|
||||
|
||||
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||
future_to_link = {executor.submit(load_link, link): link for link in links}
|
||||
for future in concurrent.futures.as_completed(future_to_link):
|
||||
for future in tqdm(concurrent.futures.as_completed(future_to_link), total=len(links)):
|
||||
link = future_to_link[future]
|
||||
try:
|
||||
data = future.result()
|
||||
|
||||
@@ -17,15 +17,17 @@ from embedchain.utils import clean_string
|
||||
|
||||
@register_deserializable
|
||||
class WebPageLoader(BaseLoader):
|
||||
# Shared session for all instances
|
||||
_session = requests.Session()
|
||||
|
||||
def load_data(self, url):
|
||||
"""Load data from a web page."""
|
||||
response = requests.get(url)
|
||||
"""Load data from a web page using a shared requests session."""
|
||||
response = self._session.get(url, timeout=30)
|
||||
response.raise_for_status()
|
||||
data = response.content
|
||||
content = self._get_clean_content(data, url)
|
||||
|
||||
meta_data = {
|
||||
"url": url,
|
||||
}
|
||||
meta_data = {"url": url}
|
||||
|
||||
doc_id = hashlib.sha256((content + url).encode()).hexdigest()
|
||||
return {
|
||||
@@ -86,3 +88,7 @@ class WebPageLoader(BaseLoader):
|
||||
)
|
||||
|
||||
return content
|
||||
|
||||
@classmethod
|
||||
def close_session(cls):
|
||||
cls._session.close()
|
||||
|
||||
Reference in New Issue
Block a user