[Loaders] Improve web page and sitemap loader usability (#961)
This commit is contained in:
@@ -3,6 +3,7 @@ import hashlib
|
|||||||
import logging
|
import logging
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
@@ -52,7 +53,7 @@ class SitemapLoader(BaseLoader):
|
|||||||
|
|
||||||
with concurrent.futures.ThreadPoolExecutor() as executor:
|
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||||
future_to_link = {executor.submit(load_link, link): link for link in links}
|
future_to_link = {executor.submit(load_link, link): link for link in links}
|
||||||
for future in concurrent.futures.as_completed(future_to_link):
|
for future in tqdm(concurrent.futures.as_completed(future_to_link), total=len(links)):
|
||||||
link = future_to_link[future]
|
link = future_to_link[future]
|
||||||
try:
|
try:
|
||||||
data = future.result()
|
data = future.result()
|
||||||
|
|||||||
@@ -17,15 +17,17 @@ from embedchain.utils import clean_string
|
|||||||
|
|
||||||
@register_deserializable
|
@register_deserializable
|
||||||
class WebPageLoader(BaseLoader):
|
class WebPageLoader(BaseLoader):
|
||||||
|
# Shared session for all instances
|
||||||
|
_session = requests.Session()
|
||||||
|
|
||||||
def load_data(self, url):
|
def load_data(self, url):
|
||||||
"""Load data from a web page."""
|
"""Load data from a web page using a shared requests session."""
|
||||||
response = requests.get(url)
|
response = self._session.get(url, timeout=30)
|
||||||
|
response.raise_for_status()
|
||||||
data = response.content
|
data = response.content
|
||||||
content = self._get_clean_content(data, url)
|
content = self._get_clean_content(data, url)
|
||||||
|
|
||||||
meta_data = {
|
meta_data = {"url": url}
|
||||||
"url": url,
|
|
||||||
}
|
|
||||||
|
|
||||||
doc_id = hashlib.sha256((content + url).encode()).hexdigest()
|
doc_id = hashlib.sha256((content + url).encode()).hexdigest()
|
||||||
return {
|
return {
|
||||||
@@ -86,3 +88,7 @@ class WebPageLoader(BaseLoader):
|
|||||||
)
|
)
|
||||||
|
|
||||||
return content
|
return content
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def close_session(cls):
|
||||||
|
cls._session.close()
|
||||||
|
|||||||
@@ -27,7 +27,7 @@ def test_load_data(web_page_loader):
|
|||||||
</body>
|
</body>
|
||||||
</html>
|
</html>
|
||||||
"""
|
"""
|
||||||
with patch("embedchain.loaders.web_page.requests.get", return_value=mock_response):
|
with patch("embedchain.loaders.web_page.WebPageLoader._session.get", return_value=mock_response):
|
||||||
result = web_page_loader.load_data(page_url)
|
result = web_page_loader.load_data(page_url)
|
||||||
|
|
||||||
content = web_page_loader._get_clean_content(mock_response.content, page_url)
|
content = web_page_loader._get_clean_content(mock_response.content, page_url)
|
||||||
|
|||||||
Reference in New Issue
Block a user