diff --git a/embedchain/loaders/pdf_file.py b/embedchain/loaders/pdf_file.py index fb97f324..77107764 100644 --- a/embedchain/loaders/pdf_file.py +++ b/embedchain/loaders/pdf_file.py @@ -15,7 +15,10 @@ from embedchain.utils.misc import clean_string class PdfFileLoader(BaseLoader): def load_data(self, url): """Load data from a PDF file.""" - loader = PyPDFLoader(url) + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36", # noqa:E501 + } + loader = PyPDFLoader(url, headers=headers) data = [] all_content = [] pages = loader.load_and_split() diff --git a/embedchain/loaders/sitemap.py b/embedchain/loaders/sitemap.py index fc6e7b4e..4e4da7e1 100644 --- a/embedchain/loaders/sitemap.py +++ b/embedchain/loaders/sitemap.py @@ -31,10 +31,13 @@ class SitemapLoader(BaseLoader): def load_data(self, sitemap_source): output = [] web_page_loader = WebPageLoader() + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36", # noqa:E501 + } if urlparse(sitemap_source).scheme in ("http", "https"): try: - response = requests.get(sitemap_source) + response = requests.get(sitemap_source, headers=headers) response.raise_for_status() soup = BeautifulSoup(response.text, "xml") except requests.RequestException as e: