From 8f28264aecfec1f793fc3c56d75853d43797aa02 Mon Sep 17 00:00:00 2001 From: Taranjeet Singh Date: Fri, 26 Jan 2024 23:59:09 -0800 Subject: [PATCH] feat: add UA header for pdf and sitemap (#1222) --- embedchain/loaders/pdf_file.py | 5 ++++- embedchain/loaders/sitemap.py | 5 ++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/embedchain/loaders/pdf_file.py b/embedchain/loaders/pdf_file.py index fb97f324..77107764 100644 --- a/embedchain/loaders/pdf_file.py +++ b/embedchain/loaders/pdf_file.py @@ -15,7 +15,10 @@ from embedchain.utils.misc import clean_string class PdfFileLoader(BaseLoader): def load_data(self, url): """Load data from a PDF file.""" - loader = PyPDFLoader(url) + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36", # noqa:E501 + } + loader = PyPDFLoader(url, headers=headers) data = [] all_content = [] pages = loader.load_and_split() diff --git a/embedchain/loaders/sitemap.py b/embedchain/loaders/sitemap.py index fc6e7b4e..4e4da7e1 100644 --- a/embedchain/loaders/sitemap.py +++ b/embedchain/loaders/sitemap.py @@ -31,10 +31,13 @@ class SitemapLoader(BaseLoader): def load_data(self, sitemap_source): output = [] web_page_loader = WebPageLoader() + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36", # noqa:E501 + } if urlparse(sitemap_source).scheme in ("http", "https"): try: - response = requests.get(sitemap_source) + response = requests.get(sitemap_source, headers=headers) response.raise_for_status() soup = BeautifulSoup(response.text, "xml") except requests.RequestException as e: