feat: add UA header for pdf and sitemap (#1222)

This commit is contained in:
Taranjeet Singh
2024-01-26 23:59:09 -08:00
committed by GitHub
parent ec4fb11aa5
commit 8f28264aec
2 changed files with 8 additions and 2 deletions

View File

@@ -15,7 +15,10 @@ from embedchain.utils.misc import clean_string
class PdfFileLoader(BaseLoader): class PdfFileLoader(BaseLoader):
def load_data(self, url): def load_data(self, url):
"""Load data from a PDF file.""" """Load data from a PDF file."""
loader = PyPDFLoader(url) headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36", # noqa:E501
}
loader = PyPDFLoader(url, headers=headers)
data = [] data = []
all_content = [] all_content = []
pages = loader.load_and_split() pages = loader.load_and_split()

View File

@@ -31,10 +31,13 @@ class SitemapLoader(BaseLoader):
def load_data(self, sitemap_source): def load_data(self, sitemap_source):
output = [] output = []
web_page_loader = WebPageLoader() web_page_loader = WebPageLoader()
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36", # noqa:E501
}
if urlparse(sitemap_source).scheme in ("http", "https"): if urlparse(sitemap_source).scheme in ("http", "https"):
try: try:
response = requests.get(sitemap_source) response = requests.get(sitemap_source, headers=headers)
response.raise_for_status() response.raise_for_status()
soup = BeautifulSoup(response.text, "xml") soup = BeautifulSoup(response.text, "xml")
except requests.RequestException as e: except requests.RequestException as e: