feat: add UA header for pdf and sitemap (#1222)
This commit is contained in:
@@ -15,7 +15,10 @@ from embedchain.utils.misc import clean_string
|
|||||||
class PdfFileLoader(BaseLoader):
|
class PdfFileLoader(BaseLoader):
|
||||||
def load_data(self, url):
|
def load_data(self, url):
|
||||||
"""Load data from a PDF file."""
|
"""Load data from a PDF file."""
|
||||||
loader = PyPDFLoader(url)
|
headers = {
|
||||||
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36", # noqa:E501
|
||||||
|
}
|
||||||
|
loader = PyPDFLoader(url, headers=headers)
|
||||||
data = []
|
data = []
|
||||||
all_content = []
|
all_content = []
|
||||||
pages = loader.load_and_split()
|
pages = loader.load_and_split()
|
||||||
|
|||||||
@@ -31,10 +31,13 @@ class SitemapLoader(BaseLoader):
|
|||||||
def load_data(self, sitemap_source):
|
def load_data(self, sitemap_source):
|
||||||
output = []
|
output = []
|
||||||
web_page_loader = WebPageLoader()
|
web_page_loader = WebPageLoader()
|
||||||
|
headers = {
|
||||||
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36", # noqa:E501
|
||||||
|
}
|
||||||
|
|
||||||
if urlparse(sitemap_source).scheme in ("http", "https"):
|
if urlparse(sitemap_source).scheme in ("http", "https"):
|
||||||
try:
|
try:
|
||||||
response = requests.get(sitemap_source)
|
response = requests.get(sitemap_source, headers=headers)
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
soup = BeautifulSoup(response.text, "xml")
|
soup = BeautifulSoup(response.text, "xml")
|
||||||
except requests.RequestException as e:
|
except requests.RequestException as e:
|
||||||
|
|||||||
Reference in New Issue
Block a user