Speed up docs site loader (#1266)
This commit is contained in:
@@ -24,6 +24,9 @@ class DocsSiteLoader(BaseLoader):
|
|||||||
self.visited_links = set()
|
self.visited_links = set()
|
||||||
|
|
||||||
def _get_child_links_recursive(self, url):
|
def _get_child_links_recursive(self, url):
|
||||||
|
if url in self.visited_links:
|
||||||
|
return
|
||||||
|
|
||||||
parsed_url = urlparse(url)
|
parsed_url = urlparse(url)
|
||||||
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
|
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
|
||||||
current_path = parsed_url.path
|
current_path = parsed_url.path
|
||||||
@@ -34,16 +37,15 @@ class DocsSiteLoader(BaseLoader):
|
|||||||
return
|
return
|
||||||
|
|
||||||
soup = BeautifulSoup(response.text, "html.parser")
|
soup = BeautifulSoup(response.text, "html.parser")
|
||||||
all_links = [link.get("href") for link in soup.find_all("a")]
|
all_links = (link.get("href") for link in soup.find_all("a", href=True))
|
||||||
|
|
||||||
child_links = [link for link in all_links if link and link.startswith(current_path) and link != current_path]
|
child_links = (link for link in all_links if link.startswith(current_path) and link != current_path)
|
||||||
|
|
||||||
absolute_paths = [urljoin(base_url, link) for link in child_links]
|
absolute_paths = set(urljoin(base_url, link) for link in child_links)
|
||||||
|
|
||||||
for link in absolute_paths:
|
self.visited_links.update(absolute_paths)
|
||||||
if link not in self.visited_links:
|
|
||||||
self.visited_links.add(link)
|
[self._get_child_links_recursive(link) for link in absolute_paths if link not in self.visited_links]
|
||||||
self._get_child_links_recursive(link)
|
|
||||||
|
|
||||||
def _get_all_urls(self, url):
|
def _get_all_urls(self, url):
|
||||||
self.visited_links = set()
|
self.visited_links = set()
|
||||||
|
|||||||
Reference in New Issue
Block a user