Adding fetching data functionality for reference links in the web page (#1806)

2024-10-15 07:26:35 -04:00
parent 721d765921
commit 20c3aee636
9 changed files with 86 additions and 8 deletions
--- a/embedchain/embedchain/loaders/base_loader.py
+++ b/embedchain/embedchain/loaders/base_loader.py
@@ -1,3 +1,5 @@
+from typing import Any, Optional
+
 from embedchain.helpers.json_serializable import JSONSerializable


@@ -5,7 +7,7 @@ class BaseLoader(JSONSerializable):
    def __init__(self):
        pass

-    def load_data(self, url):
+    def load_data(self, url, **kwargs: Optional[dict[str, Any]]):
        """
        Implemented by child classes
        """
--- a/embedchain/embedchain/loaders/web_page.py
+++ b/embedchain/embedchain/loaders/web_page.py
@@ -1,5 +1,6 @@
 import hashlib
 import logging
+from typing import Any, Optional

 import requests

@@ -22,14 +23,29 @@ class WebPageLoader(BaseLoader):
    # Shared session for all instances
    _session = requests.Session()

-    def load_data(self, url):
+    def load_data(self, url, **kwargs: Optional[dict[str, Any]]):
        """Load data from a web page using a shared requests' session."""
+        all_references = False
+        for key, value in kwargs.items():
+            if key == "all_references":
+                all_references = kwargs["all_references"]
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36",  # noqa:E501
        }
        response = self._session.get(url, headers=headers, timeout=30)
        response.raise_for_status()
        data = response.content
+        reference_links = self.fetch_reference_links(response)
+        if all_references:
+            for i in reference_links:
+                try:
+                    response = self._session.get(i, headers=headers, timeout=30)
+                    response.raise_for_status()
+                    data += response.content
+                except Exception as e:
+                    logging.error(f"Failed to add URL {url}: {e}")
+                    continue
+
        content = self._get_clean_content(data, url)

        metadata = {"url": url}
@@ -98,3 +114,13 @@ class WebPageLoader(BaseLoader):
    @classmethod
    def close_session(cls):
        cls._session.close()
+
+    def fetch_reference_links(self, response):
+        if response.status_code == 200:
+            soup = BeautifulSoup(response.content, "html.parser")
+            a_tags = soup.find_all("a", href=True)
+            reference_links = [a["href"] for a in a_tags if a["href"].startswith("http")]
+            return reference_links
+        else:
+            print(f"Failed to retrieve the page. Status code: {response.status_code}")
+            return []