Feature: Add support for loading docs website (#293)

2023-07-16 22:22:52 -07:00
parent d5e40e1853
commit a548863a09
10 changed files with 173 additions and 86 deletions
--- a/embedchain/chunkers/code_docs_page.py
+++ b/embedchain/chunkers/code_docs_page.py
@@ -12,8 +12,8 @@ TEXT_SPLITTER_CHUNK_PARAMS = {
 }


-class CodeDocsPageChunker(BaseChunker):
-    """Chunker for code docs page."""
+class DocsSiteChunker(BaseChunker):
+    """Chunker for code docs site."""

    def __init__(self, config: Optional[ChunkerConfig] = None):
        if config is None:
--- a/embedchain/config/QueryConfig.py
+++ b/embedchain/config/QueryConfig.py
@@ -28,7 +28,7 @@ DEFAULT_PROMPT_WITH_HISTORY = """
  Helpful Answer:
 """  # noqa:E501

-CODE_DOCS_PAGE_DEFAULT_PROMPT = """
+DOCS_SITE_DEFAULT_PROMPT = """
  Use the following pieces of context to answer the query at the end.
  If you don't know the answer, just say that you don't know, don't try to make up an answer. Wherever possible, give complete code snippet. Dont make up any code snippet on your own.

@@ -41,7 +41,7 @@ CODE_DOCS_PAGE_DEFAULT_PROMPT = """

 DEFAULT_PROMPT_TEMPLATE = Template(DEFAULT_PROMPT)
 DEFAULT_PROMPT_WITH_HISTORY_TEMPLATE = Template(DEFAULT_PROMPT_WITH_HISTORY)
-CODE_DOCS_PAGE_PROMPT_TEMPLATE = Template(CODE_DOCS_PAGE_DEFAULT_PROMPT)
+DOCS_SITE_PROMPT_TEMPLATE = Template(DOCS_SITE_DEFAULT_PROMPT)
 query_re = re.compile(r"\$\{*query\}*")
 context_re = re.compile(r"\$\{*context\}*")
 history_re = re.compile(r"\$\{*history\}*")
--- a/embedchain/data_formatter/data_formatter.py
+++ b/embedchain/data_formatter/data_formatter.py
@@ -1,4 +1,4 @@
-from embedchain.chunkers.code_docs_page import CodeDocsPageChunker
+from embedchain.chunkers.docs_site import DocsSiteChunker
 from embedchain.chunkers.docx_file import DocxFileChunker
 from embedchain.chunkers.pdf_file import PdfFileChunker
 from embedchain.chunkers.qna_pair import QnaPairChunker
@@ -6,7 +6,7 @@ from embedchain.chunkers.text import TextChunker
 from embedchain.chunkers.web_page import WebPageChunker
 from embedchain.chunkers.youtube_video import YoutubeVideoChunker
 from embedchain.config import AddConfig
-from embedchain.loaders.code_docs_page import CodeDocsPageLoader
+from embedchain.loaders.docs_site_loader import DocsSiteLoader
 from embedchain.loaders.docx_file import DocxFileLoader
 from embedchain.loaders.local_qna_pair import LocalQnaPairLoader
 from embedchain.loaders.local_text import LocalTextLoader
@@ -43,7 +43,7 @@ class DataFormatter:
            "text": LocalTextLoader(),
            "docx": DocxFileLoader(),
            "sitemap": SitemapLoader(),
-            "code_docs_page": CodeDocsPageLoader(),
+            "docs_site": DocsSiteLoader(),
        }
        if data_type in loaders:
            return loaders[data_type]
@@ -66,7 +66,7 @@ class DataFormatter:
            "text": TextChunker(config),
            "docx": DocxFileChunker(config),
            "sitemap": WebPageChunker(config),
-            "code_docs_page": CodeDocsPageChunker(config),
+            "docs_site": DocsSiteChunker(config),
        }
        if data_type in chunkers:
            return chunkers[data_type]
--- a/embedchain/embedchain.py
+++ b/embedchain/embedchain.py
@@ -9,7 +9,7 @@ from langchain.docstore.document import Document
 from langchain.memory import ConversationBufferMemory

 from embedchain.config import AddConfig, ChatConfig, InitConfig, QueryConfig
-from embedchain.config.QueryConfig import CODE_DOCS_PAGE_PROMPT_TEMPLATE, DEFAULT_PROMPT, DEFAULT_PROMPT_WITH_HISTORY
+from embedchain.config.QueryConfig import DOCS_SITE_PROMPT_TEMPLATE, DEFAULT_PROMPT, DEFAULT_PROMPT_WITH_HISTORY
 from embedchain.data_formatter import DataFormatter

 gpt4all_model = None
@@ -35,7 +35,7 @@ class EmbedChain:
        self.db_client = self.config.db.client
        self.collection = self.config.db.collection
        self.user_asks = []
-        self.is_code_docs_instance = False
+        self.is_docs_site_instance = False
        self.online = False

    def add(self, data_type, url, metadata=None, config: AddConfig = None):
@@ -56,8 +56,8 @@ class EmbedChain:
        data_formatter = DataFormatter(data_type, config)
        self.user_asks.append([data_type, url, metadata])
        self.load_and_embed(data_formatter.loader, data_formatter.chunker, url, metadata)
-        if data_type in ("code_docs_page",):
-            self.is_code_docs_instance = True
+        if data_type in ("docs_site",):
+            self.is_docs_site_instance = True

    def add_local(self, data_type, content, metadata=None, config: AddConfig = None):
        """
@@ -201,6 +201,7 @@ class EmbedChain:

    def access_search_and_get_results(self, input_query):
        from langchain.tools import DuckDuckGoSearchRun
+
        search = DuckDuckGoSearchRun()
        logging.info(f"Access search to get answers for {input_query}")
        return search.run(input_query)
@@ -218,8 +219,8 @@ class EmbedChain:
        """
        if config is None:
            config = QueryConfig()
-        if self.is_code_docs_instance:
-            config.template = CODE_DOCS_PAGE_PROMPT_TEMPLATE
+        if self.is_docs_site_instance:
+            config.template = DOCS_SITE_PROMPT_TEMPLATE
            config.number_documents = 5
        k = {}
        if self.online:
@@ -257,8 +258,8 @@ class EmbedChain:
        """
        if config is None:
            config = ChatConfig()
-        if self.is_code_docs_instance:
-            config.template = CODE_DOCS_PAGE_PROMPT_TEMPLATE
+        if self.is_docs_site_instance:
+            config.template = DOCS_SITE_PROMPT_TEMPLATE
            config.number_documents = 5
        k = {}
        if self.online:
--- a/embedchain/loaders/code_docs_page.py
+++ b/embedchain/loaders/code_docs_page.py
@@ -1,64 +0,0 @@
-import requests
-from bs4 import BeautifulSoup
-
-from embedchain.utils import clean_string
-
-
-class CodeDocsPageLoader:
-    def load_data(self, url):
-        """Load data from a web page."""
-        response = requests.get(url)
-        data = response.content
-        soup = BeautifulSoup(data, "html.parser")
-        selectors = [
-            "article.bd-article",
-            'article[role="main"]',
-            "div.md-content",
-            'div[role="main"]',
-            "div.container",
-            "div.section",
-            "article",
-            "main",
-        ]
-        content = None
-        for selector in selectors:
-            element = soup.select_one(selector)
-            if element is not None:
-                content = element.prettify()
-                break
-        if not content:
-            content = soup.get_text()
-        soup = BeautifulSoup(content, "html.parser")
-        for tag in soup(
-            [
-                "nav",
-                "aside",
-                "form",
-                "header",
-                "noscript",
-                "svg",
-                "canvas",
-                "footer",
-                "script",
-                "style",
-            ]
-        ):
-            tag.string = " "
-        for div in soup.find_all("div", {"class": "cell_output"}):
-            div.decompose()
-        for div in soup.find_all("div", {"class": "output_wrapper"}):
-            div.decompose()
-        for div in soup.find_all("div", {"class": "output"}):
-            div.decompose()
-        content = clean_string(soup.get_text())
-        output = []
-        meta_data = {
-            "url": url,
-        }
-        output.append(
-            {
-                "content": content,
-                "meta_data": meta_data,
-            }
-        )
-        return output
--- a/embedchain/loaders/docs_site_loader.py
+++ b/embedchain/loaders/docs_site_loader.py
@@ -0,0 +1,98 @@
+import logging
+from urllib.parse import urljoin, urlparse
+
+import requests
+from bs4 import BeautifulSoup
+
+
+class DocsSiteLoader:
+    def __init__(self):
+        self.visited_links = set()
+
+    def _get_child_links_recursive(self, url):
+        parsed_url = urlparse(url)
+        base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
+        current_path = parsed_url.path
+
+        response = requests.get(url)
+        if response.status_code != 200:
+            logging.info(f"Failed to fetch the website: {response.status_code}")
+            return
+
+        soup = BeautifulSoup(response.text, "html.parser")
+        all_links = [link.get("href") for link in soup.find_all("a")]
+
+        child_links = [link for link in all_links if link and link.startswith(current_path) and link != current_path]
+
+        absolute_paths = [urljoin(base_url, link) for link in child_links]
+
+        for link in absolute_paths:
+            if link not in self.visited_links:
+                self.visited_links.add(link)
+                self._get_child_links_recursive(link)
+
+    def _get_all_urls(self, url):
+        self.visited_links = set()
+        self._get_child_links_recursive(url)
+        urls = [link for link in self.visited_links if urlparse(link).netloc == urlparse(url).netloc]
+        return urls
+
+    def _load_data_from_url(self, url):
+        response = requests.get(url)
+        if response.status_code != 200:
+            logging.info(f"Failed to fetch the website: {response.status_code}")
+            return []
+
+        soup = BeautifulSoup(response.content, "html.parser")
+        selectors = [
+            "article.bd-article",
+            'article[role="main"]',
+            "div.md-content",
+            'div[role="main"]',
+            "div.container",
+            "div.section",
+            "article",
+            "main",
+        ]
+
+        output = []
+        for selector in selectors:
+            element = soup.select_one(selector)
+            if element:
+                content = element.prettify()
+                break
+        else:
+            content = soup.get_text()
+
+        soup = BeautifulSoup(content, "html.parser")
+        ignored_tags = [
+            "nav",
+            "aside",
+            "form",
+            "header",
+            "noscript",
+            "svg",
+            "canvas",
+            "footer",
+            "script",
+            "style",
+        ]
+        for tag in soup(ignored_tags):
+            tag.decompose()
+
+        content = " ".join(soup.stripped_strings)
+        output.append(
+            {
+                "content": content,
+                "meta_data": {"url": url},
+            }
+        )
+
+        return output
+
+    def load_data(self, url):
+        all_urls = self._get_all_urls(url)
+        output = []
+        for u in all_urls:
+            output.extend(self._load_data_from_url(u))
+        return output
--- a/embedchain/version.py
+++ b/embedchain/version.py
@@ -1 +0,0 @@
-__version__ = "0.0.23"