feat: add SitemapLoader (#68)

2023-07-12 12:13:30 +05:30
parent 6fbf45498a
commit d2e8f796ca
6 changed files with 37 additions and 1 deletions
--- a/README.md
+++ b/README.md
@@ -294,6 +294,13 @@ To supply your own QnA pair, use the data_type as `qna_pair` and enter a tuple.
 ```python
 app.add_local('qna_pair', ("Question", "Answer"))
 ```
+### Sitemap
+
+To add a XML site map containing list of all urls, use the data_type as `sitemap` and enter the sitemap url. Eg:
+
+```python
+app.add('sitemap', 'a_valid_sitemap_url/sitemap.xml')
+```

 ### Reusing a Vector DB

--- a/embedchain/config/InitConfig.py
+++ b/embedchain/config/InitConfig.py
@@ -62,6 +62,7 @@ class InitConfig(BaseConfig):
        Sets database to default (`ChromaDb`).
        """
        from embedchain.vectordb.chroma_db import ChromaDB
+
        self.db = ChromaDB(ef=self.ef, host=self.host, port=self.port)

    def _setup_logging(self, debug_level):
--- a/embedchain/data_formatter/data_formatter.py
+++ b/embedchain/data_formatter/data_formatter.py
@@ -9,6 +9,7 @@ from embedchain.loaders.docx_file import DocxFileLoader
 from embedchain.loaders.local_qna_pair import LocalQnaPairLoader
 from embedchain.loaders.local_text import LocalTextLoader
 from embedchain.loaders.pdf_file import PdfFileLoader
+from embedchain.loaders.sitemap import SitemapLoader
 from embedchain.loaders.web_page import WebPageLoader
 from embedchain.loaders.youtube_video import YoutubeVideoLoader

@@ -39,6 +40,7 @@ class DataFormatter:
            "qna_pair": LocalQnaPairLoader(),
            "text": LocalTextLoader(),
            "docx": DocxFileLoader(),
+            "sitemap": SitemapLoader(),
        }
        if data_type in loaders:
            return loaders[data_type]
@@ -60,6 +62,7 @@ class DataFormatter:
            "qna_pair": QnaPairChunker(config),
            "text": TextChunker(config),
            "docx": DocxFileChunker(config),
+            "sitemap": WebPageChunker(config),
        }
        if data_type in chunkers:
            return chunkers[data_type]
--- a/embedchain/loaders/sitemap.py
+++ b/embedchain/loaders/sitemap.py
@@ -0,0 +1,24 @@
+import requests
+from bs4 import BeautifulSoup
+
+from embedchain.loaders.web_page import WebPageLoader
+
+
+class SitemapLoader:
+    def load_data(self, sitemap_url):
+        """
+        This method takes a sitemap URL as input and retrieves
+        all the URLs to use the WebPageLoader to load content
+        of each page.
+        """
+        output = []
+        web_page_loader = WebPageLoader()
+        response = requests.get(sitemap_url)
+        response.raise_for_status()
+
+        soup = BeautifulSoup(response.text, "xml")
+        links = [link.text for link in soup.find_all("loc")]
+        for link in links:
+            each_load_data = web_page_loader.load_data(link)
+            output.append(each_load_data)
+        return [data[0] for data in output]
--- a/embedchain/vectordb/chroma_db.py
+++ b/embedchain/vectordb/chroma_db.py
@@ -1,5 +1,5 @@
-import os
 import logging
+import os

 import chromadb
 from chromadb.utils import embedding_functions
--- a/setup.py
+++ b/setup.py
@@ -29,6 +29,7 @@ setuptools.setup(
        "beautifulsoup4",
        "pypdf",
        "pytube",
+        "lxml",
        "gpt4all",
        "sentence_transformers",
        "docx2txt",