feat: add SitemapLoader (#68)

2023-07-12 12:13:30 +05:30
parent 6fbf45498a
commit d2e8f796ca
6 changed files with 37 additions and 1 deletions
--- a/README.md
+++ b/README.md
@@ -294,6 +294,13 @@ To supply your own QnA pair, use the data_type as `qna_pair` and enter a tuple.
 ```python
 app.add_local('qna_pair', ("Question", "Answer"))
 ```
 ### Sitemap
 To add a XML site map containing list of all urls, use the data_type as `sitemap` and enter the sitemap url. Eg:
 ```python
 app.add('sitemap', 'a_valid_sitemap_url/sitemap.xml')
 ```
 ### Reusing a Vector DB
--- a/embedchain/config/InitConfig.py
+++ b/embedchain/config/InitConfig.py
@@ -62,6 +62,7 @@ class InitConfig(BaseConfig):
        Sets database to default (`ChromaDb`).
        """
        from embedchain.vectordb.chroma_db import ChromaDB
        self.db = ChromaDB(ef=self.ef, host=self.host, port=self.port)
    def _setup_logging(self, debug_level):
--- a/embedchain/data_formatter/data_formatter.py
+++ b/embedchain/data_formatter/data_formatter.py
@@ -9,6 +9,7 @@ from embedchain.loaders.docx_file import DocxFileLoader
 from embedchain.loaders.local_qna_pair import LocalQnaPairLoader
 from embedchain.loaders.local_text import LocalTextLoader
 from embedchain.loaders.pdf_file import PdfFileLoader
 from embedchain.loaders.sitemap import SitemapLoader
 from embedchain.loaders.web_page import WebPageLoader
 from embedchain.loaders.youtube_video import YoutubeVideoLoader
@@ -39,6 +40,7 @@ class DataFormatter:
            "qna_pair": LocalQnaPairLoader(),
            "text": LocalTextLoader(),
            "docx": DocxFileLoader(),
            "sitemap": SitemapLoader(),
        }
        if data_type in loaders:
            return loaders[data_type]
@@ -60,6 +62,7 @@ class DataFormatter:
            "qna_pair": QnaPairChunker(config),
            "text": TextChunker(config),
            "docx": DocxFileChunker(config),
            "sitemap": WebPageChunker(config),
        }
        if data_type in chunkers:
            return chunkers[data_type]
--- a/embedchain/loaders/sitemap.py
+++ b/embedchain/loaders/sitemap.py
@@ -0,0 +1,24 @@
 import requests
 from bs4 import BeautifulSoup
 from embedchain.loaders.web_page import WebPageLoader
 class SitemapLoader:
    def load_data(self, sitemap_url):
        """
        This method takes a sitemap URL as input and retrieves
        all the URLs to use the WebPageLoader to load content
        of each page.
        """
        output = []
        web_page_loader = WebPageLoader()
        response = requests.get(sitemap_url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "xml")
        links = [link.text for link in soup.find_all("loc")]
        for link in links:
            each_load_data = web_page_loader.load_data(link)
            output.append(each_load_data)
        return [data[0] for data in output]
--- a/embedchain/vectordb/chroma_db.py
+++ b/embedchain/vectordb/chroma_db.py
@@ -1,5 +1,5 @@
 import os
 import logging
 import os
 import chromadb
 from chromadb.utils import embedding_functions
--- a/setup.py
+++ b/setup.py
@@ -29,6 +29,7 @@ setuptools.setup(
        "beautifulsoup4",
        "pypdf",
        "pytube",
        "lxml",
        "gpt4all",
        "sentence_transformers",
        "docx2txt",