From d2e8f796ca60d41e09e0e69415d1ec5e056b884e Mon Sep 17 00:00:00 2001 From: aaishikdutta <107566376+aaishikdutta@users.noreply.github.com> Date: Wed, 12 Jul 2023 12:13:30 +0530 Subject: [PATCH] feat: add SitemapLoader (#68) --- README.md | 7 ++++++ embedchain/config/InitConfig.py | 1 + embedchain/data_formatter/data_formatter.py | 3 +++ embedchain/loaders/sitemap.py | 24 +++++++++++++++++++++ embedchain/vectordb/chroma_db.py | 2 +- setup.py | 1 + 6 files changed, 37 insertions(+), 1 deletion(-) create mode 100644 embedchain/loaders/sitemap.py diff --git a/README.md b/README.md index e72a92b8..be88d23a 100644 --- a/README.md +++ b/README.md @@ -294,6 +294,13 @@ To supply your own QnA pair, use the data_type as `qna_pair` and enter a tuple. ```python app.add_local('qna_pair', ("Question", "Answer")) ``` +### Sitemap + +To add a XML site map containing list of all urls, use the data_type as `sitemap` and enter the sitemap url. Eg: + +```python +app.add('sitemap', 'a_valid_sitemap_url/sitemap.xml') +``` ### Reusing a Vector DB diff --git a/embedchain/config/InitConfig.py b/embedchain/config/InitConfig.py index fb4daf2e..923ab879 100644 --- a/embedchain/config/InitConfig.py +++ b/embedchain/config/InitConfig.py @@ -62,6 +62,7 @@ class InitConfig(BaseConfig): Sets database to default (`ChromaDb`). """ from embedchain.vectordb.chroma_db import ChromaDB + self.db = ChromaDB(ef=self.ef, host=self.host, port=self.port) def _setup_logging(self, debug_level): diff --git a/embedchain/data_formatter/data_formatter.py b/embedchain/data_formatter/data_formatter.py index bff1b4f2..6e6e18f7 100644 --- a/embedchain/data_formatter/data_formatter.py +++ b/embedchain/data_formatter/data_formatter.py @@ -9,6 +9,7 @@ from embedchain.loaders.docx_file import DocxFileLoader from embedchain.loaders.local_qna_pair import LocalQnaPairLoader from embedchain.loaders.local_text import LocalTextLoader from embedchain.loaders.pdf_file import PdfFileLoader +from embedchain.loaders.sitemap import SitemapLoader from embedchain.loaders.web_page import WebPageLoader from embedchain.loaders.youtube_video import YoutubeVideoLoader @@ -39,6 +40,7 @@ class DataFormatter: "qna_pair": LocalQnaPairLoader(), "text": LocalTextLoader(), "docx": DocxFileLoader(), + "sitemap": SitemapLoader(), } if data_type in loaders: return loaders[data_type] @@ -60,6 +62,7 @@ class DataFormatter: "qna_pair": QnaPairChunker(config), "text": TextChunker(config), "docx": DocxFileChunker(config), + "sitemap": WebPageChunker(config), } if data_type in chunkers: return chunkers[data_type] diff --git a/embedchain/loaders/sitemap.py b/embedchain/loaders/sitemap.py new file mode 100644 index 00000000..e38a2be8 --- /dev/null +++ b/embedchain/loaders/sitemap.py @@ -0,0 +1,24 @@ +import requests +from bs4 import BeautifulSoup + +from embedchain.loaders.web_page import WebPageLoader + + +class SitemapLoader: + def load_data(self, sitemap_url): + """ + This method takes a sitemap URL as input and retrieves + all the URLs to use the WebPageLoader to load content + of each page. + """ + output = [] + web_page_loader = WebPageLoader() + response = requests.get(sitemap_url) + response.raise_for_status() + + soup = BeautifulSoup(response.text, "xml") + links = [link.text for link in soup.find_all("loc")] + for link in links: + each_load_data = web_page_loader.load_data(link) + output.append(each_load_data) + return [data[0] for data in output] diff --git a/embedchain/vectordb/chroma_db.py b/embedchain/vectordb/chroma_db.py index 2142f9d3..72408104 100644 --- a/embedchain/vectordb/chroma_db.py +++ b/embedchain/vectordb/chroma_db.py @@ -1,5 +1,5 @@ -import os import logging +import os import chromadb from chromadb.utils import embedding_functions diff --git a/setup.py b/setup.py index 6469e334..c9dee211 100644 --- a/setup.py +++ b/setup.py @@ -29,6 +29,7 @@ setuptools.setup( "beautifulsoup4", "pypdf", "pytube", + "lxml", "gpt4all", "sentence_transformers", "docx2txt",