feat: add SitemapLoader (#68)

This commit is contained in:
aaishikdutta
2023-07-12 12:13:30 +05:30
committed by GitHub
parent 6fbf45498a
commit d2e8f796ca
6 changed files with 37 additions and 1 deletions

View File

@@ -294,6 +294,13 @@ To supply your own QnA pair, use the data_type as `qna_pair` and enter a tuple.
```python
app.add_local('qna_pair', ("Question", "Answer"))
```
### Sitemap
To add a XML site map containing list of all urls, use the data_type as `sitemap` and enter the sitemap url. Eg:
```python
app.add('sitemap', 'a_valid_sitemap_url/sitemap.xml')
```
### Reusing a Vector DB

View File

@@ -62,6 +62,7 @@ class InitConfig(BaseConfig):
Sets database to default (`ChromaDb`).
"""
from embedchain.vectordb.chroma_db import ChromaDB
self.db = ChromaDB(ef=self.ef, host=self.host, port=self.port)
def _setup_logging(self, debug_level):

View File

@@ -9,6 +9,7 @@ from embedchain.loaders.docx_file import DocxFileLoader
from embedchain.loaders.local_qna_pair import LocalQnaPairLoader
from embedchain.loaders.local_text import LocalTextLoader
from embedchain.loaders.pdf_file import PdfFileLoader
from embedchain.loaders.sitemap import SitemapLoader
from embedchain.loaders.web_page import WebPageLoader
from embedchain.loaders.youtube_video import YoutubeVideoLoader
@@ -39,6 +40,7 @@ class DataFormatter:
"qna_pair": LocalQnaPairLoader(),
"text": LocalTextLoader(),
"docx": DocxFileLoader(),
"sitemap": SitemapLoader(),
}
if data_type in loaders:
return loaders[data_type]
@@ -60,6 +62,7 @@ class DataFormatter:
"qna_pair": QnaPairChunker(config),
"text": TextChunker(config),
"docx": DocxFileChunker(config),
"sitemap": WebPageChunker(config),
}
if data_type in chunkers:
return chunkers[data_type]

View File

@@ -0,0 +1,24 @@
import requests
from bs4 import BeautifulSoup
from embedchain.loaders.web_page import WebPageLoader
class SitemapLoader:
def load_data(self, sitemap_url):
"""
This method takes a sitemap URL as input and retrieves
all the URLs to use the WebPageLoader to load content
of each page.
"""
output = []
web_page_loader = WebPageLoader()
response = requests.get(sitemap_url)
response.raise_for_status()
soup = BeautifulSoup(response.text, "xml")
links = [link.text for link in soup.find_all("loc")]
for link in links:
each_load_data = web_page_loader.load_data(link)
output.append(each_load_data)
return [data[0] for data in output]

View File

@@ -1,5 +1,5 @@
import os
import logging
import os
import chromadb
from chromadb.utils import embedding_functions

View File

@@ -29,6 +29,7 @@ setuptools.setup(
"beautifulsoup4",
"pypdf",
"pytube",
"lxml",
"gpt4all",
"sentence_transformers",
"docx2txt",