feat: add SitemapLoader (#68)

This commit is contained in:
aaishikdutta
2023-07-12 12:13:30 +05:30
committed by GitHub
parent 6fbf45498a
commit d2e8f796ca
6 changed files with 37 additions and 1 deletions

View File

@@ -294,6 +294,13 @@ To supply your own QnA pair, use the data_type as `qna_pair` and enter a tuple.
```python ```python
app.add_local('qna_pair', ("Question", "Answer")) app.add_local('qna_pair', ("Question", "Answer"))
``` ```
### Sitemap
To add a XML site map containing list of all urls, use the data_type as `sitemap` and enter the sitemap url. Eg:
```python
app.add('sitemap', 'a_valid_sitemap_url/sitemap.xml')
```
### Reusing a Vector DB ### Reusing a Vector DB

View File

@@ -62,6 +62,7 @@ class InitConfig(BaseConfig):
Sets database to default (`ChromaDb`). Sets database to default (`ChromaDb`).
""" """
from embedchain.vectordb.chroma_db import ChromaDB from embedchain.vectordb.chroma_db import ChromaDB
self.db = ChromaDB(ef=self.ef, host=self.host, port=self.port) self.db = ChromaDB(ef=self.ef, host=self.host, port=self.port)
def _setup_logging(self, debug_level): def _setup_logging(self, debug_level):

View File

@@ -9,6 +9,7 @@ from embedchain.loaders.docx_file import DocxFileLoader
from embedchain.loaders.local_qna_pair import LocalQnaPairLoader from embedchain.loaders.local_qna_pair import LocalQnaPairLoader
from embedchain.loaders.local_text import LocalTextLoader from embedchain.loaders.local_text import LocalTextLoader
from embedchain.loaders.pdf_file import PdfFileLoader from embedchain.loaders.pdf_file import PdfFileLoader
from embedchain.loaders.sitemap import SitemapLoader
from embedchain.loaders.web_page import WebPageLoader from embedchain.loaders.web_page import WebPageLoader
from embedchain.loaders.youtube_video import YoutubeVideoLoader from embedchain.loaders.youtube_video import YoutubeVideoLoader
@@ -39,6 +40,7 @@ class DataFormatter:
"qna_pair": LocalQnaPairLoader(), "qna_pair": LocalQnaPairLoader(),
"text": LocalTextLoader(), "text": LocalTextLoader(),
"docx": DocxFileLoader(), "docx": DocxFileLoader(),
"sitemap": SitemapLoader(),
} }
if data_type in loaders: if data_type in loaders:
return loaders[data_type] return loaders[data_type]
@@ -60,6 +62,7 @@ class DataFormatter:
"qna_pair": QnaPairChunker(config), "qna_pair": QnaPairChunker(config),
"text": TextChunker(config), "text": TextChunker(config),
"docx": DocxFileChunker(config), "docx": DocxFileChunker(config),
"sitemap": WebPageChunker(config),
} }
if data_type in chunkers: if data_type in chunkers:
return chunkers[data_type] return chunkers[data_type]

View File

@@ -0,0 +1,24 @@
import requests
from bs4 import BeautifulSoup
from embedchain.loaders.web_page import WebPageLoader
class SitemapLoader:
def load_data(self, sitemap_url):
"""
This method takes a sitemap URL as input and retrieves
all the URLs to use the WebPageLoader to load content
of each page.
"""
output = []
web_page_loader = WebPageLoader()
response = requests.get(sitemap_url)
response.raise_for_status()
soup = BeautifulSoup(response.text, "xml")
links = [link.text for link in soup.find_all("loc")]
for link in links:
each_load_data = web_page_loader.load_data(link)
output.append(each_load_data)
return [data[0] for data in output]

View File

@@ -1,5 +1,5 @@
import os
import logging import logging
import os
import chromadb import chromadb
from chromadb.utils import embedding_functions from chromadb.utils import embedding_functions

View File

@@ -29,6 +29,7 @@ setuptools.setup(
"beautifulsoup4", "beautifulsoup4",
"pypdf", "pypdf",
"pytube", "pytube",
"lxml",
"gpt4all", "gpt4all",
"sentence_transformers", "sentence_transformers",
"docx2txt", "docx2txt",