feat: add SitemapLoader (#68)
This commit is contained in:
@@ -294,6 +294,13 @@ To supply your own QnA pair, use the data_type as `qna_pair` and enter a tuple.
|
|||||||
```python
|
```python
|
||||||
app.add_local('qna_pair', ("Question", "Answer"))
|
app.add_local('qna_pair', ("Question", "Answer"))
|
||||||
```
|
```
|
||||||
|
### Sitemap
|
||||||
|
|
||||||
|
To add a XML site map containing list of all urls, use the data_type as `sitemap` and enter the sitemap url. Eg:
|
||||||
|
|
||||||
|
```python
|
||||||
|
app.add('sitemap', 'a_valid_sitemap_url/sitemap.xml')
|
||||||
|
```
|
||||||
|
|
||||||
### Reusing a Vector DB
|
### Reusing a Vector DB
|
||||||
|
|
||||||
|
|||||||
@@ -62,6 +62,7 @@ class InitConfig(BaseConfig):
|
|||||||
Sets database to default (`ChromaDb`).
|
Sets database to default (`ChromaDb`).
|
||||||
"""
|
"""
|
||||||
from embedchain.vectordb.chroma_db import ChromaDB
|
from embedchain.vectordb.chroma_db import ChromaDB
|
||||||
|
|
||||||
self.db = ChromaDB(ef=self.ef, host=self.host, port=self.port)
|
self.db = ChromaDB(ef=self.ef, host=self.host, port=self.port)
|
||||||
|
|
||||||
def _setup_logging(self, debug_level):
|
def _setup_logging(self, debug_level):
|
||||||
|
|||||||
@@ -9,6 +9,7 @@ from embedchain.loaders.docx_file import DocxFileLoader
|
|||||||
from embedchain.loaders.local_qna_pair import LocalQnaPairLoader
|
from embedchain.loaders.local_qna_pair import LocalQnaPairLoader
|
||||||
from embedchain.loaders.local_text import LocalTextLoader
|
from embedchain.loaders.local_text import LocalTextLoader
|
||||||
from embedchain.loaders.pdf_file import PdfFileLoader
|
from embedchain.loaders.pdf_file import PdfFileLoader
|
||||||
|
from embedchain.loaders.sitemap import SitemapLoader
|
||||||
from embedchain.loaders.web_page import WebPageLoader
|
from embedchain.loaders.web_page import WebPageLoader
|
||||||
from embedchain.loaders.youtube_video import YoutubeVideoLoader
|
from embedchain.loaders.youtube_video import YoutubeVideoLoader
|
||||||
|
|
||||||
@@ -39,6 +40,7 @@ class DataFormatter:
|
|||||||
"qna_pair": LocalQnaPairLoader(),
|
"qna_pair": LocalQnaPairLoader(),
|
||||||
"text": LocalTextLoader(),
|
"text": LocalTextLoader(),
|
||||||
"docx": DocxFileLoader(),
|
"docx": DocxFileLoader(),
|
||||||
|
"sitemap": SitemapLoader(),
|
||||||
}
|
}
|
||||||
if data_type in loaders:
|
if data_type in loaders:
|
||||||
return loaders[data_type]
|
return loaders[data_type]
|
||||||
@@ -60,6 +62,7 @@ class DataFormatter:
|
|||||||
"qna_pair": QnaPairChunker(config),
|
"qna_pair": QnaPairChunker(config),
|
||||||
"text": TextChunker(config),
|
"text": TextChunker(config),
|
||||||
"docx": DocxFileChunker(config),
|
"docx": DocxFileChunker(config),
|
||||||
|
"sitemap": WebPageChunker(config),
|
||||||
}
|
}
|
||||||
if data_type in chunkers:
|
if data_type in chunkers:
|
||||||
return chunkers[data_type]
|
return chunkers[data_type]
|
||||||
|
|||||||
24
embedchain/loaders/sitemap.py
Normal file
24
embedchain/loaders/sitemap.py
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
from embedchain.loaders.web_page import WebPageLoader
|
||||||
|
|
||||||
|
|
||||||
|
class SitemapLoader:
|
||||||
|
def load_data(self, sitemap_url):
|
||||||
|
"""
|
||||||
|
This method takes a sitemap URL as input and retrieves
|
||||||
|
all the URLs to use the WebPageLoader to load content
|
||||||
|
of each page.
|
||||||
|
"""
|
||||||
|
output = []
|
||||||
|
web_page_loader = WebPageLoader()
|
||||||
|
response = requests.get(sitemap_url)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
soup = BeautifulSoup(response.text, "xml")
|
||||||
|
links = [link.text for link in soup.find_all("loc")]
|
||||||
|
for link in links:
|
||||||
|
each_load_data = web_page_loader.load_data(link)
|
||||||
|
output.append(each_load_data)
|
||||||
|
return [data[0] for data in output]
|
||||||
@@ -1,5 +1,5 @@
|
|||||||
import os
|
|
||||||
import logging
|
import logging
|
||||||
|
import os
|
||||||
|
|
||||||
import chromadb
|
import chromadb
|
||||||
from chromadb.utils import embedding_functions
|
from chromadb.utils import embedding_functions
|
||||||
|
|||||||
Reference in New Issue
Block a user