feat: add SitemapLoader (#68)
This commit is contained in:
@@ -62,6 +62,7 @@ class InitConfig(BaseConfig):
|
||||
Sets database to default (`ChromaDb`).
|
||||
"""
|
||||
from embedchain.vectordb.chroma_db import ChromaDB
|
||||
|
||||
self.db = ChromaDB(ef=self.ef, host=self.host, port=self.port)
|
||||
|
||||
def _setup_logging(self, debug_level):
|
||||
|
||||
@@ -9,6 +9,7 @@ from embedchain.loaders.docx_file import DocxFileLoader
|
||||
from embedchain.loaders.local_qna_pair import LocalQnaPairLoader
|
||||
from embedchain.loaders.local_text import LocalTextLoader
|
||||
from embedchain.loaders.pdf_file import PdfFileLoader
|
||||
from embedchain.loaders.sitemap import SitemapLoader
|
||||
from embedchain.loaders.web_page import WebPageLoader
|
||||
from embedchain.loaders.youtube_video import YoutubeVideoLoader
|
||||
|
||||
@@ -39,6 +40,7 @@ class DataFormatter:
|
||||
"qna_pair": LocalQnaPairLoader(),
|
||||
"text": LocalTextLoader(),
|
||||
"docx": DocxFileLoader(),
|
||||
"sitemap": SitemapLoader(),
|
||||
}
|
||||
if data_type in loaders:
|
||||
return loaders[data_type]
|
||||
@@ -60,6 +62,7 @@ class DataFormatter:
|
||||
"qna_pair": QnaPairChunker(config),
|
||||
"text": TextChunker(config),
|
||||
"docx": DocxFileChunker(config),
|
||||
"sitemap": WebPageChunker(config),
|
||||
}
|
||||
if data_type in chunkers:
|
||||
return chunkers[data_type]
|
||||
|
||||
24
embedchain/loaders/sitemap.py
Normal file
24
embedchain/loaders/sitemap.py
Normal file
@@ -0,0 +1,24 @@
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from embedchain.loaders.web_page import WebPageLoader
|
||||
|
||||
|
||||
class SitemapLoader:
|
||||
def load_data(self, sitemap_url):
|
||||
"""
|
||||
This method takes a sitemap URL as input and retrieves
|
||||
all the URLs to use the WebPageLoader to load content
|
||||
of each page.
|
||||
"""
|
||||
output = []
|
||||
web_page_loader = WebPageLoader()
|
||||
response = requests.get(sitemap_url)
|
||||
response.raise_for_status()
|
||||
|
||||
soup = BeautifulSoup(response.text, "xml")
|
||||
links = [link.text for link in soup.find_all("loc")]
|
||||
for link in links:
|
||||
each_load_data = web_page_loader.load_data(link)
|
||||
output.append(each_load_data)
|
||||
return [data[0] for data in output]
|
||||
@@ -1,5 +1,5 @@
|
||||
import os
|
||||
import logging
|
||||
import os
|
||||
|
||||
import chromadb
|
||||
from chromadb.utils import embedding_functions
|
||||
|
||||
Reference in New Issue
Block a user