Feature: Add support for loading docs website (#293)
This commit is contained in:
@@ -12,8 +12,8 @@ TEXT_SPLITTER_CHUNK_PARAMS = {
|
||||
}
|
||||
|
||||
|
||||
class CodeDocsPageChunker(BaseChunker):
|
||||
"""Chunker for code docs page."""
|
||||
class DocsSiteChunker(BaseChunker):
|
||||
"""Chunker for code docs site."""
|
||||
|
||||
def __init__(self, config: Optional[ChunkerConfig] = None):
|
||||
if config is None:
|
||||
@@ -28,7 +28,7 @@ DEFAULT_PROMPT_WITH_HISTORY = """
|
||||
Helpful Answer:
|
||||
""" # noqa:E501
|
||||
|
||||
CODE_DOCS_PAGE_DEFAULT_PROMPT = """
|
||||
DOCS_SITE_DEFAULT_PROMPT = """
|
||||
Use the following pieces of context to answer the query at the end.
|
||||
If you don't know the answer, just say that you don't know, don't try to make up an answer. Wherever possible, give complete code snippet. Dont make up any code snippet on your own.
|
||||
|
||||
@@ -41,7 +41,7 @@ CODE_DOCS_PAGE_DEFAULT_PROMPT = """
|
||||
|
||||
DEFAULT_PROMPT_TEMPLATE = Template(DEFAULT_PROMPT)
|
||||
DEFAULT_PROMPT_WITH_HISTORY_TEMPLATE = Template(DEFAULT_PROMPT_WITH_HISTORY)
|
||||
CODE_DOCS_PAGE_PROMPT_TEMPLATE = Template(CODE_DOCS_PAGE_DEFAULT_PROMPT)
|
||||
DOCS_SITE_PROMPT_TEMPLATE = Template(DOCS_SITE_DEFAULT_PROMPT)
|
||||
query_re = re.compile(r"\$\{*query\}*")
|
||||
context_re = re.compile(r"\$\{*context\}*")
|
||||
history_re = re.compile(r"\$\{*history\}*")
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
from embedchain.chunkers.code_docs_page import CodeDocsPageChunker
|
||||
from embedchain.chunkers.docs_site import DocsSiteChunker
|
||||
from embedchain.chunkers.docx_file import DocxFileChunker
|
||||
from embedchain.chunkers.pdf_file import PdfFileChunker
|
||||
from embedchain.chunkers.qna_pair import QnaPairChunker
|
||||
@@ -6,7 +6,7 @@ from embedchain.chunkers.text import TextChunker
|
||||
from embedchain.chunkers.web_page import WebPageChunker
|
||||
from embedchain.chunkers.youtube_video import YoutubeVideoChunker
|
||||
from embedchain.config import AddConfig
|
||||
from embedchain.loaders.code_docs_page import CodeDocsPageLoader
|
||||
from embedchain.loaders.docs_site_loader import DocsSiteLoader
|
||||
from embedchain.loaders.docx_file import DocxFileLoader
|
||||
from embedchain.loaders.local_qna_pair import LocalQnaPairLoader
|
||||
from embedchain.loaders.local_text import LocalTextLoader
|
||||
@@ -43,7 +43,7 @@ class DataFormatter:
|
||||
"text": LocalTextLoader(),
|
||||
"docx": DocxFileLoader(),
|
||||
"sitemap": SitemapLoader(),
|
||||
"code_docs_page": CodeDocsPageLoader(),
|
||||
"docs_site": DocsSiteLoader(),
|
||||
}
|
||||
if data_type in loaders:
|
||||
return loaders[data_type]
|
||||
@@ -66,7 +66,7 @@ class DataFormatter:
|
||||
"text": TextChunker(config),
|
||||
"docx": DocxFileChunker(config),
|
||||
"sitemap": WebPageChunker(config),
|
||||
"code_docs_page": CodeDocsPageChunker(config),
|
||||
"docs_site": DocsSiteChunker(config),
|
||||
}
|
||||
if data_type in chunkers:
|
||||
return chunkers[data_type]
|
||||
|
||||
@@ -9,7 +9,7 @@ from langchain.docstore.document import Document
|
||||
from langchain.memory import ConversationBufferMemory
|
||||
|
||||
from embedchain.config import AddConfig, ChatConfig, InitConfig, QueryConfig
|
||||
from embedchain.config.QueryConfig import CODE_DOCS_PAGE_PROMPT_TEMPLATE, DEFAULT_PROMPT, DEFAULT_PROMPT_WITH_HISTORY
|
||||
from embedchain.config.QueryConfig import DOCS_SITE_PROMPT_TEMPLATE, DEFAULT_PROMPT, DEFAULT_PROMPT_WITH_HISTORY
|
||||
from embedchain.data_formatter import DataFormatter
|
||||
|
||||
gpt4all_model = None
|
||||
@@ -35,7 +35,7 @@ class EmbedChain:
|
||||
self.db_client = self.config.db.client
|
||||
self.collection = self.config.db.collection
|
||||
self.user_asks = []
|
||||
self.is_code_docs_instance = False
|
||||
self.is_docs_site_instance = False
|
||||
self.online = False
|
||||
|
||||
def add(self, data_type, url, metadata=None, config: AddConfig = None):
|
||||
@@ -56,8 +56,8 @@ class EmbedChain:
|
||||
data_formatter = DataFormatter(data_type, config)
|
||||
self.user_asks.append([data_type, url, metadata])
|
||||
self.load_and_embed(data_formatter.loader, data_formatter.chunker, url, metadata)
|
||||
if data_type in ("code_docs_page",):
|
||||
self.is_code_docs_instance = True
|
||||
if data_type in ("docs_site",):
|
||||
self.is_docs_site_instance = True
|
||||
|
||||
def add_local(self, data_type, content, metadata=None, config: AddConfig = None):
|
||||
"""
|
||||
@@ -201,6 +201,7 @@ class EmbedChain:
|
||||
|
||||
def access_search_and_get_results(self, input_query):
|
||||
from langchain.tools import DuckDuckGoSearchRun
|
||||
|
||||
search = DuckDuckGoSearchRun()
|
||||
logging.info(f"Access search to get answers for {input_query}")
|
||||
return search.run(input_query)
|
||||
@@ -218,8 +219,8 @@ class EmbedChain:
|
||||
"""
|
||||
if config is None:
|
||||
config = QueryConfig()
|
||||
if self.is_code_docs_instance:
|
||||
config.template = CODE_DOCS_PAGE_PROMPT_TEMPLATE
|
||||
if self.is_docs_site_instance:
|
||||
config.template = DOCS_SITE_PROMPT_TEMPLATE
|
||||
config.number_documents = 5
|
||||
k = {}
|
||||
if self.online:
|
||||
@@ -257,8 +258,8 @@ class EmbedChain:
|
||||
"""
|
||||
if config is None:
|
||||
config = ChatConfig()
|
||||
if self.is_code_docs_instance:
|
||||
config.template = CODE_DOCS_PAGE_PROMPT_TEMPLATE
|
||||
if self.is_docs_site_instance:
|
||||
config.template = DOCS_SITE_PROMPT_TEMPLATE
|
||||
config.number_documents = 5
|
||||
k = {}
|
||||
if self.online:
|
||||
|
||||
@@ -1,64 +0,0 @@
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from embedchain.utils import clean_string
|
||||
|
||||
|
||||
class CodeDocsPageLoader:
|
||||
def load_data(self, url):
|
||||
"""Load data from a web page."""
|
||||
response = requests.get(url)
|
||||
data = response.content
|
||||
soup = BeautifulSoup(data, "html.parser")
|
||||
selectors = [
|
||||
"article.bd-article",
|
||||
'article[role="main"]',
|
||||
"div.md-content",
|
||||
'div[role="main"]',
|
||||
"div.container",
|
||||
"div.section",
|
||||
"article",
|
||||
"main",
|
||||
]
|
||||
content = None
|
||||
for selector in selectors:
|
||||
element = soup.select_one(selector)
|
||||
if element is not None:
|
||||
content = element.prettify()
|
||||
break
|
||||
if not content:
|
||||
content = soup.get_text()
|
||||
soup = BeautifulSoup(content, "html.parser")
|
||||
for tag in soup(
|
||||
[
|
||||
"nav",
|
||||
"aside",
|
||||
"form",
|
||||
"header",
|
||||
"noscript",
|
||||
"svg",
|
||||
"canvas",
|
||||
"footer",
|
||||
"script",
|
||||
"style",
|
||||
]
|
||||
):
|
||||
tag.string = " "
|
||||
for div in soup.find_all("div", {"class": "cell_output"}):
|
||||
div.decompose()
|
||||
for div in soup.find_all("div", {"class": "output_wrapper"}):
|
||||
div.decompose()
|
||||
for div in soup.find_all("div", {"class": "output"}):
|
||||
div.decompose()
|
||||
content = clean_string(soup.get_text())
|
||||
output = []
|
||||
meta_data = {
|
||||
"url": url,
|
||||
}
|
||||
output.append(
|
||||
{
|
||||
"content": content,
|
||||
"meta_data": meta_data,
|
||||
}
|
||||
)
|
||||
return output
|
||||
98
embedchain/loaders/docs_site_loader.py
Normal file
98
embedchain/loaders/docs_site_loader.py
Normal file
@@ -0,0 +1,98 @@
|
||||
import logging
|
||||
from urllib.parse import urljoin, urlparse
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
|
||||
class DocsSiteLoader:
|
||||
def __init__(self):
|
||||
self.visited_links = set()
|
||||
|
||||
def _get_child_links_recursive(self, url):
|
||||
parsed_url = urlparse(url)
|
||||
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
|
||||
current_path = parsed_url.path
|
||||
|
||||
response = requests.get(url)
|
||||
if response.status_code != 200:
|
||||
logging.info(f"Failed to fetch the website: {response.status_code}")
|
||||
return
|
||||
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
all_links = [link.get("href") for link in soup.find_all("a")]
|
||||
|
||||
child_links = [link for link in all_links if link and link.startswith(current_path) and link != current_path]
|
||||
|
||||
absolute_paths = [urljoin(base_url, link) for link in child_links]
|
||||
|
||||
for link in absolute_paths:
|
||||
if link not in self.visited_links:
|
||||
self.visited_links.add(link)
|
||||
self._get_child_links_recursive(link)
|
||||
|
||||
def _get_all_urls(self, url):
|
||||
self.visited_links = set()
|
||||
self._get_child_links_recursive(url)
|
||||
urls = [link for link in self.visited_links if urlparse(link).netloc == urlparse(url).netloc]
|
||||
return urls
|
||||
|
||||
def _load_data_from_url(self, url):
|
||||
response = requests.get(url)
|
||||
if response.status_code != 200:
|
||||
logging.info(f"Failed to fetch the website: {response.status_code}")
|
||||
return []
|
||||
|
||||
soup = BeautifulSoup(response.content, "html.parser")
|
||||
selectors = [
|
||||
"article.bd-article",
|
||||
'article[role="main"]',
|
||||
"div.md-content",
|
||||
'div[role="main"]',
|
||||
"div.container",
|
||||
"div.section",
|
||||
"article",
|
||||
"main",
|
||||
]
|
||||
|
||||
output = []
|
||||
for selector in selectors:
|
||||
element = soup.select_one(selector)
|
||||
if element:
|
||||
content = element.prettify()
|
||||
break
|
||||
else:
|
||||
content = soup.get_text()
|
||||
|
||||
soup = BeautifulSoup(content, "html.parser")
|
||||
ignored_tags = [
|
||||
"nav",
|
||||
"aside",
|
||||
"form",
|
||||
"header",
|
||||
"noscript",
|
||||
"svg",
|
||||
"canvas",
|
||||
"footer",
|
||||
"script",
|
||||
"style",
|
||||
]
|
||||
for tag in soup(ignored_tags):
|
||||
tag.decompose()
|
||||
|
||||
content = " ".join(soup.stripped_strings)
|
||||
output.append(
|
||||
{
|
||||
"content": content,
|
||||
"meta_data": {"url": url},
|
||||
}
|
||||
)
|
||||
|
||||
return output
|
||||
|
||||
def load_data(self, url):
|
||||
all_urls = self._get_all_urls(url)
|
||||
output = []
|
||||
for u in all_urls:
|
||||
output.extend(self._load_data_from_url(u))
|
||||
return output
|
||||
@@ -1 +0,0 @@
|
||||
__version__ = "0.0.23"
|
||||
Reference in New Issue
Block a user