Feature: Add support for loading docs website (#293)

This commit is contained in:
Deshraj Yadav
2023-07-16 22:22:52 -07:00
committed by GitHub
parent d5e40e1853
commit a548863a09
10 changed files with 173 additions and 86 deletions

View File

@@ -12,8 +12,8 @@ TEXT_SPLITTER_CHUNK_PARAMS = {
}
class CodeDocsPageChunker(BaseChunker):
"""Chunker for code docs page."""
class DocsSiteChunker(BaseChunker):
"""Chunker for code docs site."""
def __init__(self, config: Optional[ChunkerConfig] = None):
if config is None:

View File

@@ -28,7 +28,7 @@ DEFAULT_PROMPT_WITH_HISTORY = """
Helpful Answer:
""" # noqa:E501
CODE_DOCS_PAGE_DEFAULT_PROMPT = """
DOCS_SITE_DEFAULT_PROMPT = """
Use the following pieces of context to answer the query at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer. Wherever possible, give complete code snippet. Dont make up any code snippet on your own.
@@ -41,7 +41,7 @@ CODE_DOCS_PAGE_DEFAULT_PROMPT = """
DEFAULT_PROMPT_TEMPLATE = Template(DEFAULT_PROMPT)
DEFAULT_PROMPT_WITH_HISTORY_TEMPLATE = Template(DEFAULT_PROMPT_WITH_HISTORY)
CODE_DOCS_PAGE_PROMPT_TEMPLATE = Template(CODE_DOCS_PAGE_DEFAULT_PROMPT)
DOCS_SITE_PROMPT_TEMPLATE = Template(DOCS_SITE_DEFAULT_PROMPT)
query_re = re.compile(r"\$\{*query\}*")
context_re = re.compile(r"\$\{*context\}*")
history_re = re.compile(r"\$\{*history\}*")

View File

@@ -1,4 +1,4 @@
from embedchain.chunkers.code_docs_page import CodeDocsPageChunker
from embedchain.chunkers.docs_site import DocsSiteChunker
from embedchain.chunkers.docx_file import DocxFileChunker
from embedchain.chunkers.pdf_file import PdfFileChunker
from embedchain.chunkers.qna_pair import QnaPairChunker
@@ -6,7 +6,7 @@ from embedchain.chunkers.text import TextChunker
from embedchain.chunkers.web_page import WebPageChunker
from embedchain.chunkers.youtube_video import YoutubeVideoChunker
from embedchain.config import AddConfig
from embedchain.loaders.code_docs_page import CodeDocsPageLoader
from embedchain.loaders.docs_site_loader import DocsSiteLoader
from embedchain.loaders.docx_file import DocxFileLoader
from embedchain.loaders.local_qna_pair import LocalQnaPairLoader
from embedchain.loaders.local_text import LocalTextLoader
@@ -43,7 +43,7 @@ class DataFormatter:
"text": LocalTextLoader(),
"docx": DocxFileLoader(),
"sitemap": SitemapLoader(),
"code_docs_page": CodeDocsPageLoader(),
"docs_site": DocsSiteLoader(),
}
if data_type in loaders:
return loaders[data_type]
@@ -66,7 +66,7 @@ class DataFormatter:
"text": TextChunker(config),
"docx": DocxFileChunker(config),
"sitemap": WebPageChunker(config),
"code_docs_page": CodeDocsPageChunker(config),
"docs_site": DocsSiteChunker(config),
}
if data_type in chunkers:
return chunkers[data_type]

View File

@@ -9,7 +9,7 @@ from langchain.docstore.document import Document
from langchain.memory import ConversationBufferMemory
from embedchain.config import AddConfig, ChatConfig, InitConfig, QueryConfig
from embedchain.config.QueryConfig import CODE_DOCS_PAGE_PROMPT_TEMPLATE, DEFAULT_PROMPT, DEFAULT_PROMPT_WITH_HISTORY
from embedchain.config.QueryConfig import DOCS_SITE_PROMPT_TEMPLATE, DEFAULT_PROMPT, DEFAULT_PROMPT_WITH_HISTORY
from embedchain.data_formatter import DataFormatter
gpt4all_model = None
@@ -35,7 +35,7 @@ class EmbedChain:
self.db_client = self.config.db.client
self.collection = self.config.db.collection
self.user_asks = []
self.is_code_docs_instance = False
self.is_docs_site_instance = False
self.online = False
def add(self, data_type, url, metadata=None, config: AddConfig = None):
@@ -56,8 +56,8 @@ class EmbedChain:
data_formatter = DataFormatter(data_type, config)
self.user_asks.append([data_type, url, metadata])
self.load_and_embed(data_formatter.loader, data_formatter.chunker, url, metadata)
if data_type in ("code_docs_page",):
self.is_code_docs_instance = True
if data_type in ("docs_site",):
self.is_docs_site_instance = True
def add_local(self, data_type, content, metadata=None, config: AddConfig = None):
"""
@@ -201,6 +201,7 @@ class EmbedChain:
def access_search_and_get_results(self, input_query):
from langchain.tools import DuckDuckGoSearchRun
search = DuckDuckGoSearchRun()
logging.info(f"Access search to get answers for {input_query}")
return search.run(input_query)
@@ -218,8 +219,8 @@ class EmbedChain:
"""
if config is None:
config = QueryConfig()
if self.is_code_docs_instance:
config.template = CODE_DOCS_PAGE_PROMPT_TEMPLATE
if self.is_docs_site_instance:
config.template = DOCS_SITE_PROMPT_TEMPLATE
config.number_documents = 5
k = {}
if self.online:
@@ -257,8 +258,8 @@ class EmbedChain:
"""
if config is None:
config = ChatConfig()
if self.is_code_docs_instance:
config.template = CODE_DOCS_PAGE_PROMPT_TEMPLATE
if self.is_docs_site_instance:
config.template = DOCS_SITE_PROMPT_TEMPLATE
config.number_documents = 5
k = {}
if self.online:

View File

@@ -1,64 +0,0 @@
import requests
from bs4 import BeautifulSoup
from embedchain.utils import clean_string
class CodeDocsPageLoader:
def load_data(self, url):
"""Load data from a web page."""
response = requests.get(url)
data = response.content
soup = BeautifulSoup(data, "html.parser")
selectors = [
"article.bd-article",
'article[role="main"]',
"div.md-content",
'div[role="main"]',
"div.container",
"div.section",
"article",
"main",
]
content = None
for selector in selectors:
element = soup.select_one(selector)
if element is not None:
content = element.prettify()
break
if not content:
content = soup.get_text()
soup = BeautifulSoup(content, "html.parser")
for tag in soup(
[
"nav",
"aside",
"form",
"header",
"noscript",
"svg",
"canvas",
"footer",
"script",
"style",
]
):
tag.string = " "
for div in soup.find_all("div", {"class": "cell_output"}):
div.decompose()
for div in soup.find_all("div", {"class": "output_wrapper"}):
div.decompose()
for div in soup.find_all("div", {"class": "output"}):
div.decompose()
content = clean_string(soup.get_text())
output = []
meta_data = {
"url": url,
}
output.append(
{
"content": content,
"meta_data": meta_data,
}
)
return output

View File

@@ -0,0 +1,98 @@
import logging
from urllib.parse import urljoin, urlparse
import requests
from bs4 import BeautifulSoup
class DocsSiteLoader:
def __init__(self):
self.visited_links = set()
def _get_child_links_recursive(self, url):
parsed_url = urlparse(url)
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
current_path = parsed_url.path
response = requests.get(url)
if response.status_code != 200:
logging.info(f"Failed to fetch the website: {response.status_code}")
return
soup = BeautifulSoup(response.text, "html.parser")
all_links = [link.get("href") for link in soup.find_all("a")]
child_links = [link for link in all_links if link and link.startswith(current_path) and link != current_path]
absolute_paths = [urljoin(base_url, link) for link in child_links]
for link in absolute_paths:
if link not in self.visited_links:
self.visited_links.add(link)
self._get_child_links_recursive(link)
def _get_all_urls(self, url):
self.visited_links = set()
self._get_child_links_recursive(url)
urls = [link for link in self.visited_links if urlparse(link).netloc == urlparse(url).netloc]
return urls
def _load_data_from_url(self, url):
response = requests.get(url)
if response.status_code != 200:
logging.info(f"Failed to fetch the website: {response.status_code}")
return []
soup = BeautifulSoup(response.content, "html.parser")
selectors = [
"article.bd-article",
'article[role="main"]',
"div.md-content",
'div[role="main"]',
"div.container",
"div.section",
"article",
"main",
]
output = []
for selector in selectors:
element = soup.select_one(selector)
if element:
content = element.prettify()
break
else:
content = soup.get_text()
soup = BeautifulSoup(content, "html.parser")
ignored_tags = [
"nav",
"aside",
"form",
"header",
"noscript",
"svg",
"canvas",
"footer",
"script",
"style",
]
for tag in soup(ignored_tags):
tag.decompose()
content = " ".join(soup.stripped_strings)
output.append(
{
"content": content,
"meta_data": {"url": url},
}
)
return output
def load_data(self, url):
all_urls = self._get_all_urls(url)
output = []
for u in all_urls:
output.extend(self._load_data_from_url(u))
return output

View File

@@ -1 +0,0 @@
__version__ = "0.0.23"