diff --git a/docs/advanced/data_types.mdx b/docs/advanced/data_types.mdx index 123a41d8..7e37efd1 100644 --- a/docs/advanced/data_types.mdx +++ b/docs/advanced/data_types.mdx @@ -38,6 +38,14 @@ To add any doc/docx file, use the data_type as `docx`. Eg: app.add('docx', 'a_local_docx_file_path') ``` +### Code documentation website loader + +To add any code documentation website as a loader, use the data_type as `docs_site`. Eg: + +```python +app.add("docs_site", "https://docs.embedchain.ai/") +``` + ### Text To supply your own text, use the data_type as `text` and enter a string. The text is not processed, this can be very versatile. Eg: diff --git a/embedchain/chunkers/code_docs_page.py b/embedchain/chunkers/docs_site.py similarity index 87% rename from embedchain/chunkers/code_docs_page.py rename to embedchain/chunkers/docs_site.py index a3470cfa..18e2a42b 100644 --- a/embedchain/chunkers/code_docs_page.py +++ b/embedchain/chunkers/docs_site.py @@ -12,8 +12,8 @@ TEXT_SPLITTER_CHUNK_PARAMS = { } -class CodeDocsPageChunker(BaseChunker): - """Chunker for code docs page.""" +class DocsSiteChunker(BaseChunker): + """Chunker for code docs site.""" def __init__(self, config: Optional[ChunkerConfig] = None): if config is None: diff --git a/embedchain/config/QueryConfig.py b/embedchain/config/QueryConfig.py index 39285a21..8c7a5ba4 100644 --- a/embedchain/config/QueryConfig.py +++ b/embedchain/config/QueryConfig.py @@ -28,7 +28,7 @@ DEFAULT_PROMPT_WITH_HISTORY = """ Helpful Answer: """ # noqa:E501 -CODE_DOCS_PAGE_DEFAULT_PROMPT = """ +DOCS_SITE_DEFAULT_PROMPT = """ Use the following pieces of context to answer the query at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Wherever possible, give complete code snippet. Dont make up any code snippet on your own. @@ -41,7 +41,7 @@ CODE_DOCS_PAGE_DEFAULT_PROMPT = """ DEFAULT_PROMPT_TEMPLATE = Template(DEFAULT_PROMPT) DEFAULT_PROMPT_WITH_HISTORY_TEMPLATE = Template(DEFAULT_PROMPT_WITH_HISTORY) -CODE_DOCS_PAGE_PROMPT_TEMPLATE = Template(CODE_DOCS_PAGE_DEFAULT_PROMPT) +DOCS_SITE_PROMPT_TEMPLATE = Template(DOCS_SITE_DEFAULT_PROMPT) query_re = re.compile(r"\$\{*query\}*") context_re = re.compile(r"\$\{*context\}*") history_re = re.compile(r"\$\{*history\}*") diff --git a/embedchain/data_formatter/data_formatter.py b/embedchain/data_formatter/data_formatter.py index 10c8c68b..3e941fea 100644 --- a/embedchain/data_formatter/data_formatter.py +++ b/embedchain/data_formatter/data_formatter.py @@ -1,4 +1,4 @@ -from embedchain.chunkers.code_docs_page import CodeDocsPageChunker +from embedchain.chunkers.docs_site import DocsSiteChunker from embedchain.chunkers.docx_file import DocxFileChunker from embedchain.chunkers.pdf_file import PdfFileChunker from embedchain.chunkers.qna_pair import QnaPairChunker @@ -6,7 +6,7 @@ from embedchain.chunkers.text import TextChunker from embedchain.chunkers.web_page import WebPageChunker from embedchain.chunkers.youtube_video import YoutubeVideoChunker from embedchain.config import AddConfig -from embedchain.loaders.code_docs_page import CodeDocsPageLoader +from embedchain.loaders.docs_site_loader import DocsSiteLoader from embedchain.loaders.docx_file import DocxFileLoader from embedchain.loaders.local_qna_pair import LocalQnaPairLoader from embedchain.loaders.local_text import LocalTextLoader @@ -43,7 +43,7 @@ class DataFormatter: "text": LocalTextLoader(), "docx": DocxFileLoader(), "sitemap": SitemapLoader(), - "code_docs_page": CodeDocsPageLoader(), + "docs_site": DocsSiteLoader(), } if data_type in loaders: return loaders[data_type] @@ -66,7 +66,7 @@ class DataFormatter: "text": TextChunker(config), "docx": DocxFileChunker(config), "sitemap": WebPageChunker(config), - "code_docs_page": CodeDocsPageChunker(config), + "docs_site": DocsSiteChunker(config), } if data_type in chunkers: return chunkers[data_type] diff --git a/embedchain/embedchain.py b/embedchain/embedchain.py index 216b9d41..cc88f887 100644 --- a/embedchain/embedchain.py +++ b/embedchain/embedchain.py @@ -9,7 +9,7 @@ from langchain.docstore.document import Document from langchain.memory import ConversationBufferMemory from embedchain.config import AddConfig, ChatConfig, InitConfig, QueryConfig -from embedchain.config.QueryConfig import CODE_DOCS_PAGE_PROMPT_TEMPLATE, DEFAULT_PROMPT, DEFAULT_PROMPT_WITH_HISTORY +from embedchain.config.QueryConfig import DOCS_SITE_PROMPT_TEMPLATE, DEFAULT_PROMPT, DEFAULT_PROMPT_WITH_HISTORY from embedchain.data_formatter import DataFormatter gpt4all_model = None @@ -35,7 +35,7 @@ class EmbedChain: self.db_client = self.config.db.client self.collection = self.config.db.collection self.user_asks = [] - self.is_code_docs_instance = False + self.is_docs_site_instance = False self.online = False def add(self, data_type, url, metadata=None, config: AddConfig = None): @@ -56,8 +56,8 @@ class EmbedChain: data_formatter = DataFormatter(data_type, config) self.user_asks.append([data_type, url, metadata]) self.load_and_embed(data_formatter.loader, data_formatter.chunker, url, metadata) - if data_type in ("code_docs_page",): - self.is_code_docs_instance = True + if data_type in ("docs_site",): + self.is_docs_site_instance = True def add_local(self, data_type, content, metadata=None, config: AddConfig = None): """ @@ -201,6 +201,7 @@ class EmbedChain: def access_search_and_get_results(self, input_query): from langchain.tools import DuckDuckGoSearchRun + search = DuckDuckGoSearchRun() logging.info(f"Access search to get answers for {input_query}") return search.run(input_query) @@ -218,8 +219,8 @@ class EmbedChain: """ if config is None: config = QueryConfig() - if self.is_code_docs_instance: - config.template = CODE_DOCS_PAGE_PROMPT_TEMPLATE + if self.is_docs_site_instance: + config.template = DOCS_SITE_PROMPT_TEMPLATE config.number_documents = 5 k = {} if self.online: @@ -257,8 +258,8 @@ class EmbedChain: """ if config is None: config = ChatConfig() - if self.is_code_docs_instance: - config.template = CODE_DOCS_PAGE_PROMPT_TEMPLATE + if self.is_docs_site_instance: + config.template = DOCS_SITE_PROMPT_TEMPLATE config.number_documents = 5 k = {} if self.online: diff --git a/embedchain/loaders/code_docs_page.py b/embedchain/loaders/code_docs_page.py deleted file mode 100644 index 5d4e1720..00000000 --- a/embedchain/loaders/code_docs_page.py +++ /dev/null @@ -1,64 +0,0 @@ -import requests -from bs4 import BeautifulSoup - -from embedchain.utils import clean_string - - -class CodeDocsPageLoader: - def load_data(self, url): - """Load data from a web page.""" - response = requests.get(url) - data = response.content - soup = BeautifulSoup(data, "html.parser") - selectors = [ - "article.bd-article", - 'article[role="main"]', - "div.md-content", - 'div[role="main"]', - "div.container", - "div.section", - "article", - "main", - ] - content = None - for selector in selectors: - element = soup.select_one(selector) - if element is not None: - content = element.prettify() - break - if not content: - content = soup.get_text() - soup = BeautifulSoup(content, "html.parser") - for tag in soup( - [ - "nav", - "aside", - "form", - "header", - "noscript", - "svg", - "canvas", - "footer", - "script", - "style", - ] - ): - tag.string = " " - for div in soup.find_all("div", {"class": "cell_output"}): - div.decompose() - for div in soup.find_all("div", {"class": "output_wrapper"}): - div.decompose() - for div in soup.find_all("div", {"class": "output"}): - div.decompose() - content = clean_string(soup.get_text()) - output = [] - meta_data = { - "url": url, - } - output.append( - { - "content": content, - "meta_data": meta_data, - } - ) - return output diff --git a/embedchain/loaders/docs_site_loader.py b/embedchain/loaders/docs_site_loader.py new file mode 100644 index 00000000..404ba8b1 --- /dev/null +++ b/embedchain/loaders/docs_site_loader.py @@ -0,0 +1,98 @@ +import logging +from urllib.parse import urljoin, urlparse + +import requests +from bs4 import BeautifulSoup + + +class DocsSiteLoader: + def __init__(self): + self.visited_links = set() + + def _get_child_links_recursive(self, url): + parsed_url = urlparse(url) + base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" + current_path = parsed_url.path + + response = requests.get(url) + if response.status_code != 200: + logging.info(f"Failed to fetch the website: {response.status_code}") + return + + soup = BeautifulSoup(response.text, "html.parser") + all_links = [link.get("href") for link in soup.find_all("a")] + + child_links = [link for link in all_links if link and link.startswith(current_path) and link != current_path] + + absolute_paths = [urljoin(base_url, link) for link in child_links] + + for link in absolute_paths: + if link not in self.visited_links: + self.visited_links.add(link) + self._get_child_links_recursive(link) + + def _get_all_urls(self, url): + self.visited_links = set() + self._get_child_links_recursive(url) + urls = [link for link in self.visited_links if urlparse(link).netloc == urlparse(url).netloc] + return urls + + def _load_data_from_url(self, url): + response = requests.get(url) + if response.status_code != 200: + logging.info(f"Failed to fetch the website: {response.status_code}") + return [] + + soup = BeautifulSoup(response.content, "html.parser") + selectors = [ + "article.bd-article", + 'article[role="main"]', + "div.md-content", + 'div[role="main"]', + "div.container", + "div.section", + "article", + "main", + ] + + output = [] + for selector in selectors: + element = soup.select_one(selector) + if element: + content = element.prettify() + break + else: + content = soup.get_text() + + soup = BeautifulSoup(content, "html.parser") + ignored_tags = [ + "nav", + "aside", + "form", + "header", + "noscript", + "svg", + "canvas", + "footer", + "script", + "style", + ] + for tag in soup(ignored_tags): + tag.decompose() + + content = " ".join(soup.stripped_strings) + output.append( + { + "content": content, + "meta_data": {"url": url}, + } + ) + + return output + + def load_data(self, url): + all_urls = self._get_all_urls(url) + output = [] + for u in all_urls: + output.extend(self._load_data_from_url(u)) + return output diff --git a/embedchain/version.py b/embedchain/version.py deleted file mode 100644 index 40b07ef1..00000000 --- a/embedchain/version.py +++ /dev/null @@ -1 +0,0 @@ -__version__ = "0.0.23" diff --git a/notebooks/embedchain-docs-site-example.ipynb b/notebooks/embedchain-docs-site-example.ipynb new file mode 100644 index 00000000..bde7d5d9 --- /dev/null +++ b/notebooks/embedchain-docs-site-example.ipynb @@ -0,0 +1,49 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "id": "9743e7ee", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "embedchain not found\r\n" + ] + } + ], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e3b55735", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/setup.py b/setup.py index bdaae363..961a02ac 100644 --- a/setup.py +++ b/setup.py @@ -1,15 +1,11 @@ -import importlib.metadata - import setuptools -version = importlib.metadata.version(__package__ or __name__) - with open("README.md", "r", encoding="utf-8") as fh: long_description = fh.read() setuptools.setup( name="embedchain", - version=version, + version="0.0.23", author="Taranjeet Singh", author_email="reachtotj@gmail.com", description="embedchain is a framework to easily create LLM powered bots over any dataset", # noqa:E501