feat: Add new data type: code_docs_loader (#274)

This commit is contained in:
Taranjeet Singh
2023-07-15 09:02:11 +05:30
committed by GitHub
parent cd0c7bc971
commit 86e4146126
7 changed files with 133 additions and 6 deletions

View File

@@ -25,7 +25,7 @@ class BaseChunker:
meta_data = data["meta_data"]
url = meta_data["url"]
chunks = self.text_splitter.split_text(content)
chunks = self.get_chunks(content)
for chunk in chunks:
chunk_id = hashlib.sha256((chunk + url).encode()).hexdigest()
@@ -39,3 +39,11 @@ class BaseChunker:
"ids": ids,
"metadatas": metadatas,
}
def get_chunks(self, content):
"""
Returns chunks using text splitter instance.
Override in child class if custom logic.
"""
return self.text_splitter.split_text(content)

View File

@@ -0,0 +1,22 @@
from typing import Optional
from langchain.text_splitter import RecursiveCharacterTextSplitter
from embedchain.chunkers.base_chunker import BaseChunker
from embedchain.config.AddConfig import ChunkerConfig
TEXT_SPLITTER_CHUNK_PARAMS = {
"chunk_size": 500,
"chunk_overlap": 50,
"length_function": len,
}
class CodeDocsPageChunker(BaseChunker):
"""Chunker for code docs page."""
def __init__(self, config: Optional[ChunkerConfig] = None):
if config is None:
config = TEXT_SPLITTER_CHUNK_PARAMS
text_splitter = RecursiveCharacterTextSplitter(**config)
super().__init__(text_splitter)

View File

@@ -17,7 +17,7 @@ DEFAULT_PROMPT = """
DEFAULT_PROMPT_WITH_HISTORY = """
Use the following pieces of context to answer the query at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
I will provide you with our conversation history.
I will provide you with our conversation history.
$context
@@ -28,8 +28,20 @@ DEFAULT_PROMPT_WITH_HISTORY = """
Helpful Answer:
""" # noqa:E501
CODE_DOCS_PAGE_DEFAULT_PROMPT = """
Use the following pieces of context to answer the query at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer. Wherever possible, give complete code snippet. Dont make up any code snippet on your own.
$context
Query: $query
Helpful Answer:
""" # noqa:E501
DEFAULT_PROMPT_TEMPLATE = Template(DEFAULT_PROMPT)
DEFAULT_PROMPT_WITH_HISTORY_TEMPLATE = Template(DEFAULT_PROMPT_WITH_HISTORY)
CODE_DOCS_PAGE_PROMPT_TEMPLATE = Template(CODE_DOCS_PAGE_DEFAULT_PROMPT)
query_re = re.compile(r"\$\{*query\}*")
context_re = re.compile(r"\$\{*context\}*")
history_re = re.compile(r"\$\{*history\}*")

View File

@@ -1,3 +1,4 @@
from embedchain.chunkers.code_docs_page import CodeDocsPageChunker
from embedchain.chunkers.docx_file import DocxFileChunker
from embedchain.chunkers.pdf_file import PdfFileChunker
from embedchain.chunkers.qna_pair import QnaPairChunker
@@ -5,6 +6,7 @@ from embedchain.chunkers.text import TextChunker
from embedchain.chunkers.web_page import WebPageChunker
from embedchain.chunkers.youtube_video import YoutubeVideoChunker
from embedchain.config import AddConfig
from embedchain.loaders.code_docs_page import CodeDocsPageLoader
from embedchain.loaders.docx_file import DocxFileLoader
from embedchain.loaders.local_qna_pair import LocalQnaPairLoader
from embedchain.loaders.local_text import LocalTextLoader
@@ -41,6 +43,7 @@ class DataFormatter:
"text": LocalTextLoader(),
"docx": DocxFileLoader(),
"sitemap": SitemapLoader(),
"code_docs_page": CodeDocsPageLoader(),
}
if data_type in loaders:
return loaders[data_type]
@@ -63,6 +66,7 @@ class DataFormatter:
"text": TextChunker(config),
"docx": DocxFileChunker(config),
"sitemap": WebPageChunker(config),
"code_docs_page": CodeDocsPageChunker(config)
}
if data_type in chunkers:
return chunkers[data_type]

View File

@@ -9,7 +9,7 @@ from langchain.docstore.document import Document
from langchain.memory import ConversationBufferMemory
from embedchain.config import AddConfig, ChatConfig, InitConfig, QueryConfig
from embedchain.config.QueryConfig import DEFAULT_PROMPT
from embedchain.config.QueryConfig import DEFAULT_PROMPT, CODE_DOCS_PAGE_PROMPT_TEMPLATE
from embedchain.data_formatter import DataFormatter
gpt4all_model = None
@@ -35,6 +35,7 @@ class EmbedChain:
self.db_client = self.config.db.client
self.collection = self.config.db.collection
self.user_asks = []
self.is_code_docs_instance = False
def add(self, data_type, url, metadata=None, config: AddConfig = None):
"""
@@ -56,6 +57,8 @@ class EmbedChain:
self.load_and_embed(
data_formatter.loader, data_formatter.chunker, url, metadata
)
if data_type in ("code_docs_page", ):
self.is_code_docs_instance = True
def add_local(self, data_type, content, metadata=None, config: AddConfig = None):
"""
@@ -211,6 +214,9 @@ class EmbedChain:
"""
if config is None:
config = QueryConfig()
if self.is_code_docs_instance:
config.template = CODE_DOCS_PAGE_PROMPT_TEMPLATE
config.number_documents = 5
contexts = self.retrieve_from_database(input_query, config)
prompt = self.generate_prompt(input_query, contexts, config)
logging.info(f"Prompt: {prompt}")
@@ -244,7 +250,9 @@ class EmbedChain:
"""
if config is None:
config = ChatConfig()
if self.is_code_docs_instance:
config.template = CODE_DOCS_PAGE_PROMPT_TEMPLATE
config.number_documents = 5
contexts = self.retrieve_from_database(input_query, config)
global memory

View File

@@ -0,0 +1,63 @@
import requests
from bs4 import BeautifulSoup
from embedchain.utils import clean_string
class CodeDocsPageLoader:
def load_data(self, url):
"""Load data from a web page."""
response = requests.get(url)
data = response.content
soup = BeautifulSoup(data, "html.parser")
selectors = [
'article.bd-article',
'article[role="main"]',
'div.md-content',
'div[role="main"]',
'div.container',
'div.section',
'article',
'main',
]
content = None
for selector in selectors:
element = soup.select_one(selector)
if element is not None:
content = element.prettify()
break
if not content:
content = soup.get_text()
soup = BeautifulSoup(content, "html.parser")
for tag in soup(
[
"nav",
"aside",
"form",
"header",
"noscript",
"svg",
"canvas",
"footer",
"script",
"style",
]
):
tag.string = " "
for div in soup.find_all("div", {'class': 'cell_output'}):
div.decompose()
for div in soup.find_all("div", {'class': 'output_wrapper'}):
div.decompose()
for div in soup.find_all("div", {'class': 'output'}):
div.decompose()
content = clean_string(soup.get_text())
output = []
meta_data = {
"url": url,
}
output.append(
{
"content": content,
"meta_data": meta_data,
}
)
return output