[Improvements] Improve the default prompt and data loader util functions (#1272)

This commit is contained in:
Deshraj Yadav
2024-02-18 14:06:32 -08:00
committed by GitHub
parent 9a11683003
commit 6c12bc9044
19 changed files with 79 additions and 62 deletions

View File

@@ -7,40 +7,60 @@ from embedchain.config.base_config import BaseConfig
from embedchain.helpers.json_serializable import register_deserializable from embedchain.helpers.json_serializable import register_deserializable
DEFAULT_PROMPT = """ DEFAULT_PROMPT = """
Use the following pieces of context to answer the query at the end. You are a Q&A expert system. Your responses must always be rooted in the context provided for each query. Here are some guidelines to follow:
If you don't know the answer, just say that you don't know, don't try to make up an answer.
$context 1. Refrain from explicitly mentioning the context provided in your response.
2. The context should silently guide your answers without being directly acknowledged.
3. Do not use phrases such as 'According to the context provided', 'Based on the context, ...' etc.
Query: $query Context information:
----------------------
$context
----------------------
Helpful Answer: Query: $query
Answer:
""" # noqa:E501 """ # noqa:E501
DEFAULT_PROMPT_WITH_HISTORY = """ DEFAULT_PROMPT_WITH_HISTORY = """
Use the following pieces of context to answer the query at the end. You are a Q&A expert system. Your responses must always be rooted in the context provided for each query. You are also provided with the conversation history with the user. Make sure to use relevant context from conversation history as needed.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
I will provide you with our conversation history.
$context Here are some guidelines to follow:
History: 1. Refrain from explicitly mentioning the context provided in your response.
$history 2. The context should silently guide your answers without being directly acknowledged.
3. Do not use phrases such as 'According to the context provided', 'Based on the context, ...' etc.
Query: $query Context information:
----------------------
$context
----------------------
Helpful Answer: Conversation history:
----------------------
$history
----------------------
Query: $query
Answer:
""" # noqa:E501 """ # noqa:E501
DOCS_SITE_DEFAULT_PROMPT = """ DOCS_SITE_DEFAULT_PROMPT = """
Use the following pieces of context to answer the query at the end. You are an expert AI assistant for developer support product. Your responses must always be rooted in the context provided for each query. Wherever possible, give complete code snippet. Dont make up any code snippet on your own.
If you don't know the answer, just say that you don't know, don't try to make up an answer. Wherever possible, give complete code snippet. Dont make up any code snippet on your own.
$context Here are some guidelines to follow:
Query: $query 1. Refrain from explicitly mentioning the context provided in your response.
2. The context should silently guide your answers without being directly acknowledged.
3. Do not use phrases such as 'According to the context provided', 'Based on the context, ...' etc.
Helpful Answer: Context information:
----------------------
$context
----------------------
Query: $query
Answer:
""" # noqa:E501 """ # noqa:E501
DEFAULT_PROMPT_TEMPLATE = Template(DEFAULT_PROMPT) DEFAULT_PROMPT_TEMPLATE = Template(DEFAULT_PROMPT)

View File

@@ -131,7 +131,7 @@ class DiscordLoader(BaseLoader):
client = DiscordClient(intents=intents) client = DiscordClient(intents=intents)
client.run(self.token) client.run(self.token)
meta_data = { metadata = {
"url": channel_id, "url": channel_id,
} }
@@ -144,7 +144,7 @@ class DiscordLoader(BaseLoader):
"data": [ "data": [
{ {
"content": messages, "content": messages,
"meta_data": meta_data, "meta_data": metadata,
} }
], ],
} }

View File

@@ -39,7 +39,7 @@ class DiscourseLoader(BaseLoader):
return return
response_data = response.json() response_data = response.json()
post_contents = clean_string(response_data.get("raw")) post_contents = clean_string(response_data.get("raw"))
meta_data = { metadata = {
"url": post_url, "url": post_url,
"created_at": response_data.get("created_at", ""), "created_at": response_data.get("created_at", ""),
"username": response_data.get("username", ""), "username": response_data.get("username", ""),
@@ -48,7 +48,7 @@ class DiscourseLoader(BaseLoader):
} }
data = { data = {
"content": post_contents, "content": post_contents,
"meta_data": meta_data, "meta_data": metadata,
} }
return data return data

View File

@@ -18,9 +18,9 @@ class DocxFileLoader(BaseLoader):
output = [] output = []
data = loader.load() data = loader.load()
content = data[0].page_content content = data[0].page_content
meta_data = data[0].metadata metadata = data[0].metadata
meta_data["url"] = "local" metadata["url"] = "local"
output.append({"content": content, "meta_data": meta_data}) output.append({"content": content, "meta_data": metadata})
doc_id = hashlib.sha256((content + url).encode()).hexdigest() doc_id = hashlib.sha256((content + url).encode()).hexdigest()
return { return {
"doc_id": doc_id, "doc_id": doc_id,

View File

@@ -11,14 +11,14 @@ class LocalQnaPairLoader(BaseLoader):
question, answer = content question, answer = content
content = f"Q: {question}\nA: {answer}" content = f"Q: {question}\nA: {answer}"
url = "local" url = "local"
meta_data = {"url": url, "question": question} metadata = {"url": url, "question": question}
doc_id = hashlib.sha256((content + url).encode()).hexdigest() doc_id = hashlib.sha256((content + url).encode()).hexdigest()
return { return {
"doc_id": doc_id, "doc_id": doc_id,
"data": [ "data": [
{ {
"content": content, "content": content,
"meta_data": meta_data, "meta_data": metadata,
} }
], ],
} }

View File

@@ -9,7 +9,7 @@ class LocalTextLoader(BaseLoader):
def load_data(self, content): def load_data(self, content):
"""Load data from a local text file.""" """Load data from a local text file."""
url = "local" url = "local"
meta_data = { metadata = {
"url": url, "url": url,
} }
doc_id = hashlib.sha256((content + url).encode()).hexdigest() doc_id = hashlib.sha256((content + url).encode()).hexdigest()
@@ -18,7 +18,7 @@ class LocalTextLoader(BaseLoader):
"data": [ "data": [
{ {
"content": content, "content": content,
"meta_data": meta_data, "meta_data": metadata,
} }
], ],
} }

View File

@@ -10,7 +10,7 @@ class MdxLoader(BaseLoader):
"""Load data from a mdx file.""" """Load data from a mdx file."""
with open(url, "r", encoding="utf-8") as infile: with open(url, "r", encoding="utf-8") as infile:
content = infile.read() content = infile.read()
meta_data = { metadata = {
"url": url, "url": url,
} }
doc_id = hashlib.sha256((content + url).encode()).hexdigest() doc_id = hashlib.sha256((content + url).encode()).hexdigest()
@@ -19,7 +19,7 @@ class MdxLoader(BaseLoader):
"data": [ "data": [
{ {
"content": content, "content": content,
"meta_data": meta_data, "meta_data": metadata,
} }
], ],
} }

View File

@@ -35,8 +35,8 @@ class OpenAPILoader(BaseLoader):
yaml_data = yaml.load(file, Loader=yaml.SafeLoader) yaml_data = yaml.load(file, Loader=yaml.SafeLoader)
for i, (key, value) in enumerate(yaml_data.items()): for i, (key, value) in enumerate(yaml_data.items()):
string_data = f"{key}: {value}" string_data = f"{key}: {value}"
meta_data = {"url": file_path, "row": i + 1} metadata = {"url": file_path, "row": i + 1}
data.append({"content": string_data, "meta_data": meta_data}) data.append({"content": string_data, "meta_data": metadata})
data_content.append(string_data) data_content.append(string_data)
doc_id = hashlib.sha256((content + ", ".join(data_content)).encode()).hexdigest() doc_id = hashlib.sha256((content + ", ".join(data_content)).encode()).hexdigest()
return {"doc_id": doc_id, "data": data} return {"doc_id": doc_id, "data": data}

View File

@@ -27,12 +27,12 @@ class PdfFileLoader(BaseLoader):
for page in pages: for page in pages:
content = page.page_content content = page.page_content
content = clean_string(content) content = clean_string(content)
meta_data = page.metadata metadata = page.metadata
meta_data["url"] = url metadata["url"] = url
data.append( data.append(
{ {
"content": content, "content": content,
"meta_data": meta_data, "meta_data": metadata,
} }
) )
all_content.append(content) all_content.append(content)

View File

@@ -41,12 +41,12 @@ class RSSFeedLoader(BaseLoader):
data = loader.load() data = loader.load()
for entry in data: for entry in data:
meta_data = RSSFeedLoader.serialize_metadata(entry.metadata) metadata = RSSFeedLoader.serialize_metadata(entry.metadata)
meta_data.update({"url": url}) metadata.update({"url": url})
output.append( output.append(
{ {
"content": entry.page_content, "content": entry.page_content,
"meta_data": meta_data, "meta_data": metadata,
} }
) )

View File

@@ -88,16 +88,16 @@ class SlackLoader(BaseLoader):
content = clean_string(text) content = clean_string(text)
message_meta_data_keys = ["iid", "team", "ts", "type", "user", "username"] message_meta_data_keys = ["iid", "team", "ts", "type", "user", "username"]
meta_data = {} metadata = {}
for key in message.keys(): for key in message.keys():
if key in message_meta_data_keys: if key in message_meta_data_keys:
meta_data[key] = message.get(key) metadata[key] = message.get(key)
meta_data.update({"url": url}) metadata.update({"url": url})
data.append( data.append(
{ {
"content": content, "content": content,
"meta_data": meta_data, "meta_data": metadata,
} }
) )
data_content.append(content) data_content.append(content)

View File

@@ -17,14 +17,14 @@ class TextFileLoader(BaseLoader):
doc_id = hashlib.sha256((content + url).encode()).hexdigest() doc_id = hashlib.sha256((content + url).encode()).hexdigest()
meta_data = {"url": url, "file_size": os.path.getsize(url), "file_type": url.split(".")[-1]} metadata = {"url": url, "file_size": os.path.getsize(url), "file_type": url.split(".")[-1]}
return { return {
"doc_id": doc_id, "doc_id": doc_id,
"data": [ "data": [
{ {
"content": content, "content": content,
"meta_data": meta_data, "meta_data": metadata,
} }
], ],
} }

View File

@@ -26,12 +26,12 @@ class UnstructuredLoader(BaseLoader):
for page in pages: for page in pages:
content = page.page_content content = page.page_content
content = clean_string(content) content = clean_string(content)
meta_data = page.metadata metadata = page.metadata
meta_data["url"] = url metadata["url"] = url
data.append( data.append(
{ {
"content": content, "content": content,
"meta_data": meta_data, "meta_data": metadata,
} }
) )
all_content.append(content) all_content.append(content)

View File

@@ -30,7 +30,7 @@ class WebPageLoader(BaseLoader):
data = response.content data = response.content
content = self._get_clean_content(data, url) content = self._get_clean_content(data, url)
meta_data = {"url": url} metadata = {"url": url}
doc_id = hashlib.sha256((content + url).encode()).hexdigest() doc_id = hashlib.sha256((content + url).encode()).hexdigest()
return { return {
@@ -38,7 +38,7 @@ class WebPageLoader(BaseLoader):
"data": [ "data": [
{ {
"content": content, "content": content,
"meta_data": meta_data, "meta_data": metadata,
} }
], ],
} }

View File

@@ -19,10 +19,10 @@ class XmlLoader(BaseLoader):
data = loader.load() data = loader.load()
content = data[0].page_content content = data[0].page_content
content = clean_string(content) content = clean_string(content)
meta_data = data[0].metadata metadata = data[0].metadata
meta_data["url"] = meta_data["source"] metadata["url"] = metadata["source"]
del meta_data["source"] del metadata["source"]
output = [{"content": content, "meta_data": meta_data}] output = [{"content": content, "meta_data": metadata}]
doc_id = hashlib.sha256((content + xml_url).encode()).hexdigest() doc_id = hashlib.sha256((content + xml_url).encode()).hexdigest()
return { return {
"doc_id": doc_id, "doc_id": doc_id,

View File

@@ -22,13 +22,13 @@ class YoutubeVideoLoader(BaseLoader):
raise ValueError(f"No data found for url: {url}") raise ValueError(f"No data found for url: {url}")
content = doc[0].page_content content = doc[0].page_content
content = clean_string(content) content = clean_string(content)
meta_data = doc[0].metadata metadata = doc[0].metadata
meta_data["url"] = url metadata["url"] = url
output.append( output.append(
{ {
"content": content, "content": content,
"meta_data": meta_data, "meta_data": metadata,
} }
) )
doc_id = hashlib.sha256((content + url).encode()).hexdigest() doc_id = hashlib.sha256((content + url).encode()).hexdigest()

View File

@@ -79,9 +79,6 @@ def clean_string(text):
cleaned_text (str): The cleaned text after all the cleaning operations cleaned_text (str): The cleaned text after all the cleaning operations
have been performed. have been performed.
""" """
# Replacement of newline characters:
text = text.replace("\n", " ")
# Stripping and reducing multiple spaces to single: # Stripping and reducing multiple spaces to single:
cleaned_text = re.sub(r"\s+", " ", text.strip()) cleaned_text = re.sub(r"\s+", " ", text.strip())

View File

@@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "embedchain" name = "embedchain"
version = "0.1.80" version = "0.1.81"
description = "Simplest open source retrieval(RAG) framework" description = "Simplest open source retrieval(RAG) framework"
authors = [ authors = [
"Taranjeet Singh <taranjeet@embedchain.ai>", "Taranjeet Singh <taranjeet@embedchain.ai>",

View File

@@ -64,7 +64,7 @@ class TestApp(unittest.TestCase):
self.assertEqual(len(app.llm.history), 1) self.assertEqual(len(app.llm.history), 1)
history = app.llm.history history = app.llm.history
dry_run = app.chat("Test query 2", dry_run=True) dry_run = app.chat("Test query 2", dry_run=True)
self.assertIn("History:", dry_run) self.assertIn("Conversation history:", dry_run)
self.assertEqual(history, app.llm.history) self.assertEqual(history, app.llm.history)
self.assertEqual(len(app.llm.history), 1) self.assertEqual(len(app.llm.history), 1)