diff --git a/embedchain/config/llm/base.py b/embedchain/config/llm/base.py index 3619b924..3059aef0 100644 --- a/embedchain/config/llm/base.py +++ b/embedchain/config/llm/base.py @@ -7,40 +7,60 @@ from embedchain.config.base_config import BaseConfig from embedchain.helpers.json_serializable import register_deserializable DEFAULT_PROMPT = """ - Use the following pieces of context to answer the query at the end. - If you don't know the answer, just say that you don't know, don't try to make up an answer. +You are a Q&A expert system. Your responses must always be rooted in the context provided for each query. Here are some guidelines to follow: - $context +1. Refrain from explicitly mentioning the context provided in your response. +2. The context should silently guide your answers without being directly acknowledged. +3. Do not use phrases such as 'According to the context provided', 'Based on the context, ...' etc. - Query: $query +Context information: +---------------------- +$context +---------------------- - Helpful Answer: +Query: $query +Answer: """ # noqa:E501 DEFAULT_PROMPT_WITH_HISTORY = """ - Use the following pieces of context to answer the query at the end. - If you don't know the answer, just say that you don't know, don't try to make up an answer. - I will provide you with our conversation history. +You are a Q&A expert system. Your responses must always be rooted in the context provided for each query. You are also provided with the conversation history with the user. Make sure to use relevant context from conversation history as needed. - $context +Here are some guidelines to follow: - History: - $history +1. Refrain from explicitly mentioning the context provided in your response. +2. The context should silently guide your answers without being directly acknowledged. +3. Do not use phrases such as 'According to the context provided', 'Based on the context, ...' etc. - Query: $query +Context information: +---------------------- +$context +---------------------- - Helpful Answer: +Conversation history: +---------------------- +$history +---------------------- + +Query: $query +Answer: """ # noqa:E501 DOCS_SITE_DEFAULT_PROMPT = """ - Use the following pieces of context to answer the query at the end. - If you don't know the answer, just say that you don't know, don't try to make up an answer. Wherever possible, give complete code snippet. Dont make up any code snippet on your own. +You are an expert AI assistant for developer support product. Your responses must always be rooted in the context provided for each query. Wherever possible, give complete code snippet. Dont make up any code snippet on your own. - $context +Here are some guidelines to follow: - Query: $query +1. Refrain from explicitly mentioning the context provided in your response. +2. The context should silently guide your answers without being directly acknowledged. +3. Do not use phrases such as 'According to the context provided', 'Based on the context, ...' etc. - Helpful Answer: +Context information: +---------------------- +$context +---------------------- + +Query: $query +Answer: """ # noqa:E501 DEFAULT_PROMPT_TEMPLATE = Template(DEFAULT_PROMPT) diff --git a/embedchain/loaders/discord.py b/embedchain/loaders/discord.py index 9e8bbff4..7db210ad 100644 --- a/embedchain/loaders/discord.py +++ b/embedchain/loaders/discord.py @@ -131,7 +131,7 @@ class DiscordLoader(BaseLoader): client = DiscordClient(intents=intents) client.run(self.token) - meta_data = { + metadata = { "url": channel_id, } @@ -144,7 +144,7 @@ class DiscordLoader(BaseLoader): "data": [ { "content": messages, - "meta_data": meta_data, + "meta_data": metadata, } ], } diff --git a/embedchain/loaders/discourse.py b/embedchain/loaders/discourse.py index b16e4d4c..1d36efa7 100644 --- a/embedchain/loaders/discourse.py +++ b/embedchain/loaders/discourse.py @@ -39,7 +39,7 @@ class DiscourseLoader(BaseLoader): return response_data = response.json() post_contents = clean_string(response_data.get("raw")) - meta_data = { + metadata = { "url": post_url, "created_at": response_data.get("created_at", ""), "username": response_data.get("username", ""), @@ -48,7 +48,7 @@ class DiscourseLoader(BaseLoader): } data = { "content": post_contents, - "meta_data": meta_data, + "meta_data": metadata, } return data diff --git a/embedchain/loaders/docx_file.py b/embedchain/loaders/docx_file.py index 319819ff..6abc7af6 100644 --- a/embedchain/loaders/docx_file.py +++ b/embedchain/loaders/docx_file.py @@ -18,9 +18,9 @@ class DocxFileLoader(BaseLoader): output = [] data = loader.load() content = data[0].page_content - meta_data = data[0].metadata - meta_data["url"] = "local" - output.append({"content": content, "meta_data": meta_data}) + metadata = data[0].metadata + metadata["url"] = "local" + output.append({"content": content, "meta_data": metadata}) doc_id = hashlib.sha256((content + url).encode()).hexdigest() return { "doc_id": doc_id, diff --git a/embedchain/loaders/local_qna_pair.py b/embedchain/loaders/local_qna_pair.py index 1158d4a9..c93adfda 100644 --- a/embedchain/loaders/local_qna_pair.py +++ b/embedchain/loaders/local_qna_pair.py @@ -11,14 +11,14 @@ class LocalQnaPairLoader(BaseLoader): question, answer = content content = f"Q: {question}\nA: {answer}" url = "local" - meta_data = {"url": url, "question": question} + metadata = {"url": url, "question": question} doc_id = hashlib.sha256((content + url).encode()).hexdigest() return { "doc_id": doc_id, "data": [ { "content": content, - "meta_data": meta_data, + "meta_data": metadata, } ], } diff --git a/embedchain/loaders/local_text.py b/embedchain/loaders/local_text.py index e03ee12b..98a98cd6 100644 --- a/embedchain/loaders/local_text.py +++ b/embedchain/loaders/local_text.py @@ -9,7 +9,7 @@ class LocalTextLoader(BaseLoader): def load_data(self, content): """Load data from a local text file.""" url = "local" - meta_data = { + metadata = { "url": url, } doc_id = hashlib.sha256((content + url).encode()).hexdigest() @@ -18,7 +18,7 @@ class LocalTextLoader(BaseLoader): "data": [ { "content": content, - "meta_data": meta_data, + "meta_data": metadata, } ], } diff --git a/embedchain/loaders/mdx.py b/embedchain/loaders/mdx.py index 45b112f1..42b9b7fe 100644 --- a/embedchain/loaders/mdx.py +++ b/embedchain/loaders/mdx.py @@ -10,7 +10,7 @@ class MdxLoader(BaseLoader): """Load data from a mdx file.""" with open(url, "r", encoding="utf-8") as infile: content = infile.read() - meta_data = { + metadata = { "url": url, } doc_id = hashlib.sha256((content + url).encode()).hexdigest() @@ -19,7 +19,7 @@ class MdxLoader(BaseLoader): "data": [ { "content": content, - "meta_data": meta_data, + "meta_data": metadata, } ], } diff --git a/embedchain/loaders/openapi.py b/embedchain/loaders/openapi.py index 0f2164f9..18983b9a 100644 --- a/embedchain/loaders/openapi.py +++ b/embedchain/loaders/openapi.py @@ -35,8 +35,8 @@ class OpenAPILoader(BaseLoader): yaml_data = yaml.load(file, Loader=yaml.SafeLoader) for i, (key, value) in enumerate(yaml_data.items()): string_data = f"{key}: {value}" - meta_data = {"url": file_path, "row": i + 1} - data.append({"content": string_data, "meta_data": meta_data}) + metadata = {"url": file_path, "row": i + 1} + data.append({"content": string_data, "meta_data": metadata}) data_content.append(string_data) doc_id = hashlib.sha256((content + ", ".join(data_content)).encode()).hexdigest() return {"doc_id": doc_id, "data": data} diff --git a/embedchain/loaders/pdf_file.py b/embedchain/loaders/pdf_file.py index edd56c85..4f702bc0 100644 --- a/embedchain/loaders/pdf_file.py +++ b/embedchain/loaders/pdf_file.py @@ -27,12 +27,12 @@ class PdfFileLoader(BaseLoader): for page in pages: content = page.page_content content = clean_string(content) - meta_data = page.metadata - meta_data["url"] = url + metadata = page.metadata + metadata["url"] = url data.append( { "content": content, - "meta_data": meta_data, + "meta_data": metadata, } ) all_content.append(content) diff --git a/embedchain/loaders/rss_feed.py b/embedchain/loaders/rss_feed.py index 767921f1..eea7eb86 100644 --- a/embedchain/loaders/rss_feed.py +++ b/embedchain/loaders/rss_feed.py @@ -41,12 +41,12 @@ class RSSFeedLoader(BaseLoader): data = loader.load() for entry in data: - meta_data = RSSFeedLoader.serialize_metadata(entry.metadata) - meta_data.update({"url": url}) + metadata = RSSFeedLoader.serialize_metadata(entry.metadata) + metadata.update({"url": url}) output.append( { "content": entry.page_content, - "meta_data": meta_data, + "meta_data": metadata, } ) diff --git a/embedchain/loaders/slack.py b/embedchain/loaders/slack.py index 75f18738..f1caa1ec 100644 --- a/embedchain/loaders/slack.py +++ b/embedchain/loaders/slack.py @@ -88,16 +88,16 @@ class SlackLoader(BaseLoader): content = clean_string(text) message_meta_data_keys = ["iid", "team", "ts", "type", "user", "username"] - meta_data = {} + metadata = {} for key in message.keys(): if key in message_meta_data_keys: - meta_data[key] = message.get(key) - meta_data.update({"url": url}) + metadata[key] = message.get(key) + metadata.update({"url": url}) data.append( { "content": content, - "meta_data": meta_data, + "meta_data": metadata, } ) data_content.append(content) diff --git a/embedchain/loaders/text_file.py b/embedchain/loaders/text_file.py index 1c9e02db..bc7fb4b0 100644 --- a/embedchain/loaders/text_file.py +++ b/embedchain/loaders/text_file.py @@ -17,14 +17,14 @@ class TextFileLoader(BaseLoader): doc_id = hashlib.sha256((content + url).encode()).hexdigest() - meta_data = {"url": url, "file_size": os.path.getsize(url), "file_type": url.split(".")[-1]} + metadata = {"url": url, "file_size": os.path.getsize(url), "file_type": url.split(".")[-1]} return { "doc_id": doc_id, "data": [ { "content": content, - "meta_data": meta_data, + "meta_data": metadata, } ], } diff --git a/embedchain/loaders/unstructured_file.py b/embedchain/loaders/unstructured_file.py index 80099105..a1545c0e 100644 --- a/embedchain/loaders/unstructured_file.py +++ b/embedchain/loaders/unstructured_file.py @@ -26,12 +26,12 @@ class UnstructuredLoader(BaseLoader): for page in pages: content = page.page_content content = clean_string(content) - meta_data = page.metadata - meta_data["url"] = url + metadata = page.metadata + metadata["url"] = url data.append( { "content": content, - "meta_data": meta_data, + "meta_data": metadata, } ) all_content.append(content) diff --git a/embedchain/loaders/web_page.py b/embedchain/loaders/web_page.py index 71ad4e14..d68bb8a8 100644 --- a/embedchain/loaders/web_page.py +++ b/embedchain/loaders/web_page.py @@ -30,7 +30,7 @@ class WebPageLoader(BaseLoader): data = response.content content = self._get_clean_content(data, url) - meta_data = {"url": url} + metadata = {"url": url} doc_id = hashlib.sha256((content + url).encode()).hexdigest() return { @@ -38,7 +38,7 @@ class WebPageLoader(BaseLoader): "data": [ { "content": content, - "meta_data": meta_data, + "meta_data": metadata, } ], } diff --git a/embedchain/loaders/xml.py b/embedchain/loaders/xml.py index 01bbd21a..080032e6 100644 --- a/embedchain/loaders/xml.py +++ b/embedchain/loaders/xml.py @@ -19,10 +19,10 @@ class XmlLoader(BaseLoader): data = loader.load() content = data[0].page_content content = clean_string(content) - meta_data = data[0].metadata - meta_data["url"] = meta_data["source"] - del meta_data["source"] - output = [{"content": content, "meta_data": meta_data}] + metadata = data[0].metadata + metadata["url"] = metadata["source"] + del metadata["source"] + output = [{"content": content, "meta_data": metadata}] doc_id = hashlib.sha256((content + xml_url).encode()).hexdigest() return { "doc_id": doc_id, diff --git a/embedchain/loaders/youtube_video.py b/embedchain/loaders/youtube_video.py index 1e8ba35e..764a7d2a 100644 --- a/embedchain/loaders/youtube_video.py +++ b/embedchain/loaders/youtube_video.py @@ -22,13 +22,13 @@ class YoutubeVideoLoader(BaseLoader): raise ValueError(f"No data found for url: {url}") content = doc[0].page_content content = clean_string(content) - meta_data = doc[0].metadata - meta_data["url"] = url + metadata = doc[0].metadata + metadata["url"] = url output.append( { "content": content, - "meta_data": meta_data, + "meta_data": metadata, } ) doc_id = hashlib.sha256((content + url).encode()).hexdigest() diff --git a/embedchain/utils/misc.py b/embedchain/utils/misc.py index 1c2a12a1..503af903 100644 --- a/embedchain/utils/misc.py +++ b/embedchain/utils/misc.py @@ -79,9 +79,6 @@ def clean_string(text): cleaned_text (str): The cleaned text after all the cleaning operations have been performed. """ - # Replacement of newline characters: - text = text.replace("\n", " ") - # Stripping and reducing multiple spaces to single: cleaned_text = re.sub(r"\s+", " ", text.strip()) diff --git a/pyproject.toml b/pyproject.toml index c7f41bcf..ac2de144 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "embedchain" -version = "0.1.80" +version = "0.1.81" description = "Simplest open source retrieval(RAG) framework" authors = [ "Taranjeet Singh ", diff --git a/tests/llm/test_chat.py b/tests/llm/test_chat.py index 852b0b77..991a0adf 100644 --- a/tests/llm/test_chat.py +++ b/tests/llm/test_chat.py @@ -64,7 +64,7 @@ class TestApp(unittest.TestCase): self.assertEqual(len(app.llm.history), 1) history = app.llm.history dry_run = app.chat("Test query 2", dry_run=True) - self.assertIn("History:", dry_run) + self.assertIn("Conversation history:", dry_run) self.assertEqual(history, app.llm.history) self.assertEqual(len(app.llm.history), 1)