From e60f1680a0f06f582bda9a9b08004fcb47afc468 Mon Sep 17 00:00:00 2001 From: cachho Date: Fri, 23 Jun 2023 14:24:43 +0200 Subject: [PATCH] chore: removed markdown to plaintext function --- embedchain/loaders_local/qna_pair.py | 4 --- embedchain/utils.py | 41 ---------------------------- 2 files changed, 45 deletions(-) diff --git a/embedchain/loaders_local/qna_pair.py b/embedchain/loaders_local/qna_pair.py index 374fc1c4..de3479cc 100644 --- a/embedchain/loaders_local/qna_pair.py +++ b/embedchain/loaders_local/qna_pair.py @@ -1,11 +1,7 @@ -from embedchain.utils import markdown_to_plaintext - - class QnaPairLoader: def load_data(self, content): question, answer = content - answer = markdown_to_plaintext(answer) content = f"Q: {question}\nA: {answer}" meta_data = { "url": "local", diff --git a/embedchain/utils.py b/embedchain/utils.py index 6bf8dccd..902b551d 100644 --- a/embedchain/utils.py +++ b/embedchain/utils.py @@ -8,44 +8,3 @@ def clean_string(text): cleaned_text = cleaned_text.replace('#', ' ') cleaned_text = re.sub(r'([^\w\s])\1*', r'\1', cleaned_text) return cleaned_text - -def markdown_to_plaintext(markdown_string): - # Lines surrounded by empty lines are considered paragraph text - markdown_string = markdown_string.strip().replace("\n\n", "\n") - - # Headers - markdown_string = markdown_string.replace("# ", "") - markdown_string = markdown_string.replace("## ", "") - markdown_string = markdown_string.replace("### ", "") - - # Bold text - markdown_string = markdown_string.replace("**", "") - markdown_string = markdown_string.replace("__", "") - - # Italicized text - markdown_string = markdown_string.replace("*", "") - markdown_string = markdown_string.replace("_", "") - - # Ordered lists - markdown_string = markdown_string.replace("1. ", "") - markdown_string = markdown_string.replace("2. ", "") - markdown_string = markdown_string.replace("3. ", "") - # And so on for other numbers - - # Unordered lists - markdown_string = markdown_string.replace("- ", "") - markdown_string = markdown_string.replace("* ", "") - markdown_string = markdown_string.replace("+ ", "") - - # Links and images - while ("[" in markdown_string and "]" in markdown_string and - "(" in markdown_string and ")" in markdown_string): - start_link = markdown_string.find("[") - end_link = markdown_string.find("]") - start_paren = markdown_string.find("(") - end_paren = markdown_string.find(")") - - if start_link < start_paren and end_link < end_paren: - markdown_string = markdown_string[:start_link] + markdown_string[start_paren+1:end_paren] + markdown_string[end_paren+1:] - - return markdown_string \ No newline at end of file