Files
t6_mem0/embedchain/utils.py
2023-06-23 19:53:57 +05:30

51 lines
1.9 KiB
Python

import re
def clean_string(text):
text = text.replace('\n', ' ')
cleaned_text = re.sub(r'\s+', ' ', text.strip())
cleaned_text = cleaned_text.replace('\\', '')
cleaned_text = cleaned_text.replace('#', ' ')
cleaned_text = re.sub(r'([^\w\s])\1*', r'\1', cleaned_text)
return cleaned_text
def markdown_to_plaintext(markdown_string):
# Lines surrounded by empty lines are considered paragraph text
markdown_string = markdown_string.strip().replace("\n\n", "\n")
# Headers
markdown_string = markdown_string.replace("# ", "")
markdown_string = markdown_string.replace("## ", "")
markdown_string = markdown_string.replace("### ", "")
# Bold text
markdown_string = markdown_string.replace("**", "")
markdown_string = markdown_string.replace("__", "")
# Italicized text
markdown_string = markdown_string.replace("*", "")
markdown_string = markdown_string.replace("_", "")
# Ordered lists
markdown_string = markdown_string.replace("1. ", "")
markdown_string = markdown_string.replace("2. ", "")
markdown_string = markdown_string.replace("3. ", "")
# And so on for other numbers
# Unordered lists
markdown_string = markdown_string.replace("- ", "")
markdown_string = markdown_string.replace("* ", "")
markdown_string = markdown_string.replace("+ ", "")
# Links and images
while ("[" in markdown_string and "]" in markdown_string and
"(" in markdown_string and ")" in markdown_string):
start_link = markdown_string.find("[")
end_link = markdown_string.find("]")
start_paren = markdown_string.find("(")
end_paren = markdown_string.find(")")
if start_link < start_paren and end_link < end_paren:
markdown_string = markdown_string[:start_link] + markdown_string[start_paren+1:end_paren] + markdown_string[end_paren+1:]
return markdown_string