diff --git a/embedchain/utils.py b/embedchain/utils.py index 902b551d..b6337147 100644 --- a/embedchain/utils.py +++ b/embedchain/utils.py @@ -2,9 +2,32 @@ import re def clean_string(text): + """ + This function takes in a string and performs a series of text cleaning operations. + + Args: + text (str): The text to be cleaned. This is expected to be a string. + + Returns: + cleaned_text (str): The cleaned text after all the cleaning operations have been performed. + """ + # Replacement of newline characters: text = text.replace('\n', ' ') + + # Stripping and reducing multiple spaces to single: cleaned_text = re.sub(r'\s+', ' ', text.strip()) + + # Removing backslashes: cleaned_text = cleaned_text.replace('\\', '') + + # Replacing hash characters: cleaned_text = cleaned_text.replace('#', ' ') + + # Eliminating consecutive non-alphanumeric characters: + # This regex identifies consecutive non-alphanumeric characters (i.e., not a word character [a-zA-Z0-9_] and not a whitespace) in the string + # and replaces each group of such characters with a single occurrence of that character. + # For example, "!!! hello !!!" would become "! hello !". cleaned_text = re.sub(r'([^\w\s])\1*', r'\1', cleaned_text) + return cleaned_text +