Files
t6_mem0/embedchain/utils.py
2023-07-11 10:20:05 +05:30

36 lines
1.1 KiB
Python

import re
def clean_string(text):
"""
This function takes in a string and performs a series of text cleaning operations.
Args:
text (str): The text to be cleaned. This is expected to be a string.
Returns:
cleaned_text (str): The cleaned text after all the cleaning operations
have been performed.
"""
# Replacement of newline characters:
text = text.replace("\n", " ")
# Stripping and reducing multiple spaces to single:
cleaned_text = re.sub(r"\s+", " ", text.strip())
# Removing backslashes:
cleaned_text = cleaned_text.replace("\\", "")
# Replacing hash characters:
cleaned_text = cleaned_text.replace("#", " ")
# Eliminating consecutive non-alphanumeric characters:
# This regex identifies consecutive non-alphanumeric characters (i.e., not
# a word character [a-zA-Z0-9_] and not a whitespace) in the string
# and replaces each group of such characters with a single occurrence of
# that character.
# For example, "!!! hello !!!" would become "! hello !".
cleaned_text = re.sub(r"([^\w\s])\1*", r"\1", cleaned_text)
return cleaned_text