68 lines
2.1 KiB
Python
68 lines
2.1 KiB
Python
import logging
|
|
import re
|
|
import string
|
|
|
|
|
|
def clean_string(text):
|
|
"""
|
|
This function takes in a string and performs a series of text cleaning operations.
|
|
|
|
Args:
|
|
text (str): The text to be cleaned. This is expected to be a string.
|
|
|
|
Returns:
|
|
cleaned_text (str): The cleaned text after all the cleaning operations
|
|
have been performed.
|
|
"""
|
|
# Replacement of newline characters:
|
|
text = text.replace("\n", " ")
|
|
|
|
# Stripping and reducing multiple spaces to single:
|
|
cleaned_text = re.sub(r"\s+", " ", text.strip())
|
|
|
|
# Removing backslashes:
|
|
cleaned_text = cleaned_text.replace("\\", "")
|
|
|
|
# Replacing hash characters:
|
|
cleaned_text = cleaned_text.replace("#", " ")
|
|
|
|
# Eliminating consecutive non-alphanumeric characters:
|
|
# This regex identifies consecutive non-alphanumeric characters (i.e., not
|
|
# a word character [a-zA-Z0-9_] and not a whitespace) in the string
|
|
# and replaces each group of such characters with a single occurrence of
|
|
# that character.
|
|
# For example, "!!! hello !!!" would become "! hello !".
|
|
cleaned_text = re.sub(r"([^\w\s])\1*", r"\1", cleaned_text)
|
|
|
|
return cleaned_text
|
|
|
|
|
|
def is_readable(s):
|
|
"""
|
|
Heuristic to determine if a string is "readable" (mostly contains printable characters and forms meaningful words)
|
|
|
|
:param s: string
|
|
:return: True if the string is more than 95% printable.
|
|
"""
|
|
printable_ratio = sum(c in string.printable for c in s) / len(s)
|
|
return printable_ratio > 0.95 # 95% of characters are printable
|
|
|
|
|
|
def use_pysqlite3():
|
|
"""
|
|
Swap std-lib sqlite3 with pysqlite3.
|
|
"""
|
|
import platform
|
|
|
|
if platform.system() == "Linux":
|
|
# According to the Chroma team, this patch only works on Linux
|
|
import subprocess
|
|
import sys
|
|
|
|
subprocess.check_call([sys.executable, "-m", "pip", "install", "pysqlite3-binary"])
|
|
|
|
__import__("pysqlite3")
|
|
sys.modules["sqlite3"] = sys.modules.pop("pysqlite3")
|
|
# Don't be surprised if this doesn't log as you expect, because the logger is instantiated after the import
|
|
logging.info("Swapped std-lib sqlite3 with pysqlite3")
|