Files
t6_mem0/embedchain/utils.py
2023-08-11 01:53:42 +05:30

92 lines
3.0 KiB
Python

import logging
import re
import string
def clean_string(text):
"""
This function takes in a string and performs a series of text cleaning operations.
Args:
text (str): The text to be cleaned. This is expected to be a string.
Returns:
cleaned_text (str): The cleaned text after all the cleaning operations
have been performed.
"""
# Replacement of newline characters:
text = text.replace("\n", " ")
# Stripping and reducing multiple spaces to single:
cleaned_text = re.sub(r"\s+", " ", text.strip())
# Removing backslashes:
cleaned_text = cleaned_text.replace("\\", "")
# Replacing hash characters:
cleaned_text = cleaned_text.replace("#", " ")
# Eliminating consecutive non-alphanumeric characters:
# This regex identifies consecutive non-alphanumeric characters (i.e., not
# a word character [a-zA-Z0-9_] and not a whitespace) in the string
# and replaces each group of such characters with a single occurrence of
# that character.
# For example, "!!! hello !!!" would become "! hello !".
cleaned_text = re.sub(r"([^\w\s])\1*", r"\1", cleaned_text)
return cleaned_text
def is_readable(s):
"""
Heuristic to determine if a string is "readable" (mostly contains printable characters and forms meaningful words)
:param s: string
:return: True if the string is more than 95% printable.
"""
try:
printable_ratio = sum(c in string.printable for c in s) / len(s)
except ZeroDivisionError:
logging.warning("Empty string processed as unreadable")
printable_ratio = 0
return printable_ratio > 0.95 # 95% of characters are printable
def use_pysqlite3():
"""
Swap std-lib sqlite3 with pysqlite3.
"""
import platform
import sqlite3
if platform.system() == "Linux" and sqlite3.sqlite_version_info < (3, 35, 0):
try:
# According to the Chroma team, this patch only works on Linux
import datetime
import subprocess
import sys
subprocess.check_call(
[sys.executable, "-m", "pip", "install", "pysqlite3-binary", "--quiet", "--disable-pip-version-check"]
)
__import__("pysqlite3")
sys.modules["sqlite3"] = sys.modules.pop("pysqlite3")
# Let the user know what happened.
current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S,%f")[:-3]
print(
f"{current_time} [embedchain] [INFO]",
"Swapped std-lib sqlite3 with pysqlite3 for ChromaDb compatibility.",
f"Your original version was {sqlite3.sqlite_version}.",
)
except Exception as e:
# Escape all exceptions
current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S,%f")[:-3]
print(
f"{current_time} [embedchain] [ERROR]",
"Failed to swap std-lib sqlite3 with pysqlite3 for ChromaDb compatibility.",
"Error:",
e,
)