[Refactor] Improve logging package wide (#1315)

This commit is contained in:
Deshraj Yadav
2024-03-13 17:13:30 -07:00
committed by GitHub
parent ef69c91b60
commit 3616eaadb4
54 changed files with 263 additions and 231 deletions

View File

@@ -11,6 +11,8 @@ from tqdm import tqdm
from embedchain.models.data_type import DataType
logger = logging.getLogger(__name__)
def parse_content(content, type):
implemented = ["html.parser", "lxml", "lxml-xml", "xml", "html5lib"]
@@ -61,7 +63,7 @@ def parse_content(content, type):
cleaned_size = len(content)
if original_size != 0:
logging.info(
logger.info(
f"Cleaned page size: {cleaned_size} characters, down from {original_size} (shrunk: {original_size-cleaned_size} chars, {round((1-(cleaned_size/original_size)) * 100, 2)}%)" # noqa:E501
)
@@ -208,31 +210,31 @@ def detect_datatype(source: Any) -> DataType:
}
if url.netloc in YOUTUBE_ALLOWED_NETLOCKS:
logging.debug(f"Source of `{formatted_source}` detected as `youtube_video`.")
logger.debug(f"Source of `{formatted_source}` detected as `youtube_video`.")
return DataType.YOUTUBE_VIDEO
if url.netloc in {"notion.so", "notion.site"}:
logging.debug(f"Source of `{formatted_source}` detected as `notion`.")
logger.debug(f"Source of `{formatted_source}` detected as `notion`.")
return DataType.NOTION
if url.path.endswith(".pdf"):
logging.debug(f"Source of `{formatted_source}` detected as `pdf_file`.")
logger.debug(f"Source of `{formatted_source}` detected as `pdf_file`.")
return DataType.PDF_FILE
if url.path.endswith(".xml"):
logging.debug(f"Source of `{formatted_source}` detected as `sitemap`.")
logger.debug(f"Source of `{formatted_source}` detected as `sitemap`.")
return DataType.SITEMAP
if url.path.endswith(".csv"):
logging.debug(f"Source of `{formatted_source}` detected as `csv`.")
logger.debug(f"Source of `{formatted_source}` detected as `csv`.")
return DataType.CSV
if url.path.endswith(".mdx") or url.path.endswith(".md"):
logging.debug(f"Source of `{formatted_source}` detected as `mdx`.")
logger.debug(f"Source of `{formatted_source}` detected as `mdx`.")
return DataType.MDX
if url.path.endswith(".docx"):
logging.debug(f"Source of `{formatted_source}` detected as `docx`.")
logger.debug(f"Source of `{formatted_source}` detected as `docx`.")
return DataType.DOCX
if url.path.endswith(".yaml"):
@@ -242,14 +244,14 @@ def detect_datatype(source: Any) -> DataType:
try:
yaml_content = yaml.safe_load(response.text)
except yaml.YAMLError as exc:
logging.error(f"Error parsing YAML: {exc}")
logger.error(f"Error parsing YAML: {exc}")
raise TypeError(f"Not a valid data type. Error loading YAML: {exc}")
if is_openapi_yaml(yaml_content):
logging.debug(f"Source of `{formatted_source}` detected as `openapi`.")
logger.debug(f"Source of `{formatted_source}` detected as `openapi`.")
return DataType.OPENAPI
else:
logging.error(
logger.error(
f"Source of `{formatted_source}` does not contain all the required \
fields of OpenAPI yaml. Check 'https://spec.openapis.org/oas/v3.1.0'"
)
@@ -258,35 +260,35 @@ def detect_datatype(source: Any) -> DataType:
make sure you have all the required fields in YAML config data"
)
except requests.exceptions.RequestException as e:
logging.error(f"Error fetching URL {formatted_source}: {e}")
logger.error(f"Error fetching URL {formatted_source}: {e}")
if url.path.endswith(".json"):
logging.debug(f"Source of `{formatted_source}` detected as `json_file`.")
logger.debug(f"Source of `{formatted_source}` detected as `json_file`.")
return DataType.JSON
if "docs" in url.netloc or ("docs" in url.path and url.scheme != "file"):
# `docs_site` detection via path is not accepted for local filesystem URIs,
# because that would mean all paths that contain `docs` are now doc sites, which is too aggressive.
logging.debug(f"Source of `{formatted_source}` detected as `docs_site`.")
logger.debug(f"Source of `{formatted_source}` detected as `docs_site`.")
return DataType.DOCS_SITE
if "github.com" in url.netloc:
logging.debug(f"Source of `{formatted_source}` detected as `github`.")
logger.debug(f"Source of `{formatted_source}` detected as `github`.")
return DataType.GITHUB
if is_google_drive_folder(url.netloc + url.path):
logging.debug(f"Source of `{formatted_source}` detected as `google drive folder`.")
logger.debug(f"Source of `{formatted_source}` detected as `google drive folder`.")
return DataType.GOOGLE_DRIVE_FOLDER
# If none of the above conditions are met, it's a general web page
logging.debug(f"Source of `{formatted_source}` detected as `web_page`.")
logger.debug(f"Source of `{formatted_source}` detected as `web_page`.")
return DataType.WEB_PAGE
elif not isinstance(source, str):
# For datatypes where source is not a string.
if isinstance(source, tuple) and len(source) == 2 and isinstance(source[0], str) and isinstance(source[1], str):
logging.debug(f"Source of `{formatted_source}` detected as `qna_pair`.")
logger.debug(f"Source of `{formatted_source}` detected as `qna_pair`.")
return DataType.QNA_PAIR
# Raise an error if it isn't a string and also not a valid non-string type (one of the previous).
@@ -300,37 +302,37 @@ def detect_datatype(source: Any) -> DataType:
# Note: checking for string is not necessary anymore.
if source.endswith(".docx"):
logging.debug(f"Source of `{formatted_source}` detected as `docx`.")
logger.debug(f"Source of `{formatted_source}` detected as `docx`.")
return DataType.DOCX
if source.endswith(".csv"):
logging.debug(f"Source of `{formatted_source}` detected as `csv`.")
logger.debug(f"Source of `{formatted_source}` detected as `csv`.")
return DataType.CSV
if source.endswith(".xml"):
logging.debug(f"Source of `{formatted_source}` detected as `xml`.")
logger.debug(f"Source of `{formatted_source}` detected as `xml`.")
return DataType.XML
if source.endswith(".mdx") or source.endswith(".md"):
logging.debug(f"Source of `{formatted_source}` detected as `mdx`.")
logger.debug(f"Source of `{formatted_source}` detected as `mdx`.")
return DataType.MDX
if source.endswith(".txt"):
logging.debug(f"Source of `{formatted_source}` detected as `text`.")
logger.debug(f"Source of `{formatted_source}` detected as `text`.")
return DataType.TEXT_FILE
if source.endswith(".pdf"):
logging.debug(f"Source of `{formatted_source}` detected as `pdf_file`.")
logger.debug(f"Source of `{formatted_source}` detected as `pdf_file`.")
return DataType.PDF_FILE
if source.endswith(".yaml"):
with open(source, "r") as file:
yaml_content = yaml.safe_load(file)
if is_openapi_yaml(yaml_content):
logging.debug(f"Source of `{formatted_source}` detected as `openapi`.")
logger.debug(f"Source of `{formatted_source}` detected as `openapi`.")
return DataType.OPENAPI
else:
logging.error(
logger.error(
f"Source of `{formatted_source}` does not contain all the required \
fields of OpenAPI yaml. Check 'https://spec.openapis.org/oas/v3.1.0'"
)
@@ -340,11 +342,11 @@ def detect_datatype(source: Any) -> DataType:
)
if source.endswith(".json"):
logging.debug(f"Source of `{formatted_source}` detected as `json`.")
logger.debug(f"Source of `{formatted_source}` detected as `json`.")
return DataType.JSON
if os.path.exists(source) and is_readable(open(source).read()):
logging.debug(f"Source of `{formatted_source}` detected as `text_file`.")
logger.debug(f"Source of `{formatted_source}` detected as `text_file`.")
return DataType.TEXT_FILE
# If the source is a valid file, that's not detectable as a type, an error is raised.
@@ -360,11 +362,11 @@ def detect_datatype(source: Any) -> DataType:
# check if the source is valid json string
if is_valid_json_string(source):
logging.debug(f"Source of `{formatted_source}` detected as `json`.")
logger.debug(f"Source of `{formatted_source}` detected as `json`.")
return DataType.JSON
# Use text as final fallback.
logging.debug(f"Source of `{formatted_source}` detected as `text`.")
logger.debug(f"Source of `{formatted_source}` detected as `text`.")
return DataType.TEXT