[Refactor] Improve logging package wide (#1315)

This commit is contained in:
Deshraj Yadav
2024-03-13 17:13:30 -07:00
committed by GitHub
parent ef69c91b60
commit 3616eaadb4
54 changed files with 263 additions and 231 deletions

View File

@@ -9,6 +9,8 @@ from embedchain.helpers.json_serializable import register_deserializable
from embedchain.loaders.base_loader import BaseLoader
from embedchain.utils.misc import is_readable
logger = logging.getLogger(__name__)
@register_deserializable
class BeehiivLoader(BaseLoader):
@@ -90,9 +92,9 @@ class BeehiivLoader(BaseLoader):
if is_readable(data):
return data
else:
logging.warning(f"Page is not readable (too many invalid characters): {link}")
logger.warning(f"Page is not readable (too many invalid characters): {link}")
except ParserRejectedMarkup as e:
logging.error(f"Failed to parse {link}: {e}")
logger.error(f"Failed to parse {link}: {e}")
return None
for link in links:

View File

@@ -10,6 +10,8 @@ from embedchain.loaders.base_loader import BaseLoader
from embedchain.loaders.text_file import TextFileLoader
from embedchain.utils.misc import detect_datatype
logger = logging.getLogger(__name__)
@register_deserializable
class DirectoryLoader(BaseLoader):
@@ -27,12 +29,12 @@ class DirectoryLoader(BaseLoader):
if not directory_path.is_dir():
raise ValueError(f"Invalid path: {path}")
logging.info(f"Loading data from directory: {path}")
logger.info(f"Loading data from directory: {path}")
data_list = self._process_directory(directory_path)
doc_id = hashlib.sha256((str(data_list) + str(directory_path)).encode()).hexdigest()
for error in self.errors:
logging.warning(error)
logger.warning(error)
return {"doc_id": doc_id, "data": data_list}
@@ -46,7 +48,7 @@ class DirectoryLoader(BaseLoader):
loader = self._predict_loader(file_path)
data_list.extend(loader.load_data(str(file_path))["data"])
elif file_path.is_dir():
logging.info(f"Loading data from directory: {file_path}")
logger.info(f"Loading data from directory: {file_path}")
return data_list
def _predict_loader(self, file_path: Path) -> BaseLoader:

View File

@@ -5,6 +5,8 @@ import os
from embedchain.helpers.json_serializable import register_deserializable
from embedchain.loaders.base_loader import BaseLoader
logger = logging.getLogger(__name__)
@register_deserializable
class DiscordLoader(BaseLoader):
@@ -102,7 +104,7 @@ class DiscordLoader(BaseLoader):
class DiscordClient(discord.Client):
async def on_ready(self) -> None:
logging.info("Logged on as {0}!".format(self.user))
logger.info("Logged on as {0}!".format(self.user))
try:
channel = self.get_channel(int(channel_id))
if not isinstance(channel, discord.TextChannel):
@@ -121,7 +123,7 @@ class DiscordLoader(BaseLoader):
messages.append(DiscordLoader._format_message(thread_message))
except Exception as e:
logging.error(e)
logger.error(e)
await self.close()
finally:
await self.close()

View File

@@ -8,6 +8,8 @@ import requests
from embedchain.loaders.base_loader import BaseLoader
from embedchain.utils.misc import clean_string
logger = logging.getLogger(__name__)
class DiscourseLoader(BaseLoader):
def __init__(self, config: Optional[dict[str, Any]] = None):
@@ -35,7 +37,7 @@ class DiscourseLoader(BaseLoader):
try:
response.raise_for_status()
except Exception as e:
logging.error(f"Failed to load post {post_id}: {e}")
logger.error(f"Failed to load post {post_id}: {e}")
return
response_data = response.json()
post_contents = clean_string(response_data.get("raw"))
@@ -56,7 +58,7 @@ class DiscourseLoader(BaseLoader):
self._check_query(query)
data = []
data_contents = []
logging.info(f"Searching data on discourse url: {self.domain}, for query: {query}")
logger.info(f"Searching data on discourse url: {self.domain}, for query: {query}")
search_url = f"{self.domain}search.json?q={query}"
response = requests.get(search_url)
try:

View File

@@ -15,6 +15,8 @@ except ImportError:
from embedchain.helpers.json_serializable import register_deserializable
from embedchain.loaders.base_loader import BaseLoader
logger = logging.getLogger(__name__)
@register_deserializable
class DocsSiteLoader(BaseLoader):
@@ -28,7 +30,7 @@ class DocsSiteLoader(BaseLoader):
response = requests.get(url)
if response.status_code != 200:
logging.info(f"Failed to fetch the website: {response.status_code}")
logger.info(f"Failed to fetch the website: {response.status_code}")
return
soup = BeautifulSoup(response.text, "html.parser")
@@ -53,7 +55,7 @@ class DocsSiteLoader(BaseLoader):
def _load_data_from_url(url: str) -> list:
response = requests.get(url)
if response.status_code != 200:
logging.info(f"Failed to fetch the website: {response.status_code}")
logger.info(f"Failed to fetch the website: {response.status_code}")
return []
soup = BeautifulSoup(response.content, "html.parser")

View File

@@ -22,6 +22,8 @@ except ImportError:
from embedchain.loaders.base_loader import BaseLoader
from embedchain.utils.misc import clean_string
logger = logging.getLogger(__name__)
class GmailReader:
SCOPES = ["https://www.googleapis.com/auth/gmail.readonly"]
@@ -114,7 +116,7 @@ class GmailLoader(BaseLoader):
def load_data(self, query: str):
reader = GmailReader(query=query)
emails = reader.load_emails()
logging.info(f"Gmail Loader: {len(emails)} emails found for query '{query}'")
logger.info(f"Gmail Loader: {len(emails)} emails found for query '{query}'")
data = []
for email in emails:

View File

@@ -5,6 +5,8 @@ from typing import Any, Optional
from embedchain.loaders.base_loader import BaseLoader
from embedchain.utils.misc import clean_string
logger = logging.getLogger(__name__)
class MySQLLoader(BaseLoader):
def __init__(self, config: Optional[dict[str, Any]]):
@@ -32,7 +34,7 @@ class MySQLLoader(BaseLoader):
self.connection = sqlconnector.connection.MySQLConnection(**config)
self.cursor = self.connection.cursor()
except (sqlconnector.Error, IOError) as err:
logging.info(f"Connection failed: {err}")
logger.info(f"Connection failed: {err}")
raise ValueError(
f"Unable to connect with the given config: {config}.",
"Please provide the correct configuration to load data from you MySQL DB. \

View File

@@ -9,6 +9,8 @@ from embedchain.helpers.json_serializable import register_deserializable
from embedchain.loaders.base_loader import BaseLoader
from embedchain.utils.misc import clean_string
logger = logging.getLogger(__name__)
class NotionDocument:
"""
@@ -98,7 +100,7 @@ class NotionLoader(BaseLoader):
id = source[-32:]
formatted_id = f"{id[:8]}-{id[8:12]}-{id[12:16]}-{id[16:20]}-{id[20:]}"
logging.debug(f"Extracted notion page id as: {formatted_id}")
logger.debug(f"Extracted notion page id as: {formatted_id}")
integration_token = os.getenv("NOTION_INTEGRATION_TOKEN")
reader = NotionPageLoader(integration_token=integration_token)

View File

@@ -4,6 +4,8 @@ from typing import Any, Optional
from embedchain.loaders.base_loader import BaseLoader
logger = logging.getLogger(__name__)
class PostgresLoader(BaseLoader):
def __init__(self, config: Optional[dict[str, Any]] = None):
@@ -32,7 +34,7 @@ class PostgresLoader(BaseLoader):
conn_params.append(f"{key}={value}")
config_info = " ".join(conn_params)
logging.info(f"Connecting to postrgres sql: {config_info}")
logger.info(f"Connecting to postrgres sql: {config_info}")
self.connection = psycopg.connect(conninfo=config_info)
self.cursor = self.connection.cursor()

View File

@@ -19,6 +19,8 @@ from embedchain.helpers.json_serializable import register_deserializable
from embedchain.loaders.base_loader import BaseLoader
from embedchain.loaders.web_page import WebPageLoader
logger = logging.getLogger(__name__)
@register_deserializable
class SitemapLoader(BaseLoader):
@@ -41,7 +43,7 @@ class SitemapLoader(BaseLoader):
response.raise_for_status()
soup = BeautifulSoup(response.text, "xml")
except requests.RequestException as e:
logging.error(f"Error fetching sitemap from URL: {e}")
logger.error(f"Error fetching sitemap from URL: {e}")
return
elif os.path.isfile(sitemap_source):
with open(sitemap_source, "r") as file:
@@ -60,7 +62,7 @@ class SitemapLoader(BaseLoader):
loader_data = web_page_loader.load_data(link)
return loader_data.get("data")
except ParserRejectedMarkup as e:
logging.error(f"Failed to parse {link}: {e}")
logger.error(f"Failed to parse {link}: {e}")
return None
with concurrent.futures.ThreadPoolExecutor() as executor:
@@ -72,6 +74,6 @@ class SitemapLoader(BaseLoader):
if data:
output.extend(data)
except Exception as e:
logging.error(f"Error loading page {link}: {e}")
logger.error(f"Error loading page {link}: {e}")
return {"doc_id": doc_id, "data": output}

View File

@@ -11,6 +11,8 @@ from embedchain.utils.misc import clean_string
SLACK_API_BASE_URL = "https://www.slack.com/api/"
logger = logging.getLogger(__name__)
class SlackLoader(BaseLoader):
def __init__(self, config: Optional[dict[str, Any]] = None):
@@ -38,7 +40,7 @@ class SlackLoader(BaseLoader):
"SLACK_USER_TOKEN environment variables not provided. Check `https://docs.embedchain.ai/data-sources/slack` to learn more." # noqa:E501
)
logging.info(f"Creating Slack Loader with config: {config}")
logger.info(f"Creating Slack Loader with config: {config}")
# get slack client config params
slack_bot_token = os.getenv("SLACK_USER_TOKEN")
ssl_cert = ssl.create_default_context(cafile=certifi.where())
@@ -54,7 +56,7 @@ class SlackLoader(BaseLoader):
headers=headers,
team_id=team_id,
)
logging.info("Slack Loader setup successful!")
logger.info("Slack Loader setup successful!")
@staticmethod
def _check_query(query):
@@ -69,7 +71,7 @@ class SlackLoader(BaseLoader):
data = []
data_content = []
logging.info(f"Searching slack conversations for query: {query}")
logger.info(f"Searching slack conversations for query: {query}")
results = self.client.search_messages(
query=query,
sort="timestamp",
@@ -79,7 +81,7 @@ class SlackLoader(BaseLoader):
messages = results.get("messages")
num_message = len(messages)
logging.info(f"Found {num_message} messages for query: {query}")
logger.info(f"Found {num_message} messages for query: {query}")
matches = messages.get("matches", [])
for message in matches:
@@ -107,7 +109,7 @@ class SlackLoader(BaseLoader):
"data": data,
}
except Exception as e:
logging.warning(f"Error in loading slack data: {e}")
logger.warning(f"Error in loading slack data: {e}")
raise ValueError(
f"Error in loading slack data: {e}. Check `https://docs.embedchain.ai/data-sources/slack` to learn more." # noqa:E501
) from e

View File

@@ -9,6 +9,8 @@ from embedchain.helpers.json_serializable import register_deserializable
from embedchain.loaders.base_loader import BaseLoader
from embedchain.utils.misc import is_readable
logger = logging.getLogger(__name__)
@register_deserializable
class SubstackLoader(BaseLoader):
@@ -90,9 +92,9 @@ class SubstackLoader(BaseLoader):
if is_readable(data):
return data
else:
logging.warning(f"Page is not readable (too many invalid characters): {link}")
logger.warning(f"Page is not readable (too many invalid characters): {link}")
except ParserRejectedMarkup as e:
logging.error(f"Failed to parse {link}: {e}")
logger.error(f"Failed to parse {link}: {e}")
return None
for link in links:

View File

@@ -14,6 +14,8 @@ from embedchain.helpers.json_serializable import register_deserializable
from embedchain.loaders.base_loader import BaseLoader
from embedchain.utils.misc import clean_string
logger = logging.getLogger(__name__)
@register_deserializable
class WebPageLoader(BaseLoader):
@@ -87,7 +89,7 @@ class WebPageLoader(BaseLoader):
cleaned_size = len(content)
if original_size != 0:
logging.info(
logger.info(
f"[{url}] Cleaned page size: {cleaned_size} characters, down from {original_size} (shrunk: {original_size-cleaned_size} chars, {round((1-(cleaned_size/original_size)) * 100, 2)}%)" # noqa:E501
)

View File

@@ -7,6 +7,8 @@ from tqdm import tqdm
from embedchain.loaders.base_loader import BaseLoader
from embedchain.loaders.youtube_video import YoutubeVideoLoader
logger = logging.getLogger(__name__)
class YoutubeChannelLoader(BaseLoader):
"""Loader for youtube channel."""
@@ -36,7 +38,7 @@ class YoutubeChannelLoader(BaseLoader):
videos = [entry["url"] for entry in info_dict["entries"]]
return videos
except Exception:
logging.error(f"Failed to fetch youtube videos for channel: {channel_name}")
logger.error(f"Failed to fetch youtube videos for channel: {channel_name}")
return []
def _load_yt_video(video_link):
@@ -45,12 +47,12 @@ class YoutubeChannelLoader(BaseLoader):
if each_load_data:
return each_load_data.get("data")
except Exception as e:
logging.error(f"Failed to load youtube video {video_link}: {e}")
logger.error(f"Failed to load youtube video {video_link}: {e}")
return None
def _add_youtube_channel():
video_links = _get_yt_video_links()
logging.info("Loading videos from youtube channel...")
logger.info("Loading videos from youtube channel...")
with concurrent.futures.ThreadPoolExecutor() as executor:
# Submitting all tasks and storing the future object with the video link
future_to_video = {
@@ -67,7 +69,7 @@ class YoutubeChannelLoader(BaseLoader):
data.extend(results)
data_urls.extend([result.get("meta_data").get("url") for result in results])
except Exception as e:
logging.error(f"Failed to process youtube video {video}: {e}")
logger.error(f"Failed to process youtube video {video}: {e}")
_add_youtube_channel()
doc_id = hashlib.sha256((youtube_url + ", ".join(data_urls)).encode()).hexdigest()