From 38426a7af127f0d6492b0789ceaf856956172ceb Mon Sep 17 00:00:00 2001 From: Sidharth Mohanty Date: Wed, 29 Nov 2023 23:37:05 +0530 Subject: [PATCH] Discord loader (#976) --- docs/data-sources/discord.mdx | 28 ++++ docs/data-sources/overview.mdx | 1 + docs/mint.json | 3 +- embedchain/data_formatter/data_formatter.py | 2 + embedchain/loaders/discord.py | 150 ++++++++++++++++++++ embedchain/models/data_type.py | 2 + 6 files changed, 185 insertions(+), 1 deletion(-) create mode 100644 docs/data-sources/discord.mdx create mode 100644 embedchain/loaders/discord.py diff --git a/docs/data-sources/discord.mdx b/docs/data-sources/discord.mdx new file mode 100644 index 00000000..ce9562de --- /dev/null +++ b/docs/data-sources/discord.mdx @@ -0,0 +1,28 @@ +--- +title: "💬 Discord" +--- + +To add any Discord channel messages to your app, just add the `channel_id` as the source and set the `data_type` to `discord`. + + + This loader requires a Discord bot token with read messages access. + To obtain the token, follow the instructions provided in this tutorial: + How to Get a Discord Bot Token?. + + +```python +import os +from embedchain import Pipeline as App + +# add your discord "BOT" token +os.environ["DISCORD_TOKEN"] = "xxx" + +app = App() + +app.add("1177296711023075338", data_type="discord") + +response = app.query("What is Joe saying about Elon Musk?") + +print(response) +# Answer: Joe is saying "Elon Musk is a genius". +``` diff --git a/docs/data-sources/overview.mdx b/docs/data-sources/overview.mdx index 08b2a572..4cf04d50 100644 --- a/docs/data-sources/overview.mdx +++ b/docs/data-sources/overview.mdx @@ -24,6 +24,7 @@ Embedchain comes with built-in support for various data sources. We handle the c +
diff --git a/docs/mint.json b/docs/mint.json index 256a5e80..6639fde1 100644 --- a/docs/mint.json +++ b/docs/mint.json @@ -89,7 +89,8 @@ "data-sources/openapi", "data-sources/youtube-video", "data-sources/discourse", - "data-sources/substack" + "data-sources/substack", + "data-sources/discord" ] }, "data-sources/data-type-handling" diff --git a/embedchain/data_formatter/data_formatter.py b/embedchain/data_formatter/data_formatter.py index 368e4325..ce6d8307 100644 --- a/embedchain/data_formatter/data_formatter.py +++ b/embedchain/data_formatter/data_formatter.py @@ -66,6 +66,7 @@ class DataFormatter(JSONSerializable): DataType.SUBSTACK: "embedchain.loaders.substack.SubstackLoader", DataType.GITHUB: "embedchain.loaders.github.GithubLoader", DataType.YOUTUBE_CHANNEL: "embedchain.loaders.youtube_channel.YoutubeChannelLoader", + DataType.DISCORD: "embedchain.loaders.discord.DiscordLoader", } custom_loaders = set( @@ -118,6 +119,7 @@ class DataFormatter(JSONSerializable): DataType.SUBSTACK: "embedchain.chunkers.substack.SubstackChunker", DataType.GITHUB: "embedchain.chunkers.common_chunker.CommonChunker", DataType.YOUTUBE_CHANNEL: "embedchain.chunkers.common_chunker.CommonChunker", + DataType.DISCORD: "embedchain.chunkers.common_chunker.CommonChunker", } if data_type in chunker_classes: diff --git a/embedchain/loaders/discord.py b/embedchain/loaders/discord.py new file mode 100644 index 00000000..464f7a65 --- /dev/null +++ b/embedchain/loaders/discord.py @@ -0,0 +1,150 @@ +import logging +import os +import hashlib + +from embedchain.helpers.json_serializable import register_deserializable +from embedchain.loaders.base_loader import BaseLoader + + +@register_deserializable +class DiscordLoader(BaseLoader): + """ + Load data from a Discord Channel ID. + """ + + def __init__(self): + if not os.environ.get("DISCORD_TOKEN"): + raise ValueError("DISCORD_TOKEN is not set") + + self.token = os.environ.get("DISCORD_TOKEN") + + @staticmethod + def _format_message(message): + return { + "message_id": message.id, + "content": message.content, + "author": { + "id": message.author.id, + "name": message.author.name, + "discriminator": message.author.discriminator, + }, + "created_at": message.created_at.isoformat(), + "attachments": [ + { + "id": attachment.id, + "filename": attachment.filename, + "size": attachment.size, + "url": attachment.url, + "proxy_url": attachment.proxy_url, + "height": attachment.height, + "width": attachment.width, + } + for attachment in message.attachments + ], + "embeds": [ + { + "title": embed.title, + "type": embed.type, + "description": embed.description, + "url": embed.url, + "timestamp": embed.timestamp.isoformat(), + "color": embed.color, + "footer": { + "text": embed.footer.text, + "icon_url": embed.footer.icon_url, + "proxy_icon_url": embed.footer.proxy_icon_url, + }, + "image": { + "url": embed.image.url, + "proxy_url": embed.image.proxy_url, + "height": embed.image.height, + "width": embed.image.width, + }, + "thumbnail": { + "url": embed.thumbnail.url, + "proxy_url": embed.thumbnail.proxy_url, + "height": embed.thumbnail.height, + "width": embed.thumbnail.width, + }, + "video": { + "url": embed.video.url, + "height": embed.video.height, + "width": embed.video.width, + }, + "provider": { + "name": embed.provider.name, + "url": embed.provider.url, + }, + "author": { + "name": embed.author.name, + "url": embed.author.url, + "icon_url": embed.author.icon_url, + "proxy_icon_url": embed.author.proxy_icon_url, + }, + "fields": [ + { + "name": field.name, + "value": field.value, + "inline": field.inline, + } + for field in embed.fields + ], + } + for embed in message.embeds + ], + } + + def load_data(self, channel_id: str): + """Load data from a Discord Channel ID.""" + import discord + + messages = [] + + class DiscordClient(discord.Client): + async def on_ready(self) -> None: + logging.info("Logged on as {0}!".format(self.user)) + try: + channel = self.get_channel(int(channel_id)) + if not isinstance(channel, discord.TextChannel): + raise ValueError( + f"Channel {channel_id} is not a text channel. " "Only text channels are supported for now." + ) + threads = {} + + for thread in channel.threads: + threads[thread.id] = thread + + async for message in channel.history(limit=None): + messages.append(DiscordLoader._format_message(message)) + if message.id in threads: + async for thread_message in threads[message.id].history(limit=None): + messages.append(DiscordLoader._format_message(thread_message)) + + except Exception as e: + logging.error(e) + await self.close() + finally: + await self.close() + + intents = discord.Intents.default() + intents.message_content = True + client = DiscordClient(intents=intents) + client.run(self.token) + + meta_data = { + "url": channel_id, + } + + messages = str(messages) + + doc_id = hashlib.sha256((messages + channel_id).encode()).hexdigest() + + return { + "doc_id": doc_id, + "data": [ + { + "content": messages, + "meta_data": meta_data, + } + ], + } diff --git a/embedchain/models/data_type.py b/embedchain/models/data_type.py index c54ee249..647d8ac5 100644 --- a/embedchain/models/data_type.py +++ b/embedchain/models/data_type.py @@ -36,6 +36,7 @@ class IndirectDataType(Enum): SUBSTACK = "substack" GITHUB = "github" YOUTUBE_CHANNEL = "youtube_channel" + DISCORD = "discord" class SpecialDataType(Enum): @@ -71,3 +72,4 @@ class DataType(Enum): SUBSTACK = IndirectDataType.SUBSTACK.value GITHUB = IndirectDataType.GITHUB.value YOUTUBE_CHANNEL = IndirectDataType.YOUTUBE_CHANNEL.value + DISCORD = IndirectDataType.DISCORD.value