diff --git a/docs/data-sources/discord.mdx b/docs/data-sources/discord.mdx
new file mode 100644
index 00000000..ce9562de
--- /dev/null
+++ b/docs/data-sources/discord.mdx
@@ -0,0 +1,28 @@
+---
+title: "💬 Discord"
+---
+
+To add any Discord channel messages to your app, just add the `channel_id` as the source and set the `data_type` to `discord`.
+
+
+ This loader requires a Discord bot token with read messages access.
+ To obtain the token, follow the instructions provided in this tutorial:
+ How to Get a Discord Bot Token?.
+
+
+```python
+import os
+from embedchain import Pipeline as App
+
+# add your discord "BOT" token
+os.environ["DISCORD_TOKEN"] = "xxx"
+
+app = App()
+
+app.add("1177296711023075338", data_type="discord")
+
+response = app.query("What is Joe saying about Elon Musk?")
+
+print(response)
+# Answer: Joe is saying "Elon Musk is a genius".
+```
diff --git a/docs/data-sources/overview.mdx b/docs/data-sources/overview.mdx
index 08b2a572..4cf04d50 100644
--- a/docs/data-sources/overview.mdx
+++ b/docs/data-sources/overview.mdx
@@ -24,6 +24,7 @@ Embedchain comes with built-in support for various data sources. We handle the c
+
diff --git a/docs/mint.json b/docs/mint.json
index 256a5e80..6639fde1 100644
--- a/docs/mint.json
+++ b/docs/mint.json
@@ -89,7 +89,8 @@
"data-sources/openapi",
"data-sources/youtube-video",
"data-sources/discourse",
- "data-sources/substack"
+ "data-sources/substack",
+ "data-sources/discord"
]
},
"data-sources/data-type-handling"
diff --git a/embedchain/data_formatter/data_formatter.py b/embedchain/data_formatter/data_formatter.py
index 368e4325..ce6d8307 100644
--- a/embedchain/data_formatter/data_formatter.py
+++ b/embedchain/data_formatter/data_formatter.py
@@ -66,6 +66,7 @@ class DataFormatter(JSONSerializable):
DataType.SUBSTACK: "embedchain.loaders.substack.SubstackLoader",
DataType.GITHUB: "embedchain.loaders.github.GithubLoader",
DataType.YOUTUBE_CHANNEL: "embedchain.loaders.youtube_channel.YoutubeChannelLoader",
+ DataType.DISCORD: "embedchain.loaders.discord.DiscordLoader",
}
custom_loaders = set(
@@ -118,6 +119,7 @@ class DataFormatter(JSONSerializable):
DataType.SUBSTACK: "embedchain.chunkers.substack.SubstackChunker",
DataType.GITHUB: "embedchain.chunkers.common_chunker.CommonChunker",
DataType.YOUTUBE_CHANNEL: "embedchain.chunkers.common_chunker.CommonChunker",
+ DataType.DISCORD: "embedchain.chunkers.common_chunker.CommonChunker",
}
if data_type in chunker_classes:
diff --git a/embedchain/loaders/discord.py b/embedchain/loaders/discord.py
new file mode 100644
index 00000000..464f7a65
--- /dev/null
+++ b/embedchain/loaders/discord.py
@@ -0,0 +1,150 @@
+import logging
+import os
+import hashlib
+
+from embedchain.helpers.json_serializable import register_deserializable
+from embedchain.loaders.base_loader import BaseLoader
+
+
+@register_deserializable
+class DiscordLoader(BaseLoader):
+ """
+ Load data from a Discord Channel ID.
+ """
+
+ def __init__(self):
+ if not os.environ.get("DISCORD_TOKEN"):
+ raise ValueError("DISCORD_TOKEN is not set")
+
+ self.token = os.environ.get("DISCORD_TOKEN")
+
+ @staticmethod
+ def _format_message(message):
+ return {
+ "message_id": message.id,
+ "content": message.content,
+ "author": {
+ "id": message.author.id,
+ "name": message.author.name,
+ "discriminator": message.author.discriminator,
+ },
+ "created_at": message.created_at.isoformat(),
+ "attachments": [
+ {
+ "id": attachment.id,
+ "filename": attachment.filename,
+ "size": attachment.size,
+ "url": attachment.url,
+ "proxy_url": attachment.proxy_url,
+ "height": attachment.height,
+ "width": attachment.width,
+ }
+ for attachment in message.attachments
+ ],
+ "embeds": [
+ {
+ "title": embed.title,
+ "type": embed.type,
+ "description": embed.description,
+ "url": embed.url,
+ "timestamp": embed.timestamp.isoformat(),
+ "color": embed.color,
+ "footer": {
+ "text": embed.footer.text,
+ "icon_url": embed.footer.icon_url,
+ "proxy_icon_url": embed.footer.proxy_icon_url,
+ },
+ "image": {
+ "url": embed.image.url,
+ "proxy_url": embed.image.proxy_url,
+ "height": embed.image.height,
+ "width": embed.image.width,
+ },
+ "thumbnail": {
+ "url": embed.thumbnail.url,
+ "proxy_url": embed.thumbnail.proxy_url,
+ "height": embed.thumbnail.height,
+ "width": embed.thumbnail.width,
+ },
+ "video": {
+ "url": embed.video.url,
+ "height": embed.video.height,
+ "width": embed.video.width,
+ },
+ "provider": {
+ "name": embed.provider.name,
+ "url": embed.provider.url,
+ },
+ "author": {
+ "name": embed.author.name,
+ "url": embed.author.url,
+ "icon_url": embed.author.icon_url,
+ "proxy_icon_url": embed.author.proxy_icon_url,
+ },
+ "fields": [
+ {
+ "name": field.name,
+ "value": field.value,
+ "inline": field.inline,
+ }
+ for field in embed.fields
+ ],
+ }
+ for embed in message.embeds
+ ],
+ }
+
+ def load_data(self, channel_id: str):
+ """Load data from a Discord Channel ID."""
+ import discord
+
+ messages = []
+
+ class DiscordClient(discord.Client):
+ async def on_ready(self) -> None:
+ logging.info("Logged on as {0}!".format(self.user))
+ try:
+ channel = self.get_channel(int(channel_id))
+ if not isinstance(channel, discord.TextChannel):
+ raise ValueError(
+ f"Channel {channel_id} is not a text channel. " "Only text channels are supported for now."
+ )
+ threads = {}
+
+ for thread in channel.threads:
+ threads[thread.id] = thread
+
+ async for message in channel.history(limit=None):
+ messages.append(DiscordLoader._format_message(message))
+ if message.id in threads:
+ async for thread_message in threads[message.id].history(limit=None):
+ messages.append(DiscordLoader._format_message(thread_message))
+
+ except Exception as e:
+ logging.error(e)
+ await self.close()
+ finally:
+ await self.close()
+
+ intents = discord.Intents.default()
+ intents.message_content = True
+ client = DiscordClient(intents=intents)
+ client.run(self.token)
+
+ meta_data = {
+ "url": channel_id,
+ }
+
+ messages = str(messages)
+
+ doc_id = hashlib.sha256((messages + channel_id).encode()).hexdigest()
+
+ return {
+ "doc_id": doc_id,
+ "data": [
+ {
+ "content": messages,
+ "meta_data": meta_data,
+ }
+ ],
+ }
diff --git a/embedchain/models/data_type.py b/embedchain/models/data_type.py
index c54ee249..647d8ac5 100644
--- a/embedchain/models/data_type.py
+++ b/embedchain/models/data_type.py
@@ -36,6 +36,7 @@ class IndirectDataType(Enum):
SUBSTACK = "substack"
GITHUB = "github"
YOUTUBE_CHANNEL = "youtube_channel"
+ DISCORD = "discord"
class SpecialDataType(Enum):
@@ -71,3 +72,4 @@ class DataType(Enum):
SUBSTACK = IndirectDataType.SUBSTACK.value
GITHUB = IndirectDataType.GITHUB.value
YOUTUBE_CHANNEL = IndirectDataType.YOUTUBE_CHANNEL.value
+ DISCORD = IndirectDataType.DISCORD.value