diff --git a/docs/components/data-sources/dropbox.mdx b/docs/components/data-sources/dropbox.mdx
new file mode 100644
index 00000000..dcfb0ac6
--- /dev/null
+++ b/docs/components/data-sources/dropbox.mdx
@@ -0,0 +1,27 @@
+---
+title: '💾 Dropbox'
+---
+
+To load folders or files from your Dropbox account, configure the `data_type` parameter as `dropbox` and specify the path to the desired file or folder, starting from the root directory of your Dropbox account.
+
+For Dropbox access, an **access token** is required. Obtain this token by visiting [Dropbox Developer Apps](https://www.dropbox.com/developers/apps). There, create a new app and generate an access token for it.
+
+Ensure your app has the following settings activated:
+
+- In the Permissions section, enable `files.content.read` and `files.metadata.read`.
+
+```python
+import os
+from embedchain import Pipeline as App
+
+os.environ["DROPBOX_ACCESS_TOKEN"] = "sl.xxx"
+os.environ["OPENAI_API_KEY"] = "sk-xxx"
+
+app = App()
+
+# any path from the root of your dropbox account, you can leave it "" for the root folder
+app.add("/test", data_type="dropbox")
+
+print(app.query("Which two celebrities are mentioned here?"))
+# The two celebrities mentioned in the given context are Elon Musk and Jeff Bezos.
+```
diff --git a/docs/components/data-sources/overview.mdx b/docs/components/data-sources/overview.mdx
index 385ce7e9..3c0072ee 100644
--- a/docs/components/data-sources/overview.mdx
+++ b/docs/components/data-sources/overview.mdx
@@ -31,6 +31,7 @@ Embedchain comes with built-in support for various data sources. We handle the c
+
diff --git a/docs/mint.json b/docs/mint.json
index b34b3214..ca9ce9ad 100644
--- a/docs/mint.json
+++ b/docs/mint.json
@@ -131,7 +131,8 @@
"components/data-sources/substack",
"components/data-sources/discord",
"components/data-sources/beehiiv",
- "components/data-sources/directory"
+ "components/data-sources/directory",
+ "components/data-sources/dropbox"
]
},
"components/data-sources/data-type-handling"
diff --git a/embedchain/data_formatter/data_formatter.py b/embedchain/data_formatter/data_formatter.py
index 3f9a2edd..36f5c719 100644
--- a/embedchain/data_formatter/data_formatter.py
+++ b/embedchain/data_formatter/data_formatter.py
@@ -76,6 +76,7 @@ class DataFormatter(JSONSerializable):
DataType.BEEHIIV: "embedchain.loaders.beehiiv.BeehiivLoader",
DataType.DIRECTORY: "embedchain.loaders.directory_loader.DirectoryLoader",
DataType.SLACK: "embedchain.loaders.slack.SlackLoader",
+ DataType.DROPBOX: "embedchain.loaders.dropbox.DropboxLoader",
DataType.TEXT_FILE: "embedchain.loaders.text_file.TextFileLoader",
}
@@ -121,6 +122,7 @@ class DataFormatter(JSONSerializable):
DataType.BEEHIIV: "embedchain.chunkers.beehiiv.BeehiivChunker",
DataType.DIRECTORY: "embedchain.chunkers.common_chunker.CommonChunker",
DataType.SLACK: "embedchain.chunkers.common_chunker.CommonChunker",
+ DataType.DROPBOX: "embedchain.chunkers.common_chunker.CommonChunker",
DataType.TEXT_FILE: "embedchain.chunkers.common_chunker.CommonChunker",
}
diff --git a/embedchain/loaders/dropbox.py b/embedchain/loaders/dropbox.py
new file mode 100644
index 00000000..c4b01f14
--- /dev/null
+++ b/embedchain/loaders/dropbox.py
@@ -0,0 +1,82 @@
+import hashlib
+import os
+from typing import List
+
+from dropbox.files import FileMetadata
+
+from embedchain.helpers.json_serializable import register_deserializable
+from embedchain.loaders.base_loader import BaseLoader
+from embedchain.loaders.directory_loader import DirectoryLoader
+
+
+@register_deserializable
+class DropboxLoader(BaseLoader):
+ def __init__(self):
+ access_token = os.environ.get("DROPBOX_ACCESS_TOKEN")
+ if not access_token:
+ raise ValueError("Please set the `DROPBOX_ACCESS_TOKEN` environment variable.")
+ try:
+ from dropbox import Dropbox, exceptions
+ except ImportError:
+ raise ImportError(
+ 'Dropbox requires extra dependencies. Install with `pip install --upgrade "embedchain[dropbox]"`'
+ )
+
+ try:
+ dbx = Dropbox(access_token)
+ dbx.users_get_current_account()
+ self.dbx = dbx
+ except exceptions.AuthError as ex:
+ raise ValueError("Invalid Dropbox access token. Please verify your token and try again.") from ex
+
+ def _download_folder(self, path: str, local_root: str) -> List[FileMetadata]:
+ """Download a folder from Dropbox and save it preserving the directory structure."""
+ entries = self.dbx.files_list_folder(path).entries
+ for entry in entries:
+ local_path = os.path.join(local_root, entry.name)
+ if isinstance(entry, FileMetadata):
+ self.dbx.files_download_to_file(local_path, f"{path}/{entry.name}")
+ else:
+ os.makedirs(local_path, exist_ok=True)
+ self._download_folder(f"{path}/{entry.name}", local_path)
+ return entries
+
+ def _generate_dir_id_from_all_paths(self, path: str) -> str:
+ """Generate a unique ID for a directory based on all of its paths."""
+ entries = self.dbx.files_list_folder(path).entries
+ paths = [f"{path}/{entry.name}" for entry in entries]
+ return hashlib.sha256("".join(paths).encode()).hexdigest()
+
+ def load_data(self, path: str):
+ """Load data from a Dropbox URL, preserving the folder structure."""
+ root_dir = f"dropbox_{self._generate_dir_id_from_all_paths(path)}"
+ os.makedirs(root_dir, exist_ok=True)
+
+ for entry in self.dbx.files_list_folder(path).entries:
+ local_path = os.path.join(root_dir, entry.name)
+ if isinstance(entry, FileMetadata):
+ self.dbx.files_download_to_file(local_path, f"{path}/{entry.name}")
+ else:
+ os.makedirs(local_path, exist_ok=True)
+ self._download_folder(f"{path}/{entry.name}", local_path)
+
+ dir_loader = DirectoryLoader()
+ data = dir_loader.load_data(root_dir)["data"]
+
+ # Clean up
+ self._clean_directory(root_dir)
+
+ return {
+ "doc_id": hashlib.sha256(path.encode()).hexdigest(),
+ "data": data,
+ }
+
+ def _clean_directory(self, dir_path):
+ """Recursively delete a directory and its contents."""
+ for item in os.listdir(dir_path):
+ item_path = os.path.join(dir_path, item)
+ if os.path.isdir(item_path):
+ self._clean_directory(item_path)
+ else:
+ os.remove(item_path)
+ os.rmdir(dir_path)
diff --git a/embedchain/models/data_type.py b/embedchain/models/data_type.py
index 3ee68c84..fb9da2cd 100644
--- a/embedchain/models/data_type.py
+++ b/embedchain/models/data_type.py
@@ -37,6 +37,7 @@ class IndirectDataType(Enum):
BEEHIIV = "beehiiv"
DIRECTORY = "directory"
SLACK = "slack"
+ DROPBOX = "dropbox"
TEXT_FILE = "text_file"
@@ -74,4 +75,5 @@ class DataType(Enum):
BEEHIIV = IndirectDataType.BEEHIIV.value
DIRECTORY = IndirectDataType.DIRECTORY.value
SLACK = IndirectDataType.SLACK.value
+ DROPBOX = IndirectDataType.DROPBOX.value
TEXT_FILE = IndirectDataType.TEXT_FILE.value
diff --git a/poetry.lock b/poetry.lock
index 89197d7c..5202837f 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand.
[[package]]
name = "aiofiles"
@@ -1269,6 +1269,23 @@ files = [
{file = "docx2txt-0.8.tar.gz", hash = "sha256:2c06d98d7cfe2d3947e5760a57d924e3ff07745b379c8737723922e7009236e5"},
]
+[[package]]
+name = "dropbox"
+version = "11.36.2"
+description = "Official Dropbox API Client"
+optional = true
+python-versions = "*"
+files = [
+ {file = "dropbox-11.36.2-py2-none-any.whl", hash = "sha256:afbfce2589b777ade1deaa2c186f3650c41e41cea0f1fac497a75112a171f8e2"},
+ {file = "dropbox-11.36.2-py3-none-any.whl", hash = "sha256:a21e4d2bcbeb1d8067ff87969aea48792c9a8266182491153feff2be9c1b9c8f"},
+ {file = "dropbox-11.36.2.tar.gz", hash = "sha256:d48d3d16d486c78b11c14a1c4a28a2611fbf5a0d0a358b861bfd9482e603c500"},
+]
+
+[package.dependencies]
+requests = ">=2.16.2"
+six = ">=1.12.0"
+stone = ">=2"
+
[[package]]
name = "duckduckgo-search"
version = "3.9.3"
@@ -4165,10 +4182,10 @@ files = [
[package.dependencies]
numpy = [
+ {version = ">=1.21.0", markers = "python_version == \"3.9\" and platform_system == \"Darwin\" and platform_machine == \"arm64\""},
{version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""},
{version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""},
{version = ">=1.19.3", markers = "platform_system == \"Linux\" and platform_machine == \"aarch64\" and python_version >= \"3.8\" and python_version < \"3.10\" or python_version > \"3.9\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_system != \"Darwin\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_machine != \"arm64\" and python_version < \"3.10\""},
- {version = ">=1.21.0", markers = "python_version == \"3.9\" and platform_system == \"Darwin\" and platform_machine == \"arm64\""},
{version = ">=1.23.5", markers = "python_version >= \"3.11\""},
]
@@ -4577,6 +4594,17 @@ files = [
dev = ["pre-commit", "tox"]
testing = ["pytest", "pytest-benchmark"]
+[[package]]
+name = "ply"
+version = "3.11"
+description = "Python Lex & Yacc"
+optional = true
+python-versions = "*"
+files = [
+ {file = "ply-3.11-py2.py3-none-any.whl", hash = "sha256:096f9b8350b65ebd2fd1346b12452efe5b9607f7482813ffca50c22722a807ce"},
+ {file = "ply-3.11.tar.gz", hash = "sha256:00c7c1aaa88358b9c765b6d3000c6eec0ba42abca5351b095321aef446081da3"},
+]
+
[[package]]
name = "portalocker"
version = "2.8.2"
@@ -6625,6 +6653,22 @@ typing-extensions = {version = ">=3.10.0", markers = "python_version < \"3.10\""
[package.extras]
full = ["httpx (>=0.22.0)", "itsdangerous", "jinja2", "python-multipart", "pyyaml"]
+[[package]]
+name = "stone"
+version = "3.3.1"
+description = "Stone is an interface description language (IDL) for APIs."
+optional = true
+python-versions = "*"
+files = [
+ {file = "stone-3.3.1-py2-none-any.whl", hash = "sha256:cd2f7f9056fc39b16c8fd46a26971dc5ccd30b5c2c246566cd2c0dd27ff96609"},
+ {file = "stone-3.3.1-py3-none-any.whl", hash = "sha256:e15866fad249c11a963cce3bdbed37758f2e88c8ff4898616bc0caeb1e216047"},
+ {file = "stone-3.3.1.tar.gz", hash = "sha256:4ef0397512f609757975f7ec09b35639d72ba7e3e17ce4ddf399578346b4cb50"},
+]
+
+[package.dependencies]
+ply = ">=3.4"
+six = ">=1.12.0"
+
[[package]]
name = "sympy"
version = "1.12"
@@ -8115,6 +8159,7 @@ cohere = ["cohere"]
community = ["llama-hub"]
dataloaders = ["docx2txt", "duckduckgo-search", "pytube", "sentence-transformers", "unstructured", "youtube-transcript-api"]
discord = ["discord"]
+dropbox = ["dropbox"]
elasticsearch = ["elasticsearch"]
github = ["PyGithub", "gitpython"]
gmail = ["llama-hub", "requests"]
@@ -8144,4 +8189,4 @@ youtube = ["youtube-transcript-api", "yt_dlp"]
[metadata]
lock-version = "2.0"
python-versions = ">=3.9,<3.12"
-content-hash = "bc763595ae5e903a5a819a2d3f31f045fac52555f72e44ead9df0e5e191955aa"
+content-hash = "335c42c91a2b5e4a1c3d8a7c39dee8665fd1eee0410e1bc6cb6cb1d6f6722445"
diff --git a/pyproject.toml b/pyproject.toml
index 1f58426e..0d75e21a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -147,6 +147,7 @@ newspaper3k = { version = "^0.2.8", optional = true }
listparser = { version = "^0.19", optional = true }
google-generativeai = { version = "^0.3.0", optional = true }
modal = { version = "^0.56.4329", optional = true }
+dropbox = { version = "^11.36.2", optional = true }
[tool.poetry.group.dev.dependencies]
black = "^23.3.0"
@@ -214,6 +215,7 @@ rss_feed = [
]
google = ["google-generativeai"]
modal = ["modal"]
+dropbox = ["dropbox"]
[tool.poetry.group.docs.dependencies]
diff --git a/tests/loaders/test_dropbox.py b/tests/loaders/test_dropbox.py
new file mode 100644
index 00000000..c7e81673
--- /dev/null
+++ b/tests/loaders/test_dropbox.py
@@ -0,0 +1,85 @@
+import os
+from unittest.mock import MagicMock
+
+import pytest
+from dropbox.files import FileMetadata
+
+from embedchain.loaders.dropbox import DropboxLoader
+
+
+@pytest.fixture
+def setup_dropbox_loader(mocker):
+ mock_dropbox = mocker.patch("dropbox.Dropbox")
+ mock_dbx = mocker.MagicMock()
+ mock_dropbox.return_value = mock_dbx
+
+ os.environ["DROPBOX_ACCESS_TOKEN"] = "test_token"
+ loader = DropboxLoader()
+
+ yield loader, mock_dbx
+
+ if "DROPBOX_ACCESS_TOKEN" in os.environ:
+ del os.environ["DROPBOX_ACCESS_TOKEN"]
+
+
+def test_initialization(setup_dropbox_loader):
+ """Test initialization of DropboxLoader."""
+ loader, _ = setup_dropbox_loader
+ assert loader is not None
+
+
+def test_download_folder(setup_dropbox_loader, mocker):
+ """Test downloading a folder."""
+ loader, mock_dbx = setup_dropbox_loader
+ mocker.patch("os.makedirs")
+ mocker.patch("os.path.join", return_value="mock/path")
+
+ mock_file_metadata = mocker.MagicMock(spec=FileMetadata)
+ mock_dbx.files_list_folder.return_value.entries = [mock_file_metadata]
+
+ entries = loader._download_folder("path/to/folder", "local_root")
+ assert entries is not None
+
+
+def test_generate_dir_id_from_all_paths(setup_dropbox_loader, mocker):
+ """Test directory ID generation."""
+ loader, mock_dbx = setup_dropbox_loader
+ mock_file_metadata = mocker.MagicMock(spec=FileMetadata, name="file.txt")
+ mock_dbx.files_list_folder.return_value.entries = [mock_file_metadata]
+
+ dir_id = loader._generate_dir_id_from_all_paths("path/to/folder")
+ assert dir_id is not None
+ assert len(dir_id) == 64
+
+
+def test_clean_directory(setup_dropbox_loader, mocker):
+ """Test cleaning up a directory."""
+ loader, _ = setup_dropbox_loader
+ mocker.patch("os.listdir", return_value=["file1", "file2"])
+ mocker.patch("os.remove")
+ mocker.patch("os.rmdir")
+
+ loader._clean_directory("path/to/folder")
+
+
+def test_load_data(mocker, setup_dropbox_loader, tmp_path):
+ loader = setup_dropbox_loader[0]
+
+ mock_file_metadata = MagicMock(spec=FileMetadata, name="file.txt")
+ mocker.patch.object(loader.dbx, "files_list_folder", return_value=MagicMock(entries=[mock_file_metadata]))
+ mocker.patch.object(loader.dbx, "files_download_to_file")
+
+ # Mock DirectoryLoader
+ mock_data = {"data": "test_data"}
+ mocker.patch("embedchain.loaders.directory_loader.DirectoryLoader.load_data", return_value=mock_data)
+
+ test_dir = tmp_path / "dropbox_test"
+ test_dir.mkdir()
+ test_file = test_dir / "file.txt"
+ test_file.write_text("dummy content")
+ mocker.patch.object(loader, "_generate_dir_id_from_all_paths", return_value=str(test_dir))
+
+ result = loader.load_data("path/to/folder")
+
+ assert result == {"doc_id": mocker.ANY, "data": "test_data"}
+ loader.dbx.files_list_folder.assert_called_once_with("path/to/folder")