[Feature] Add Dropbox loader (#1073)

Co-authored-by: Deshraj Yadav <deshrajdry@gmail.com>
This commit is contained in:
Sidharth Mohanty
2023-12-29 23:22:25 +05:30
committed by GitHub
parent a544b4d3ff
commit 404e73af77
9 changed files with 251 additions and 4 deletions

View File

@@ -0,0 +1,27 @@
---
title: '💾 Dropbox'
---
To load folders or files from your Dropbox account, configure the `data_type` parameter as `dropbox` and specify the path to the desired file or folder, starting from the root directory of your Dropbox account.
For Dropbox access, an **access token** is required. Obtain this token by visiting [Dropbox Developer Apps](https://www.dropbox.com/developers/apps). There, create a new app and generate an access token for it.
Ensure your app has the following settings activated:
- In the Permissions section, enable `files.content.read` and `files.metadata.read`.
```python
import os
from embedchain import Pipeline as App
os.environ["DROPBOX_ACCESS_TOKEN"] = "sl.xxx"
os.environ["OPENAI_API_KEY"] = "sk-xxx"
app = App()
# any path from the root of your dropbox account, you can leave it "" for the root folder
app.add("/test", data_type="dropbox")
print(app.query("Which two celebrities are mentioned here?"))
# The two celebrities mentioned in the given context are Elon Musk and Jeff Bezos.
```

View File

@@ -31,6 +31,7 @@ Embedchain comes with built-in support for various data sources. We handle the c
<Card title="📝 Substack" href="/components/data-sources/substack"></Card>
<Card title="🐝 Beehiiv" href="/components/data-sources/beehiiv"></Card>
<Card title="📁 Directory" href="/components/data-sources/directory"></Card>
<Card title="💾 Dropbox" href="/components/data-sources/dropbox"></Card>
</CardGroup>
<br/ >

View File

@@ -131,7 +131,8 @@
"components/data-sources/substack",
"components/data-sources/discord",
"components/data-sources/beehiiv",
"components/data-sources/directory"
"components/data-sources/directory",
"components/data-sources/dropbox"
]
},
"components/data-sources/data-type-handling"

View File

@@ -76,6 +76,7 @@ class DataFormatter(JSONSerializable):
DataType.BEEHIIV: "embedchain.loaders.beehiiv.BeehiivLoader",
DataType.DIRECTORY: "embedchain.loaders.directory_loader.DirectoryLoader",
DataType.SLACK: "embedchain.loaders.slack.SlackLoader",
DataType.DROPBOX: "embedchain.loaders.dropbox.DropboxLoader",
DataType.TEXT_FILE: "embedchain.loaders.text_file.TextFileLoader",
}
@@ -121,6 +122,7 @@ class DataFormatter(JSONSerializable):
DataType.BEEHIIV: "embedchain.chunkers.beehiiv.BeehiivChunker",
DataType.DIRECTORY: "embedchain.chunkers.common_chunker.CommonChunker",
DataType.SLACK: "embedchain.chunkers.common_chunker.CommonChunker",
DataType.DROPBOX: "embedchain.chunkers.common_chunker.CommonChunker",
DataType.TEXT_FILE: "embedchain.chunkers.common_chunker.CommonChunker",
}

View File

@@ -0,0 +1,82 @@
import hashlib
import os
from typing import List
from dropbox.files import FileMetadata
from embedchain.helpers.json_serializable import register_deserializable
from embedchain.loaders.base_loader import BaseLoader
from embedchain.loaders.directory_loader import DirectoryLoader
@register_deserializable
class DropboxLoader(BaseLoader):
def __init__(self):
access_token = os.environ.get("DROPBOX_ACCESS_TOKEN")
if not access_token:
raise ValueError("Please set the `DROPBOX_ACCESS_TOKEN` environment variable.")
try:
from dropbox import Dropbox, exceptions
except ImportError:
raise ImportError(
'Dropbox requires extra dependencies. Install with `pip install --upgrade "embedchain[dropbox]"`'
)
try:
dbx = Dropbox(access_token)
dbx.users_get_current_account()
self.dbx = dbx
except exceptions.AuthError as ex:
raise ValueError("Invalid Dropbox access token. Please verify your token and try again.") from ex
def _download_folder(self, path: str, local_root: str) -> List[FileMetadata]:
"""Download a folder from Dropbox and save it preserving the directory structure."""
entries = self.dbx.files_list_folder(path).entries
for entry in entries:
local_path = os.path.join(local_root, entry.name)
if isinstance(entry, FileMetadata):
self.dbx.files_download_to_file(local_path, f"{path}/{entry.name}")
else:
os.makedirs(local_path, exist_ok=True)
self._download_folder(f"{path}/{entry.name}", local_path)
return entries
def _generate_dir_id_from_all_paths(self, path: str) -> str:
"""Generate a unique ID for a directory based on all of its paths."""
entries = self.dbx.files_list_folder(path).entries
paths = [f"{path}/{entry.name}" for entry in entries]
return hashlib.sha256("".join(paths).encode()).hexdigest()
def load_data(self, path: str):
"""Load data from a Dropbox URL, preserving the folder structure."""
root_dir = f"dropbox_{self._generate_dir_id_from_all_paths(path)}"
os.makedirs(root_dir, exist_ok=True)
for entry in self.dbx.files_list_folder(path).entries:
local_path = os.path.join(root_dir, entry.name)
if isinstance(entry, FileMetadata):
self.dbx.files_download_to_file(local_path, f"{path}/{entry.name}")
else:
os.makedirs(local_path, exist_ok=True)
self._download_folder(f"{path}/{entry.name}", local_path)
dir_loader = DirectoryLoader()
data = dir_loader.load_data(root_dir)["data"]
# Clean up
self._clean_directory(root_dir)
return {
"doc_id": hashlib.sha256(path.encode()).hexdigest(),
"data": data,
}
def _clean_directory(self, dir_path):
"""Recursively delete a directory and its contents."""
for item in os.listdir(dir_path):
item_path = os.path.join(dir_path, item)
if os.path.isdir(item_path):
self._clean_directory(item_path)
else:
os.remove(item_path)
os.rmdir(dir_path)

View File

@@ -37,6 +37,7 @@ class IndirectDataType(Enum):
BEEHIIV = "beehiiv"
DIRECTORY = "directory"
SLACK = "slack"
DROPBOX = "dropbox"
TEXT_FILE = "text_file"
@@ -74,4 +75,5 @@ class DataType(Enum):
BEEHIIV = IndirectDataType.BEEHIIV.value
DIRECTORY = IndirectDataType.DIRECTORY.value
SLACK = IndirectDataType.SLACK.value
DROPBOX = IndirectDataType.DROPBOX.value
TEXT_FILE = IndirectDataType.TEXT_FILE.value

51
poetry.lock generated
View File

@@ -1,4 +1,4 @@
# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand.
# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand.
[[package]]
name = "aiofiles"
@@ -1269,6 +1269,23 @@ files = [
{file = "docx2txt-0.8.tar.gz", hash = "sha256:2c06d98d7cfe2d3947e5760a57d924e3ff07745b379c8737723922e7009236e5"},
]
[[package]]
name = "dropbox"
version = "11.36.2"
description = "Official Dropbox API Client"
optional = true
python-versions = "*"
files = [
{file = "dropbox-11.36.2-py2-none-any.whl", hash = "sha256:afbfce2589b777ade1deaa2c186f3650c41e41cea0f1fac497a75112a171f8e2"},
{file = "dropbox-11.36.2-py3-none-any.whl", hash = "sha256:a21e4d2bcbeb1d8067ff87969aea48792c9a8266182491153feff2be9c1b9c8f"},
{file = "dropbox-11.36.2.tar.gz", hash = "sha256:d48d3d16d486c78b11c14a1c4a28a2611fbf5a0d0a358b861bfd9482e603c500"},
]
[package.dependencies]
requests = ">=2.16.2"
six = ">=1.12.0"
stone = ">=2"
[[package]]
name = "duckduckgo-search"
version = "3.9.3"
@@ -4165,10 +4182,10 @@ files = [
[package.dependencies]
numpy = [
{version = ">=1.21.0", markers = "python_version == \"3.9\" and platform_system == \"Darwin\" and platform_machine == \"arm64\""},
{version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""},
{version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""},
{version = ">=1.19.3", markers = "platform_system == \"Linux\" and platform_machine == \"aarch64\" and python_version >= \"3.8\" and python_version < \"3.10\" or python_version > \"3.9\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_system != \"Darwin\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_machine != \"arm64\" and python_version < \"3.10\""},
{version = ">=1.21.0", markers = "python_version == \"3.9\" and platform_system == \"Darwin\" and platform_machine == \"arm64\""},
{version = ">=1.23.5", markers = "python_version >= \"3.11\""},
]
@@ -4577,6 +4594,17 @@ files = [
dev = ["pre-commit", "tox"]
testing = ["pytest", "pytest-benchmark"]
[[package]]
name = "ply"
version = "3.11"
description = "Python Lex & Yacc"
optional = true
python-versions = "*"
files = [
{file = "ply-3.11-py2.py3-none-any.whl", hash = "sha256:096f9b8350b65ebd2fd1346b12452efe5b9607f7482813ffca50c22722a807ce"},
{file = "ply-3.11.tar.gz", hash = "sha256:00c7c1aaa88358b9c765b6d3000c6eec0ba42abca5351b095321aef446081da3"},
]
[[package]]
name = "portalocker"
version = "2.8.2"
@@ -6625,6 +6653,22 @@ typing-extensions = {version = ">=3.10.0", markers = "python_version < \"3.10\""
[package.extras]
full = ["httpx (>=0.22.0)", "itsdangerous", "jinja2", "python-multipart", "pyyaml"]
[[package]]
name = "stone"
version = "3.3.1"
description = "Stone is an interface description language (IDL) for APIs."
optional = true
python-versions = "*"
files = [
{file = "stone-3.3.1-py2-none-any.whl", hash = "sha256:cd2f7f9056fc39b16c8fd46a26971dc5ccd30b5c2c246566cd2c0dd27ff96609"},
{file = "stone-3.3.1-py3-none-any.whl", hash = "sha256:e15866fad249c11a963cce3bdbed37758f2e88c8ff4898616bc0caeb1e216047"},
{file = "stone-3.3.1.tar.gz", hash = "sha256:4ef0397512f609757975f7ec09b35639d72ba7e3e17ce4ddf399578346b4cb50"},
]
[package.dependencies]
ply = ">=3.4"
six = ">=1.12.0"
[[package]]
name = "sympy"
version = "1.12"
@@ -8115,6 +8159,7 @@ cohere = ["cohere"]
community = ["llama-hub"]
dataloaders = ["docx2txt", "duckduckgo-search", "pytube", "sentence-transformers", "unstructured", "youtube-transcript-api"]
discord = ["discord"]
dropbox = ["dropbox"]
elasticsearch = ["elasticsearch"]
github = ["PyGithub", "gitpython"]
gmail = ["llama-hub", "requests"]
@@ -8144,4 +8189,4 @@ youtube = ["youtube-transcript-api", "yt_dlp"]
[metadata]
lock-version = "2.0"
python-versions = ">=3.9,<3.12"
content-hash = "bc763595ae5e903a5a819a2d3f31f045fac52555f72e44ead9df0e5e191955aa"
content-hash = "335c42c91a2b5e4a1c3d8a7c39dee8665fd1eee0410e1bc6cb6cb1d6f6722445"

View File

@@ -147,6 +147,7 @@ newspaper3k = { version = "^0.2.8", optional = true }
listparser = { version = "^0.19", optional = true }
google-generativeai = { version = "^0.3.0", optional = true }
modal = { version = "^0.56.4329", optional = true }
dropbox = { version = "^11.36.2", optional = true }
[tool.poetry.group.dev.dependencies]
black = "^23.3.0"
@@ -214,6 +215,7 @@ rss_feed = [
]
google = ["google-generativeai"]
modal = ["modal"]
dropbox = ["dropbox"]
[tool.poetry.group.docs.dependencies]

View File

@@ -0,0 +1,85 @@
import os
from unittest.mock import MagicMock
import pytest
from dropbox.files import FileMetadata
from embedchain.loaders.dropbox import DropboxLoader
@pytest.fixture
def setup_dropbox_loader(mocker):
mock_dropbox = mocker.patch("dropbox.Dropbox")
mock_dbx = mocker.MagicMock()
mock_dropbox.return_value = mock_dbx
os.environ["DROPBOX_ACCESS_TOKEN"] = "test_token"
loader = DropboxLoader()
yield loader, mock_dbx
if "DROPBOX_ACCESS_TOKEN" in os.environ:
del os.environ["DROPBOX_ACCESS_TOKEN"]
def test_initialization(setup_dropbox_loader):
"""Test initialization of DropboxLoader."""
loader, _ = setup_dropbox_loader
assert loader is not None
def test_download_folder(setup_dropbox_loader, mocker):
"""Test downloading a folder."""
loader, mock_dbx = setup_dropbox_loader
mocker.patch("os.makedirs")
mocker.patch("os.path.join", return_value="mock/path")
mock_file_metadata = mocker.MagicMock(spec=FileMetadata)
mock_dbx.files_list_folder.return_value.entries = [mock_file_metadata]
entries = loader._download_folder("path/to/folder", "local_root")
assert entries is not None
def test_generate_dir_id_from_all_paths(setup_dropbox_loader, mocker):
"""Test directory ID generation."""
loader, mock_dbx = setup_dropbox_loader
mock_file_metadata = mocker.MagicMock(spec=FileMetadata, name="file.txt")
mock_dbx.files_list_folder.return_value.entries = [mock_file_metadata]
dir_id = loader._generate_dir_id_from_all_paths("path/to/folder")
assert dir_id is not None
assert len(dir_id) == 64
def test_clean_directory(setup_dropbox_loader, mocker):
"""Test cleaning up a directory."""
loader, _ = setup_dropbox_loader
mocker.patch("os.listdir", return_value=["file1", "file2"])
mocker.patch("os.remove")
mocker.patch("os.rmdir")
loader._clean_directory("path/to/folder")
def test_load_data(mocker, setup_dropbox_loader, tmp_path):
loader = setup_dropbox_loader[0]
mock_file_metadata = MagicMock(spec=FileMetadata, name="file.txt")
mocker.patch.object(loader.dbx, "files_list_folder", return_value=MagicMock(entries=[mock_file_metadata]))
mocker.patch.object(loader.dbx, "files_download_to_file")
# Mock DirectoryLoader
mock_data = {"data": "test_data"}
mocker.patch("embedchain.loaders.directory_loader.DirectoryLoader.load_data", return_value=mock_data)
test_dir = tmp_path / "dropbox_test"
test_dir.mkdir()
test_file = test_dir / "file.txt"
test_file.write_text("dummy content")
mocker.patch.object(loader, "_generate_dir_id_from_all_paths", return_value=str(test_dir))
result = loader.load_data("path/to/folder")
assert result == {"doc_id": mocker.ANY, "data": "test_data"}
loader.dbx.files_list_folder.assert_called_once_with("path/to/folder")