[Feature] Add Dropbox loader (#1073)
Co-authored-by: Deshraj Yadav <deshrajdry@gmail.com>
This commit is contained in:
27
docs/components/data-sources/dropbox.mdx
Normal file
27
docs/components/data-sources/dropbox.mdx
Normal file
@@ -0,0 +1,27 @@
|
||||
---
|
||||
title: '💾 Dropbox'
|
||||
---
|
||||
|
||||
To load folders or files from your Dropbox account, configure the `data_type` parameter as `dropbox` and specify the path to the desired file or folder, starting from the root directory of your Dropbox account.
|
||||
|
||||
For Dropbox access, an **access token** is required. Obtain this token by visiting [Dropbox Developer Apps](https://www.dropbox.com/developers/apps). There, create a new app and generate an access token for it.
|
||||
|
||||
Ensure your app has the following settings activated:
|
||||
|
||||
- In the Permissions section, enable `files.content.read` and `files.metadata.read`.
|
||||
|
||||
```python
|
||||
import os
|
||||
from embedchain import Pipeline as App
|
||||
|
||||
os.environ["DROPBOX_ACCESS_TOKEN"] = "sl.xxx"
|
||||
os.environ["OPENAI_API_KEY"] = "sk-xxx"
|
||||
|
||||
app = App()
|
||||
|
||||
# any path from the root of your dropbox account, you can leave it "" for the root folder
|
||||
app.add("/test", data_type="dropbox")
|
||||
|
||||
print(app.query("Which two celebrities are mentioned here?"))
|
||||
# The two celebrities mentioned in the given context are Elon Musk and Jeff Bezos.
|
||||
```
|
||||
@@ -31,6 +31,7 @@ Embedchain comes with built-in support for various data sources. We handle the c
|
||||
<Card title="📝 Substack" href="/components/data-sources/substack"></Card>
|
||||
<Card title="🐝 Beehiiv" href="/components/data-sources/beehiiv"></Card>
|
||||
<Card title="📁 Directory" href="/components/data-sources/directory"></Card>
|
||||
<Card title="💾 Dropbox" href="/components/data-sources/dropbox"></Card>
|
||||
</CardGroup>
|
||||
|
||||
<br/ >
|
||||
|
||||
@@ -131,7 +131,8 @@
|
||||
"components/data-sources/substack",
|
||||
"components/data-sources/discord",
|
||||
"components/data-sources/beehiiv",
|
||||
"components/data-sources/directory"
|
||||
"components/data-sources/directory",
|
||||
"components/data-sources/dropbox"
|
||||
]
|
||||
},
|
||||
"components/data-sources/data-type-handling"
|
||||
|
||||
@@ -76,6 +76,7 @@ class DataFormatter(JSONSerializable):
|
||||
DataType.BEEHIIV: "embedchain.loaders.beehiiv.BeehiivLoader",
|
||||
DataType.DIRECTORY: "embedchain.loaders.directory_loader.DirectoryLoader",
|
||||
DataType.SLACK: "embedchain.loaders.slack.SlackLoader",
|
||||
DataType.DROPBOX: "embedchain.loaders.dropbox.DropboxLoader",
|
||||
DataType.TEXT_FILE: "embedchain.loaders.text_file.TextFileLoader",
|
||||
}
|
||||
|
||||
@@ -121,6 +122,7 @@ class DataFormatter(JSONSerializable):
|
||||
DataType.BEEHIIV: "embedchain.chunkers.beehiiv.BeehiivChunker",
|
||||
DataType.DIRECTORY: "embedchain.chunkers.common_chunker.CommonChunker",
|
||||
DataType.SLACK: "embedchain.chunkers.common_chunker.CommonChunker",
|
||||
DataType.DROPBOX: "embedchain.chunkers.common_chunker.CommonChunker",
|
||||
DataType.TEXT_FILE: "embedchain.chunkers.common_chunker.CommonChunker",
|
||||
}
|
||||
|
||||
|
||||
82
embedchain/loaders/dropbox.py
Normal file
82
embedchain/loaders/dropbox.py
Normal file
@@ -0,0 +1,82 @@
|
||||
import hashlib
|
||||
import os
|
||||
from typing import List
|
||||
|
||||
from dropbox.files import FileMetadata
|
||||
|
||||
from embedchain.helpers.json_serializable import register_deserializable
|
||||
from embedchain.loaders.base_loader import BaseLoader
|
||||
from embedchain.loaders.directory_loader import DirectoryLoader
|
||||
|
||||
|
||||
@register_deserializable
|
||||
class DropboxLoader(BaseLoader):
|
||||
def __init__(self):
|
||||
access_token = os.environ.get("DROPBOX_ACCESS_TOKEN")
|
||||
if not access_token:
|
||||
raise ValueError("Please set the `DROPBOX_ACCESS_TOKEN` environment variable.")
|
||||
try:
|
||||
from dropbox import Dropbox, exceptions
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
'Dropbox requires extra dependencies. Install with `pip install --upgrade "embedchain[dropbox]"`'
|
||||
)
|
||||
|
||||
try:
|
||||
dbx = Dropbox(access_token)
|
||||
dbx.users_get_current_account()
|
||||
self.dbx = dbx
|
||||
except exceptions.AuthError as ex:
|
||||
raise ValueError("Invalid Dropbox access token. Please verify your token and try again.") from ex
|
||||
|
||||
def _download_folder(self, path: str, local_root: str) -> List[FileMetadata]:
|
||||
"""Download a folder from Dropbox and save it preserving the directory structure."""
|
||||
entries = self.dbx.files_list_folder(path).entries
|
||||
for entry in entries:
|
||||
local_path = os.path.join(local_root, entry.name)
|
||||
if isinstance(entry, FileMetadata):
|
||||
self.dbx.files_download_to_file(local_path, f"{path}/{entry.name}")
|
||||
else:
|
||||
os.makedirs(local_path, exist_ok=True)
|
||||
self._download_folder(f"{path}/{entry.name}", local_path)
|
||||
return entries
|
||||
|
||||
def _generate_dir_id_from_all_paths(self, path: str) -> str:
|
||||
"""Generate a unique ID for a directory based on all of its paths."""
|
||||
entries = self.dbx.files_list_folder(path).entries
|
||||
paths = [f"{path}/{entry.name}" for entry in entries]
|
||||
return hashlib.sha256("".join(paths).encode()).hexdigest()
|
||||
|
||||
def load_data(self, path: str):
|
||||
"""Load data from a Dropbox URL, preserving the folder structure."""
|
||||
root_dir = f"dropbox_{self._generate_dir_id_from_all_paths(path)}"
|
||||
os.makedirs(root_dir, exist_ok=True)
|
||||
|
||||
for entry in self.dbx.files_list_folder(path).entries:
|
||||
local_path = os.path.join(root_dir, entry.name)
|
||||
if isinstance(entry, FileMetadata):
|
||||
self.dbx.files_download_to_file(local_path, f"{path}/{entry.name}")
|
||||
else:
|
||||
os.makedirs(local_path, exist_ok=True)
|
||||
self._download_folder(f"{path}/{entry.name}", local_path)
|
||||
|
||||
dir_loader = DirectoryLoader()
|
||||
data = dir_loader.load_data(root_dir)["data"]
|
||||
|
||||
# Clean up
|
||||
self._clean_directory(root_dir)
|
||||
|
||||
return {
|
||||
"doc_id": hashlib.sha256(path.encode()).hexdigest(),
|
||||
"data": data,
|
||||
}
|
||||
|
||||
def _clean_directory(self, dir_path):
|
||||
"""Recursively delete a directory and its contents."""
|
||||
for item in os.listdir(dir_path):
|
||||
item_path = os.path.join(dir_path, item)
|
||||
if os.path.isdir(item_path):
|
||||
self._clean_directory(item_path)
|
||||
else:
|
||||
os.remove(item_path)
|
||||
os.rmdir(dir_path)
|
||||
@@ -37,6 +37,7 @@ class IndirectDataType(Enum):
|
||||
BEEHIIV = "beehiiv"
|
||||
DIRECTORY = "directory"
|
||||
SLACK = "slack"
|
||||
DROPBOX = "dropbox"
|
||||
TEXT_FILE = "text_file"
|
||||
|
||||
|
||||
@@ -74,4 +75,5 @@ class DataType(Enum):
|
||||
BEEHIIV = IndirectDataType.BEEHIIV.value
|
||||
DIRECTORY = IndirectDataType.DIRECTORY.value
|
||||
SLACK = IndirectDataType.SLACK.value
|
||||
DROPBOX = IndirectDataType.DROPBOX.value
|
||||
TEXT_FILE = IndirectDataType.TEXT_FILE.value
|
||||
|
||||
51
poetry.lock
generated
51
poetry.lock
generated
@@ -1,4 +1,4 @@
|
||||
# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand.
|
||||
# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand.
|
||||
|
||||
[[package]]
|
||||
name = "aiofiles"
|
||||
@@ -1269,6 +1269,23 @@ files = [
|
||||
{file = "docx2txt-0.8.tar.gz", hash = "sha256:2c06d98d7cfe2d3947e5760a57d924e3ff07745b379c8737723922e7009236e5"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "dropbox"
|
||||
version = "11.36.2"
|
||||
description = "Official Dropbox API Client"
|
||||
optional = true
|
||||
python-versions = "*"
|
||||
files = [
|
||||
{file = "dropbox-11.36.2-py2-none-any.whl", hash = "sha256:afbfce2589b777ade1deaa2c186f3650c41e41cea0f1fac497a75112a171f8e2"},
|
||||
{file = "dropbox-11.36.2-py3-none-any.whl", hash = "sha256:a21e4d2bcbeb1d8067ff87969aea48792c9a8266182491153feff2be9c1b9c8f"},
|
||||
{file = "dropbox-11.36.2.tar.gz", hash = "sha256:d48d3d16d486c78b11c14a1c4a28a2611fbf5a0d0a358b861bfd9482e603c500"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
requests = ">=2.16.2"
|
||||
six = ">=1.12.0"
|
||||
stone = ">=2"
|
||||
|
||||
[[package]]
|
||||
name = "duckduckgo-search"
|
||||
version = "3.9.3"
|
||||
@@ -4165,10 +4182,10 @@ files = [
|
||||
|
||||
[package.dependencies]
|
||||
numpy = [
|
||||
{version = ">=1.21.0", markers = "python_version == \"3.9\" and platform_system == \"Darwin\" and platform_machine == \"arm64\""},
|
||||
{version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""},
|
||||
{version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""},
|
||||
{version = ">=1.19.3", markers = "platform_system == \"Linux\" and platform_machine == \"aarch64\" and python_version >= \"3.8\" and python_version < \"3.10\" or python_version > \"3.9\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_system != \"Darwin\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_machine != \"arm64\" and python_version < \"3.10\""},
|
||||
{version = ">=1.21.0", markers = "python_version == \"3.9\" and platform_system == \"Darwin\" and platform_machine == \"arm64\""},
|
||||
{version = ">=1.23.5", markers = "python_version >= \"3.11\""},
|
||||
]
|
||||
|
||||
@@ -4577,6 +4594,17 @@ files = [
|
||||
dev = ["pre-commit", "tox"]
|
||||
testing = ["pytest", "pytest-benchmark"]
|
||||
|
||||
[[package]]
|
||||
name = "ply"
|
||||
version = "3.11"
|
||||
description = "Python Lex & Yacc"
|
||||
optional = true
|
||||
python-versions = "*"
|
||||
files = [
|
||||
{file = "ply-3.11-py2.py3-none-any.whl", hash = "sha256:096f9b8350b65ebd2fd1346b12452efe5b9607f7482813ffca50c22722a807ce"},
|
||||
{file = "ply-3.11.tar.gz", hash = "sha256:00c7c1aaa88358b9c765b6d3000c6eec0ba42abca5351b095321aef446081da3"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "portalocker"
|
||||
version = "2.8.2"
|
||||
@@ -6625,6 +6653,22 @@ typing-extensions = {version = ">=3.10.0", markers = "python_version < \"3.10\""
|
||||
[package.extras]
|
||||
full = ["httpx (>=0.22.0)", "itsdangerous", "jinja2", "python-multipart", "pyyaml"]
|
||||
|
||||
[[package]]
|
||||
name = "stone"
|
||||
version = "3.3.1"
|
||||
description = "Stone is an interface description language (IDL) for APIs."
|
||||
optional = true
|
||||
python-versions = "*"
|
||||
files = [
|
||||
{file = "stone-3.3.1-py2-none-any.whl", hash = "sha256:cd2f7f9056fc39b16c8fd46a26971dc5ccd30b5c2c246566cd2c0dd27ff96609"},
|
||||
{file = "stone-3.3.1-py3-none-any.whl", hash = "sha256:e15866fad249c11a963cce3bdbed37758f2e88c8ff4898616bc0caeb1e216047"},
|
||||
{file = "stone-3.3.1.tar.gz", hash = "sha256:4ef0397512f609757975f7ec09b35639d72ba7e3e17ce4ddf399578346b4cb50"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
ply = ">=3.4"
|
||||
six = ">=1.12.0"
|
||||
|
||||
[[package]]
|
||||
name = "sympy"
|
||||
version = "1.12"
|
||||
@@ -8115,6 +8159,7 @@ cohere = ["cohere"]
|
||||
community = ["llama-hub"]
|
||||
dataloaders = ["docx2txt", "duckduckgo-search", "pytube", "sentence-transformers", "unstructured", "youtube-transcript-api"]
|
||||
discord = ["discord"]
|
||||
dropbox = ["dropbox"]
|
||||
elasticsearch = ["elasticsearch"]
|
||||
github = ["PyGithub", "gitpython"]
|
||||
gmail = ["llama-hub", "requests"]
|
||||
@@ -8144,4 +8189,4 @@ youtube = ["youtube-transcript-api", "yt_dlp"]
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = ">=3.9,<3.12"
|
||||
content-hash = "bc763595ae5e903a5a819a2d3f31f045fac52555f72e44ead9df0e5e191955aa"
|
||||
content-hash = "335c42c91a2b5e4a1c3d8a7c39dee8665fd1eee0410e1bc6cb6cb1d6f6722445"
|
||||
|
||||
@@ -147,6 +147,7 @@ newspaper3k = { version = "^0.2.8", optional = true }
|
||||
listparser = { version = "^0.19", optional = true }
|
||||
google-generativeai = { version = "^0.3.0", optional = true }
|
||||
modal = { version = "^0.56.4329", optional = true }
|
||||
dropbox = { version = "^11.36.2", optional = true }
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
black = "^23.3.0"
|
||||
@@ -214,6 +215,7 @@ rss_feed = [
|
||||
]
|
||||
google = ["google-generativeai"]
|
||||
modal = ["modal"]
|
||||
dropbox = ["dropbox"]
|
||||
|
||||
[tool.poetry.group.docs.dependencies]
|
||||
|
||||
|
||||
85
tests/loaders/test_dropbox.py
Normal file
85
tests/loaders/test_dropbox.py
Normal file
@@ -0,0 +1,85 @@
|
||||
import os
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import pytest
|
||||
from dropbox.files import FileMetadata
|
||||
|
||||
from embedchain.loaders.dropbox import DropboxLoader
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def setup_dropbox_loader(mocker):
|
||||
mock_dropbox = mocker.patch("dropbox.Dropbox")
|
||||
mock_dbx = mocker.MagicMock()
|
||||
mock_dropbox.return_value = mock_dbx
|
||||
|
||||
os.environ["DROPBOX_ACCESS_TOKEN"] = "test_token"
|
||||
loader = DropboxLoader()
|
||||
|
||||
yield loader, mock_dbx
|
||||
|
||||
if "DROPBOX_ACCESS_TOKEN" in os.environ:
|
||||
del os.environ["DROPBOX_ACCESS_TOKEN"]
|
||||
|
||||
|
||||
def test_initialization(setup_dropbox_loader):
|
||||
"""Test initialization of DropboxLoader."""
|
||||
loader, _ = setup_dropbox_loader
|
||||
assert loader is not None
|
||||
|
||||
|
||||
def test_download_folder(setup_dropbox_loader, mocker):
|
||||
"""Test downloading a folder."""
|
||||
loader, mock_dbx = setup_dropbox_loader
|
||||
mocker.patch("os.makedirs")
|
||||
mocker.patch("os.path.join", return_value="mock/path")
|
||||
|
||||
mock_file_metadata = mocker.MagicMock(spec=FileMetadata)
|
||||
mock_dbx.files_list_folder.return_value.entries = [mock_file_metadata]
|
||||
|
||||
entries = loader._download_folder("path/to/folder", "local_root")
|
||||
assert entries is not None
|
||||
|
||||
|
||||
def test_generate_dir_id_from_all_paths(setup_dropbox_loader, mocker):
|
||||
"""Test directory ID generation."""
|
||||
loader, mock_dbx = setup_dropbox_loader
|
||||
mock_file_metadata = mocker.MagicMock(spec=FileMetadata, name="file.txt")
|
||||
mock_dbx.files_list_folder.return_value.entries = [mock_file_metadata]
|
||||
|
||||
dir_id = loader._generate_dir_id_from_all_paths("path/to/folder")
|
||||
assert dir_id is not None
|
||||
assert len(dir_id) == 64
|
||||
|
||||
|
||||
def test_clean_directory(setup_dropbox_loader, mocker):
|
||||
"""Test cleaning up a directory."""
|
||||
loader, _ = setup_dropbox_loader
|
||||
mocker.patch("os.listdir", return_value=["file1", "file2"])
|
||||
mocker.patch("os.remove")
|
||||
mocker.patch("os.rmdir")
|
||||
|
||||
loader._clean_directory("path/to/folder")
|
||||
|
||||
|
||||
def test_load_data(mocker, setup_dropbox_loader, tmp_path):
|
||||
loader = setup_dropbox_loader[0]
|
||||
|
||||
mock_file_metadata = MagicMock(spec=FileMetadata, name="file.txt")
|
||||
mocker.patch.object(loader.dbx, "files_list_folder", return_value=MagicMock(entries=[mock_file_metadata]))
|
||||
mocker.patch.object(loader.dbx, "files_download_to_file")
|
||||
|
||||
# Mock DirectoryLoader
|
||||
mock_data = {"data": "test_data"}
|
||||
mocker.patch("embedchain.loaders.directory_loader.DirectoryLoader.load_data", return_value=mock_data)
|
||||
|
||||
test_dir = tmp_path / "dropbox_test"
|
||||
test_dir.mkdir()
|
||||
test_file = test_dir / "file.txt"
|
||||
test_file.write_text("dummy content")
|
||||
mocker.patch.object(loader, "_generate_dir_id_from_all_paths", return_value=str(test_dir))
|
||||
|
||||
result = loader.load_data("path/to/folder")
|
||||
|
||||
assert result == {"doc_id": mocker.ANY, "data": "test_data"}
|
||||
loader.dbx.files_list_folder.assert_called_once_with("path/to/folder")
|
||||
Reference in New Issue
Block a user