From 404e73af7729c9e381f02a28e87e38307287b53a Mon Sep 17 00:00:00 2001 From: Sidharth Mohanty Date: Fri, 29 Dec 2023 23:22:25 +0530 Subject: [PATCH] [Feature] Add Dropbox loader (#1073) Co-authored-by: Deshraj Yadav --- docs/components/data-sources/dropbox.mdx | 27 +++++++ docs/components/data-sources/overview.mdx | 1 + docs/mint.json | 3 +- embedchain/data_formatter/data_formatter.py | 2 + embedchain/loaders/dropbox.py | 82 ++++++++++++++++++++ embedchain/models/data_type.py | 2 + poetry.lock | 51 ++++++++++++- pyproject.toml | 2 + tests/loaders/test_dropbox.py | 85 +++++++++++++++++++++ 9 files changed, 251 insertions(+), 4 deletions(-) create mode 100644 docs/components/data-sources/dropbox.mdx create mode 100644 embedchain/loaders/dropbox.py create mode 100644 tests/loaders/test_dropbox.py diff --git a/docs/components/data-sources/dropbox.mdx b/docs/components/data-sources/dropbox.mdx new file mode 100644 index 00000000..dcfb0ac6 --- /dev/null +++ b/docs/components/data-sources/dropbox.mdx @@ -0,0 +1,27 @@ +--- +title: '💾 Dropbox' +--- + +To load folders or files from your Dropbox account, configure the `data_type` parameter as `dropbox` and specify the path to the desired file or folder, starting from the root directory of your Dropbox account. + +For Dropbox access, an **access token** is required. Obtain this token by visiting [Dropbox Developer Apps](https://www.dropbox.com/developers/apps). There, create a new app and generate an access token for it. + +Ensure your app has the following settings activated: + +- In the Permissions section, enable `files.content.read` and `files.metadata.read`. + +```python +import os +from embedchain import Pipeline as App + +os.environ["DROPBOX_ACCESS_TOKEN"] = "sl.xxx" +os.environ["OPENAI_API_KEY"] = "sk-xxx" + +app = App() + +# any path from the root of your dropbox account, you can leave it "" for the root folder +app.add("/test", data_type="dropbox") + +print(app.query("Which two celebrities are mentioned here?")) +# The two celebrities mentioned in the given context are Elon Musk and Jeff Bezos. +``` diff --git a/docs/components/data-sources/overview.mdx b/docs/components/data-sources/overview.mdx index 385ce7e9..3c0072ee 100644 --- a/docs/components/data-sources/overview.mdx +++ b/docs/components/data-sources/overview.mdx @@ -31,6 +31,7 @@ Embedchain comes with built-in support for various data sources. We handle the c +
diff --git a/docs/mint.json b/docs/mint.json index b34b3214..ca9ce9ad 100644 --- a/docs/mint.json +++ b/docs/mint.json @@ -131,7 +131,8 @@ "components/data-sources/substack", "components/data-sources/discord", "components/data-sources/beehiiv", - "components/data-sources/directory" + "components/data-sources/directory", + "components/data-sources/dropbox" ] }, "components/data-sources/data-type-handling" diff --git a/embedchain/data_formatter/data_formatter.py b/embedchain/data_formatter/data_formatter.py index 3f9a2edd..36f5c719 100644 --- a/embedchain/data_formatter/data_formatter.py +++ b/embedchain/data_formatter/data_formatter.py @@ -76,6 +76,7 @@ class DataFormatter(JSONSerializable): DataType.BEEHIIV: "embedchain.loaders.beehiiv.BeehiivLoader", DataType.DIRECTORY: "embedchain.loaders.directory_loader.DirectoryLoader", DataType.SLACK: "embedchain.loaders.slack.SlackLoader", + DataType.DROPBOX: "embedchain.loaders.dropbox.DropboxLoader", DataType.TEXT_FILE: "embedchain.loaders.text_file.TextFileLoader", } @@ -121,6 +122,7 @@ class DataFormatter(JSONSerializable): DataType.BEEHIIV: "embedchain.chunkers.beehiiv.BeehiivChunker", DataType.DIRECTORY: "embedchain.chunkers.common_chunker.CommonChunker", DataType.SLACK: "embedchain.chunkers.common_chunker.CommonChunker", + DataType.DROPBOX: "embedchain.chunkers.common_chunker.CommonChunker", DataType.TEXT_FILE: "embedchain.chunkers.common_chunker.CommonChunker", } diff --git a/embedchain/loaders/dropbox.py b/embedchain/loaders/dropbox.py new file mode 100644 index 00000000..c4b01f14 --- /dev/null +++ b/embedchain/loaders/dropbox.py @@ -0,0 +1,82 @@ +import hashlib +import os +from typing import List + +from dropbox.files import FileMetadata + +from embedchain.helpers.json_serializable import register_deserializable +from embedchain.loaders.base_loader import BaseLoader +from embedchain.loaders.directory_loader import DirectoryLoader + + +@register_deserializable +class DropboxLoader(BaseLoader): + def __init__(self): + access_token = os.environ.get("DROPBOX_ACCESS_TOKEN") + if not access_token: + raise ValueError("Please set the `DROPBOX_ACCESS_TOKEN` environment variable.") + try: + from dropbox import Dropbox, exceptions + except ImportError: + raise ImportError( + 'Dropbox requires extra dependencies. Install with `pip install --upgrade "embedchain[dropbox]"`' + ) + + try: + dbx = Dropbox(access_token) + dbx.users_get_current_account() + self.dbx = dbx + except exceptions.AuthError as ex: + raise ValueError("Invalid Dropbox access token. Please verify your token and try again.") from ex + + def _download_folder(self, path: str, local_root: str) -> List[FileMetadata]: + """Download a folder from Dropbox and save it preserving the directory structure.""" + entries = self.dbx.files_list_folder(path).entries + for entry in entries: + local_path = os.path.join(local_root, entry.name) + if isinstance(entry, FileMetadata): + self.dbx.files_download_to_file(local_path, f"{path}/{entry.name}") + else: + os.makedirs(local_path, exist_ok=True) + self._download_folder(f"{path}/{entry.name}", local_path) + return entries + + def _generate_dir_id_from_all_paths(self, path: str) -> str: + """Generate a unique ID for a directory based on all of its paths.""" + entries = self.dbx.files_list_folder(path).entries + paths = [f"{path}/{entry.name}" for entry in entries] + return hashlib.sha256("".join(paths).encode()).hexdigest() + + def load_data(self, path: str): + """Load data from a Dropbox URL, preserving the folder structure.""" + root_dir = f"dropbox_{self._generate_dir_id_from_all_paths(path)}" + os.makedirs(root_dir, exist_ok=True) + + for entry in self.dbx.files_list_folder(path).entries: + local_path = os.path.join(root_dir, entry.name) + if isinstance(entry, FileMetadata): + self.dbx.files_download_to_file(local_path, f"{path}/{entry.name}") + else: + os.makedirs(local_path, exist_ok=True) + self._download_folder(f"{path}/{entry.name}", local_path) + + dir_loader = DirectoryLoader() + data = dir_loader.load_data(root_dir)["data"] + + # Clean up + self._clean_directory(root_dir) + + return { + "doc_id": hashlib.sha256(path.encode()).hexdigest(), + "data": data, + } + + def _clean_directory(self, dir_path): + """Recursively delete a directory and its contents.""" + for item in os.listdir(dir_path): + item_path = os.path.join(dir_path, item) + if os.path.isdir(item_path): + self._clean_directory(item_path) + else: + os.remove(item_path) + os.rmdir(dir_path) diff --git a/embedchain/models/data_type.py b/embedchain/models/data_type.py index 3ee68c84..fb9da2cd 100644 --- a/embedchain/models/data_type.py +++ b/embedchain/models/data_type.py @@ -37,6 +37,7 @@ class IndirectDataType(Enum): BEEHIIV = "beehiiv" DIRECTORY = "directory" SLACK = "slack" + DROPBOX = "dropbox" TEXT_FILE = "text_file" @@ -74,4 +75,5 @@ class DataType(Enum): BEEHIIV = IndirectDataType.BEEHIIV.value DIRECTORY = IndirectDataType.DIRECTORY.value SLACK = IndirectDataType.SLACK.value + DROPBOX = IndirectDataType.DROPBOX.value TEXT_FILE = IndirectDataType.TEXT_FILE.value diff --git a/poetry.lock b/poetry.lock index 89197d7c..5202837f 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. [[package]] name = "aiofiles" @@ -1269,6 +1269,23 @@ files = [ {file = "docx2txt-0.8.tar.gz", hash = "sha256:2c06d98d7cfe2d3947e5760a57d924e3ff07745b379c8737723922e7009236e5"}, ] +[[package]] +name = "dropbox" +version = "11.36.2" +description = "Official Dropbox API Client" +optional = true +python-versions = "*" +files = [ + {file = "dropbox-11.36.2-py2-none-any.whl", hash = "sha256:afbfce2589b777ade1deaa2c186f3650c41e41cea0f1fac497a75112a171f8e2"}, + {file = "dropbox-11.36.2-py3-none-any.whl", hash = "sha256:a21e4d2bcbeb1d8067ff87969aea48792c9a8266182491153feff2be9c1b9c8f"}, + {file = "dropbox-11.36.2.tar.gz", hash = "sha256:d48d3d16d486c78b11c14a1c4a28a2611fbf5a0d0a358b861bfd9482e603c500"}, +] + +[package.dependencies] +requests = ">=2.16.2" +six = ">=1.12.0" +stone = ">=2" + [[package]] name = "duckduckgo-search" version = "3.9.3" @@ -4165,10 +4182,10 @@ files = [ [package.dependencies] numpy = [ + {version = ">=1.21.0", markers = "python_version == \"3.9\" and platform_system == \"Darwin\" and platform_machine == \"arm64\""}, {version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""}, {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""}, {version = ">=1.19.3", markers = "platform_system == \"Linux\" and platform_machine == \"aarch64\" and python_version >= \"3.8\" and python_version < \"3.10\" or python_version > \"3.9\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_system != \"Darwin\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_machine != \"arm64\" and python_version < \"3.10\""}, - {version = ">=1.21.0", markers = "python_version == \"3.9\" and platform_system == \"Darwin\" and platform_machine == \"arm64\""}, {version = ">=1.23.5", markers = "python_version >= \"3.11\""}, ] @@ -4577,6 +4594,17 @@ files = [ dev = ["pre-commit", "tox"] testing = ["pytest", "pytest-benchmark"] +[[package]] +name = "ply" +version = "3.11" +description = "Python Lex & Yacc" +optional = true +python-versions = "*" +files = [ + {file = "ply-3.11-py2.py3-none-any.whl", hash = "sha256:096f9b8350b65ebd2fd1346b12452efe5b9607f7482813ffca50c22722a807ce"}, + {file = "ply-3.11.tar.gz", hash = "sha256:00c7c1aaa88358b9c765b6d3000c6eec0ba42abca5351b095321aef446081da3"}, +] + [[package]] name = "portalocker" version = "2.8.2" @@ -6625,6 +6653,22 @@ typing-extensions = {version = ">=3.10.0", markers = "python_version < \"3.10\"" [package.extras] full = ["httpx (>=0.22.0)", "itsdangerous", "jinja2", "python-multipart", "pyyaml"] +[[package]] +name = "stone" +version = "3.3.1" +description = "Stone is an interface description language (IDL) for APIs." +optional = true +python-versions = "*" +files = [ + {file = "stone-3.3.1-py2-none-any.whl", hash = "sha256:cd2f7f9056fc39b16c8fd46a26971dc5ccd30b5c2c246566cd2c0dd27ff96609"}, + {file = "stone-3.3.1-py3-none-any.whl", hash = "sha256:e15866fad249c11a963cce3bdbed37758f2e88c8ff4898616bc0caeb1e216047"}, + {file = "stone-3.3.1.tar.gz", hash = "sha256:4ef0397512f609757975f7ec09b35639d72ba7e3e17ce4ddf399578346b4cb50"}, +] + +[package.dependencies] +ply = ">=3.4" +six = ">=1.12.0" + [[package]] name = "sympy" version = "1.12" @@ -8115,6 +8159,7 @@ cohere = ["cohere"] community = ["llama-hub"] dataloaders = ["docx2txt", "duckduckgo-search", "pytube", "sentence-transformers", "unstructured", "youtube-transcript-api"] discord = ["discord"] +dropbox = ["dropbox"] elasticsearch = ["elasticsearch"] github = ["PyGithub", "gitpython"] gmail = ["llama-hub", "requests"] @@ -8144,4 +8189,4 @@ youtube = ["youtube-transcript-api", "yt_dlp"] [metadata] lock-version = "2.0" python-versions = ">=3.9,<3.12" -content-hash = "bc763595ae5e903a5a819a2d3f31f045fac52555f72e44ead9df0e5e191955aa" +content-hash = "335c42c91a2b5e4a1c3d8a7c39dee8665fd1eee0410e1bc6cb6cb1d6f6722445" diff --git a/pyproject.toml b/pyproject.toml index 1f58426e..0d75e21a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -147,6 +147,7 @@ newspaper3k = { version = "^0.2.8", optional = true } listparser = { version = "^0.19", optional = true } google-generativeai = { version = "^0.3.0", optional = true } modal = { version = "^0.56.4329", optional = true } +dropbox = { version = "^11.36.2", optional = true } [tool.poetry.group.dev.dependencies] black = "^23.3.0" @@ -214,6 +215,7 @@ rss_feed = [ ] google = ["google-generativeai"] modal = ["modal"] +dropbox = ["dropbox"] [tool.poetry.group.docs.dependencies] diff --git a/tests/loaders/test_dropbox.py b/tests/loaders/test_dropbox.py new file mode 100644 index 00000000..c7e81673 --- /dev/null +++ b/tests/loaders/test_dropbox.py @@ -0,0 +1,85 @@ +import os +from unittest.mock import MagicMock + +import pytest +from dropbox.files import FileMetadata + +from embedchain.loaders.dropbox import DropboxLoader + + +@pytest.fixture +def setup_dropbox_loader(mocker): + mock_dropbox = mocker.patch("dropbox.Dropbox") + mock_dbx = mocker.MagicMock() + mock_dropbox.return_value = mock_dbx + + os.environ["DROPBOX_ACCESS_TOKEN"] = "test_token" + loader = DropboxLoader() + + yield loader, mock_dbx + + if "DROPBOX_ACCESS_TOKEN" in os.environ: + del os.environ["DROPBOX_ACCESS_TOKEN"] + + +def test_initialization(setup_dropbox_loader): + """Test initialization of DropboxLoader.""" + loader, _ = setup_dropbox_loader + assert loader is not None + + +def test_download_folder(setup_dropbox_loader, mocker): + """Test downloading a folder.""" + loader, mock_dbx = setup_dropbox_loader + mocker.patch("os.makedirs") + mocker.patch("os.path.join", return_value="mock/path") + + mock_file_metadata = mocker.MagicMock(spec=FileMetadata) + mock_dbx.files_list_folder.return_value.entries = [mock_file_metadata] + + entries = loader._download_folder("path/to/folder", "local_root") + assert entries is not None + + +def test_generate_dir_id_from_all_paths(setup_dropbox_loader, mocker): + """Test directory ID generation.""" + loader, mock_dbx = setup_dropbox_loader + mock_file_metadata = mocker.MagicMock(spec=FileMetadata, name="file.txt") + mock_dbx.files_list_folder.return_value.entries = [mock_file_metadata] + + dir_id = loader._generate_dir_id_from_all_paths("path/to/folder") + assert dir_id is not None + assert len(dir_id) == 64 + + +def test_clean_directory(setup_dropbox_loader, mocker): + """Test cleaning up a directory.""" + loader, _ = setup_dropbox_loader + mocker.patch("os.listdir", return_value=["file1", "file2"]) + mocker.patch("os.remove") + mocker.patch("os.rmdir") + + loader._clean_directory("path/to/folder") + + +def test_load_data(mocker, setup_dropbox_loader, tmp_path): + loader = setup_dropbox_loader[0] + + mock_file_metadata = MagicMock(spec=FileMetadata, name="file.txt") + mocker.patch.object(loader.dbx, "files_list_folder", return_value=MagicMock(entries=[mock_file_metadata])) + mocker.patch.object(loader.dbx, "files_download_to_file") + + # Mock DirectoryLoader + mock_data = {"data": "test_data"} + mocker.patch("embedchain.loaders.directory_loader.DirectoryLoader.load_data", return_value=mock_data) + + test_dir = tmp_path / "dropbox_test" + test_dir.mkdir() + test_file = test_dir / "file.txt" + test_file.write_text("dummy content") + mocker.patch.object(loader, "_generate_dir_id_from_all_paths", return_value=str(test_dir)) + + result = loader.load_data("path/to/folder") + + assert result == {"doc_id": mocker.ANY, "data": "test_data"} + loader.dbx.files_list_folder.assert_called_once_with("path/to/folder")