Files
t6_mem0/embedchain/loaders/google_drive.py
2024-01-05 14:01:56 +05:30

56 lines
1.9 KiB
Python

import hashlib
import re
try:
from googleapiclient.errors import HttpError
except ImportError:
raise ImportError(
"Google Drive requires extra dependencies. Install with `pip install embedchain[googledrive]`"
) from None
from langchain.document_loaders import GoogleDriveLoader as Loader
from langchain.document_loaders import UnstructuredFileIOLoader
from embedchain.helpers.json_serializable import register_deserializable
from embedchain.loaders.base_loader import BaseLoader
@register_deserializable
class GoogleDriveLoader(BaseLoader):
@staticmethod
def _get_drive_id_from_url(url: str):
regex = r"^https:\/\/drive\.google\.com\/drive\/(?:u\/\d+\/)folders\/([a-zA-Z0-9_-]+)$"
if re.match(regex, url):
return url.split("/")[-1]
raise ValueError(
f"The url provided {url} does not match a google drive folder url. Example drive url: "
f"https://drive.google.com/drive/u/0/folders/xxxx"
)
def load_data(self, url: str):
"""Load data from a Google drive folder."""
folder_id: str = self._get_drive_id_from_url(url)
try:
loader = Loader(
folder_id=folder_id,
recursive=True,
file_loader_cls=UnstructuredFileIOLoader,
)
data = []
all_content = []
docs = loader.load()
for doc in docs:
all_content.append(doc.page_content)
# renames source to url for later use.
doc.metadata["url"] = doc.metadata.pop("source")
data.append({"content": doc.page_content, "meta_data": doc.metadata})
doc_id = hashlib.sha256((" ".join(all_content) + url).encode()).hexdigest()
return {"doc_id": doc_id, "data": data}
except HttpError:
raise FileNotFoundError("Unable to locate folder or files, check provided drive URL and try again")