[Features] Add Github and Youtube Channel loaders (#957)

Co-authored-by: Deven Patel <deven298@yahoo.com>
Co-authored-by: Deshraj Yadav <deshrajdry@gmail.com>
This commit is contained in:
Deven Patel
2023-11-15 19:17:42 -08:00
committed by GitHub
parent 3fa7db8420
commit 07fb6bee54
9 changed files with 303 additions and 5 deletions

View File

@@ -1,5 +1,8 @@
import hashlib
from langchain.text_splitter import RecursiveCharacterTextSplitter
from embedchain.config.add_config import ChunkerConfig
from embedchain.helper.json_serializable import JSONSerializable
from embedchain.models.data_type import DataType
@@ -7,7 +10,15 @@ from embedchain.models.data_type import DataType
class BaseChunker(JSONSerializable):
def __init__(self, text_splitter):
"""Initialize the chunker."""
self.text_splitter = text_splitter
if text_splitter is None:
config = ChunkerConfig(chunk_size=1000, chunk_overlap=0, length_function=len)
self.text_splitter = RecursiveCharacterTextSplitter(
chunk_size=config.chunk_size,
chunk_overlap=config.chunk_overlap,
length_function=config.length_function,
)
else:
self.text_splitter = text_splitter
self.data_type = None
def create_chunks(self, loader, src, app_id=None):

View File

@@ -64,6 +64,8 @@ class DataFormatter(JSONSerializable):
DataType.GMAIL: "embedchain.loaders.gmail.GmailLoader",
DataType.NOTION: "embedchain.loaders.notion.NotionLoader",
DataType.SUBSTACK: "embedchain.loaders.substack.SubstackLoader",
DataType.GITHUB: "embedchain.loaders.github.GithubLoader",
DataType.YOUTUBE_CHANNEL: "embedchain.loaders.youtube_channel.YoutubeChannelLoader",
}
custom_loaders = set(
@@ -114,6 +116,8 @@ class DataFormatter(JSONSerializable):
DataType.SLACK: "embedchain.chunkers.slack.SlackChunker",
DataType.DISCOURSE: "embedchain.chunkers.discourse.DiscourseChunker",
DataType.SUBSTACK: "embedchain.chunkers.substack.SubstackChunker",
DataType.GITHUB: "embedchain.chunkers.base_chunker.BaseChunker",
DataType.YOUTUBE_CHANNEL: "embedchain.chunkers.base_chunker.BaseChunker",
}
if data_type in chunker_classes:

View File

@@ -0,0 +1,81 @@
import concurrent.futures
import hashlib
import logging
import os
from embedchain.loaders.base_loader import BaseLoader
from embedchain.loaders.json import JSONLoader
from embedchain.loaders.mdx import MdxLoader
from embedchain.loaders.unstructured_file import UnstructuredLoader
from embedchain.utils import detect_datatype
class GithubLoader(BaseLoader):
def load_data(self, repo_url):
"""Load data from a git repo."""
try:
from git import Repo
except ImportError as e:
raise ValueError(
"GithubLoader requires extra dependencies. Install with `pip install --upgrade 'embedchain[git]'`"
) from e
mdx_loader = MdxLoader()
json_loader = JSONLoader()
unstructured_loader = UnstructuredLoader()
data = []
data_urls = []
def _fetch_or_clone_repo(repo_url: str, local_path: str):
if os.path.exists(local_path):
logging.info("Repository already exists. Fetching updates...")
repo = Repo(local_path)
origin = repo.remotes.origin
origin.fetch()
logging.info("Fetch completed.")
else:
logging.info("Cloning repository...")
Repo.clone_from(repo_url, local_path)
logging.info("Clone completed.")
def _load_file(file_path: str):
try:
data_type = detect_datatype(file_path).value
except Exception:
data_type = "unstructured"
if data_type == "mdx":
data = mdx_loader.load_data(file_path)
elif data_type == "json":
data = json_loader.load_data(file_path)
else:
data = unstructured_loader.load_data(file_path)
return data.get("data", [])
def _add_repo_files(repo_path: str):
with concurrent.futures.ThreadPoolExecutor() as executor:
future_to_file = {
executor.submit(_load_file, os.path.join(root, filename)): os.path.join(root, filename)
for root, _, files in os.walk(repo_path)
for filename in files
} # noqa: E501
for future in concurrent.futures.as_completed(future_to_file):
file = future_to_file[future]
try:
results = future.result()
if results:
data.extend(results)
data_urls.extend([result.get("meta_data").get("url") for result in results])
except Exception as e:
logging.error(f"Failed to process {file}: {e}")
source_hash = hashlib.sha256(repo_url.encode()).hexdigest()
repo_path = f"/tmp/{source_hash}"
_fetch_or_clone_repo(repo_url=repo_url, local_path=repo_path)
_add_repo_files(repo_path)
doc_id = hashlib.sha256((repo_url + ", ".join(data_urls)).encode()).hexdigest()
return {
"doc_id": doc_id,
"data": data,
}

View File

@@ -57,8 +57,8 @@ class SitemapLoader(BaseLoader):
try:
data = future.result()
if data:
output.append(data)
output.extend(data)
except Exception as e:
logging.error(f"Error loading page {link}: {e}")
return {"doc_id": doc_id, "data": [data[0] for data in output if data]}
return {"doc_id": doc_id, "data": output}

View File

@@ -0,0 +1,70 @@
import concurrent.futures
import hashlib
import logging
from embedchain.loaders.base_loader import BaseLoader
from embedchain.loaders.youtube_video import YoutubeVideoLoader
class YoutubeChannelLoader(BaseLoader):
"""Loader for youtube channel."""
def load_data(self, channel_name):
try:
import yt_dlp
except ImportError as e:
raise ValueError(
"YoutubeLoader requires extra dependencies. Install with `pip install --upgrade 'embedchain[youtube_channel]'`" # noqa: E501
) from e
data = []
data_urls = []
youtube_url = f"https://www.youtube.com/{channel_name}/videos"
youtube_video_loader = YoutubeVideoLoader()
def _get_yt_video_links():
try:
ydl_opts = {
"quiet": True,
"extract_flat": True,
}
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
info_dict = ydl.extract_info(youtube_url, download=False)
if "entries" in info_dict:
videos = [entry["url"] for entry in info_dict["entries"]]
return videos
except Exception:
logging.error(f"Failed to fetch youtube videos for channel: {channel_name}")
return []
def _load_yt_video(video_link):
try:
each_load_data = youtube_video_loader.load_data(video_link)
if each_load_data:
return each_load_data.get("data")
except Exception as e:
logging.error(f"Failed to load youtube video {video_link}: {e}")
return None
def _add_youtube_channel():
video_links = _get_yt_video_links()
with concurrent.futures.ThreadPoolExecutor() as executor:
future_to_video = {
executor.submit(_load_yt_video, video_link): video_link for video_link in video_links
} # noqa: E501
for future in concurrent.futures.as_completed(future_to_video):
video = future_to_video[future]
try:
results = future.result()
if results:
data.extend(results)
data_urls.extend([result.get("meta_data").get("url") for result in results])
except Exception as e:
logging.error(f"Failed to process youtube video {video}: {e}")
_add_youtube_channel()
doc_id = hashlib.sha256((youtube_url + ", ".join(data_urls)).encode()).hexdigest()
return {
"doc_id": doc_id,
"data": data,
}

View File

@@ -34,6 +34,8 @@ class IndirectDataType(Enum):
SLACK = "slack"
DISCOURSE = "discourse"
SUBSTACK = "substack"
GITHUB = "github"
YOUTUBE_CHANNEL = "youtube_channel"
class SpecialDataType(Enum):
@@ -67,3 +69,5 @@ class DataType(Enum):
SLACK = IndirectDataType.SLACK.value
DISCOURSE = IndirectDataType.DISCOURSE.value
SUBSTACK = IndirectDataType.SUBSTACK.value
GITHUB = IndirectDataType.GITHUB.value
YOUTUBE_CHANNEL = IndirectDataType.YOUTUBE_CHANNEL.value

View File

@@ -255,6 +255,10 @@ def detect_datatype(source: Any) -> DataType:
logging.debug(f"Source of `{formatted_source}` detected as `docs_site`.")
return DataType.DOCS_SITE
if "github.com" in url.netloc:
logging.debug(f"Source of `{formatted_source}` detected as `github`.")
return DataType.GITHUB
# If none of the above conditions are met, it's a general web page
logging.debug(f"Source of `{formatted_source}` detected as `web_page`.")
return DataType.WEB_PAGE

119
poetry.lock generated
View File

@@ -1691,6 +1691,37 @@ files = [
[package.dependencies]
wcwidth = ">=0.2.5"
[[package]]
name = "gitdb"
version = "4.0.11"
description = "Git Object Database"
optional = true
python-versions = ">=3.7"
files = [
{file = "gitdb-4.0.11-py3-none-any.whl", hash = "sha256:81a3407ddd2ee8df444cbacea00e2d038e40150acfa3001696fe0dcf1d3adfa4"},
{file = "gitdb-4.0.11.tar.gz", hash = "sha256:bf5421126136d6d0af55bc1e7c1af1c397a34f5b7bd79e776cd3e89785c2b04b"},
]
[package.dependencies]
smmap = ">=3.0.1,<6"
[[package]]
name = "gitpython"
version = "3.1.40"
description = "GitPython is a Python library used to interact with Git repositories"
optional = true
python-versions = ">=3.7"
files = [
{file = "GitPython-3.1.40-py3-none-any.whl", hash = "sha256:cf14627d5a8049ffbf49915732e5eddbe8134c3bdb9d476e6182b676fc573f8a"},
{file = "GitPython-3.1.40.tar.gz", hash = "sha256:22b126e9ffb671fdd0c129796343a02bf67bf2994b35449ffc9321aa755e18a4"},
]
[package.dependencies]
gitdb = ">=4.0.1,<5"
[package.extras]
test = ["black", "coverage[toml]", "ddt (>=1.1.1,!=1.4.3)", "mock", "mypy", "pre-commit", "pytest", "pytest-cov", "pytest-instafail", "pytest-subtests", "pytest-sugar"]
[[package]]
name = "google-api-core"
version = "2.12.0"
@@ -3380,6 +3411,17 @@ files = [
{file = "multidict-6.0.4.tar.gz", hash = "sha256:3666906492efb76453c0e7b97f2cf459b0682e7402c0489a95484965dbc1da49"},
]
[[package]]
name = "mutagen"
version = "1.47.0"
description = "read and write audio tags for many formats"
optional = true
python-versions = ">=3.7"
files = [
{file = "mutagen-1.47.0-py3-none-any.whl", hash = "sha256:edd96f50c5907a9539d8e5bba7245f62c9f520aef333d13392a79a4f70aca719"},
{file = "mutagen-1.47.0.tar.gz", hash = "sha256:719fadef0a978c31b4cf3c956261b3c58b6948b32023078a2117b1de09f0fc99"},
]
[[package]]
name = "mypy-extensions"
version = "1.0.0"
@@ -4639,6 +4681,47 @@ files = [
{file = "pycparser-2.21.tar.gz", hash = "sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206"},
]
[[package]]
name = "pycryptodomex"
version = "3.19.0"
description = "Cryptographic library for Python"
optional = true
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
files = [
{file = "pycryptodomex-3.19.0-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:ff64fd720def623bf64d8776f8d0deada1cc1bf1ec3c1f9d6f5bb5bd098d034f"},
{file = "pycryptodomex-3.19.0-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:61056a1fd3254f6f863de94c233b30dd33bc02f8c935b2000269705f1eeeffa4"},
{file = "pycryptodomex-3.19.0-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:258c4233a3fe5a6341780306a36c6fb072ef38ce676a6d41eec3e591347919e8"},
{file = "pycryptodomex-3.19.0-cp27-cp27m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6e45bb4635b3c4e0a00ca9df75ef6295838c85c2ac44ad882410cb631ed1eeaa"},
{file = "pycryptodomex-3.19.0-cp27-cp27m-musllinux_1_1_aarch64.whl", hash = "sha256:a12144d785518f6491ad334c75ccdc6ad52ea49230b4237f319dbb7cef26f464"},
{file = "pycryptodomex-3.19.0-cp27-cp27m-win32.whl", hash = "sha256:1789d89f61f70a4cd5483d4dfa8df7032efab1118f8b9894faae03c967707865"},
{file = "pycryptodomex-3.19.0-cp27-cp27m-win_amd64.whl", hash = "sha256:eb2fc0ec241bf5e5ef56c8fbec4a2634d631e4c4f616a59b567947a0f35ad83c"},
{file = "pycryptodomex-3.19.0-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:c9a68a2f7bd091ccea54ad3be3e9d65eded813e6d79fdf4cc3604e26cdd6384f"},
{file = "pycryptodomex-3.19.0-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:8df69e41f7e7015a90b94d1096ec3d8e0182e73449487306709ec27379fff761"},
{file = "pycryptodomex-3.19.0-cp27-cp27mu-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:917033016ecc23c8933205585a0ab73e20020fdf671b7cd1be788a5c4039840b"},
{file = "pycryptodomex-3.19.0-cp27-cp27mu-musllinux_1_1_aarch64.whl", hash = "sha256:e8e5ecbd4da4157889fce8ba49da74764dd86c891410bfd6b24969fa46edda51"},
{file = "pycryptodomex-3.19.0-cp35-abi3-macosx_10_9_universal2.whl", hash = "sha256:a77b79852175064c822b047fee7cf5a1f434f06ad075cc9986aa1c19a0c53eb0"},
{file = "pycryptodomex-3.19.0-cp35-abi3-macosx_10_9_x86_64.whl", hash = "sha256:5b883e1439ab63af976656446fb4839d566bb096f15fc3c06b5a99cde4927188"},
{file = "pycryptodomex-3.19.0-cp35-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a3866d68e2fc345162b1b9b83ef80686acfe5cec0d134337f3b03950a0a8bf56"},
{file = "pycryptodomex-3.19.0-cp35-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c74eb1f73f788facece7979ce91594dc177e1a9b5d5e3e64697dd58299e5cb4d"},
{file = "pycryptodomex-3.19.0-cp35-abi3-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7cb51096a6a8d400724104db8a7e4f2206041a1f23e58924aa3d8d96bcb48338"},
{file = "pycryptodomex-3.19.0-cp35-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:a588a1cb7781da9d5e1c84affd98c32aff9c89771eac8eaa659d2760666f7139"},
{file = "pycryptodomex-3.19.0-cp35-abi3-musllinux_1_1_i686.whl", hash = "sha256:d4dd3b381ff5a5907a3eb98f5f6d32c64d319a840278ceea1dcfcc65063856f3"},
{file = "pycryptodomex-3.19.0-cp35-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:263de9a96d2fcbc9f5bd3a279f14ea0d5f072adb68ebd324987576ec25da084d"},
{file = "pycryptodomex-3.19.0-cp35-abi3-win32.whl", hash = "sha256:67c8eb79ab33d0fbcb56842992298ddb56eb6505a72369c20f60bc1d2b6fb002"},
{file = "pycryptodomex-3.19.0-cp35-abi3-win_amd64.whl", hash = "sha256:09c9401dc06fb3d94cb1ec23b4ea067a25d1f4c6b7b118ff5631d0b5daaab3cc"},
{file = "pycryptodomex-3.19.0-pp27-pypy_73-manylinux2010_x86_64.whl", hash = "sha256:edbe083c299835de7e02c8aa0885cb904a75087d35e7bab75ebe5ed336e8c3e2"},
{file = "pycryptodomex-3.19.0-pp27-pypy_73-win32.whl", hash = "sha256:136b284e9246b4ccf4f752d435c80f2c44fc2321c198505de1d43a95a3453b3c"},
{file = "pycryptodomex-3.19.0-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:5d73e9fa3fe830e7b6b42afc49d8329b07a049a47d12e0ef9225f2fd220f19b2"},
{file = "pycryptodomex-3.19.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b2f1982c5bc311f0aab8c293524b861b485d76f7c9ab2c3ac9a25b6f7655975"},
{file = "pycryptodomex-3.19.0-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bfb040b5dda1dff1e197d2ef71927bd6b8bfcb9793bc4dfe0bb6df1e691eaacb"},
{file = "pycryptodomex-3.19.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:800a2b05cfb83654df80266692f7092eeefe2a314fa7901dcefab255934faeec"},
{file = "pycryptodomex-3.19.0-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:c01678aee8ac0c1a461cbc38ad496f953f9efcb1fa19f5637cbeba7544792a53"},
{file = "pycryptodomex-3.19.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2126bc54beccbede6eade00e647106b4f4c21e5201d2b0a73e9e816a01c50905"},
{file = "pycryptodomex-3.19.0-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b801216c48c0886742abf286a9a6b117e248ca144d8ceec1f931ce2dd0c9cb40"},
{file = "pycryptodomex-3.19.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:50cb18d4dd87571006fd2447ccec85e6cec0136632a550aa29226ba075c80644"},
{file = "pycryptodomex-3.19.0.tar.gz", hash = "sha256:af83a554b3f077564229865c45af0791be008ac6469ef0098152139e6bd4b5b6"},
]
[[package]]
name = "pydantic"
version = "2.4.2"
@@ -6014,6 +6097,17 @@ files = [
optional = ["SQLAlchemy (>=1.4,<3)", "aiodns (>1.0)", "aiohttp (>=3.7.3,<4)", "boto3 (<=2)", "websocket-client (>=1,<2)", "websockets (>=10,<11)"]
testing = ["Flask (>=1,<2)", "Flask-Sockets (>=0.2,<1)", "Jinja2 (==3.0.3)", "Werkzeug (<2)", "black (==22.8.0)", "boto3 (<=2)", "click (==8.0.4)", "databases (>=0.5)", "flake8 (>=5,<6)", "itsdangerous (==1.1.0)", "moto (>=3,<4)", "psutil (>=5,<6)", "pytest (>=6.2.5,<7)", "pytest-asyncio (<1)", "pytest-cov (>=2,<3)"]
[[package]]
name = "smmap"
version = "5.0.1"
description = "A pure Python implementation of a sliding window memory map manager"
optional = true
python-versions = ">=3.7"
files = [
{file = "smmap-5.0.1-py3-none-any.whl", hash = "sha256:e6d8668fa5f93e706934a62d7b4db19c8d9eb8cf2adbb75ef1b675aa332b69da"},
{file = "smmap-5.0.1.tar.gz", hash = "sha256:dceeb6c0028fdb6734471eb07c0cd2aae706ccaecab45965ee83f11c8d3b1f62"},
]
[[package]]
name = "sniffio"
version = "1.3.0"
@@ -7508,6 +7602,27 @@ files = [
[package.dependencies]
requests = "*"
[[package]]
name = "yt-dlp"
version = "2023.11.16"
description = "A youtube-dl fork with additional features and patches"
optional = true
python-versions = ">=3.7"
files = [
{file = "yt-dlp-2023.11.16.tar.gz", hash = "sha256:f0ccdaf12e08b15902601a4671c7ab12906d7b11de3ae75fa6506811c24ec5da"},
{file = "yt_dlp-2023.11.16-py2.py3-none-any.whl", hash = "sha256:0322ba85aa4afdb75f8641ed550e5958964daff034aeb477abb15031fd9a51ed"},
]
[package.dependencies]
brotli = {version = "*", markers = "implementation_name == \"cpython\""}
brotlicffi = {version = "*", markers = "implementation_name != \"cpython\""}
certifi = "*"
mutagen = "*"
pycryptodomex = "*"
requests = ">=2.31.0,<3"
urllib3 = ">=1.26.17,<3"
websockets = "*"
[[package]]
name = "zipp"
version = "3.17.0"
@@ -7529,6 +7644,7 @@ community = ["llama-hub"]
dataloaders = ["beautifulsoup4", "docx2txt", "duckduckgo-search", "pypdf", "pytube", "sentence-transformers", "unstructured"]
discord = ["discord"]
elasticsearch = ["elasticsearch"]
git = ["gitpython"]
gmail = ["llama-hub", "requests"]
huggingface-hub = ["huggingface_hub"]
images = ["ftfy", "pillow", "regex", "torch", "torchvision"]
@@ -7547,8 +7663,9 @@ streamlit = []
vertexai = ["google-cloud-aiplatform"]
weaviate = ["weaviate-client"]
whatsapp = ["flask", "twilio"]
youtube-channel = ["yt_dlp"]
[metadata]
lock-version = "2.0"
python-versions = ">=3.9,<3.12"
content-hash = "fe9ebe5f637303885981d10ace60b955635c7ca7586605546837e59206bfefd7"
content-hash = "a7282080c7a4379bdc6f33dfe9cae7eb20764aae0176137ba5c7af7cdcc58ede"

View File

@@ -1,6 +1,6 @@
[tool.poetry]
name = "embedchain"
version = "0.1.13"
version = "0.1.14"
description = "Data platform for LLMs - Load, index, retrieve and sync any unstructured data"
authors = [
"Taranjeet Singh <taranjeet@embedchain.ai>",
@@ -134,6 +134,8 @@ psycopg = { version = "^3.1.12", optional = true }
psycopg-binary = { version = "^3.1.12", optional = true }
psycopg-pool = { version = "^3.1.8", optional = true }
mysql-connector-python = { version = "^8.1.0", optional = true }
gitpython = { version = "^3.1.38", optional = true }
yt_dlp = { version = "^2023.11.14", optional = true }
[tool.poetry.group.dev.dependencies]
black = "^23.3.0"
@@ -190,6 +192,11 @@ gmail = [
json = ["llama-hub"]
postgres = ["psycopg", "psycopg-binary", "psycopg-pool"]
mysql = ["mysql-connector-python"]
git = ["gitpython"]
youtube_channel = [
"yt_dlp",
"youtube-transcripts-api",
]
[tool.poetry.group.docs.dependencies]