From 78301ee63d4927016764407bbeeb8a0ee28c1cb7 Mon Sep 17 00:00:00 2001 From: Esparon1 <136267691+Esparon1@users.noreply.github.com> Date: Mon, 6 May 2024 18:31:04 +0100 Subject: [PATCH] Add feature to extract timestamps from youtube videos (#1345) --- embedchain/loaders/youtube_video.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/embedchain/loaders/youtube_video.py b/embedchain/loaders/youtube_video.py index 764a7d2a..ff0c4d90 100644 --- a/embedchain/loaders/youtube_video.py +++ b/embedchain/loaders/youtube_video.py @@ -1,5 +1,12 @@ import hashlib - +import json +import logging +try: + from youtube_transcript_api import YouTubeTranscriptApi +except ImportError: + raise ImportError( + 'YouTube video requires extra dependencies. Install with `pip install youtube-transcript-api "`' + ) try: from langchain_community.document_loaders import YoutubeLoader except ImportError: @@ -25,6 +32,18 @@ class YoutubeVideoLoader(BaseLoader): metadata = doc[0].metadata metadata["url"] = url + + video_id = url.split("v=")[1].split('&')[0] + try: + # Fetching transcript data + transcript = YouTubeTranscriptApi.get_transcript(video_id,languages=['en']) + # convert transcript to json to avoid unicode symboles + metadata["transcript"] = json.dumps(transcript, ensure_ascii=True) + except Exception as e: + logging.exception(f"Failed to fetch transcript for video {url}") + metadata["transcript"] = "Unavailable" + + output.append( { "content": content,