Add feature to extract timestamps from youtube videos (#1345)
This commit is contained in:
@@ -1,5 +1,12 @@
|
||||
import hashlib
|
||||
|
||||
import json
|
||||
import logging
|
||||
try:
|
||||
from youtube_transcript_api import YouTubeTranscriptApi
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
'YouTube video requires extra dependencies. Install with `pip install youtube-transcript-api "`'
|
||||
)
|
||||
try:
|
||||
from langchain_community.document_loaders import YoutubeLoader
|
||||
except ImportError:
|
||||
@@ -25,6 +32,18 @@ class YoutubeVideoLoader(BaseLoader):
|
||||
metadata = doc[0].metadata
|
||||
metadata["url"] = url
|
||||
|
||||
|
||||
video_id = url.split("v=")[1].split('&')[0]
|
||||
try:
|
||||
# Fetching transcript data
|
||||
transcript = YouTubeTranscriptApi.get_transcript(video_id,languages=['en'])
|
||||
# convert transcript to json to avoid unicode symboles
|
||||
metadata["transcript"] = json.dumps(transcript, ensure_ascii=True)
|
||||
except Exception as e:
|
||||
logging.exception(f"Failed to fetch transcript for video {url}")
|
||||
metadata["transcript"] = "Unavailable"
|
||||
|
||||
|
||||
output.append(
|
||||
{
|
||||
"content": content,
|
||||
|
||||
Reference in New Issue
Block a user