Add feature to extract timestamps from youtube videos (#1345)
This commit is contained in:
@@ -1,5 +1,12 @@
|
|||||||
import hashlib
|
import hashlib
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
try:
|
||||||
|
from youtube_transcript_api import YouTubeTranscriptApi
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError(
|
||||||
|
'YouTube video requires extra dependencies. Install with `pip install youtube-transcript-api "`'
|
||||||
|
)
|
||||||
try:
|
try:
|
||||||
from langchain_community.document_loaders import YoutubeLoader
|
from langchain_community.document_loaders import YoutubeLoader
|
||||||
except ImportError:
|
except ImportError:
|
||||||
@@ -25,6 +32,18 @@ class YoutubeVideoLoader(BaseLoader):
|
|||||||
metadata = doc[0].metadata
|
metadata = doc[0].metadata
|
||||||
metadata["url"] = url
|
metadata["url"] = url
|
||||||
|
|
||||||
|
|
||||||
|
video_id = url.split("v=")[1].split('&')[0]
|
||||||
|
try:
|
||||||
|
# Fetching transcript data
|
||||||
|
transcript = YouTubeTranscriptApi.get_transcript(video_id,languages=['en'])
|
||||||
|
# convert transcript to json to avoid unicode symboles
|
||||||
|
metadata["transcript"] = json.dumps(transcript, ensure_ascii=True)
|
||||||
|
except Exception as e:
|
||||||
|
logging.exception(f"Failed to fetch transcript for video {url}")
|
||||||
|
metadata["transcript"] = "Unavailable"
|
||||||
|
|
||||||
|
|
||||||
output.append(
|
output.append(
|
||||||
{
|
{
|
||||||
"content": content,
|
"content": content,
|
||||||
|
|||||||
Reference in New Issue
Block a user