Added language detection for non-english youtube videos (#1362)

This commit is contained in:
Ananto Joyoadikusumo
2024-06-14 13:02:37 +07:00
committed by GitHub
parent 439b425c61
commit 4800e0344c
3 changed files with 19 additions and 18 deletions

View File

@@ -69,7 +69,8 @@ class OpenAILlm(BaseLlm):
messages: list[BaseMessage], messages: list[BaseMessage],
) -> str: ) -> str:
from langchain.output_parsers.openai_tools import JsonOutputToolsParser from langchain.output_parsers.openai_tools import JsonOutputToolsParser
from langchain_core.utils.function_calling import convert_to_openai_tool from langchain_core.utils.function_calling import \
convert_to_openai_tool
openai_tools = [convert_to_openai_tool(tools)] openai_tools = [convert_to_openai_tool(tools)]
chat = chat.bind(tools=openai_tools).pipe(JsonOutputToolsParser()) chat = chat.bind(tools=openai_tools).pipe(JsonOutputToolsParser())

View File

@@ -8,6 +8,7 @@ except ImportError:
raise ImportError('YouTube video requires extra dependencies. Install with `pip install youtube-transcript-api "`') raise ImportError('YouTube video requires extra dependencies. Install with `pip install youtube-transcript-api "`')
try: try:
from langchain_community.document_loaders import YoutubeLoader from langchain_community.document_loaders import YoutubeLoader
from langchain_community.document_loaders.youtube import _parse_video_id
except ImportError: except ImportError:
raise ImportError( raise ImportError(
'YouTube video requires extra dependencies. Install with `pip install --upgrade "embedchain[dataloaders]"`' 'YouTube video requires extra dependencies. Install with `pip install --upgrade "embedchain[dataloaders]"`'
@@ -21,7 +22,20 @@ from embedchain.utils.misc import clean_string
class YoutubeVideoLoader(BaseLoader): class YoutubeVideoLoader(BaseLoader):
def load_data(self, url): def load_data(self, url):
"""Load data from a Youtube video.""" """Load data from a Youtube video."""
loader = YoutubeLoader.from_youtube_url(url, add_video_info=True) video_id = _parse_video_id(url)
languages = ["en"]
try:
# Fetching transcript data
languages = [transcript.language_code for transcript in YouTubeTranscriptApi.list_transcripts(video_id)]
transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=languages)
# convert transcript to json to avoid unicode symboles
transcript = json.dumps(transcript, ensure_ascii=True)
except Exception:
logging.exception(f"Failed to fetch transcript for video {url}")
transcript = "Unavailable"
loader = YoutubeLoader.from_youtube_url(url, add_video_info=True, language=languages)
doc = loader.load() doc = loader.load()
output = [] output = []
if not len(doc): if not len(doc):
@@ -30,16 +44,7 @@ class YoutubeVideoLoader(BaseLoader):
content = clean_string(content) content = clean_string(content)
metadata = doc[0].metadata metadata = doc[0].metadata
metadata["url"] = url metadata["url"] = url
metadata["transcript"] = transcript
video_id = url.split("v=")[1].split("&")[0]
try:
# Fetching transcript data
transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=["en"])
# convert transcript to json to avoid unicode symboles
metadata["transcript"] = json.dumps(transcript, ensure_ascii=True)
except Exception:
logging.exception(f"Failed to fetch transcript for video {url}")
metadata["transcript"] = "Unavailable"
output.append( output.append(
{ {

View File

@@ -1,5 +1,4 @@
import hashlib import hashlib
import json
from unittest.mock import MagicMock, Mock, patch from unittest.mock import MagicMock, Mock, patch
import pytest import pytest
@@ -37,11 +36,7 @@ def test_load_data(youtube_video_loader):
expected_data = [ expected_data = [
{ {
"content": "This is a YouTube video content.", "content": "This is a YouTube video content.",
"meta_data": { "meta_data": {"url": video_url, "title": "Test Video", "transcript": "Unavailable"},
"url": video_url,
"title": "Test Video",
"transcript": json.dumps(mock_transcript, ensure_ascii=True),
},
} }
] ]