Added language detection for non-english youtube videos (#1362)

This commit is contained in:
Ananto Joyoadikusumo
2024-06-14 13:02:37 +07:00
committed by GitHub
parent 439b425c61
commit 4800e0344c
3 changed files with 19 additions and 18 deletions

View File

@@ -69,7 +69,8 @@ class OpenAILlm(BaseLlm):
messages: list[BaseMessage],
) -> str:
from langchain.output_parsers.openai_tools import JsonOutputToolsParser
from langchain_core.utils.function_calling import convert_to_openai_tool
from langchain_core.utils.function_calling import \
convert_to_openai_tool
openai_tools = [convert_to_openai_tool(tools)]
chat = chat.bind(tools=openai_tools).pipe(JsonOutputToolsParser())

View File

@@ -8,6 +8,7 @@ except ImportError:
raise ImportError('YouTube video requires extra dependencies. Install with `pip install youtube-transcript-api "`')
try:
from langchain_community.document_loaders import YoutubeLoader
from langchain_community.document_loaders.youtube import _parse_video_id
except ImportError:
raise ImportError(
'YouTube video requires extra dependencies. Install with `pip install --upgrade "embedchain[dataloaders]"`'
@@ -21,7 +22,20 @@ from embedchain.utils.misc import clean_string
class YoutubeVideoLoader(BaseLoader):
def load_data(self, url):
"""Load data from a Youtube video."""
loader = YoutubeLoader.from_youtube_url(url, add_video_info=True)
video_id = _parse_video_id(url)
languages = ["en"]
try:
# Fetching transcript data
languages = [transcript.language_code for transcript in YouTubeTranscriptApi.list_transcripts(video_id)]
transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=languages)
# convert transcript to json to avoid unicode symboles
transcript = json.dumps(transcript, ensure_ascii=True)
except Exception:
logging.exception(f"Failed to fetch transcript for video {url}")
transcript = "Unavailable"
loader = YoutubeLoader.from_youtube_url(url, add_video_info=True, language=languages)
doc = loader.load()
output = []
if not len(doc):
@@ -30,16 +44,7 @@ class YoutubeVideoLoader(BaseLoader):
content = clean_string(content)
metadata = doc[0].metadata
metadata["url"] = url
video_id = url.split("v=")[1].split("&")[0]
try:
# Fetching transcript data
transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=["en"])
# convert transcript to json to avoid unicode symboles
metadata["transcript"] = json.dumps(transcript, ensure_ascii=True)
except Exception:
logging.exception(f"Failed to fetch transcript for video {url}")
metadata["transcript"] = "Unavailable"
metadata["transcript"] = transcript
output.append(
{

View File

@@ -1,5 +1,4 @@
import hashlib
import json
from unittest.mock import MagicMock, Mock, patch
import pytest
@@ -37,11 +36,7 @@ def test_load_data(youtube_video_loader):
expected_data = [
{
"content": "This is a YouTube video content.",
"meta_data": {
"url": video_url,
"title": "Test Video",
"transcript": json.dumps(mock_transcript, ensure_ascii=True),
},
"meta_data": {"url": video_url, "title": "Test Video", "transcript": "Unavailable"},
}
]