Added language detection for non-english youtube videos (#1362)
This commit is contained in:
committed by
GitHub
parent
439b425c61
commit
4800e0344c
@@ -69,7 +69,8 @@ class OpenAILlm(BaseLlm):
|
|||||||
messages: list[BaseMessage],
|
messages: list[BaseMessage],
|
||||||
) -> str:
|
) -> str:
|
||||||
from langchain.output_parsers.openai_tools import JsonOutputToolsParser
|
from langchain.output_parsers.openai_tools import JsonOutputToolsParser
|
||||||
from langchain_core.utils.function_calling import convert_to_openai_tool
|
from langchain_core.utils.function_calling import \
|
||||||
|
convert_to_openai_tool
|
||||||
|
|
||||||
openai_tools = [convert_to_openai_tool(tools)]
|
openai_tools = [convert_to_openai_tool(tools)]
|
||||||
chat = chat.bind(tools=openai_tools).pipe(JsonOutputToolsParser())
|
chat = chat.bind(tools=openai_tools).pipe(JsonOutputToolsParser())
|
||||||
|
|||||||
@@ -8,6 +8,7 @@ except ImportError:
|
|||||||
raise ImportError('YouTube video requires extra dependencies. Install with `pip install youtube-transcript-api "`')
|
raise ImportError('YouTube video requires extra dependencies. Install with `pip install youtube-transcript-api "`')
|
||||||
try:
|
try:
|
||||||
from langchain_community.document_loaders import YoutubeLoader
|
from langchain_community.document_loaders import YoutubeLoader
|
||||||
|
from langchain_community.document_loaders.youtube import _parse_video_id
|
||||||
except ImportError:
|
except ImportError:
|
||||||
raise ImportError(
|
raise ImportError(
|
||||||
'YouTube video requires extra dependencies. Install with `pip install --upgrade "embedchain[dataloaders]"`'
|
'YouTube video requires extra dependencies. Install with `pip install --upgrade "embedchain[dataloaders]"`'
|
||||||
@@ -21,7 +22,20 @@ from embedchain.utils.misc import clean_string
|
|||||||
class YoutubeVideoLoader(BaseLoader):
|
class YoutubeVideoLoader(BaseLoader):
|
||||||
def load_data(self, url):
|
def load_data(self, url):
|
||||||
"""Load data from a Youtube video."""
|
"""Load data from a Youtube video."""
|
||||||
loader = YoutubeLoader.from_youtube_url(url, add_video_info=True)
|
video_id = _parse_video_id(url)
|
||||||
|
|
||||||
|
languages = ["en"]
|
||||||
|
try:
|
||||||
|
# Fetching transcript data
|
||||||
|
languages = [transcript.language_code for transcript in YouTubeTranscriptApi.list_transcripts(video_id)]
|
||||||
|
transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=languages)
|
||||||
|
# convert transcript to json to avoid unicode symboles
|
||||||
|
transcript = json.dumps(transcript, ensure_ascii=True)
|
||||||
|
except Exception:
|
||||||
|
logging.exception(f"Failed to fetch transcript for video {url}")
|
||||||
|
transcript = "Unavailable"
|
||||||
|
|
||||||
|
loader = YoutubeLoader.from_youtube_url(url, add_video_info=True, language=languages)
|
||||||
doc = loader.load()
|
doc = loader.load()
|
||||||
output = []
|
output = []
|
||||||
if not len(doc):
|
if not len(doc):
|
||||||
@@ -30,16 +44,7 @@ class YoutubeVideoLoader(BaseLoader):
|
|||||||
content = clean_string(content)
|
content = clean_string(content)
|
||||||
metadata = doc[0].metadata
|
metadata = doc[0].metadata
|
||||||
metadata["url"] = url
|
metadata["url"] = url
|
||||||
|
metadata["transcript"] = transcript
|
||||||
video_id = url.split("v=")[1].split("&")[0]
|
|
||||||
try:
|
|
||||||
# Fetching transcript data
|
|
||||||
transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=["en"])
|
|
||||||
# convert transcript to json to avoid unicode symboles
|
|
||||||
metadata["transcript"] = json.dumps(transcript, ensure_ascii=True)
|
|
||||||
except Exception:
|
|
||||||
logging.exception(f"Failed to fetch transcript for video {url}")
|
|
||||||
metadata["transcript"] = "Unavailable"
|
|
||||||
|
|
||||||
output.append(
|
output.append(
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -1,5 +1,4 @@
|
|||||||
import hashlib
|
import hashlib
|
||||||
import json
|
|
||||||
from unittest.mock import MagicMock, Mock, patch
|
from unittest.mock import MagicMock, Mock, patch
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
@@ -37,11 +36,7 @@ def test_load_data(youtube_video_loader):
|
|||||||
expected_data = [
|
expected_data = [
|
||||||
{
|
{
|
||||||
"content": "This is a YouTube video content.",
|
"content": "This is a YouTube video content.",
|
||||||
"meta_data": {
|
"meta_data": {"url": video_url, "title": "Test Video", "transcript": "Unavailable"},
|
||||||
"url": video_url,
|
|
||||||
"title": "Test Video",
|
|
||||||
"transcript": json.dumps(mock_transcript, ensure_ascii=True),
|
|
||||||
},
|
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user