Support for Audio Files (#1416)

2024-06-12 22:55:58 +05:30
parent 1bddd46ed2
commit 08b67b4a78
10 changed files with 211 additions and 1 deletions
--- a/2
+++ b/2
@@ -11,7 +11,7 @@ install:
 install_all:
 	poetry install --all-extras
-	poetry run pip install pinecone-text pinecone-client langchain-anthropic "unstructured[local-inference, all-docs]" ollama
+	poetry run pip install pinecone-text pinecone-client langchain-anthropic "unstructured[local-inference, all-docs]" ollama deepgram-sdk==3.2.7 
 install_es:
 	poetry install --extras elasticsearch
--- a/docs/components/data-sources/audio.mdx
+++ b/docs/components/data-sources/audio.mdx
@@ -0,0 +1,25 @@
 ---
 title: "🎤 Audio"
 ---
 To use an audio as data source, just add `data_type` as `audio` and pass in the path of the audio (local or hosted).
 We use [Deepgram](https://developers.deepgram.com/docs/introduction) to transcribe the audiot to text, and then use the generated text as the data source.
 You would require an Deepgram API key which is available [here](https://console.deepgram.com/signup?jump=keys) to use this feature.
 ### Without customization
 ```python
 import os
 from embedchain import App
 os.environ["DEEPGRAM_API_KEY"] = "153xxx"
 app = App()
 app.add("introduction.wav", data_type="audio")
 response = app.query("What is my name and how old am I?")
 print(response)
 # Answer: Your name is Dave and you are 21 years old.
 ```
--- a/docs/components/data-sources/overview.mdx
+++ b/docs/components/data-sources/overview.mdx
@@ -9,6 +9,7 @@ Embedchain comes with built-in support for various data sources. We handle the c
  <Card title="CSV file" href="/components/data-sources/csv"></Card>
  <Card title="JSON file" href="/components/data-sources/json"></Card>
  <Card title="Text" href="/components/data-sources/text"></Card>
  <Card title="Text File" href="/components/data-sources/text-file"></Card>
  <Card title="Directory" href="/components/data-sources/directory"></Card>
  <Card title="Web page" href="/components/data-sources/web-page"></Card>
  <Card title="Youtube Channel" href="/components/data-sources/youtube-channel"></Card>
@@ -33,6 +34,7 @@ Embedchain comes with built-in support for various data sources. We handle the c
  <Card title="Beehiiv" href="/components/data-sources/beehiiv"></Card>
  <Card title="Dropbox" href="/components/data-sources/dropbox"></Card>
  <Card title="Image" href="/components/data-sources/image"></Card>
  <Card title="Audio" href="/components/data-sources/audio"></Card>
  <Card title="Custom" href="/components/data-sources/custom"></Card>
 </CardGroup>
--- a/embedchain/chunkers/audio.py
+++ b/embedchain/chunkers/audio.py
@@ -0,0 +1,22 @@
 from typing import Optional
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from embedchain.chunkers.base_chunker import BaseChunker
 from embedchain.config.add_config import ChunkerConfig
 from embedchain.helpers.json_serializable import register_deserializable
@register_deserializable
 class AudioChunker(BaseChunker):
    """Chunker for audio."""
    def __init__(self, config: Optional[ChunkerConfig] = None):
        if config is None:
            config = ChunkerConfig(chunk_size=1000, chunk_overlap=0, length_function=len)
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=config.chunk_size,
            chunk_overlap=config.chunk_overlap,
            length_function=config.length_function,
        )
        super().__init__(text_splitter)
--- a/embedchain/data_formatter/data_formatter.py
+++ b/embedchain/data_formatter/data_formatter.py
@@ -81,6 +81,7 @@ class DataFormatter(JSONSerializable):
            DataType.DROPBOX: "embedchain.loaders.dropbox.DropboxLoader",
            DataType.TEXT_FILE: "embedchain.loaders.text_file.TextFileLoader",
            DataType.EXCEL_FILE: "embedchain.loaders.excel_file.ExcelFileLoader",
            DataType.AUDIO: "embedchain.loaders.audio.AudioLoader",
        }
        if data_type == DataType.CUSTOM or loader is not None:
@@ -129,6 +130,7 @@ class DataFormatter(JSONSerializable):
            DataType.DROPBOX: "embedchain.chunkers.common_chunker.CommonChunker",
            DataType.TEXT_FILE: "embedchain.chunkers.common_chunker.CommonChunker",
            DataType.EXCEL_FILE: "embedchain.chunkers.excel_file.ExcelFileChunker",
            DataType.AUDIO: "embedchain.chunkers.audio.AudioChunker",
        }
        if chunker is not None:
--- a/embedchain/loaders/audio.py
+++ b/embedchain/loaders/audio.py
@@ -0,0 +1,51 @@
 import os
 import hashlib
 import validators
 from embedchain.helpers.json_serializable import register_deserializable
 from embedchain.loaders.base_loader import BaseLoader
 try:
    from deepgram import DeepgramClient, PrerecordedOptions
 except ImportError:
    raise ImportError(
        "Audio file requires extra dependencies. Install with `pip install deepgram-sdk==3.2.7`"
    ) from None
@register_deserializable
 class AudioLoader(BaseLoader):
    def __init__(self):
        if not os.environ.get("DEEPGRAM_API_KEY"):
            raise ValueError("DEEPGRAM_API_KEY is not set")
        DG_KEY = os.environ.get("DEEPGRAM_API_KEY")
        self.client = DeepgramClient(DG_KEY)
    def load_data(self, url: str):
        """Load data from a audio file or URL."""
        options = PrerecordedOptions(
            model="nova-2",
            smart_format=True,
        )
        if validators.url(url):
            source = {"url": url}
            response = self.client.listen.prerecorded.v("1").transcribe_url(source, options)
        else:
            with open(url, "rb") as audio:
                source = {"buffer": audio}
                response = self.client.listen.prerecorded.v("1").transcribe_file(source, options)
        content = response["results"]["channels"][0]["alternatives"][0]["transcript"]
        doc_id = hashlib.sha256((content + url).encode()).hexdigest()
        metadata = {"url": url}
        return {
            "doc_id": doc_id,
            "data": [
                {
                    "content": content,
                    "meta_data": metadata,
                }
            ],
        }
--- a/embedchain/models/data_type.py
+++ b/embedchain/models/data_type.py
@@ -41,6 +41,7 @@ class IndirectDataType(Enum):
    DROPBOX = "dropbox"
    TEXT_FILE = "text_file"
    EXCEL_FILE = "excel_file"
    AUDIO = "audio"
 class SpecialDataType(Enum):
@@ -81,3 +82,4 @@ class DataType(Enum):
    DROPBOX = IndirectDataType.DROPBOX.value
    TEXT_FILE = IndirectDataType.TEXT_FILE.value
    EXCEL_FILE = IndirectDataType.EXCEL_FILE.value
    AUDIO = IndirectDataType.AUDIO.value
--- a/embedchain/utils/misc.py
+++ b/embedchain/utils/misc.py
@@ -237,6 +237,12 @@ def detect_datatype(source: Any) -> DataType:
            logger.debug(f"Source of `{formatted_source}` detected as `docx`.")
            return DataType.DOCX
        if url.path.endswith(
            (".mp3", ".mp4", ".mp2", ".aac", ".wav", ".flac", ".pcm", ".m4a", ".ogg", ".opus", ".webm")
        ):
            logger.debug(f"Source of `{formatted_source}` detected as `audio`.")
            return DataType.AUDIO
        if url.path.endswith(".yaml"):
            try:
                response = requests.get(source)
--- a/tests/chunkers/test_chunkers.py
+++ b/tests/chunkers/test_chunkers.py
@@ -19,6 +19,7 @@ from embedchain.chunkers.text import TextChunker
 from embedchain.chunkers.web_page import WebPageChunker
 from embedchain.chunkers.xml import XmlChunker
 from embedchain.chunkers.youtube_video import YoutubeVideoChunker
 from embedchain.chunkers.audio import AudioChunker
 from embedchain.config.add_config import ChunkerConfig
 chunker_config = ChunkerConfig(chunk_size=500, chunk_overlap=0, length_function=len)
@@ -45,6 +46,7 @@ chunker_common_config = {
    CommonChunker: {"chunk_size": 2000, "chunk_overlap": 0, "length_function": len},
    GoogleDriveChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
    ExcelFileChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
    AudioChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
 }
--- a/tests/loaders/test_audio.py
+++ b/tests/loaders/test_audio.py
@@ -0,0 +1,98 @@
 import os
 import sys
 import hashlib
 import pytest
 from unittest.mock import mock_open, patch
 if sys.version_info > (3, 10):  # as `match` statement was introduced in python 3.10
    from deepgram import PrerecordedOptions
    from embedchain.loaders.audio import AudioLoader
@pytest.fixture
 def setup_audio_loader(mocker):
    mock_dropbox = mocker.patch("deepgram.DeepgramClient")
    mock_dbx = mocker.MagicMock()
    mock_dropbox.return_value = mock_dbx
    os.environ["DEEPGRAM_API_KEY"] = "test_key"
    loader = AudioLoader()
    loader.client = mock_dbx
    yield loader, mock_dbx
    if "DEEPGRAM_API_KEY" in os.environ:
        del os.environ["DEEPGRAM_API_KEY"]
@pytest.mark.skipif(
    sys.version_info < (3, 10), reason="Test skipped for Python 3.9 or lower"
 )  # as `match` statement was introduced in python 3.10
 def test_initialization(setup_audio_loader):
    """Test initialization of AudioLoader."""
    loader, _ = setup_audio_loader
    assert loader is not None
@pytest.mark.skipif(
    sys.version_info < (3, 10), reason="Test skipped for Python 3.9 or lower"
 )  # as `match` statement was introduced in python 3.10
 def test_load_data_from_url(setup_audio_loader):
    loader, mock_dbx = setup_audio_loader
    url = "https://example.com/audio.mp3"
    expected_content = "This is a test audio transcript."
    mock_response = {"results": {"channels": [{"alternatives": [{"transcript": expected_content}]}]}}
    mock_dbx.listen.prerecorded.v.return_value.transcribe_url.return_value = mock_response
    result = loader.load_data(url)
    doc_id = hashlib.sha256((expected_content + url).encode()).hexdigest()
    expected_result = {
        "doc_id": doc_id,
        "data": [
            {
                "content": expected_content,
                "meta_data": {"url": url},
            }
        ],
    }
    assert result == expected_result
    mock_dbx.listen.prerecorded.v.assert_called_once_with("1")
    mock_dbx.listen.prerecorded.v.return_value.transcribe_url.assert_called_once_with(
        {"url": url}, PrerecordedOptions(model="nova-2", smart_format=True)
    )
@pytest.mark.skipif(
    sys.version_info < (3, 10), reason="Test skipped for Python 3.9 or lower"
 )  # as `match` statement was introduced in python 3.10
 def test_load_data_from_file(setup_audio_loader):
    loader, mock_dbx = setup_audio_loader
    file_path = "local_audio.mp3"
    expected_content = "This is a test audio transcript."
    mock_response = {"results": {"channels": [{"alternatives": [{"transcript": expected_content}]}]}}
    mock_dbx.listen.prerecorded.v.return_value.transcribe_file.return_value = mock_response
    # Mock the file reading functionality
    with patch("builtins.open", mock_open(read_data=b"some data")) as mock_file:
        result = loader.load_data(file_path)
    doc_id = hashlib.sha256((expected_content + file_path).encode()).hexdigest()
    expected_result = {
        "doc_id": doc_id,
        "data": [
            {
                "content": expected_content,
                "meta_data": {"url": file_path},
            }
        ],
    }
    assert result == expected_result
    mock_dbx.listen.prerecorded.v.assert_called_once_with("1")
    mock_dbx.listen.prerecorded.v.return_value.transcribe_file.assert_called_once_with(
        {"buffer": mock_file.return_value}, PrerecordedOptions(model="nova-2", smart_format=True)
    )