Support for Excel files (#1319)

This commit is contained in:
Dev Khant
2024-04-16 10:33:43 +05:30
committed by GitHub
parent 536f85b78a
commit 6c32d287b5
8 changed files with 120 additions and 1 deletions

View File

@@ -2,6 +2,7 @@ from embedchain.chunkers.common_chunker import CommonChunker
from embedchain.chunkers.discourse import DiscourseChunker
from embedchain.chunkers.docs_site import DocsSiteChunker
from embedchain.chunkers.docx_file import DocxFileChunker
from embedchain.chunkers.excel_file import ExcelFileChunker
from embedchain.chunkers.gmail import GmailChunker
from embedchain.chunkers.google_drive import GoogleDriveChunker
from embedchain.chunkers.json import JSONChunker
@@ -43,6 +44,7 @@ chunker_common_config = {
DiscourseChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
CommonChunker: {"chunk_size": 2000, "chunk_overlap": 0, "length_function": len},
GoogleDriveChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
ExcelFileChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
}

View File

@@ -0,0 +1,33 @@
import hashlib
from unittest.mock import patch
import pytest
from embedchain.loaders.excel_file import ExcelFileLoader
@pytest.fixture
def excel_file_loader():
return ExcelFileLoader()
def test_load_data(excel_file_loader):
mock_url = "mock_excel_file.xlsx"
expected_content = "Sample Excel Content"
# Mock the load_data method of the excel_file_loader instance
with patch.object(
excel_file_loader,
"load_data",
return_value={
"doc_id": hashlib.sha256((expected_content + mock_url).encode()).hexdigest(),
"data": [{"content": expected_content, "meta_data": {"url": mock_url}}],
},
):
result = excel_file_loader.load_data(mock_url)
assert result["data"][0]["content"] == expected_content
assert result["data"][0]["meta_data"]["url"] == mock_url
expected_doc_id = hashlib.sha256((expected_content + mock_url).encode()).hexdigest()
assert result["doc_id"] == expected_doc_id