Article Content
+This is some test content.
+diff --git a/embedchain/loaders/web_page.py b/embedchain/loaders/web_page.py
index 53d41df0..bf0d2416 100644
--- a/embedchain/loaders/web_page.py
+++ b/embedchain/loaders/web_page.py
@@ -15,7 +15,25 @@ class WebPageLoader(BaseLoader):
"""Load data from a web page."""
response = requests.get(url)
data = response.content
- soup = BeautifulSoup(data, "html.parser")
+ content = self._get_clean_content(data, url)
+
+ meta_data = {
+ "url": url,
+ }
+
+ doc_id = hashlib.sha256((content + url).encode()).hexdigest()
+ return {
+ "doc_id": doc_id,
+ "data": [
+ {
+ "content": content,
+ "meta_data": meta_data,
+ }
+ ],
+ }
+
+ def _get_clean_content(self, html, url) -> str:
+ soup = BeautifulSoup(html, "html.parser")
original_size = len(str(soup.get_text()))
tags_to_exclude = [
@@ -61,17 +79,4 @@ class WebPageLoader(BaseLoader):
f"[{url}] Cleaned page size: {cleaned_size} characters, down from {original_size} (shrunk: {original_size-cleaned_size} chars, {round((1-(cleaned_size/original_size)) * 100, 2)}%)" # noqa:E501
)
- meta_data = {
- "url": url,
- }
- content = content
- doc_id = hashlib.sha256((content + url).encode()).hexdigest()
- return {
- "doc_id": doc_id,
- "data": [
- {
- "content": content,
- "meta_data": meta_data,
- }
- ],
- }
+ return content
diff --git a/tests/chunkers/test_base_chunker.py b/tests/chunkers/test_base_chunker.py
new file mode 100644
index 00000000..2a9deffb
--- /dev/null
+++ b/tests/chunkers/test_base_chunker.py
@@ -0,0 +1,84 @@
+import hashlib
+import pytest
+from unittest.mock import MagicMock
+from embedchain.chunkers.base_chunker import BaseChunker
+from embedchain.models.data_type import DataType
+
+
+@pytest.fixture
+def text_splitter_mock():
+ return MagicMock()
+
+
+@pytest.fixture
+def loader_mock():
+ return MagicMock()
+
+
+@pytest.fixture
+def app_id():
+ return "test_app"
+
+
+@pytest.fixture
+def data_type():
+ return DataType.TEXT
+
+
+@pytest.fixture
+def chunker(text_splitter_mock, data_type):
+ text_splitter = text_splitter_mock
+ chunker = BaseChunker(text_splitter)
+ chunker.set_data_type(data_type)
+ return chunker
+
+
+def test_create_chunks(chunker, text_splitter_mock, loader_mock, app_id, data_type):
+ text_splitter_mock.split_text.return_value = ["Chunk 1", "Chunk 2"]
+ loader_mock.load_data.return_value = {
+ "data": [{"content": "Content 1", "meta_data": {"url": "URL 1"}}],
+ "doc_id": "DocID",
+ }
+
+ result = chunker.create_chunks(loader_mock, "test_src", app_id)
+ expected_ids = [
+ hashlib.sha256(("Chunk 1" + "URL 1").encode()).hexdigest(),
+ hashlib.sha256(("Chunk 2" + "URL 1").encode()).hexdigest(),
+ ]
+
+ assert result["documents"] == ["Chunk 1", "Chunk 2"]
+ assert result["ids"] == expected_ids
+ assert result["metadatas"] == [
+ {
+ "url": "URL 1",
+ "data_type": data_type.value,
+ "doc_id": f"{app_id}--DocID",
+ },
+ {
+ "url": "URL 1",
+ "data_type": data_type.value,
+ "doc_id": f"{app_id}--DocID",
+ },
+ ]
+ assert result["doc_id"] == f"{app_id}--DocID"
+
+
+def test_get_chunks(chunker, text_splitter_mock):
+ text_splitter_mock.split_text.return_value = ["Chunk 1", "Chunk 2"]
+
+ content = "This is a test content."
+ result = chunker.get_chunks(content)
+
+ assert len(result) == 2
+ assert result == ["Chunk 1", "Chunk 2"]
+
+
+def test_set_data_type(chunker):
+ chunker.set_data_type(DataType.MDX)
+ assert chunker.data_type == DataType.MDX
+
+
+def test_get_word_count(chunker):
+ documents = ["This is a test.", "Another test."]
+ result = chunker.get_word_count(documents)
+ assert result == 6
diff --git a/tests/chunkers/test_chunkers.py b/tests/chunkers/test_chunkers.py
new file mode 100644
index 00000000..b8c72adf
--- /dev/null
+++ b/tests/chunkers/test_chunkers.py
@@ -0,0 +1,46 @@
+from embedchain.chunkers.docs_site import DocsSiteChunker
+from embedchain.chunkers.docx_file import DocxFileChunker
+from embedchain.chunkers.mdx import MdxChunker
+from embedchain.chunkers.notion import NotionChunker
+from embedchain.chunkers.pdf_file import PdfFileChunker
+from embedchain.chunkers.qna_pair import QnaPairChunker
+from embedchain.chunkers.sitemap import SitemapChunker
+from embedchain.chunkers.table import TableChunker
+from embedchain.chunkers.text import TextChunker
+from embedchain.chunkers.web_page import WebPageChunker
+from embedchain.chunkers.xml import XmlChunker
+from embedchain.chunkers.youtube_video import YoutubeVideoChunker
+from embedchain.config.add_config import ChunkerConfig
+
+chunker_config = ChunkerConfig(chunk_size=500, chunk_overlap=0, length_function=len)
+
+chunker_common_config = {
+ DocsSiteChunker: {"chunk_size": 500, "chunk_overlap": 50, "length_function": len},
+ DocxFileChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
+ PdfFileChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
+ TextChunker: {"chunk_size": 300, "chunk_overlap": 0, "length_function": len},
+ MdxChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
+ NotionChunker: {"chunk_size": 300, "chunk_overlap": 0, "length_function": len},
+ QnaPairChunker: {"chunk_size": 300, "chunk_overlap": 0, "length_function": len},
+ TableChunker: {"chunk_size": 300, "chunk_overlap": 0, "length_function": len},
+ SitemapChunker: {"chunk_size": 500, "chunk_overlap": 0, "length_function": len},
+ WebPageChunker: {"chunk_size": 500, "chunk_overlap": 0, "length_function": len},
+ XmlChunker: {"chunk_size": 500, "chunk_overlap": 50, "length_function": len},
+ YoutubeVideoChunker: {"chunk_size": 2000, "chunk_overlap": 0, "length_function": len},
+}
+
+
+def test_default_config_values():
+ for chunker_class, config in chunker_common_config.items():
+ chunker = chunker_class()
+ assert chunker.text_splitter._chunk_size == config["chunk_size"]
+ assert chunker.text_splitter._chunk_overlap == config["chunk_overlap"]
+ assert chunker.text_splitter._length_function == config["length_function"]
+
+
+def test_custom_config_values():
+ for chunker_class, _ in chunker_common_config.items():
+ chunker = chunker_class(config=chunker_config)
+ assert chunker.text_splitter._chunk_size == 500
+ assert chunker.text_splitter._chunk_overlap == 0
+ assert chunker.text_splitter._length_function == len
diff --git a/tests/loaders/test_csv.py b/tests/loaders/test_csv.py
index 07f06ae8..9cdcff39 100644
--- a/tests/loaders/test_csv.py
+++ b/tests/loaders/test_csv.py
@@ -2,6 +2,7 @@ import csv
import os
import pathlib
import tempfile
+from unittest.mock import MagicMock, patch
import pytest
@@ -84,3 +85,29 @@ def test_load_data_with_file_uri(delimiter):
# Cleaning up the temporary file
os.unlink(tmpfile.name)
+
+
+@pytest.mark.parametrize("content", ["ftp://example.com", "sftp://example.com", "mailto://example.com"])
+def test_get_file_content(content):
+ with pytest.raises(ValueError):
+ loader = CsvLoader()
+ loader._get_file_content(content)
+
+
+@pytest.mark.parametrize("content", ["http://example.com", "https://example.com"])
+def test_get_file_content_http(content):
+ """
+ Test _get_file_content method of CsvLoader for http and https URLs
+ """
+
+ with patch("requests.get") as mock_get:
+ mock_response = MagicMock()
+ mock_response.text = "Name,Age,Occupation\nAlice,28,Engineer\nBob,35,Doctor\nCharlie,22,Student"
+ mock_get.return_value = mock_response
+
+ loader = CsvLoader()
+ file_content = loader._get_file_content(content)
+
+ mock_get.assert_called_once_with(content)
+ mock_response.raise_for_status.assert_called_once()
+ assert file_content.read() == mock_response.text
diff --git a/tests/loaders/test_docs_site.py b/tests/loaders/test_docs_site.py
new file mode 100644
index 00000000..e27bd1bf
--- /dev/null
+++ b/tests/loaders/test_docs_site.py
@@ -0,0 +1,128 @@
+import hashlib
+import pytest
+from unittest.mock import Mock, patch
+from requests import Response
+from embedchain.loaders.docs_site_loader import DocsSiteLoader
+
+
+@pytest.fixture
+def mock_requests_get():
+ with patch("requests.get") as mock_get:
+ yield mock_get
+
+
+@pytest.fixture
+def docs_site_loader():
+ return DocsSiteLoader()
+
+
+def test_get_child_links_recursive(mock_requests_get, docs_site_loader):
+ mock_response = Mock()
+ mock_response.status_code = 200
+ mock_response.text = """
+
+ Page 1
+ Page 2
+
+ """
+ mock_requests_get.return_value = mock_response
+
+ docs_site_loader._get_child_links_recursive("https://example.com")
+
+ assert len(docs_site_loader.visited_links) == 2
+ assert "https://example.com/page1" in docs_site_loader.visited_links
+ assert "https://example.com/page2" in docs_site_loader.visited_links
+
+
+def test_get_child_links_recursive_status_not_200(mock_requests_get, docs_site_loader):
+ mock_response = Mock()
+ mock_response.status_code = 404
+ mock_requests_get.return_value = mock_response
+
+ docs_site_loader._get_child_links_recursive("https://example.com")
+
+ assert len(docs_site_loader.visited_links) == 0
+
+
+def test_get_all_urls(mock_requests_get, docs_site_loader):
+ mock_response = Mock()
+ mock_response.status_code = 200
+ mock_response.text = """
+
+ Page 1
+ Page 2
+ External
+
+ """
+ mock_requests_get.return_value = mock_response
+
+ all_urls = docs_site_loader._get_all_urls("https://example.com")
+
+ assert len(all_urls) == 3
+ assert "https://example.com/page1" in all_urls
+ assert "https://example.com/page2" in all_urls
+ assert "https://example.com/external" in all_urls
+
+
+def test_load_data_from_url(mock_requests_get, docs_site_loader):
+ mock_response = Mock()
+ mock_response.status_code = 200
+ mock_response.content = """
+
+
+ Article Content
This is some test content.
+