Rename embedchain to mem0 and open sourcing code for long term memory (#1474)
Co-authored-by: Deshraj Yadav <deshrajdry@gmail.com>
This commit is contained in:
117
embedchain/tests/loaders/test_web_page.py
Normal file
117
embedchain/tests/loaders/test_web_page.py
Normal file
@@ -0,0 +1,117 @@
|
||||
import hashlib
|
||||
from unittest.mock import Mock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
from embedchain.loaders.web_page import WebPageLoader
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def web_page_loader():
|
||||
return WebPageLoader()
|
||||
|
||||
|
||||
def test_load_data(web_page_loader):
|
||||
page_url = "https://example.com/page"
|
||||
mock_response = Mock()
|
||||
mock_response.status_code = 200
|
||||
mock_response.content = """
|
||||
<html>
|
||||
<head>
|
||||
<title>Test Page</title>
|
||||
</head>
|
||||
<body>
|
||||
<div id="content">
|
||||
<p>This is some test content.</p>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
with patch("embedchain.loaders.web_page.WebPageLoader._session.get", return_value=mock_response):
|
||||
result = web_page_loader.load_data(page_url)
|
||||
|
||||
content = web_page_loader._get_clean_content(mock_response.content, page_url)
|
||||
expected_doc_id = hashlib.sha256((content + page_url).encode()).hexdigest()
|
||||
assert result["doc_id"] == expected_doc_id
|
||||
|
||||
expected_data = [
|
||||
{
|
||||
"content": content,
|
||||
"meta_data": {
|
||||
"url": page_url,
|
||||
},
|
||||
}
|
||||
]
|
||||
|
||||
assert result["data"] == expected_data
|
||||
|
||||
|
||||
def test_get_clean_content_excludes_unnecessary_info(web_page_loader):
|
||||
mock_html = """
|
||||
<html>
|
||||
<head>
|
||||
<title>Sample HTML</title>
|
||||
<style>
|
||||
/* Stylesheet to be excluded */
|
||||
.elementor-location-header {
|
||||
background-color: #f0f0f0;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<header id="header">Header Content</header>
|
||||
<nav class="nav">Nav Content</nav>
|
||||
<aside>Aside Content</aside>
|
||||
<form>Form Content</form>
|
||||
<main>Main Content</main>
|
||||
<footer class="footer">Footer Content</footer>
|
||||
<script>Some Script</script>
|
||||
<noscript>NoScript Content</noscript>
|
||||
<svg>SVG Content</svg>
|
||||
<canvas>Canvas Content</canvas>
|
||||
|
||||
<div id="sidebar">Sidebar Content</div>
|
||||
<div id="main-navigation">Main Navigation Content</div>
|
||||
<div id="menu-main-menu">Menu Main Menu Content</div>
|
||||
|
||||
<div class="header-sidebar-wrapper">Header Sidebar Wrapper Content</div>
|
||||
<div class="blog-sidebar-wrapper">Blog Sidebar Wrapper Content</div>
|
||||
<div class="related-posts">Related Posts Content</div>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
tags_to_exclude = [
|
||||
"nav",
|
||||
"aside",
|
||||
"form",
|
||||
"header",
|
||||
"noscript",
|
||||
"svg",
|
||||
"canvas",
|
||||
"footer",
|
||||
"script",
|
||||
"style",
|
||||
]
|
||||
ids_to_exclude = ["sidebar", "main-navigation", "menu-main-menu"]
|
||||
classes_to_exclude = [
|
||||
"elementor-location-header",
|
||||
"navbar-header",
|
||||
"nav",
|
||||
"header-sidebar-wrapper",
|
||||
"blog-sidebar-wrapper",
|
||||
"related-posts",
|
||||
]
|
||||
|
||||
content = web_page_loader._get_clean_content(mock_html, "https://example.com/page")
|
||||
|
||||
for tag in tags_to_exclude:
|
||||
assert tag not in content
|
||||
|
||||
for id in ids_to_exclude:
|
||||
assert id not in content
|
||||
|
||||
for class_name in classes_to_exclude:
|
||||
assert class_name not in content
|
||||
|
||||
assert len(content) > 0
|
||||
Reference in New Issue
Block a user