Feat: Improve test coverage of DocsSiteLoader (#758)
This commit is contained in:
@@ -123,6 +123,7 @@ pytest-env = "^0.8.1"
|
|||||||
click = "^8.1.3"
|
click = "^8.1.3"
|
||||||
isort = "^5.12.0"
|
isort = "^5.12.0"
|
||||||
pytest-cov = "^4.1.0"
|
pytest-cov = "^4.1.0"
|
||||||
|
responses = "^0.23.3"
|
||||||
|
|
||||||
[tool.poetry.extras]
|
[tool.poetry.extras]
|
||||||
streamlit = ["streamlit"]
|
streamlit = ["streamlit"]
|
||||||
|
|||||||
218
tests/loaders/test_docs_site_loader.py
Normal file
218
tests/loaders/test_docs_site_loader.py
Normal file
@@ -0,0 +1,218 @@
|
|||||||
|
import pytest
|
||||||
|
import responses
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"ignored_tag",
|
||||||
|
[
|
||||||
|
"<nav>This is a navigation bar.</nav>",
|
||||||
|
"<aside>This is an aside.</aside>",
|
||||||
|
"<form>This is a form.</form>",
|
||||||
|
"<header>This is a header.</header>",
|
||||||
|
"<noscript>This is a noscript.</noscript>",
|
||||||
|
"<svg>This is an SVG.</svg>",
|
||||||
|
"<canvas>This is a canvas.</canvas>",
|
||||||
|
"<footer>This is a footer.</footer>",
|
||||||
|
"<script>This is a script.</script>",
|
||||||
|
"<style>This is a style.</style>",
|
||||||
|
],
|
||||||
|
ids=["nav", "aside", "form", "header", "noscript", "svg", "canvas", "footer", "script", "style"],
|
||||||
|
)
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"selectee",
|
||||||
|
[
|
||||||
|
"""
|
||||||
|
<article class="bd-article">
|
||||||
|
<h2>Article Title</h2>
|
||||||
|
<p>Article content goes here.</p>
|
||||||
|
{ignored_tag}
|
||||||
|
</article>
|
||||||
|
""",
|
||||||
|
"""
|
||||||
|
<article role="main">
|
||||||
|
<h2>Main Article Title</h2>
|
||||||
|
<p>Main article content goes here.</p>
|
||||||
|
{ignored_tag}
|
||||||
|
</article>
|
||||||
|
""",
|
||||||
|
"""
|
||||||
|
<div class="md-content">
|
||||||
|
<h2>Markdown Content</h2>
|
||||||
|
<p>Markdown content goes here.</p>
|
||||||
|
{ignored_tag}
|
||||||
|
</div>
|
||||||
|
""",
|
||||||
|
"""
|
||||||
|
<div role="main">
|
||||||
|
<h2>Main Content</h2>
|
||||||
|
<p>Main content goes here.</p>
|
||||||
|
{ignored_tag}
|
||||||
|
</div>
|
||||||
|
""",
|
||||||
|
"""
|
||||||
|
<div class="container">
|
||||||
|
<h2>Container</h2>
|
||||||
|
<p>Container content goes here.</p>
|
||||||
|
{ignored_tag}
|
||||||
|
</div>
|
||||||
|
""",
|
||||||
|
"""
|
||||||
|
<div class="section">
|
||||||
|
<h2>Section</h2>
|
||||||
|
<p>Section content goes here.</p>
|
||||||
|
{ignored_tag}
|
||||||
|
</div>
|
||||||
|
""",
|
||||||
|
"""
|
||||||
|
<article>
|
||||||
|
<h2>Generic Article</h2>
|
||||||
|
<p>Generic article content goes here.</p>
|
||||||
|
{ignored_tag}
|
||||||
|
</article>
|
||||||
|
""",
|
||||||
|
"""
|
||||||
|
<main>
|
||||||
|
<h2>Main Content</h2>
|
||||||
|
<p>Main content goes here.</p>
|
||||||
|
{ignored_tag}
|
||||||
|
</main>
|
||||||
|
""",
|
||||||
|
],
|
||||||
|
ids=[
|
||||||
|
"article.bd-article",
|
||||||
|
'article[role="main"]',
|
||||||
|
"div.md-content",
|
||||||
|
'div[role="main"]',
|
||||||
|
"div.container",
|
||||||
|
"div.section",
|
||||||
|
"article",
|
||||||
|
"main",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_load_data_gets_by_selectors_and_ignored_tags(selectee, ignored_tag, loader, mocked_responses, mocker):
|
||||||
|
child_url = "https://docs.embedchain.ai/quickstart"
|
||||||
|
selectee = selectee.format(ignored_tag=ignored_tag)
|
||||||
|
html_body = """
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html lang="en">
|
||||||
|
<body>
|
||||||
|
{selectee}
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
"""
|
||||||
|
html_body = html_body.format(selectee=selectee)
|
||||||
|
mocked_responses.get(child_url, body=html_body, status=200, content_type="text/html")
|
||||||
|
|
||||||
|
url = "https://docs.embedchain.ai/"
|
||||||
|
html_body = """
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html lang="en">
|
||||||
|
<body>
|
||||||
|
<li><a href="/quickstart">Quickstart</a></li>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
"""
|
||||||
|
mocked_responses.get(url, body=html_body, status=200, content_type="text/html")
|
||||||
|
|
||||||
|
mock_sha256 = mocker.patch("embedchain.loaders.docs_site_loader.hashlib.sha256")
|
||||||
|
doc_id = "mocked_hash"
|
||||||
|
mock_sha256.return_value.hexdigest.return_value = doc_id
|
||||||
|
|
||||||
|
result = loader.load_data(url)
|
||||||
|
selector_soup = BeautifulSoup(selectee, "html.parser")
|
||||||
|
expected_content = " ".join((selector_soup.select_one("h2").get_text(), selector_soup.select_one("p").get_text()))
|
||||||
|
assert result["doc_id"] == doc_id
|
||||||
|
assert result["data"] == [
|
||||||
|
{
|
||||||
|
"content": expected_content,
|
||||||
|
"meta_data": {"url": "https://docs.embedchain.ai/quickstart"},
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def test_load_data_gets_child_links_recursively(loader, mocked_responses, mocker):
|
||||||
|
child_url = "https://docs.embedchain.ai/quickstart"
|
||||||
|
html_body = """
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html lang="en">
|
||||||
|
<body>
|
||||||
|
<li><a href="/">..</a></li>
|
||||||
|
<li><a href="/quickstart">.</a></li>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
"""
|
||||||
|
mocked_responses.get(child_url, body=html_body, status=200, content_type="text/html")
|
||||||
|
|
||||||
|
child_url = "https://docs.embedchain.ai/introduction"
|
||||||
|
html_body = """
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html lang="en">
|
||||||
|
<body>
|
||||||
|
<li><a href="/">..</a></li>
|
||||||
|
<li><a href="/introduction">.</a></li>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
"""
|
||||||
|
mocked_responses.get(child_url, body=html_body, status=200, content_type="text/html")
|
||||||
|
|
||||||
|
url = "https://docs.embedchain.ai/"
|
||||||
|
html_body = """
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html lang="en">
|
||||||
|
<body>
|
||||||
|
<li><a href="/quickstart">Quickstart</a></li>
|
||||||
|
<li><a href="/introduction">Introduction</a></li>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
"""
|
||||||
|
mocked_responses.get(url, body=html_body, status=200, content_type="text/html")
|
||||||
|
|
||||||
|
mock_sha256 = mocker.patch("embedchain.loaders.docs_site_loader.hashlib.sha256")
|
||||||
|
doc_id = "mocked_hash"
|
||||||
|
mock_sha256.return_value.hexdigest.return_value = doc_id
|
||||||
|
|
||||||
|
result = loader.load_data(url)
|
||||||
|
assert result["doc_id"] == doc_id
|
||||||
|
expected_data = [
|
||||||
|
{"content": "..\n.", "meta_data": {"url": "https://docs.embedchain.ai/quickstart"}},
|
||||||
|
{"content": "..\n.", "meta_data": {"url": "https://docs.embedchain.ai/introduction"}},
|
||||||
|
]
|
||||||
|
assert all(item in expected_data for item in result["data"])
|
||||||
|
|
||||||
|
|
||||||
|
def test_load_data_fails_to_fetch_website(loader, mocked_responses, mocker):
|
||||||
|
child_url = "https://docs.embedchain.ai/introduction"
|
||||||
|
mocked_responses.get(child_url, status=404)
|
||||||
|
|
||||||
|
url = "https://docs.embedchain.ai/"
|
||||||
|
html_body = """
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html lang="en">
|
||||||
|
<body>
|
||||||
|
<li><a href="/introduction">Introduction</a></li>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
"""
|
||||||
|
mocked_responses.get(url, body=html_body, status=200, content_type="text/html")
|
||||||
|
|
||||||
|
mock_sha256 = mocker.patch("embedchain.loaders.docs_site_loader.hashlib.sha256")
|
||||||
|
doc_id = "mocked_hash"
|
||||||
|
mock_sha256.return_value.hexdigest.return_value = doc_id
|
||||||
|
|
||||||
|
result = loader.load_data(url)
|
||||||
|
assert result["doc_id"] is doc_id
|
||||||
|
assert result["data"] == []
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def loader():
|
||||||
|
from embedchain.loaders.docs_site_loader import DocsSiteLoader
|
||||||
|
|
||||||
|
return DocsSiteLoader()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def mocked_responses():
|
||||||
|
with responses.RequestsMock() as rsps:
|
||||||
|
yield rsps
|
||||||
Reference in New Issue
Block a user