diff --git a/pyproject.toml b/pyproject.toml index 7bfc09d2..bc9c47cf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -123,6 +123,7 @@ pytest-env = "^0.8.1" click = "^8.1.3" isort = "^5.12.0" pytest-cov = "^4.1.0" +responses = "^0.23.3" [tool.poetry.extras] streamlit = ["streamlit"] diff --git a/tests/loaders/test_docs_site_loader.py b/tests/loaders/test_docs_site_loader.py new file mode 100644 index 00000000..16b503b9 --- /dev/null +++ b/tests/loaders/test_docs_site_loader.py @@ -0,0 +1,218 @@ +import pytest +import responses +from bs4 import BeautifulSoup + + +@pytest.mark.parametrize( + "ignored_tag", + [ + "", + "", + "
This is a form.
", + "
This is a header.
", + "", + "This is an SVG.", + "This is a canvas.", + "", + "", + "", + ], + ids=["nav", "aside", "form", "header", "noscript", "svg", "canvas", "footer", "script", "style"], +) +@pytest.mark.parametrize( + "selectee", + [ + """ +
+

Article Title

+

Article content goes here.

+ {ignored_tag} +
+""", + """ +
+

Main Article Title

+

Main article content goes here.

+ {ignored_tag} +
+""", + """ +
+

Markdown Content

+

Markdown content goes here.

+ {ignored_tag} +
+""", + """ +
+

Main Content

+

Main content goes here.

+ {ignored_tag} +
+""", + """ +
+

Container

+

Container content goes here.

+ {ignored_tag} +
+ """, + """ +
+

Section

+

Section content goes here.

+ {ignored_tag} +
+ """, + """ +
+

Generic Article

+

Generic article content goes here.

+ {ignored_tag} +
+ """, + """ +
+

Main Content

+

Main content goes here.

+ {ignored_tag} +
+""", + ], + ids=[ + "article.bd-article", + 'article[role="main"]', + "div.md-content", + 'div[role="main"]', + "div.container", + "div.section", + "article", + "main", + ], +) +def test_load_data_gets_by_selectors_and_ignored_tags(selectee, ignored_tag, loader, mocked_responses, mocker): + child_url = "https://docs.embedchain.ai/quickstart" + selectee = selectee.format(ignored_tag=ignored_tag) + html_body = """ + + + + {selectee} + + +""" + html_body = html_body.format(selectee=selectee) + mocked_responses.get(child_url, body=html_body, status=200, content_type="text/html") + + url = "https://docs.embedchain.ai/" + html_body = """ + + + +
  • Quickstart
  • + + +""" + mocked_responses.get(url, body=html_body, status=200, content_type="text/html") + + mock_sha256 = mocker.patch("embedchain.loaders.docs_site_loader.hashlib.sha256") + doc_id = "mocked_hash" + mock_sha256.return_value.hexdigest.return_value = doc_id + + result = loader.load_data(url) + selector_soup = BeautifulSoup(selectee, "html.parser") + expected_content = " ".join((selector_soup.select_one("h2").get_text(), selector_soup.select_one("p").get_text())) + assert result["doc_id"] == doc_id + assert result["data"] == [ + { + "content": expected_content, + "meta_data": {"url": "https://docs.embedchain.ai/quickstart"}, + } + ] + + +def test_load_data_gets_child_links_recursively(loader, mocked_responses, mocker): + child_url = "https://docs.embedchain.ai/quickstart" + html_body = """ + + + +
  • ..
  • +
  • .
  • + + +""" + mocked_responses.get(child_url, body=html_body, status=200, content_type="text/html") + + child_url = "https://docs.embedchain.ai/introduction" + html_body = """ + + + +
  • ..
  • +
  • .
  • + + +""" + mocked_responses.get(child_url, body=html_body, status=200, content_type="text/html") + + url = "https://docs.embedchain.ai/" + html_body = """ + + + +
  • Quickstart
  • +
  • Introduction
  • + + +""" + mocked_responses.get(url, body=html_body, status=200, content_type="text/html") + + mock_sha256 = mocker.patch("embedchain.loaders.docs_site_loader.hashlib.sha256") + doc_id = "mocked_hash" + mock_sha256.return_value.hexdigest.return_value = doc_id + + result = loader.load_data(url) + assert result["doc_id"] == doc_id + expected_data = [ + {"content": "..\n.", "meta_data": {"url": "https://docs.embedchain.ai/quickstart"}}, + {"content": "..\n.", "meta_data": {"url": "https://docs.embedchain.ai/introduction"}}, + ] + assert all(item in expected_data for item in result["data"]) + + +def test_load_data_fails_to_fetch_website(loader, mocked_responses, mocker): + child_url = "https://docs.embedchain.ai/introduction" + mocked_responses.get(child_url, status=404) + + url = "https://docs.embedchain.ai/" + html_body = """ + + + +
  • Introduction
  • + + +""" + mocked_responses.get(url, body=html_body, status=200, content_type="text/html") + + mock_sha256 = mocker.patch("embedchain.loaders.docs_site_loader.hashlib.sha256") + doc_id = "mocked_hash" + mock_sha256.return_value.hexdigest.return_value = doc_id + + result = loader.load_data(url) + assert result["doc_id"] is doc_id + assert result["data"] == [] + + +@pytest.fixture +def loader(): + from embedchain.loaders.docs_site_loader import DocsSiteLoader + + return DocsSiteLoader() + + +@pytest.fixture +def mocked_responses(): + with responses.RequestsMock() as rsps: + yield rsps