From 51ebf3439bfdf439ca7e97aa2d5a60929ac09f44 Mon Sep 17 00:00:00 2001 From: Sidharth Mohanty Date: Fri, 8 Dec 2023 03:41:56 +0530 Subject: [PATCH] [New] Beehiiv loader (#963) --- docs/data-sources/beehiiv.mdx | 16 +++ docs/data-sources/overview.mdx | 1 + docs/mint.json | 3 +- embedchain/chunkers/beehiiv.py | 22 +++++ embedchain/data_formatter/data_formatter.py | 2 + embedchain/loaders/beehiiv.py | 104 ++++++++++++++++++++ embedchain/loaders/substack.py | 10 +- embedchain/models/data_type.py | 2 + 8 files changed, 153 insertions(+), 7 deletions(-) create mode 100644 docs/data-sources/beehiiv.mdx create mode 100644 embedchain/chunkers/beehiiv.py create mode 100644 embedchain/loaders/beehiiv.py diff --git a/docs/data-sources/beehiiv.mdx b/docs/data-sources/beehiiv.mdx new file mode 100644 index 00000000..077f6504 --- /dev/null +++ b/docs/data-sources/beehiiv.mdx @@ -0,0 +1,16 @@ +--- +title: "🐝 Beehiiv" +--- + +To add any Beehiiv data sources to your app, just add the base url as the source and set the data_type to `beehiiv`. + +```python +from embedchain import Pipeline as App + +app = App() + +# source: just add the base url and set the data_type to 'beehiiv' +app.add('https://aibreakfast.beehiiv.com', data_type='beehiiv') +app.query("How much is OpenAI paying developers?") +# Answer: OpenAI is aggressively recruiting Google's top AI researchers with offers ranging between $5 to $10 million annually, primarily in stock options. +``` diff --git a/docs/data-sources/overview.mdx b/docs/data-sources/overview.mdx index a35d76da..8bff8bce 100644 --- a/docs/data-sources/overview.mdx +++ b/docs/data-sources/overview.mdx @@ -27,6 +27,7 @@ Embedchain comes with built-in support for various data sources. We handle the c +
diff --git a/docs/mint.json b/docs/mint.json index 6639fde1..7d32a528 100644 --- a/docs/mint.json +++ b/docs/mint.json @@ -90,7 +90,8 @@ "data-sources/youtube-video", "data-sources/discourse", "data-sources/substack", - "data-sources/discord" + "data-sources/discord", + "data-sources/beehiiv" ] }, "data-sources/data-type-handling" diff --git a/embedchain/chunkers/beehiiv.py b/embedchain/chunkers/beehiiv.py new file mode 100644 index 00000000..7c130d54 --- /dev/null +++ b/embedchain/chunkers/beehiiv.py @@ -0,0 +1,22 @@ +from typing import Optional + +from langchain.text_splitter import RecursiveCharacterTextSplitter + +from embedchain.chunkers.base_chunker import BaseChunker +from embedchain.config.add_config import ChunkerConfig +from embedchain.helpers.json_serializable import register_deserializable + + +@register_deserializable +class BeehiivChunker(BaseChunker): + """Chunker for Beehiiv.""" + + def __init__(self, config: Optional[ChunkerConfig] = None): + if config is None: + config = ChunkerConfig(chunk_size=1000, chunk_overlap=0, length_function=len) + text_splitter = RecursiveCharacterTextSplitter( + chunk_size=config.chunk_size, + chunk_overlap=config.chunk_overlap, + length_function=config.length_function, + ) + super().__init__(text_splitter) diff --git a/embedchain/data_formatter/data_formatter.py b/embedchain/data_formatter/data_formatter.py index 1bff1e62..0e8b521e 100644 --- a/embedchain/data_formatter/data_formatter.py +++ b/embedchain/data_formatter/data_formatter.py @@ -72,6 +72,7 @@ class DataFormatter(JSONSerializable): DataType.SUBSTACK: "embedchain.loaders.substack.SubstackLoader", DataType.YOUTUBE_CHANNEL: "embedchain.loaders.youtube_channel.YoutubeChannelLoader", DataType.DISCORD: "embedchain.loaders.discord.DiscordLoader", + DataType.BEEHIIV: "embedchain.loaders.beehiiv.BeehiivLoader", } if data_type == DataType.CUSTOM or loader is not None: @@ -112,6 +113,7 @@ class DataFormatter(JSONSerializable): DataType.YOUTUBE_CHANNEL: "embedchain.chunkers.common_chunker.CommonChunker", DataType.DISCORD: "embedchain.chunkers.common_chunker.CommonChunker", DataType.CUSTOM: "embedchain.chunkers.common_chunker.CommonChunker", + DataType.BEEHIIV: "embedchain.chunkers.beehiiv.BeehiivChunker", } if chunker is not None: diff --git a/embedchain/loaders/beehiiv.py b/embedchain/loaders/beehiiv.py new file mode 100644 index 00000000..66ccfa24 --- /dev/null +++ b/embedchain/loaders/beehiiv.py @@ -0,0 +1,104 @@ +import hashlib +import logging +import time +import requests +from xml.etree import ElementTree + +from embedchain.helpers.json_serializable import register_deserializable +from embedchain.loaders.base_loader import BaseLoader +from embedchain.utils import is_readable + + +@register_deserializable +class BeehiivLoader(BaseLoader): + """ + This loader is used to load data from Beehiiv URLs. + """ + + def load_data(self, url: str): + try: + from bs4 import BeautifulSoup + from bs4.builder import ParserRejectedMarkup + except ImportError: + raise ImportError( + 'Beehiiv requires extra dependencies. Install with `pip install --upgrade "embedchain[dataloaders]"`' + ) from None + + if not url.endswith("sitemap.xml"): + url = url + "/sitemap.xml" + + output = [] + # we need to set this as a header to avoid 403 + headers = { + "User-Agent": ( + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) " + "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 " + "Safari/537.36" + ), + } + response = requests.get(url, headers=headers) + try: + response.raise_for_status() + except requests.exceptions.HTTPError as e: + raise ValueError( + f""" + Failed to load {url}: {e}. Please use the root substack URL. For example, https://example.substack.com + """ + ) + + try: + ElementTree.fromstring(response.content) + except ElementTree.ParseError: + raise ValueError( + f""" + Failed to parse {url}. Please use the root substack URL. For example, https://example.substack.com + """ + ) + soup = BeautifulSoup(response.text, "xml") + links = [link.text for link in soup.find_all("loc") if link.parent.name == "url" and "/p/" in link.text] + if len(links) == 0: + links = [link.text for link in soup.find_all("loc") if "/p/" in link.text] + + doc_id = hashlib.sha256((" ".join(links) + url).encode()).hexdigest() + + def serialize_response(soup: BeautifulSoup): + data = {} + + h1_el = soup.find("h1") + if h1_el is not None: + data["title"] = h1_el.text + + description_el = soup.find("meta", {"name": "description"}) + if description_el is not None: + data["description"] = description_el["content"] + + content_el = soup.find("div", {"id": "content-blocks"}) + if content_el is not None: + data["content"] = content_el.text + + return data + + def load_link(link: str): + try: + beehiiv_data = requests.get(link, headers=headers) + beehiiv_data.raise_for_status() + + soup = BeautifulSoup(beehiiv_data.text, "html.parser") + data = serialize_response(soup) + data = str(data) + if is_readable(data): + return data + else: + logging.warning(f"Page is not readable (too many invalid characters): {link}") + except ParserRejectedMarkup as e: + logging.error(f"Failed to parse {link}: {e}") + return None + + for link in links: + data = load_link(link) + if data: + output.append({"content": data, "meta_data": {"url": link}}) + # TODO: allow users to configure this + time.sleep(1.0) # added to avoid rate limiting + + return {"doc_id": doc_id, "data": output} diff --git a/embedchain/loaders/substack.py b/embedchain/loaders/substack.py index 0d46b6d1..350ab3c2 100644 --- a/embedchain/loaders/substack.py +++ b/embedchain/loaders/substack.py @@ -12,9 +12,7 @@ from embedchain.utils import is_readable @register_deserializable class SubstackLoader(BaseLoader): """ - This method takes a sitemap URL as input and retrieves - all the URLs to use the WebPageLoader to load content - of each page. + This loader is used to load data from Substack URLs. """ def load_data(self, url: str): @@ -62,10 +60,10 @@ class SubstackLoader(BaseLoader): def load_link(link: str): try: - each_load_data = requests.get(link) - each_load_data.raise_for_status() + substack_data = requests.get(link) + substack_data.raise_for_status() - soup = BeautifulSoup(response.text, "html.parser") + soup = BeautifulSoup(substack_data.text, "html.parser") data = serialize_response(soup) data = str(data) if is_readable(data): diff --git a/embedchain/models/data_type.py b/embedchain/models/data_type.py index a56f7f8d..3bc962f5 100644 --- a/embedchain/models/data_type.py +++ b/embedchain/models/data_type.py @@ -33,6 +33,7 @@ class IndirectDataType(Enum): YOUTUBE_CHANNEL = "youtube_channel" DISCORD = "discord" CUSTOM = "custom" + BEEHIIV = "beehiiv" class SpecialDataType(Enum): @@ -65,3 +66,4 @@ class DataType(Enum): YOUTUBE_CHANNEL = IndirectDataType.YOUTUBE_CHANNEL.value DISCORD = IndirectDataType.DISCORD.value CUSTOM = IndirectDataType.CUSTOM.value + BEEHIIV = IndirectDataType.BEEHIIV.value