[Feature] RSS Feed loader (#942)
This commit is contained in:
52
embedchain/loaders/rss_feed.py
Normal file
52
embedchain/loaders/rss_feed.py
Normal file
@@ -0,0 +1,52 @@
|
||||
import hashlib
|
||||
|
||||
from embedchain.helper.json_serializable import register_deserializable
|
||||
from embedchain.loaders.base_loader import BaseLoader
|
||||
|
||||
|
||||
@register_deserializable
|
||||
class RSSFeedLoader(BaseLoader):
|
||||
"""Loader for RSS Feed."""
|
||||
|
||||
def load_data(self, url):
|
||||
"""Load data from a rss feed."""
|
||||
output = self.get_rss_content(url)
|
||||
doc_id = hashlib.sha256((str(output) + url).encode()).hexdigest()
|
||||
return {
|
||||
"doc_id": doc_id,
|
||||
"data": output,
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def serialize_metadata(metadata):
|
||||
for key, value in metadata.items():
|
||||
if not isinstance(value, (str, int, float, bool)):
|
||||
metadata[key] = str(value)
|
||||
|
||||
return metadata
|
||||
|
||||
@staticmethod
|
||||
def get_rss_content(url: str):
|
||||
try:
|
||||
from langchain.document_loaders import RSSFeedLoader as LangchainRSSFeedLoader
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"""RSSFeedLoader file requires extra dependencies.
|
||||
Install with `pip install --upgrade "embedchain[rss_feed]"`"""
|
||||
) from None
|
||||
|
||||
output = []
|
||||
loader = LangchainRSSFeedLoader(urls=[url])
|
||||
data = loader.load()
|
||||
|
||||
for entry in data:
|
||||
meta_data = RSSFeedLoader.serialize_metadata(entry.metadata)
|
||||
meta_data.update({"url": url})
|
||||
output.append(
|
||||
{
|
||||
"content": entry.page_content,
|
||||
"meta_data": meta_data,
|
||||
}
|
||||
)
|
||||
|
||||
return output
|
||||
Reference in New Issue
Block a user