Add simple app functionality

This commit enables anyone to create a app and add 3 types of data sources: * pdf file * youtube video * website It exposes a function called query which first gets similar docs from vector db and then passes it to LLM to get the final answer.
2023-06-20 14:42:52 +05:30
parent 775da3e481
commit 468db83337
11 changed files with 329 additions and 0 deletions
--- a/embedchain/loaders/init.py
+++ b/embedchain/loaders/init.py
--- a/embedchain/loaders/pdf_file.py
+++ b/embedchain/loaders/pdf_file.py
@@ -0,0 +1,23 @@
+from langchain.document_loaders import PyPDFLoader
+
+from embedchain.utils import clean_string
+
+
+class PdfFileLoader:
+
+    def load_data(self, url):
+        loader = PyPDFLoader(url)
+        output = []
+        pages = loader.load_and_split()
+        if not len(pages):
+            raise ValueError("No data found")
+        for page in pages:
+            content = page.page_content
+            content = clean_string(content)
+            meta_data = page.metadata
+            meta_data["url"] = url
+            output.append({
+                "content": content,
+                "meta_data": meta_data,
+            })
+        return output
--- a/embedchain/loaders/website.py
+++ b/embedchain/loaders/website.py
@@ -0,0 +1,30 @@
+import requests
+
+from bs4 import BeautifulSoup
+
+from embedchain.utils import clean_string
+
+
+class WebsiteLoader:
+
+    def load_data(self, url):
+        response = requests.get(url)
+        data = response.content
+        soup = BeautifulSoup(data, 'html.parser')
+        for tag in soup([
+            "nav", "aside", "form", "header",
+            "noscript", "svg", "canvas",
+            "footer", "script", "style"
+        ]):
+            tag.string = " "
+        output = []
+        content = soup.get_text()
+        content = clean_string(content)
+        meta_data = {
+            "url": url,
+        }
+        output.append({
+            "content": content,
+            "meta_data": meta_data,
+        })
+        return output
--- a/embedchain/loaders/youtube_video.py
+++ b/embedchain/loaders/youtube_video.py
@@ -0,0 +1,22 @@
+from langchain.document_loaders import YoutubeLoader
+
+from embedchain.utils import clean_string
+
+
+class YoutubeVideoLoader:
+
+    def load_data(self, url):
+        loader = YoutubeLoader.from_youtube_url(url, add_video_info=True)
+        doc = loader.load()
+        output = []
+        if not len(doc):
+            raise ValueError("No data found")
+        content = doc[0].page_content
+        content = clean_string(content)
+        meta_data = doc[0].metadata
+        meta_data["url"] = url
+        output.append({
+            "content": content,
+            "meta_data": meta_data,
+        })
+        return output