Resolve conflicts (#208)

This commit is contained in:
Deshraj Yadav
2023-07-10 21:50:05 -07:00
committed by GitHub
parent 6936d6983d
commit 9ca836520f
32 changed files with 396 additions and 207 deletions

View File

@@ -1,8 +1,9 @@
from langchain.document_loaders import Docx2txtLoader
class DocxFileLoader:
def load_data(self, url):
''' Load data from a .docx file. '''
"""Load data from a .docx file."""
loader = Docx2txtLoader(url)
output = []
data = loader.load()

View File

@@ -1,13 +1,14 @@
class LocalQnaPairLoader:
def load_data(self, content):
''' Load data from a local QnA pair. '''
"""Load data from a local QnA pair."""
question, answer = content
content = f"Q: {question}\nA: {answer}"
meta_data = {
"url": "local",
}
return [{
"content": content,
"meta_data": meta_data,
}]
return [
{
"content": content,
"meta_data": meta_data,
}
]

View File

@@ -1,11 +1,12 @@
class LocalTextLoader:
def load_data(self, content):
''' Load data from a local text file. '''
"""Load data from a local text file."""
meta_data = {
"url": "local",
}
return [{
"content": content,
"meta_data": meta_data,
}]
return [
{
"content": content,
"meta_data": meta_data,
}
]

View File

@@ -4,9 +4,8 @@ from embedchain.utils import clean_string
class PdfFileLoader:
def load_data(self, url):
''' Load data from a PDF file. '''
"""Load data from a PDF file."""
loader = PyPDFLoader(url)
output = []
pages = loader.load_and_split()
@@ -17,8 +16,10 @@ class PdfFileLoader:
content = clean_string(content)
meta_data = page.metadata
meta_data["url"] = url
output.append({
"content": content,
"meta_data": meta_data,
})
output.append(
{
"content": content,
"meta_data": meta_data,
}
)
return output

View File

@@ -1,22 +1,29 @@
import requests
from bs4 import BeautifulSoup
from embedchain.utils import clean_string
class WebPageLoader:
def load_data(self, url):
''' Load data from a web page. '''
"""Load data from a web page."""
response = requests.get(url)
data = response.content
soup = BeautifulSoup(data, 'html.parser')
for tag in soup([
"nav", "aside", "form", "header",
"noscript", "svg", "canvas",
"footer", "script", "style"
]):
soup = BeautifulSoup(data, "html.parser")
for tag in soup(
[
"nav",
"aside",
"form",
"header",
"noscript",
"svg",
"canvas",
"footer",
"script",
"style",
]
):
tag.string = " "
output = []
content = soup.get_text()
@@ -24,8 +31,10 @@ class WebPageLoader:
meta_data = {
"url": url,
}
output.append({
"content": content,
"meta_data": meta_data,
})
return output
output.append(
{
"content": content,
"meta_data": meta_data,
}
)
return output

View File

@@ -4,9 +4,8 @@ from embedchain.utils import clean_string
class YoutubeVideoLoader:
def load_data(self, url):
''' Load data from a Youtube video. '''
"""Load data from a Youtube video."""
loader = YoutubeLoader.from_youtube_url(url, add_video_info=True)
doc = loader.load()
output = []
@@ -16,8 +15,10 @@ class YoutubeVideoLoader:
content = clean_string(content)
meta_data = doc[0].metadata
meta_data["url"] = url
output.append({
"content": content,
"meta_data": meta_data,
})
output.append(
{
"content": content,
"meta_data": meta_data,
}
)
return output