feat: Add embedding manager (#570)
This commit is contained in:
@@ -1,4 +1,5 @@
|
||||
import csv
|
||||
import hashlib
|
||||
from io import StringIO
|
||||
from urllib.parse import urlparse
|
||||
|
||||
@@ -34,7 +35,7 @@ class CsvLoader(BaseLoader):
|
||||
def load_data(content):
|
||||
"""Load a csv file with headers. Each line is a document"""
|
||||
result = []
|
||||
|
||||
lines = []
|
||||
with CsvLoader._get_file_content(content) as file:
|
||||
first_line = file.readline()
|
||||
delimiter = CsvLoader._detect_delimiter(first_line)
|
||||
@@ -42,5 +43,10 @@ class CsvLoader(BaseLoader):
|
||||
reader = csv.DictReader(file, delimiter=delimiter)
|
||||
for i, row in enumerate(reader):
|
||||
line = ", ".join([f"{field}: {value}" for field, value in row.items()])
|
||||
lines.append(line)
|
||||
result.append({"content": line, "meta_data": {"url": content, "row": i + 1}})
|
||||
return result
|
||||
doc_id = hashlib.sha256((content + " ".join(lines)).encode()).hexdigest()
|
||||
return {
|
||||
"doc_id": doc_id,
|
||||
"data": result
|
||||
}
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
import hashlib
|
||||
import logging
|
||||
from urllib.parse import urljoin, urlparse
|
||||
|
||||
@@ -99,4 +100,8 @@ class DocsSiteLoader(BaseLoader):
|
||||
output = []
|
||||
for u in all_urls:
|
||||
output.extend(self._load_data_from_url(u))
|
||||
return output
|
||||
doc_id = hashlib.sha256((" ".join(all_urls) + url).encode()).hexdigest()
|
||||
return {
|
||||
"doc_id": doc_id,
|
||||
"data": output,
|
||||
}
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
import hashlib
|
||||
|
||||
from langchain.document_loaders import Docx2txtLoader
|
||||
|
||||
from embedchain.helper.json_serializable import register_deserializable
|
||||
@@ -15,4 +17,8 @@ class DocxFileLoader(BaseLoader):
|
||||
meta_data = data[0].metadata
|
||||
meta_data["url"] = "local"
|
||||
output.append({"content": content, "meta_data": meta_data})
|
||||
return output
|
||||
doc_id = hashlib.sha256((content + url).encode()).hexdigest()
|
||||
return {
|
||||
"doc_id": doc_id,
|
||||
"data": output,
|
||||
}
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
import hashlib
|
||||
|
||||
from embedchain.helper.json_serializable import register_deserializable
|
||||
from embedchain.loaders.base_loader import BaseLoader
|
||||
|
||||
@@ -8,12 +10,17 @@ class LocalQnaPairLoader(BaseLoader):
|
||||
"""Load data from a local QnA pair."""
|
||||
question, answer = content
|
||||
content = f"Q: {question}\nA: {answer}"
|
||||
url = "local"
|
||||
meta_data = {
|
||||
"url": "local",
|
||||
"url": url,
|
||||
}
|
||||
doc_id = hashlib.sha256((content + url).encode()).hexdigest()
|
||||
return {
|
||||
"doc_id": doc_id,
|
||||
"data": [
|
||||
{
|
||||
"content": content,
|
||||
"meta_data": meta_data,
|
||||
}
|
||||
]
|
||||
}
|
||||
return [
|
||||
{
|
||||
"content": content,
|
||||
"meta_data": meta_data,
|
||||
}
|
||||
]
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
import hashlib
|
||||
|
||||
from embedchain.helper.json_serializable import register_deserializable
|
||||
from embedchain.loaders.base_loader import BaseLoader
|
||||
|
||||
@@ -6,12 +8,17 @@ from embedchain.loaders.base_loader import BaseLoader
|
||||
class LocalTextLoader(BaseLoader):
|
||||
def load_data(self, content):
|
||||
"""Load data from a local text file."""
|
||||
url = "local"
|
||||
meta_data = {
|
||||
"url": "local",
|
||||
"url": url,
|
||||
}
|
||||
doc_id = hashlib.sha256((content + url).encode()).hexdigest()
|
||||
return {
|
||||
"doc_id": doc_id,
|
||||
"data": [
|
||||
{
|
||||
"content": content,
|
||||
"meta_data": meta_data,
|
||||
}
|
||||
]
|
||||
}
|
||||
return [
|
||||
{
|
||||
"content": content,
|
||||
"meta_data": meta_data,
|
||||
}
|
||||
]
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
import hashlib
|
||||
import logging
|
||||
import os
|
||||
|
||||
@@ -34,10 +35,13 @@ class NotionLoader(BaseLoader):
|
||||
|
||||
# Clean text
|
||||
text = clean_string(raw_text)
|
||||
|
||||
return [
|
||||
doc_id = hashlib.sha256((text + source).encode()).hexdigest()
|
||||
return {
|
||||
"doc_id": doc_id,
|
||||
"data": [
|
||||
{
|
||||
"content": text,
|
||||
"meta_data": {"url": f"notion-{formatted_id}"},
|
||||
}
|
||||
]
|
||||
],
|
||||
}
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
import hashlib
|
||||
|
||||
from langchain.document_loaders import PyPDFLoader
|
||||
|
||||
from embedchain.helper.json_serializable import register_deserializable
|
||||
@@ -10,7 +12,8 @@ class PdfFileLoader(BaseLoader):
|
||||
def load_data(self, url):
|
||||
"""Load data from a PDF file."""
|
||||
loader = PyPDFLoader(url)
|
||||
output = []
|
||||
data = []
|
||||
all_content = []
|
||||
pages = loader.load_and_split()
|
||||
if not len(pages):
|
||||
raise ValueError("No data found")
|
||||
@@ -19,10 +22,15 @@ class PdfFileLoader(BaseLoader):
|
||||
content = clean_string(content)
|
||||
meta_data = page.metadata
|
||||
meta_data["url"] = url
|
||||
output.append(
|
||||
data.append(
|
||||
{
|
||||
"content": content,
|
||||
"meta_data": meta_data,
|
||||
}
|
||||
)
|
||||
return output
|
||||
all_content.append(content)
|
||||
doc_id = hashlib.sha256((" ".join(all_content) + url).encode()).hexdigest()
|
||||
return {
|
||||
"doc_id": doc_id,
|
||||
"data": data,
|
||||
}
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
import hashlib
|
||||
import logging
|
||||
|
||||
import requests
|
||||
@@ -30,6 +31,8 @@ class SitemapLoader(BaseLoader):
|
||||
# Get all <loc> tags as a fallback. This might include images.
|
||||
links = [link.text for link in soup.find_all("loc")]
|
||||
|
||||
doc_id = hashlib.sha256((" ".join(links) + sitemap_url).encode()).hexdigest()
|
||||
|
||||
for link in links:
|
||||
try:
|
||||
each_load_data = web_page_loader.load_data(link)
|
||||
@@ -40,4 +43,7 @@ class SitemapLoader(BaseLoader):
|
||||
logging.warning(f"Page is not readable (too many invalid characters): {link}")
|
||||
except ParserRejectedMarkup as e:
|
||||
logging.error(f"Failed to parse {link}: {e}")
|
||||
return [data[0] for data in output]
|
||||
return {
|
||||
"doc_id": doc_id,
|
||||
"data": [data[0] for data in output]
|
||||
}
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
import hashlib
|
||||
import logging
|
||||
|
||||
import requests
|
||||
@@ -63,10 +64,14 @@ class WebPageLoader(BaseLoader):
|
||||
meta_data = {
|
||||
"url": url,
|
||||
}
|
||||
|
||||
return [
|
||||
{
|
||||
"content": content,
|
||||
"meta_data": meta_data,
|
||||
}
|
||||
]
|
||||
content = content
|
||||
doc_id = hashlib.sha256((content + url).encode()).hexdigest()
|
||||
return {
|
||||
"doc_id": doc_id,
|
||||
"data": [
|
||||
{
|
||||
"content": content,
|
||||
"meta_data": meta_data,
|
||||
}
|
||||
],
|
||||
}
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
import hashlib
|
||||
|
||||
from langchain.document_loaders import YoutubeLoader
|
||||
|
||||
from embedchain.helper.json_serializable import register_deserializable
|
||||
@@ -18,10 +20,15 @@ class YoutubeVideoLoader(BaseLoader):
|
||||
content = clean_string(content)
|
||||
meta_data = doc[0].metadata
|
||||
meta_data["url"] = url
|
||||
|
||||
output.append(
|
||||
{
|
||||
"content": content,
|
||||
"meta_data": meta_data,
|
||||
}
|
||||
)
|
||||
return output
|
||||
doc_id = hashlib.sha256((content + url).encode()).hexdigest()
|
||||
return {
|
||||
"doc_id": doc_id,
|
||||
"data": output,
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user