feat: Add embedding manager (#570)

This commit is contained in:
Taranjeet Singh
2023-09-11 23:43:53 -07:00
committed by GitHub
parent ba208f5b48
commit 2bd6881361
16 changed files with 311 additions and 73 deletions

View File

@@ -1,4 +1,5 @@
import csv
import hashlib
from io import StringIO
from urllib.parse import urlparse
@@ -34,7 +35,7 @@ class CsvLoader(BaseLoader):
def load_data(content):
"""Load a csv file with headers. Each line is a document"""
result = []
lines = []
with CsvLoader._get_file_content(content) as file:
first_line = file.readline()
delimiter = CsvLoader._detect_delimiter(first_line)
@@ -42,5 +43,10 @@ class CsvLoader(BaseLoader):
reader = csv.DictReader(file, delimiter=delimiter)
for i, row in enumerate(reader):
line = ", ".join([f"{field}: {value}" for field, value in row.items()])
lines.append(line)
result.append({"content": line, "meta_data": {"url": content, "row": i + 1}})
return result
doc_id = hashlib.sha256((content + " ".join(lines)).encode()).hexdigest()
return {
"doc_id": doc_id,
"data": result
}

View File

@@ -1,3 +1,4 @@
import hashlib
import logging
from urllib.parse import urljoin, urlparse
@@ -99,4 +100,8 @@ class DocsSiteLoader(BaseLoader):
output = []
for u in all_urls:
output.extend(self._load_data_from_url(u))
return output
doc_id = hashlib.sha256((" ".join(all_urls) + url).encode()).hexdigest()
return {
"doc_id": doc_id,
"data": output,
}

View File

@@ -1,3 +1,5 @@
import hashlib
from langchain.document_loaders import Docx2txtLoader
from embedchain.helper.json_serializable import register_deserializable
@@ -15,4 +17,8 @@ class DocxFileLoader(BaseLoader):
meta_data = data[0].metadata
meta_data["url"] = "local"
output.append({"content": content, "meta_data": meta_data})
return output
doc_id = hashlib.sha256((content + url).encode()).hexdigest()
return {
"doc_id": doc_id,
"data": output,
}

View File

@@ -1,3 +1,5 @@
import hashlib
from embedchain.helper.json_serializable import register_deserializable
from embedchain.loaders.base_loader import BaseLoader
@@ -8,12 +10,17 @@ class LocalQnaPairLoader(BaseLoader):
"""Load data from a local QnA pair."""
question, answer = content
content = f"Q: {question}\nA: {answer}"
url = "local"
meta_data = {
"url": "local",
"url": url,
}
doc_id = hashlib.sha256((content + url).encode()).hexdigest()
return {
"doc_id": doc_id,
"data": [
{
"content": content,
"meta_data": meta_data,
}
]
}
return [
{
"content": content,
"meta_data": meta_data,
}
]

View File

@@ -1,3 +1,5 @@
import hashlib
from embedchain.helper.json_serializable import register_deserializable
from embedchain.loaders.base_loader import BaseLoader
@@ -6,12 +8,17 @@ from embedchain.loaders.base_loader import BaseLoader
class LocalTextLoader(BaseLoader):
def load_data(self, content):
"""Load data from a local text file."""
url = "local"
meta_data = {
"url": "local",
"url": url,
}
doc_id = hashlib.sha256((content + url).encode()).hexdigest()
return {
"doc_id": doc_id,
"data": [
{
"content": content,
"meta_data": meta_data,
}
]
}
return [
{
"content": content,
"meta_data": meta_data,
}
]

View File

@@ -1,3 +1,4 @@
import hashlib
import logging
import os
@@ -34,10 +35,13 @@ class NotionLoader(BaseLoader):
# Clean text
text = clean_string(raw_text)
return [
doc_id = hashlib.sha256((text + source).encode()).hexdigest()
return {
"doc_id": doc_id,
"data": [
{
"content": text,
"meta_data": {"url": f"notion-{formatted_id}"},
}
]
],
}

View File

@@ -1,3 +1,5 @@
import hashlib
from langchain.document_loaders import PyPDFLoader
from embedchain.helper.json_serializable import register_deserializable
@@ -10,7 +12,8 @@ class PdfFileLoader(BaseLoader):
def load_data(self, url):
"""Load data from a PDF file."""
loader = PyPDFLoader(url)
output = []
data = []
all_content = []
pages = loader.load_and_split()
if not len(pages):
raise ValueError("No data found")
@@ -19,10 +22,15 @@ class PdfFileLoader(BaseLoader):
content = clean_string(content)
meta_data = page.metadata
meta_data["url"] = url
output.append(
data.append(
{
"content": content,
"meta_data": meta_data,
}
)
return output
all_content.append(content)
doc_id = hashlib.sha256((" ".join(all_content) + url).encode()).hexdigest()
return {
"doc_id": doc_id,
"data": data,
}

View File

@@ -1,3 +1,4 @@
import hashlib
import logging
import requests
@@ -30,6 +31,8 @@ class SitemapLoader(BaseLoader):
# Get all <loc> tags as a fallback. This might include images.
links = [link.text for link in soup.find_all("loc")]
doc_id = hashlib.sha256((" ".join(links) + sitemap_url).encode()).hexdigest()
for link in links:
try:
each_load_data = web_page_loader.load_data(link)
@@ -40,4 +43,7 @@ class SitemapLoader(BaseLoader):
logging.warning(f"Page is not readable (too many invalid characters): {link}")
except ParserRejectedMarkup as e:
logging.error(f"Failed to parse {link}: {e}")
return [data[0] for data in output]
return {
"doc_id": doc_id,
"data": [data[0] for data in output]
}

View File

@@ -1,3 +1,4 @@
import hashlib
import logging
import requests
@@ -63,10 +64,14 @@ class WebPageLoader(BaseLoader):
meta_data = {
"url": url,
}
return [
{
"content": content,
"meta_data": meta_data,
}
]
content = content
doc_id = hashlib.sha256((content + url).encode()).hexdigest()
return {
"doc_id": doc_id,
"data": [
{
"content": content,
"meta_data": meta_data,
}
],
}

View File

@@ -1,3 +1,5 @@
import hashlib
from langchain.document_loaders import YoutubeLoader
from embedchain.helper.json_serializable import register_deserializable
@@ -18,10 +20,15 @@ class YoutubeVideoLoader(BaseLoader):
content = clean_string(content)
meta_data = doc[0].metadata
meta_data["url"] = url
output.append(
{
"content": content,
"meta_data": meta_data,
}
)
return output
doc_id = hashlib.sha256((content + url).encode()).hexdigest()
return {
"doc_id": doc_id,
"data": output,
}