Add simple app functionality
This commit enables anyone to create a app and add 3 types of data sources: * pdf file * youtube video * website It exposes a function called query which first gets similar docs from vector db and then passes it to LLM to get the final answer.
This commit is contained in:
0
embedchain/chunkers/__init__.py
Normal file
0
embedchain/chunkers/__init__.py
Normal file
36
embedchain/chunkers/pdf_file.py
Normal file
36
embedchain/chunkers/pdf_file.py
Normal file
@@ -0,0 +1,36 @@
|
||||
import hashlib
|
||||
|
||||
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||
|
||||
|
||||
TEXT_SPLITTER_CHUNK_PARAMS = {
|
||||
"chunk_size": 1000,
|
||||
"chunk_overlap": 0,
|
||||
"length_function": len,
|
||||
}
|
||||
|
||||
TEXT_SPLITTER = RecursiveCharacterTextSplitter(**TEXT_SPLITTER_CHUNK_PARAMS)
|
||||
|
||||
|
||||
class PdfFileChunker:
|
||||
|
||||
def create_chunks(self, loader, url):
|
||||
documents = []
|
||||
ids = []
|
||||
datas = loader.load_data(url)
|
||||
metadatas = []
|
||||
for data in datas:
|
||||
content = data["content"]
|
||||
meta_data = data["meta_data"]
|
||||
chunks = TEXT_SPLITTER.split_text(content)
|
||||
url = meta_data["url"]
|
||||
for chunk in chunks:
|
||||
chunk_id = hashlib.sha256((chunk + url).encode()).hexdigest()
|
||||
ids.append(chunk_id)
|
||||
documents.append(chunk)
|
||||
metadatas.append(meta_data)
|
||||
return {
|
||||
"documents": documents,
|
||||
"ids": ids,
|
||||
"metadatas": metadatas,
|
||||
}
|
||||
36
embedchain/chunkers/website.py
Normal file
36
embedchain/chunkers/website.py
Normal file
@@ -0,0 +1,36 @@
|
||||
import hashlib
|
||||
|
||||
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||
|
||||
|
||||
TEXT_SPLITTER_CHUNK_PARAMS = {
|
||||
"chunk_size": 500,
|
||||
"chunk_overlap": 0,
|
||||
"length_function": len,
|
||||
}
|
||||
|
||||
TEXT_SPLITTER = RecursiveCharacterTextSplitter(**TEXT_SPLITTER_CHUNK_PARAMS)
|
||||
|
||||
|
||||
class WebsiteChunker:
|
||||
|
||||
def create_chunks(self, loader, url):
|
||||
documents = []
|
||||
ids = []
|
||||
datas = loader.load_data(url)
|
||||
metadatas = []
|
||||
for data in datas:
|
||||
content = data["content"]
|
||||
meta_data = data["meta_data"]
|
||||
chunks = TEXT_SPLITTER.split_text(content)
|
||||
url = meta_data["url"]
|
||||
for chunk in chunks:
|
||||
chunk_id = hashlib.sha256((chunk + url).encode()).hexdigest()
|
||||
ids.append(chunk_id)
|
||||
documents.append(chunk)
|
||||
metadatas.append(meta_data)
|
||||
return {
|
||||
"documents": documents,
|
||||
"ids": ids,
|
||||
"metadatas": metadatas,
|
||||
}
|
||||
36
embedchain/chunkers/youtube_video.py
Normal file
36
embedchain/chunkers/youtube_video.py
Normal file
@@ -0,0 +1,36 @@
|
||||
import hashlib
|
||||
|
||||
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||
|
||||
|
||||
TEXT_SPLITTER_CHUNK_PARAMS = {
|
||||
"chunk_size": 2000,
|
||||
"chunk_overlap": 0,
|
||||
"length_function": len,
|
||||
}
|
||||
|
||||
TEXT_SPLITTER = RecursiveCharacterTextSplitter(**TEXT_SPLITTER_CHUNK_PARAMS)
|
||||
|
||||
|
||||
class YoutubeVideoChunker:
|
||||
|
||||
def create_chunks(self, loader, url):
|
||||
documents = []
|
||||
ids = []
|
||||
datas = loader.load_data(url)
|
||||
metadatas = []
|
||||
for data in datas:
|
||||
content = data["content"]
|
||||
meta_data = data["meta_data"]
|
||||
chunks = TEXT_SPLITTER.split_text(content)
|
||||
url = meta_data["url"]
|
||||
for chunk in chunks:
|
||||
chunk_id = hashlib.sha256((chunk + url).encode()).hexdigest()
|
||||
ids.append(chunk_id)
|
||||
documents.append(chunk)
|
||||
metadatas.append(meta_data)
|
||||
return {
|
||||
"documents": documents,
|
||||
"ids": ids,
|
||||
"metadatas": metadatas,
|
||||
}
|
||||
Reference in New Issue
Block a user