Files
t6_mem0/embedchain/chunkers/website.py
Taranjeet Singh 468db83337 Add simple app functionality
This commit enables anyone to create a app and add 3 types of data
sources:

* pdf file
* youtube video
* website

It exposes a function called query which first gets similar docs from
vector db and then passes it to LLM to get the final answer.
2023-06-20 14:42:55 +05:30

36 lines
990 B
Python

import hashlib
from langchain.text_splitter import RecursiveCharacterTextSplitter
TEXT_SPLITTER_CHUNK_PARAMS = {
"chunk_size": 500,
"chunk_overlap": 0,
"length_function": len,
}
TEXT_SPLITTER = RecursiveCharacterTextSplitter(**TEXT_SPLITTER_CHUNK_PARAMS)
class WebsiteChunker:
def create_chunks(self, loader, url):
documents = []
ids = []
datas = loader.load_data(url)
metadatas = []
for data in datas:
content = data["content"]
meta_data = data["meta_data"]
chunks = TEXT_SPLITTER.split_text(content)
url = meta_data["url"]
for chunk in chunks:
chunk_id = hashlib.sha256((chunk + url).encode()).hexdigest()
ids.append(chunk_id)
documents.append(chunk)
metadatas.append(meta_data)
return {
"documents": documents,
"ids": ids,
"metadatas": metadatas,
}