Adds a base chunker from which any chunker can inherit. Existing chunkers are refactored to inherit from this base chunker.
28 lines
819 B
Python
28 lines
819 B
Python
import hashlib
|
|
|
|
|
|
class BaseChunker:
|
|
def __init__(self, text_splitter):
|
|
self.text_splitter = text_splitter
|
|
|
|
def create_chunks(self, loader, url):
|
|
documents = []
|
|
ids = []
|
|
datas = loader.load_data(url)
|
|
metadatas = []
|
|
for data in datas:
|
|
content = data["content"]
|
|
meta_data = data["meta_data"]
|
|
chunks = self.text_splitter.split_text(content)
|
|
url = meta_data["url"]
|
|
for chunk in chunks:
|
|
chunk_id = hashlib.sha256((chunk + url).encode()).hexdigest()
|
|
ids.append(chunk_id)
|
|
documents.append(chunk)
|
|
metadatas.append(meta_data)
|
|
return {
|
|
"documents": documents,
|
|
"ids": ids,
|
|
"metadatas": metadatas,
|
|
}
|