Chunkers: Refactor each chunker & add base class

Adds a base chunker from which any chunker can inherit.
Existing chunkers are refactored to inherit from this base
chunker.
This commit is contained in:
Taranjeet Singh
2023-06-20 16:30:18 +05:30
parent d2da80f5bc
commit 4329caa17c
4 changed files with 42 additions and 75 deletions

View File

@@ -0,0 +1,27 @@
import hashlib
class BaseChunker:
def __init__(self, text_splitter):
self.text_splitter = text_splitter
def create_chunks(self, loader, url):
documents = []
ids = []
datas = loader.load_data(url)
metadatas = []
for data in datas:
content = data["content"]
meta_data = data["meta_data"]
chunks = self.text_splitter.split_text(content)
url = meta_data["url"]
for chunk in chunks:
chunk_id = hashlib.sha256((chunk + url).encode()).hexdigest()
ids.append(chunk_id)
documents.append(chunk)
metadatas.append(meta_data)
return {
"documents": documents,
"ids": ids,
"metadatas": metadatas,
}

View File

@@ -1,4 +1,4 @@
import hashlib
from embedchain.chunkers.base_chunker import BaseChunker
from langchain.text_splitter import RecursiveCharacterTextSplitter
@@ -9,28 +9,8 @@ TEXT_SPLITTER_CHUNK_PARAMS = {
"length_function": len,
}
TEXT_SPLITTER = RecursiveCharacterTextSplitter(**TEXT_SPLITTER_CHUNK_PARAMS)
class PdfFileChunker:
def create_chunks(self, loader, url):
documents = []
ids = []
datas = loader.load_data(url)
metadatas = []
for data in datas:
content = data["content"]
meta_data = data["meta_data"]
chunks = TEXT_SPLITTER.split_text(content)
url = meta_data["url"]
for chunk in chunks:
chunk_id = hashlib.sha256((chunk + url).encode()).hexdigest()
ids.append(chunk_id)
documents.append(chunk)
metadatas.append(meta_data)
return {
"documents": documents,
"ids": ids,
"metadatas": metadatas,
}
class PdfFileChunker(BaseChunker):
def __init__(self):
text_splitter = RecursiveCharacterTextSplitter(**TEXT_SPLITTER_CHUNK_PARAMS)
super().__init__(text_splitter)

View File

@@ -1,4 +1,4 @@
import hashlib
from embedchain.chunkers.base_chunker import BaseChunker
from langchain.text_splitter import RecursiveCharacterTextSplitter
@@ -9,28 +9,8 @@ TEXT_SPLITTER_CHUNK_PARAMS = {
"length_function": len,
}
TEXT_SPLITTER = RecursiveCharacterTextSplitter(**TEXT_SPLITTER_CHUNK_PARAMS)
class WebsiteChunker:
def create_chunks(self, loader, url):
documents = []
ids = []
datas = loader.load_data(url)
metadatas = []
for data in datas:
content = data["content"]
meta_data = data["meta_data"]
chunks = TEXT_SPLITTER.split_text(content)
url = meta_data["url"]
for chunk in chunks:
chunk_id = hashlib.sha256((chunk + url).encode()).hexdigest()
ids.append(chunk_id)
documents.append(chunk)
metadatas.append(meta_data)
return {
"documents": documents,
"ids": ids,
"metadatas": metadatas,
}
class WebsiteChunker(BaseChunker):
def __init__(self):
text_splitter = RecursiveCharacterTextSplitter(**TEXT_SPLITTER_CHUNK_PARAMS)
super().__init__(text_splitter)

View File

@@ -1,4 +1,4 @@
import hashlib
from embedchain.chunkers.base_chunker import BaseChunker
from langchain.text_splitter import RecursiveCharacterTextSplitter
@@ -9,28 +9,8 @@ TEXT_SPLITTER_CHUNK_PARAMS = {
"length_function": len,
}
TEXT_SPLITTER = RecursiveCharacterTextSplitter(**TEXT_SPLITTER_CHUNK_PARAMS)
class YoutubeVideoChunker:
def create_chunks(self, loader, url):
documents = []
ids = []
datas = loader.load_data(url)
metadatas = []
for data in datas:
content = data["content"]
meta_data = data["meta_data"]
chunks = TEXT_SPLITTER.split_text(content)
url = meta_data["url"]
for chunk in chunks:
chunk_id = hashlib.sha256((chunk + url).encode()).hexdigest()
ids.append(chunk_id)
documents.append(chunk)
metadatas.append(meta_data)
return {
"documents": documents,
"ids": ids,
"metadatas": metadatas,
}
class YoutubeVideoChunker(BaseChunker):
def __init__(self):
text_splitter = RecursiveCharacterTextSplitter(**TEXT_SPLITTER_CHUNK_PARAMS)
super().__init__(text_splitter)