Chunkers: Refactor each chunker & add base class
Adds a base chunker from which any chunker can inherit. Existing chunkers are refactored to inherit from this base chunker.
This commit is contained in:
27
embedchain/chunkers/base_chunker.py
Normal file
27
embedchain/chunkers/base_chunker.py
Normal file
@@ -0,0 +1,27 @@
|
|||||||
|
import hashlib
|
||||||
|
|
||||||
|
|
||||||
|
class BaseChunker:
|
||||||
|
def __init__(self, text_splitter):
|
||||||
|
self.text_splitter = text_splitter
|
||||||
|
|
||||||
|
def create_chunks(self, loader, url):
|
||||||
|
documents = []
|
||||||
|
ids = []
|
||||||
|
datas = loader.load_data(url)
|
||||||
|
metadatas = []
|
||||||
|
for data in datas:
|
||||||
|
content = data["content"]
|
||||||
|
meta_data = data["meta_data"]
|
||||||
|
chunks = self.text_splitter.split_text(content)
|
||||||
|
url = meta_data["url"]
|
||||||
|
for chunk in chunks:
|
||||||
|
chunk_id = hashlib.sha256((chunk + url).encode()).hexdigest()
|
||||||
|
ids.append(chunk_id)
|
||||||
|
documents.append(chunk)
|
||||||
|
metadatas.append(meta_data)
|
||||||
|
return {
|
||||||
|
"documents": documents,
|
||||||
|
"ids": ids,
|
||||||
|
"metadatas": metadatas,
|
||||||
|
}
|
||||||
@@ -1,4 +1,4 @@
|
|||||||
import hashlib
|
from embedchain.chunkers.base_chunker import BaseChunker
|
||||||
|
|
||||||
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||||
|
|
||||||
@@ -9,28 +9,8 @@ TEXT_SPLITTER_CHUNK_PARAMS = {
|
|||||||
"length_function": len,
|
"length_function": len,
|
||||||
}
|
}
|
||||||
|
|
||||||
TEXT_SPLITTER = RecursiveCharacterTextSplitter(**TEXT_SPLITTER_CHUNK_PARAMS)
|
|
||||||
|
|
||||||
|
class PdfFileChunker(BaseChunker):
|
||||||
class PdfFileChunker:
|
def __init__(self):
|
||||||
|
text_splitter = RecursiveCharacterTextSplitter(**TEXT_SPLITTER_CHUNK_PARAMS)
|
||||||
def create_chunks(self, loader, url):
|
super().__init__(text_splitter)
|
||||||
documents = []
|
|
||||||
ids = []
|
|
||||||
datas = loader.load_data(url)
|
|
||||||
metadatas = []
|
|
||||||
for data in datas:
|
|
||||||
content = data["content"]
|
|
||||||
meta_data = data["meta_data"]
|
|
||||||
chunks = TEXT_SPLITTER.split_text(content)
|
|
||||||
url = meta_data["url"]
|
|
||||||
for chunk in chunks:
|
|
||||||
chunk_id = hashlib.sha256((chunk + url).encode()).hexdigest()
|
|
||||||
ids.append(chunk_id)
|
|
||||||
documents.append(chunk)
|
|
||||||
metadatas.append(meta_data)
|
|
||||||
return {
|
|
||||||
"documents": documents,
|
|
||||||
"ids": ids,
|
|
||||||
"metadatas": metadatas,
|
|
||||||
}
|
|
||||||
@@ -1,4 +1,4 @@
|
|||||||
import hashlib
|
from embedchain.chunkers.base_chunker import BaseChunker
|
||||||
|
|
||||||
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||||
|
|
||||||
@@ -9,28 +9,8 @@ TEXT_SPLITTER_CHUNK_PARAMS = {
|
|||||||
"length_function": len,
|
"length_function": len,
|
||||||
}
|
}
|
||||||
|
|
||||||
TEXT_SPLITTER = RecursiveCharacterTextSplitter(**TEXT_SPLITTER_CHUNK_PARAMS)
|
|
||||||
|
|
||||||
|
class WebsiteChunker(BaseChunker):
|
||||||
class WebsiteChunker:
|
def __init__(self):
|
||||||
|
text_splitter = RecursiveCharacterTextSplitter(**TEXT_SPLITTER_CHUNK_PARAMS)
|
||||||
def create_chunks(self, loader, url):
|
super().__init__(text_splitter)
|
||||||
documents = []
|
|
||||||
ids = []
|
|
||||||
datas = loader.load_data(url)
|
|
||||||
metadatas = []
|
|
||||||
for data in datas:
|
|
||||||
content = data["content"]
|
|
||||||
meta_data = data["meta_data"]
|
|
||||||
chunks = TEXT_SPLITTER.split_text(content)
|
|
||||||
url = meta_data["url"]
|
|
||||||
for chunk in chunks:
|
|
||||||
chunk_id = hashlib.sha256((chunk + url).encode()).hexdigest()
|
|
||||||
ids.append(chunk_id)
|
|
||||||
documents.append(chunk)
|
|
||||||
metadatas.append(meta_data)
|
|
||||||
return {
|
|
||||||
"documents": documents,
|
|
||||||
"ids": ids,
|
|
||||||
"metadatas": metadatas,
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
import hashlib
|
from embedchain.chunkers.base_chunker import BaseChunker
|
||||||
|
|
||||||
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||||
|
|
||||||
@@ -9,28 +9,8 @@ TEXT_SPLITTER_CHUNK_PARAMS = {
|
|||||||
"length_function": len,
|
"length_function": len,
|
||||||
}
|
}
|
||||||
|
|
||||||
TEXT_SPLITTER = RecursiveCharacterTextSplitter(**TEXT_SPLITTER_CHUNK_PARAMS)
|
|
||||||
|
|
||||||
|
class YoutubeVideoChunker(BaseChunker):
|
||||||
class YoutubeVideoChunker:
|
def __init__(self):
|
||||||
|
text_splitter = RecursiveCharacterTextSplitter(**TEXT_SPLITTER_CHUNK_PARAMS)
|
||||||
def create_chunks(self, loader, url):
|
super().__init__(text_splitter)
|
||||||
documents = []
|
|
||||||
ids = []
|
|
||||||
datas = loader.load_data(url)
|
|
||||||
metadatas = []
|
|
||||||
for data in datas:
|
|
||||||
content = data["content"]
|
|
||||||
meta_data = data["meta_data"]
|
|
||||||
chunks = TEXT_SPLITTER.split_text(content)
|
|
||||||
url = meta_data["url"]
|
|
||||||
for chunk in chunks:
|
|
||||||
chunk_id = hashlib.sha256((chunk + url).encode()).hexdigest()
|
|
||||||
ids.append(chunk_id)
|
|
||||||
documents.append(chunk)
|
|
||||||
metadatas.append(meta_data)
|
|
||||||
return {
|
|
||||||
"documents": documents,
|
|
||||||
"ids": ids,
|
|
||||||
"metadatas": metadatas,
|
|
||||||
}
|
|
||||||
Reference in New Issue
Block a user