diff --git a/embedchain/chunkers/base_chunker.py b/embedchain/chunkers/base_chunker.py new file mode 100644 index 00000000..bcd5a86d --- /dev/null +++ b/embedchain/chunkers/base_chunker.py @@ -0,0 +1,27 @@ +import hashlib + + +class BaseChunker: + def __init__(self, text_splitter): + self.text_splitter = text_splitter + + def create_chunks(self, loader, url): + documents = [] + ids = [] + datas = loader.load_data(url) + metadatas = [] + for data in datas: + content = data["content"] + meta_data = data["meta_data"] + chunks = self.text_splitter.split_text(content) + url = meta_data["url"] + for chunk in chunks: + chunk_id = hashlib.sha256((chunk + url).encode()).hexdigest() + ids.append(chunk_id) + documents.append(chunk) + metadatas.append(meta_data) + return { + "documents": documents, + "ids": ids, + "metadatas": metadatas, + } diff --git a/embedchain/chunkers/pdf_file.py b/embedchain/chunkers/pdf_file.py index ad760594..47a23c7a 100644 --- a/embedchain/chunkers/pdf_file.py +++ b/embedchain/chunkers/pdf_file.py @@ -1,4 +1,4 @@ -import hashlib +from embedchain.chunkers.base_chunker import BaseChunker from langchain.text_splitter import RecursiveCharacterTextSplitter @@ -9,28 +9,8 @@ TEXT_SPLITTER_CHUNK_PARAMS = { "length_function": len, } -TEXT_SPLITTER = RecursiveCharacterTextSplitter(**TEXT_SPLITTER_CHUNK_PARAMS) - -class PdfFileChunker: - - def create_chunks(self, loader, url): - documents = [] - ids = [] - datas = loader.load_data(url) - metadatas = [] - for data in datas: - content = data["content"] - meta_data = data["meta_data"] - chunks = TEXT_SPLITTER.split_text(content) - url = meta_data["url"] - for chunk in chunks: - chunk_id = hashlib.sha256((chunk + url).encode()).hexdigest() - ids.append(chunk_id) - documents.append(chunk) - metadatas.append(meta_data) - return { - "documents": documents, - "ids": ids, - "metadatas": metadatas, - } \ No newline at end of file +class PdfFileChunker(BaseChunker): + def __init__(self): + text_splitter = RecursiveCharacterTextSplitter(**TEXT_SPLITTER_CHUNK_PARAMS) + super().__init__(text_splitter) \ No newline at end of file diff --git a/embedchain/chunkers/website.py b/embedchain/chunkers/website.py index 6d70131d..090eb730 100644 --- a/embedchain/chunkers/website.py +++ b/embedchain/chunkers/website.py @@ -1,4 +1,4 @@ -import hashlib +from embedchain.chunkers.base_chunker import BaseChunker from langchain.text_splitter import RecursiveCharacterTextSplitter @@ -9,28 +9,8 @@ TEXT_SPLITTER_CHUNK_PARAMS = { "length_function": len, } -TEXT_SPLITTER = RecursiveCharacterTextSplitter(**TEXT_SPLITTER_CHUNK_PARAMS) - -class WebsiteChunker: - - def create_chunks(self, loader, url): - documents = [] - ids = [] - datas = loader.load_data(url) - metadatas = [] - for data in datas: - content = data["content"] - meta_data = data["meta_data"] - chunks = TEXT_SPLITTER.split_text(content) - url = meta_data["url"] - for chunk in chunks: - chunk_id = hashlib.sha256((chunk + url).encode()).hexdigest() - ids.append(chunk_id) - documents.append(chunk) - metadatas.append(meta_data) - return { - "documents": documents, - "ids": ids, - "metadatas": metadatas, - } \ No newline at end of file +class WebsiteChunker(BaseChunker): + def __init__(self): + text_splitter = RecursiveCharacterTextSplitter(**TEXT_SPLITTER_CHUNK_PARAMS) + super().__init__(text_splitter) diff --git a/embedchain/chunkers/youtube_video.py b/embedchain/chunkers/youtube_video.py index 6c4b1ae1..7435c02d 100644 --- a/embedchain/chunkers/youtube_video.py +++ b/embedchain/chunkers/youtube_video.py @@ -1,4 +1,4 @@ -import hashlib +from embedchain.chunkers.base_chunker import BaseChunker from langchain.text_splitter import RecursiveCharacterTextSplitter @@ -9,28 +9,8 @@ TEXT_SPLITTER_CHUNK_PARAMS = { "length_function": len, } -TEXT_SPLITTER = RecursiveCharacterTextSplitter(**TEXT_SPLITTER_CHUNK_PARAMS) - -class YoutubeVideoChunker: - - def create_chunks(self, loader, url): - documents = [] - ids = [] - datas = loader.load_data(url) - metadatas = [] - for data in datas: - content = data["content"] - meta_data = data["meta_data"] - chunks = TEXT_SPLITTER.split_text(content) - url = meta_data["url"] - for chunk in chunks: - chunk_id = hashlib.sha256((chunk + url).encode()).hexdigest() - ids.append(chunk_id) - documents.append(chunk) - metadatas.append(meta_data) - return { - "documents": documents, - "ids": ids, - "metadatas": metadatas, - } \ No newline at end of file +class YoutubeVideoChunker(BaseChunker): + def __init__(self): + text_splitter = RecursiveCharacterTextSplitter(**TEXT_SPLITTER_CHUNK_PARAMS) + super().__init__(text_splitter) \ No newline at end of file