From c12362486ff8aa7a23fc850f97dd7a17f71773b8 Mon Sep 17 00:00:00 2001 From: aaishikdutta <107566376+aaishikdutta@users.noreply.github.com> Date: Wed, 19 Jul 2023 05:35:43 +0530 Subject: [PATCH] feat: added data format to metadata internally (#314) --- embedchain/chunkers/base_chunker.py | 10 ++++++++++ embedchain/data_formatter/data_formatter.py | 4 +++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/embedchain/chunkers/base_chunker.py b/embedchain/chunkers/base_chunker.py index e2334488..e44366b1 100644 --- a/embedchain/chunkers/base_chunker.py +++ b/embedchain/chunkers/base_chunker.py @@ -5,6 +5,7 @@ class BaseChunker: def __init__(self, text_splitter): """Initialize the chunker.""" self.text_splitter = text_splitter + self.data_type = None def create_chunks(self, loader, src): """ @@ -22,7 +23,10 @@ class BaseChunker: metadatas = [] for data in datas: content = data["content"] + meta_data = data["meta_data"] + # add data type to meta data to allow query using data type + meta_data["data_type"] = self.data_type url = meta_data["url"] chunks = self.get_chunks(content) @@ -47,3 +51,9 @@ class BaseChunker: Override in child class if custom logic. """ return self.text_splitter.split_text(content) + + def set_data_type(self, data_type): + """ + set the data type of chunker + """ + self.data_type = data_type diff --git a/embedchain/data_formatter/data_formatter.py b/embedchain/data_formatter/data_formatter.py index 3e941fea..fb878e54 100644 --- a/embedchain/data_formatter/data_formatter.py +++ b/embedchain/data_formatter/data_formatter.py @@ -69,6 +69,8 @@ class DataFormatter: "docs_site": DocsSiteChunker(config), } if data_type in chunkers: - return chunkers[data_type] + chunker = chunkers[data_type] + chunker.set_data_type(data_type) + return chunker else: raise ValueError(f"Unsupported data type: {data_type}")