feat: added data format to metadata internally (#314)

This commit is contained in:
aaishikdutta
2023-07-19 05:35:43 +05:30
committed by GitHub
parent d16eafae05
commit c12362486f
2 changed files with 13 additions and 1 deletions

View File

@@ -5,6 +5,7 @@ class BaseChunker:
def __init__(self, text_splitter):
"""Initialize the chunker."""
self.text_splitter = text_splitter
self.data_type = None
def create_chunks(self, loader, src):
"""
@@ -22,7 +23,10 @@ class BaseChunker:
metadatas = []
for data in datas:
content = data["content"]
meta_data = data["meta_data"]
# add data type to meta data to allow query using data type
meta_data["data_type"] = self.data_type
url = meta_data["url"]
chunks = self.get_chunks(content)
@@ -47,3 +51,9 @@ class BaseChunker:
Override in child class if custom logic.
"""
return self.text_splitter.split_text(content)
def set_data_type(self, data_type):
"""
set the data type of chunker
"""
self.data_type = data_type

View File

@@ -69,6 +69,8 @@ class DataFormatter:
"docs_site": DocsSiteChunker(config),
}
if data_type in chunkers:
return chunkers[data_type]
chunker = chunkers[data_type]
chunker.set_data_type(data_type)
return chunker
else:
raise ValueError(f"Unsupported data type: {data_type}")