feat: added data format to metadata internally (#314)
This commit is contained in:
@@ -5,6 +5,7 @@ class BaseChunker:
|
||||
def __init__(self, text_splitter):
|
||||
"""Initialize the chunker."""
|
||||
self.text_splitter = text_splitter
|
||||
self.data_type = None
|
||||
|
||||
def create_chunks(self, loader, src):
|
||||
"""
|
||||
@@ -22,7 +23,10 @@ class BaseChunker:
|
||||
metadatas = []
|
||||
for data in datas:
|
||||
content = data["content"]
|
||||
|
||||
meta_data = data["meta_data"]
|
||||
# add data type to meta data to allow query using data type
|
||||
meta_data["data_type"] = self.data_type
|
||||
url = meta_data["url"]
|
||||
|
||||
chunks = self.get_chunks(content)
|
||||
@@ -47,3 +51,9 @@ class BaseChunker:
|
||||
Override in child class if custom logic.
|
||||
"""
|
||||
return self.text_splitter.split_text(content)
|
||||
|
||||
def set_data_type(self, data_type):
|
||||
"""
|
||||
set the data type of chunker
|
||||
"""
|
||||
self.data_type = data_type
|
||||
|
||||
@@ -69,6 +69,8 @@ class DataFormatter:
|
||||
"docs_site": DocsSiteChunker(config),
|
||||
}
|
||||
if data_type in chunkers:
|
||||
return chunkers[data_type]
|
||||
chunker = chunkers[data_type]
|
||||
chunker.set_data_type(data_type)
|
||||
return chunker
|
||||
else:
|
||||
raise ValueError(f"Unsupported data type: {data_type}")
|
||||
|
||||
Reference in New Issue
Block a user