[Improvement] Add support for min chunk size (#1007)
This commit is contained in:
@@ -1,4 +1,5 @@
|
||||
import builtins
|
||||
import logging
|
||||
from importlib import import_module
|
||||
from typing import Callable, Optional
|
||||
|
||||
@@ -14,12 +15,21 @@ class ChunkerConfig(BaseConfig):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
chunk_size: Optional[int] = None,
|
||||
chunk_overlap: Optional[int] = None,
|
||||
chunk_size: Optional[int] = 2000,
|
||||
chunk_overlap: Optional[int] = 0,
|
||||
length_function: Optional[Callable[[str], int]] = None,
|
||||
min_chunk_size: Optional[int] = 0,
|
||||
):
|
||||
self.chunk_size = chunk_size if chunk_size else 2000
|
||||
self.chunk_overlap = chunk_overlap if chunk_overlap else 0
|
||||
self.chunk_size = chunk_size
|
||||
self.chunk_overlap = chunk_overlap
|
||||
self.min_chunk_size = min_chunk_size
|
||||
if self.min_chunk_size >= self.chunk_size:
|
||||
raise ValueError(f"min_chunk_size {min_chunk_size} should be less than chunk_size {chunk_size}")
|
||||
if self.min_chunk_size <= self.chunk_overlap:
|
||||
logging.warn(
|
||||
f"min_chunk_size {min_chunk_size} should be greater than chunk_overlap {chunk_overlap}, otherwise it is redundant." # noqa:E501
|
||||
)
|
||||
|
||||
if isinstance(length_function, str):
|
||||
self.length_function = self.load_func(length_function)
|
||||
else:
|
||||
@@ -37,7 +47,7 @@ class ChunkerConfig(BaseConfig):
|
||||
@register_deserializable
|
||||
class LoaderConfig(BaseConfig):
|
||||
"""
|
||||
Config for the chunker used in `add` method
|
||||
Config for the loader used in `add` method
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
|
||||
Reference in New Issue
Block a user