[Improvement] Add support for min chunk size (#1007)

This commit is contained in:
Deven Patel
2023-12-15 05:59:15 +05:30
committed by GitHub
parent 9303a1bf81
commit c0ee680546
11 changed files with 59 additions and 25 deletions

View File

@@ -1,4 +1,5 @@
import builtins
import logging
from importlib import import_module
from typing import Callable, Optional
@@ -14,12 +15,21 @@ class ChunkerConfig(BaseConfig):
def __init__(
self,
chunk_size: Optional[int] = None,
chunk_overlap: Optional[int] = None,
chunk_size: Optional[int] = 2000,
chunk_overlap: Optional[int] = 0,
length_function: Optional[Callable[[str], int]] = None,
min_chunk_size: Optional[int] = 0,
):
self.chunk_size = chunk_size if chunk_size else 2000
self.chunk_overlap = chunk_overlap if chunk_overlap else 0
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
self.min_chunk_size = min_chunk_size
if self.min_chunk_size >= self.chunk_size:
raise ValueError(f"min_chunk_size {min_chunk_size} should be less than chunk_size {chunk_size}")
if self.min_chunk_size <= self.chunk_overlap:
logging.warn(
f"min_chunk_size {min_chunk_size} should be greater than chunk_overlap {chunk_overlap}, otherwise it is redundant." # noqa:E501
)
if isinstance(length_function, str):
self.length_function = self.load_func(length_function)
else:
@@ -37,7 +47,7 @@ class ChunkerConfig(BaseConfig):
@register_deserializable
class LoaderConfig(BaseConfig):
"""
Config for the chunker used in `add` method
Config for the loader used in `add` method
"""
def __init__(self):