diff --git a/docs/advanced/query_configuration.mdx b/docs/advanced/query_configuration.mdx index 42aabaa0..e726076e 100644 --- a/docs/advanced/query_configuration.mdx +++ b/docs/advanced/query_configuration.mdx @@ -36,6 +36,7 @@ Default values of chunker config parameters for different `data_type`: |web_page|500|0|len| |pdf_file|1000|0|len| |youtube_video|2000|0|len| +|docs_site|500|50|len| ### LoaderConfig diff --git a/embedchain/chunkers/docs_site.py b/embedchain/chunkers/docs_site.py index 18e2a42b..c84621c2 100644 --- a/embedchain/chunkers/docs_site.py +++ b/embedchain/chunkers/docs_site.py @@ -5,18 +5,16 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter from embedchain.chunkers.base_chunker import BaseChunker from embedchain.config.AddConfig import ChunkerConfig -TEXT_SPLITTER_CHUNK_PARAMS = { - "chunk_size": 500, - "chunk_overlap": 50, - "length_function": len, -} - class DocsSiteChunker(BaseChunker): """Chunker for code docs site.""" def __init__(self, config: Optional[ChunkerConfig] = None): if config is None: - config = TEXT_SPLITTER_CHUNK_PARAMS - text_splitter = RecursiveCharacterTextSplitter(**config) + config = ChunkerConfig(chunk_size=500, chunk_overlap=50, length_function=len) + text_splitter = RecursiveCharacterTextSplitter( + chunk_size=config.chunk_size, + chunk_overlap=config.chunk_overlap, + length_function=config.length_function, + ) super().__init__(text_splitter)