From a681d47bce53e245c44c0d14d4aa241417e7169c Mon Sep 17 00:00:00 2001 From: cachho Date: Thu, 20 Jul 2023 08:29:59 +0200 Subject: [PATCH] fix: `docs_site` use chunker config implementation (#326) --- docs/advanced/query_configuration.mdx | 1 + embedchain/chunkers/docs_site.py | 14 ++++++-------- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/docs/advanced/query_configuration.mdx b/docs/advanced/query_configuration.mdx index 42aabaa0..e726076e 100644 --- a/docs/advanced/query_configuration.mdx +++ b/docs/advanced/query_configuration.mdx @@ -36,6 +36,7 @@ Default values of chunker config parameters for different `data_type`: |web_page|500|0|len| |pdf_file|1000|0|len| |youtube_video|2000|0|len| +|docs_site|500|50|len| ### LoaderConfig diff --git a/embedchain/chunkers/docs_site.py b/embedchain/chunkers/docs_site.py index 18e2a42b..c84621c2 100644 --- a/embedchain/chunkers/docs_site.py +++ b/embedchain/chunkers/docs_site.py @@ -5,18 +5,16 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter from embedchain.chunkers.base_chunker import BaseChunker from embedchain.config.AddConfig import ChunkerConfig -TEXT_SPLITTER_CHUNK_PARAMS = { - "chunk_size": 500, - "chunk_overlap": 50, - "length_function": len, -} - class DocsSiteChunker(BaseChunker): """Chunker for code docs site.""" def __init__(self, config: Optional[ChunkerConfig] = None): if config is None: - config = TEXT_SPLITTER_CHUNK_PARAMS - text_splitter = RecursiveCharacterTextSplitter(**config) + config = ChunkerConfig(chunk_size=500, chunk_overlap=50, length_function=len) + text_splitter = RecursiveCharacterTextSplitter( + chunk_size=config.chunk_size, + chunk_overlap=config.chunk_overlap, + length_function=config.length_function, + ) super().__init__(text_splitter)