fix: docs_site use chunker config implementation (#326)
This commit is contained in:
@@ -36,6 +36,7 @@ Default values of chunker config parameters for different `data_type`:
|
|||||||
|web_page|500|0|len|
|
|web_page|500|0|len|
|
||||||
|pdf_file|1000|0|len|
|
|pdf_file|1000|0|len|
|
||||||
|youtube_video|2000|0|len|
|
|youtube_video|2000|0|len|
|
||||||
|
|docs_site|500|50|len|
|
||||||
|
|
||||||
### LoaderConfig
|
### LoaderConfig
|
||||||
|
|
||||||
|
|||||||
@@ -5,18 +5,16 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|||||||
from embedchain.chunkers.base_chunker import BaseChunker
|
from embedchain.chunkers.base_chunker import BaseChunker
|
||||||
from embedchain.config.AddConfig import ChunkerConfig
|
from embedchain.config.AddConfig import ChunkerConfig
|
||||||
|
|
||||||
TEXT_SPLITTER_CHUNK_PARAMS = {
|
|
||||||
"chunk_size": 500,
|
|
||||||
"chunk_overlap": 50,
|
|
||||||
"length_function": len,
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
class DocsSiteChunker(BaseChunker):
|
class DocsSiteChunker(BaseChunker):
|
||||||
"""Chunker for code docs site."""
|
"""Chunker for code docs site."""
|
||||||
|
|
||||||
def __init__(self, config: Optional[ChunkerConfig] = None):
|
def __init__(self, config: Optional[ChunkerConfig] = None):
|
||||||
if config is None:
|
if config is None:
|
||||||
config = TEXT_SPLITTER_CHUNK_PARAMS
|
config = ChunkerConfig(chunk_size=500, chunk_overlap=50, length_function=len)
|
||||||
text_splitter = RecursiveCharacterTextSplitter(**config)
|
text_splitter = RecursiveCharacterTextSplitter(
|
||||||
|
chunk_size=config.chunk_size,
|
||||||
|
chunk_overlap=config.chunk_overlap,
|
||||||
|
length_function=config.length_function,
|
||||||
|
)
|
||||||
super().__init__(text_splitter)
|
super().__init__(text_splitter)
|
||||||
|
|||||||
Reference in New Issue
Block a user