fix: docs_site use chunker config implementation (#326)

This commit is contained in:
cachho
2023-07-20 08:29:59 +02:00
committed by GitHub
parent 4bb06147c1
commit a681d47bce
2 changed files with 7 additions and 8 deletions

View File

@@ -36,6 +36,7 @@ Default values of chunker config parameters for different `data_type`:
|web_page|500|0|len|
|pdf_file|1000|0|len|
|youtube_video|2000|0|len|
|docs_site|500|50|len|
### LoaderConfig

View File

@@ -5,18 +5,16 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter
from embedchain.chunkers.base_chunker import BaseChunker
from embedchain.config.AddConfig import ChunkerConfig
TEXT_SPLITTER_CHUNK_PARAMS = {
"chunk_size": 500,
"chunk_overlap": 50,
"length_function": len,
}
class DocsSiteChunker(BaseChunker):
"""Chunker for code docs site."""
def __init__(self, config: Optional[ChunkerConfig] = None):
if config is None:
config = TEXT_SPLITTER_CHUNK_PARAMS
text_splitter = RecursiveCharacterTextSplitter(**config)
config = ChunkerConfig(chunk_size=500, chunk_overlap=50, length_function=len)
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=config.chunk_size,
chunk_overlap=config.chunk_overlap,
length_function=config.length_function,
)
super().__init__(text_splitter)