feat: Add new data type: code_docs_loader (#274)
This commit is contained in:
@@ -25,7 +25,7 @@ class BaseChunker:
|
||||
meta_data = data["meta_data"]
|
||||
url = meta_data["url"]
|
||||
|
||||
chunks = self.text_splitter.split_text(content)
|
||||
chunks = self.get_chunks(content)
|
||||
|
||||
for chunk in chunks:
|
||||
chunk_id = hashlib.sha256((chunk + url).encode()).hexdigest()
|
||||
@@ -39,3 +39,11 @@ class BaseChunker:
|
||||
"ids": ids,
|
||||
"metadatas": metadatas,
|
||||
}
|
||||
|
||||
def get_chunks(self, content):
|
||||
"""
|
||||
Returns chunks using text splitter instance.
|
||||
|
||||
Override in child class if custom logic.
|
||||
"""
|
||||
return self.text_splitter.split_text(content)
|
||||
22
embedchain/chunkers/code_docs_page.py
Normal file
22
embedchain/chunkers/code_docs_page.py
Normal file
@@ -0,0 +1,22 @@
|
||||
from typing import Optional
|
||||
|
||||
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||
|
||||
from embedchain.chunkers.base_chunker import BaseChunker
|
||||
from embedchain.config.AddConfig import ChunkerConfig
|
||||
|
||||
TEXT_SPLITTER_CHUNK_PARAMS = {
|
||||
"chunk_size": 500,
|
||||
"chunk_overlap": 50,
|
||||
"length_function": len,
|
||||
}
|
||||
|
||||
|
||||
class CodeDocsPageChunker(BaseChunker):
|
||||
"""Chunker for code docs page."""
|
||||
|
||||
def __init__(self, config: Optional[ChunkerConfig] = None):
|
||||
if config is None:
|
||||
config = TEXT_SPLITTER_CHUNK_PARAMS
|
||||
text_splitter = RecursiveCharacterTextSplitter(**config)
|
||||
super().__init__(text_splitter)
|
||||
Reference in New Issue
Block a user