feat: Add new data type: code_docs_loader (#274)

This commit is contained in:
Taranjeet Singh
2023-07-15 09:02:11 +05:30
committed by GitHub
parent cd0c7bc971
commit 86e4146126
7 changed files with 133 additions and 6 deletions

View File

@@ -25,7 +25,7 @@ class BaseChunker:
meta_data = data["meta_data"]
url = meta_data["url"]
chunks = self.text_splitter.split_text(content)
chunks = self.get_chunks(content)
for chunk in chunks:
chunk_id = hashlib.sha256((chunk + url).encode()).hexdigest()
@@ -39,3 +39,11 @@ class BaseChunker:
"ids": ids,
"metadatas": metadatas,
}
def get_chunks(self, content):
"""
Returns chunks using text splitter instance.
Override in child class if custom logic.
"""
return self.text_splitter.split_text(content)

View File

@@ -0,0 +1,22 @@
from typing import Optional
from langchain.text_splitter import RecursiveCharacterTextSplitter
from embedchain.chunkers.base_chunker import BaseChunker
from embedchain.config.AddConfig import ChunkerConfig
TEXT_SPLITTER_CHUNK_PARAMS = {
"chunk_size": 500,
"chunk_overlap": 50,
"length_function": len,
}
class CodeDocsPageChunker(BaseChunker):
"""Chunker for code docs page."""
def __init__(self, config: Optional[ChunkerConfig] = None):
if config is None:
config = TEXT_SPLITTER_CHUNK_PARAMS
text_splitter = RecursiveCharacterTextSplitter(**config)
super().__init__(text_splitter)