Resolve conflicts (#208)
This commit is contained in:
@@ -3,15 +3,17 @@ import hashlib
|
||||
|
||||
class BaseChunker:
|
||||
def __init__(self, text_splitter):
|
||||
''' Initialize the chunker. '''
|
||||
"""Initialize the chunker."""
|
||||
self.text_splitter = text_splitter
|
||||
|
||||
def create_chunks(self, loader, src):
|
||||
"""
|
||||
Loads data and chunks it.
|
||||
|
||||
:param loader: The loader which's `load_data` method is used to create the raw data.
|
||||
:param src: The data to be handled by the loader. Can be a URL for remote sources or local content for local loaders.
|
||||
:param loader: The loader which's `load_data` method is used to create
|
||||
the raw data.
|
||||
:param src: The data to be handled by the loader. Can be a URL for
|
||||
remote sources or local content for local loaders.
|
||||
"""
|
||||
documents = []
|
||||
ids = []
|
||||
@@ -27,7 +29,7 @@ class BaseChunker:
|
||||
|
||||
for chunk in chunks:
|
||||
chunk_id = hashlib.sha256((chunk + url).encode()).hexdigest()
|
||||
if (idMap.get(chunk_id) is None):
|
||||
if idMap.get(chunk_id) is None:
|
||||
idMap[chunk_id] = True
|
||||
ids.append(chunk_id)
|
||||
documents.append(chunk)
|
||||
|
||||
@@ -1,10 +1,9 @@
|
||||
from typing import Optional
|
||||
from embedchain.chunkers.base_chunker import BaseChunker
|
||||
from embedchain.config.AddConfig import ChunkerConfig
|
||||
|
||||
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||
|
||||
|
||||
from embedchain.chunkers.base_chunker import BaseChunker
|
||||
from embedchain.config.AddConfig import ChunkerConfig
|
||||
|
||||
TEXT_SPLITTER_CHUNK_PARAMS = {
|
||||
"chunk_size": 1000,
|
||||
@@ -14,7 +13,8 @@ TEXT_SPLITTER_CHUNK_PARAMS = {
|
||||
|
||||
|
||||
class DocxFileChunker(BaseChunker):
|
||||
''' Chunker for .docx file. '''
|
||||
"""Chunker for .docx file."""
|
||||
|
||||
def __init__(self, config: Optional[ChunkerConfig] = None):
|
||||
if config is None:
|
||||
config = TEXT_SPLITTER_CHUNK_PARAMS
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
from typing import Optional
|
||||
from embedchain.chunkers.base_chunker import BaseChunker
|
||||
from embedchain.config.AddConfig import ChunkerConfig
|
||||
|
||||
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||
|
||||
from embedchain.chunkers.base_chunker import BaseChunker
|
||||
from embedchain.config.AddConfig import ChunkerConfig
|
||||
|
||||
TEXT_SPLITTER_CHUNK_PARAMS = {
|
||||
"chunk_size": 1000,
|
||||
@@ -13,7 +13,8 @@ TEXT_SPLITTER_CHUNK_PARAMS = {
|
||||
|
||||
|
||||
class PdfFileChunker(BaseChunker):
|
||||
''' Chunker for PDF file. '''
|
||||
"""Chunker for PDF file."""
|
||||
|
||||
def __init__(self, config: Optional[ChunkerConfig] = None):
|
||||
if config is None:
|
||||
config = TEXT_SPLITTER_CHUNK_PARAMS
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
from typing import Optional
|
||||
from embedchain.chunkers.base_chunker import BaseChunker
|
||||
from embedchain.config.AddConfig import ChunkerConfig
|
||||
|
||||
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||
|
||||
from embedchain.chunkers.base_chunker import BaseChunker
|
||||
from embedchain.config.AddConfig import ChunkerConfig
|
||||
|
||||
TEXT_SPLITTER_CHUNK_PARAMS = {
|
||||
"chunk_size": 300,
|
||||
@@ -13,7 +13,8 @@ TEXT_SPLITTER_CHUNK_PARAMS = {
|
||||
|
||||
|
||||
class QnaPairChunker(BaseChunker):
|
||||
''' Chunker for QnA pair. '''
|
||||
"""Chunker for QnA pair."""
|
||||
|
||||
def __init__(self, config: Optional[ChunkerConfig] = None):
|
||||
if config is None:
|
||||
config = TEXT_SPLITTER_CHUNK_PARAMS
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
from typing import Optional
|
||||
from embedchain.chunkers.base_chunker import BaseChunker
|
||||
from embedchain.config.AddConfig import ChunkerConfig
|
||||
|
||||
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||
|
||||
from embedchain.chunkers.base_chunker import BaseChunker
|
||||
from embedchain.config.AddConfig import ChunkerConfig
|
||||
|
||||
TEXT_SPLITTER_CHUNK_PARAMS = {
|
||||
"chunk_size": 300,
|
||||
@@ -13,7 +13,8 @@ TEXT_SPLITTER_CHUNK_PARAMS = {
|
||||
|
||||
|
||||
class TextChunker(BaseChunker):
|
||||
''' Chunker for text. '''
|
||||
"""Chunker for text."""
|
||||
|
||||
def __init__(self, config: Optional[ChunkerConfig] = None):
|
||||
if config is None:
|
||||
config = TEXT_SPLITTER_CHUNK_PARAMS
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
from typing import Optional
|
||||
from embedchain.chunkers.base_chunker import BaseChunker
|
||||
from embedchain.config.AddConfig import ChunkerConfig
|
||||
|
||||
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||
|
||||
from embedchain.chunkers.base_chunker import BaseChunker
|
||||
from embedchain.config.AddConfig import ChunkerConfig
|
||||
|
||||
TEXT_SPLITTER_CHUNK_PARAMS = {
|
||||
"chunk_size": 500,
|
||||
@@ -13,7 +13,8 @@ TEXT_SPLITTER_CHUNK_PARAMS = {
|
||||
|
||||
|
||||
class WebPageChunker(BaseChunker):
|
||||
''' Chunker for web page. '''
|
||||
"""Chunker for web page."""
|
||||
|
||||
def __init__(self, config: Optional[ChunkerConfig] = None):
|
||||
if config is None:
|
||||
config = TEXT_SPLITTER_CHUNK_PARAMS
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
from typing import Optional
|
||||
from embedchain.chunkers.base_chunker import BaseChunker
|
||||
from embedchain.config.AddConfig import ChunkerConfig
|
||||
|
||||
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||
|
||||
from embedchain.chunkers.base_chunker import BaseChunker
|
||||
from embedchain.config.AddConfig import ChunkerConfig
|
||||
|
||||
TEXT_SPLITTER_CHUNK_PARAMS = {
|
||||
"chunk_size": 2000,
|
||||
@@ -13,7 +13,8 @@ TEXT_SPLITTER_CHUNK_PARAMS = {
|
||||
|
||||
|
||||
class YoutubeVideoChunker(BaseChunker):
|
||||
''' Chunker for Youtube video. '''
|
||||
"""Chunker for Youtube video."""
|
||||
|
||||
def __init__(self, config: Optional[ChunkerConfig] = None):
|
||||
if config is None:
|
||||
config = TEXT_SPLITTER_CHUNK_PARAMS
|
||||
|
||||
Reference in New Issue
Block a user