Added documentation (#219)
This commit is contained in:
@@ -3,6 +3,7 @@ import hashlib
|
||||
|
||||
class BaseChunker:
|
||||
def __init__(self, text_splitter):
|
||||
''' Initialize the chunker. '''
|
||||
self.text_splitter = text_splitter
|
||||
|
||||
def create_chunks(self, loader, src):
|
||||
|
||||
@@ -14,6 +14,7 @@ TEXT_SPLITTER_CHUNK_PARAMS = {
|
||||
|
||||
|
||||
class DocxFileChunker(BaseChunker):
|
||||
''' Chunker for .docx file. '''
|
||||
def __init__(self, config: Optional[ChunkerConfig] = None):
|
||||
if config is None:
|
||||
config = TEXT_SPLITTER_CHUNK_PARAMS
|
||||
|
||||
@@ -13,6 +13,7 @@ TEXT_SPLITTER_CHUNK_PARAMS = {
|
||||
|
||||
|
||||
class PdfFileChunker(BaseChunker):
|
||||
''' Chunker for PDF file. '''
|
||||
def __init__(self, config: Optional[ChunkerConfig] = None):
|
||||
if config is None:
|
||||
config = TEXT_SPLITTER_CHUNK_PARAMS
|
||||
|
||||
@@ -13,6 +13,7 @@ TEXT_SPLITTER_CHUNK_PARAMS = {
|
||||
|
||||
|
||||
class QnaPairChunker(BaseChunker):
|
||||
''' Chunker for QnA pair. '''
|
||||
def __init__(self, config: Optional[ChunkerConfig] = None):
|
||||
if config is None:
|
||||
config = TEXT_SPLITTER_CHUNK_PARAMS
|
||||
|
||||
@@ -13,6 +13,7 @@ TEXT_SPLITTER_CHUNK_PARAMS = {
|
||||
|
||||
|
||||
class TextChunker(BaseChunker):
|
||||
''' Chunker for text. '''
|
||||
def __init__(self, config: Optional[ChunkerConfig] = None):
|
||||
if config is None:
|
||||
config = TEXT_SPLITTER_CHUNK_PARAMS
|
||||
|
||||
@@ -13,6 +13,7 @@ TEXT_SPLITTER_CHUNK_PARAMS = {
|
||||
|
||||
|
||||
class WebPageChunker(BaseChunker):
|
||||
''' Chunker for web page. '''
|
||||
def __init__(self, config: Optional[ChunkerConfig] = None):
|
||||
if config is None:
|
||||
config = TEXT_SPLITTER_CHUNK_PARAMS
|
||||
|
||||
@@ -13,6 +13,7 @@ TEXT_SPLITTER_CHUNK_PARAMS = {
|
||||
|
||||
|
||||
class YoutubeVideoChunker(BaseChunker):
|
||||
''' Chunker for Youtube video. '''
|
||||
def __init__(self, config: Optional[ChunkerConfig] = None):
|
||||
if config is None:
|
||||
config = TEXT_SPLITTER_CHUNK_PARAMS
|
||||
|
||||
@@ -2,6 +2,7 @@ from langchain.document_loaders import Docx2txtLoader
|
||||
|
||||
class DocxFileLoader:
|
||||
def load_data(self, url):
|
||||
''' Load data from a .docx file. '''
|
||||
loader = Docx2txtLoader(url)
|
||||
output = []
|
||||
data = loader.load()
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
class LocalQnaPairLoader:
|
||||
|
||||
def load_data(self, content):
|
||||
''' Load data from a local QnA pair. '''
|
||||
question, answer = content
|
||||
content = f"Q: {question}\nA: {answer}"
|
||||
meta_data = {
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
class LocalTextLoader:
|
||||
|
||||
def load_data(self, content):
|
||||
''' Load data from a local text file. '''
|
||||
meta_data = {
|
||||
"url": "local",
|
||||
}
|
||||
|
||||
@@ -6,6 +6,7 @@ from embedchain.utils import clean_string
|
||||
class PdfFileLoader:
|
||||
|
||||
def load_data(self, url):
|
||||
''' Load data from a PDF file. '''
|
||||
loader = PyPDFLoader(url)
|
||||
output = []
|
||||
pages = loader.load_and_split()
|
||||
|
||||
@@ -8,6 +8,7 @@ from embedchain.utils import clean_string
|
||||
class WebPageLoader:
|
||||
|
||||
def load_data(self, url):
|
||||
''' Load data from a web page. '''
|
||||
response = requests.get(url)
|
||||
data = response.content
|
||||
soup = BeautifulSoup(data, 'html.parser')
|
||||
|
||||
@@ -6,6 +6,7 @@ from embedchain.utils import clean_string
|
||||
class YoutubeVideoLoader:
|
||||
|
||||
def load_data(self, url):
|
||||
''' Load data from a Youtube video. '''
|
||||
loader = YoutubeLoader.from_youtube_url(url, add_video_info=True)
|
||||
doc = loader.load()
|
||||
output = []
|
||||
|
||||
@@ -1,10 +1,13 @@
|
||||
class BaseVectorDB:
|
||||
''' Base class for vector database. '''
|
||||
|
||||
def __init__(self):
|
||||
self.client = self._get_or_create_db()
|
||||
self.collection = self._get_or_create_collection()
|
||||
|
||||
def _get_or_create_db(self):
|
||||
''' Get or create the database. '''
|
||||
raise NotImplementedError
|
||||
|
||||
def _get_or_create_collection(self):
|
||||
raise NotImplementedError
|
||||
raise NotImplementedError
|
||||
|
||||
@@ -7,6 +7,8 @@ from embedchain.vectordb.base_vector_db import BaseVectorDB
|
||||
|
||||
|
||||
class ChromaDB(BaseVectorDB):
|
||||
''' Vector database using ChromaDB. '''
|
||||
|
||||
def __init__(self, db_dir=None, ef=None):
|
||||
if ef:
|
||||
self.ef = ef
|
||||
@@ -26,9 +28,11 @@ class ChromaDB(BaseVectorDB):
|
||||
super().__init__()
|
||||
|
||||
def _get_or_create_db(self):
|
||||
''' Get or create the database. '''
|
||||
return chromadb.Client(self.client_settings)
|
||||
|
||||
def _get_or_create_collection(self):
|
||||
''' Get or create the collection. '''
|
||||
return self.client.get_or_create_collection(
|
||||
'embedchain_store', embedding_function=self.ef,
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user