Added documentation (#219)
This commit is contained in:
@@ -3,6 +3,7 @@ import hashlib
|
|||||||
|
|
||||||
class BaseChunker:
|
class BaseChunker:
|
||||||
def __init__(self, text_splitter):
|
def __init__(self, text_splitter):
|
||||||
|
''' Initialize the chunker. '''
|
||||||
self.text_splitter = text_splitter
|
self.text_splitter = text_splitter
|
||||||
|
|
||||||
def create_chunks(self, loader, src):
|
def create_chunks(self, loader, src):
|
||||||
|
|||||||
@@ -14,6 +14,7 @@ TEXT_SPLITTER_CHUNK_PARAMS = {
|
|||||||
|
|
||||||
|
|
||||||
class DocxFileChunker(BaseChunker):
|
class DocxFileChunker(BaseChunker):
|
||||||
|
''' Chunker for .docx file. '''
|
||||||
def __init__(self, config: Optional[ChunkerConfig] = None):
|
def __init__(self, config: Optional[ChunkerConfig] = None):
|
||||||
if config is None:
|
if config is None:
|
||||||
config = TEXT_SPLITTER_CHUNK_PARAMS
|
config = TEXT_SPLITTER_CHUNK_PARAMS
|
||||||
|
|||||||
@@ -13,6 +13,7 @@ TEXT_SPLITTER_CHUNK_PARAMS = {
|
|||||||
|
|
||||||
|
|
||||||
class PdfFileChunker(BaseChunker):
|
class PdfFileChunker(BaseChunker):
|
||||||
|
''' Chunker for PDF file. '''
|
||||||
def __init__(self, config: Optional[ChunkerConfig] = None):
|
def __init__(self, config: Optional[ChunkerConfig] = None):
|
||||||
if config is None:
|
if config is None:
|
||||||
config = TEXT_SPLITTER_CHUNK_PARAMS
|
config = TEXT_SPLITTER_CHUNK_PARAMS
|
||||||
|
|||||||
@@ -13,6 +13,7 @@ TEXT_SPLITTER_CHUNK_PARAMS = {
|
|||||||
|
|
||||||
|
|
||||||
class QnaPairChunker(BaseChunker):
|
class QnaPairChunker(BaseChunker):
|
||||||
|
''' Chunker for QnA pair. '''
|
||||||
def __init__(self, config: Optional[ChunkerConfig] = None):
|
def __init__(self, config: Optional[ChunkerConfig] = None):
|
||||||
if config is None:
|
if config is None:
|
||||||
config = TEXT_SPLITTER_CHUNK_PARAMS
|
config = TEXT_SPLITTER_CHUNK_PARAMS
|
||||||
|
|||||||
@@ -13,6 +13,7 @@ TEXT_SPLITTER_CHUNK_PARAMS = {
|
|||||||
|
|
||||||
|
|
||||||
class TextChunker(BaseChunker):
|
class TextChunker(BaseChunker):
|
||||||
|
''' Chunker for text. '''
|
||||||
def __init__(self, config: Optional[ChunkerConfig] = None):
|
def __init__(self, config: Optional[ChunkerConfig] = None):
|
||||||
if config is None:
|
if config is None:
|
||||||
config = TEXT_SPLITTER_CHUNK_PARAMS
|
config = TEXT_SPLITTER_CHUNK_PARAMS
|
||||||
|
|||||||
@@ -13,6 +13,7 @@ TEXT_SPLITTER_CHUNK_PARAMS = {
|
|||||||
|
|
||||||
|
|
||||||
class WebPageChunker(BaseChunker):
|
class WebPageChunker(BaseChunker):
|
||||||
|
''' Chunker for web page. '''
|
||||||
def __init__(self, config: Optional[ChunkerConfig] = None):
|
def __init__(self, config: Optional[ChunkerConfig] = None):
|
||||||
if config is None:
|
if config is None:
|
||||||
config = TEXT_SPLITTER_CHUNK_PARAMS
|
config = TEXT_SPLITTER_CHUNK_PARAMS
|
||||||
|
|||||||
@@ -13,6 +13,7 @@ TEXT_SPLITTER_CHUNK_PARAMS = {
|
|||||||
|
|
||||||
|
|
||||||
class YoutubeVideoChunker(BaseChunker):
|
class YoutubeVideoChunker(BaseChunker):
|
||||||
|
''' Chunker for Youtube video. '''
|
||||||
def __init__(self, config: Optional[ChunkerConfig] = None):
|
def __init__(self, config: Optional[ChunkerConfig] = None):
|
||||||
if config is None:
|
if config is None:
|
||||||
config = TEXT_SPLITTER_CHUNK_PARAMS
|
config = TEXT_SPLITTER_CHUNK_PARAMS
|
||||||
|
|||||||
@@ -2,6 +2,7 @@ from langchain.document_loaders import Docx2txtLoader
|
|||||||
|
|
||||||
class DocxFileLoader:
|
class DocxFileLoader:
|
||||||
def load_data(self, url):
|
def load_data(self, url):
|
||||||
|
''' Load data from a .docx file. '''
|
||||||
loader = Docx2txtLoader(url)
|
loader = Docx2txtLoader(url)
|
||||||
output = []
|
output = []
|
||||||
data = loader.load()
|
data = loader.load()
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
class LocalQnaPairLoader:
|
class LocalQnaPairLoader:
|
||||||
|
|
||||||
def load_data(self, content):
|
def load_data(self, content):
|
||||||
|
''' Load data from a local QnA pair. '''
|
||||||
question, answer = content
|
question, answer = content
|
||||||
content = f"Q: {question}\nA: {answer}"
|
content = f"Q: {question}\nA: {answer}"
|
||||||
meta_data = {
|
meta_data = {
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
class LocalTextLoader:
|
class LocalTextLoader:
|
||||||
|
|
||||||
def load_data(self, content):
|
def load_data(self, content):
|
||||||
|
''' Load data from a local text file. '''
|
||||||
meta_data = {
|
meta_data = {
|
||||||
"url": "local",
|
"url": "local",
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ from embedchain.utils import clean_string
|
|||||||
class PdfFileLoader:
|
class PdfFileLoader:
|
||||||
|
|
||||||
def load_data(self, url):
|
def load_data(self, url):
|
||||||
|
''' Load data from a PDF file. '''
|
||||||
loader = PyPDFLoader(url)
|
loader = PyPDFLoader(url)
|
||||||
output = []
|
output = []
|
||||||
pages = loader.load_and_split()
|
pages = loader.load_and_split()
|
||||||
|
|||||||
@@ -8,6 +8,7 @@ from embedchain.utils import clean_string
|
|||||||
class WebPageLoader:
|
class WebPageLoader:
|
||||||
|
|
||||||
def load_data(self, url):
|
def load_data(self, url):
|
||||||
|
''' Load data from a web page. '''
|
||||||
response = requests.get(url)
|
response = requests.get(url)
|
||||||
data = response.content
|
data = response.content
|
||||||
soup = BeautifulSoup(data, 'html.parser')
|
soup = BeautifulSoup(data, 'html.parser')
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ from embedchain.utils import clean_string
|
|||||||
class YoutubeVideoLoader:
|
class YoutubeVideoLoader:
|
||||||
|
|
||||||
def load_data(self, url):
|
def load_data(self, url):
|
||||||
|
''' Load data from a Youtube video. '''
|
||||||
loader = YoutubeLoader.from_youtube_url(url, add_video_info=True)
|
loader = YoutubeLoader.from_youtube_url(url, add_video_info=True)
|
||||||
doc = loader.load()
|
doc = loader.load()
|
||||||
output = []
|
output = []
|
||||||
|
|||||||
@@ -1,9 +1,12 @@
|
|||||||
class BaseVectorDB:
|
class BaseVectorDB:
|
||||||
|
''' Base class for vector database. '''
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.client = self._get_or_create_db()
|
self.client = self._get_or_create_db()
|
||||||
self.collection = self._get_or_create_collection()
|
self.collection = self._get_or_create_collection()
|
||||||
|
|
||||||
def _get_or_create_db(self):
|
def _get_or_create_db(self):
|
||||||
|
''' Get or create the database. '''
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def _get_or_create_collection(self):
|
def _get_or_create_collection(self):
|
||||||
|
|||||||
@@ -7,6 +7,8 @@ from embedchain.vectordb.base_vector_db import BaseVectorDB
|
|||||||
|
|
||||||
|
|
||||||
class ChromaDB(BaseVectorDB):
|
class ChromaDB(BaseVectorDB):
|
||||||
|
''' Vector database using ChromaDB. '''
|
||||||
|
|
||||||
def __init__(self, db_dir=None, ef=None):
|
def __init__(self, db_dir=None, ef=None):
|
||||||
if ef:
|
if ef:
|
||||||
self.ef = ef
|
self.ef = ef
|
||||||
@@ -26,9 +28,11 @@ class ChromaDB(BaseVectorDB):
|
|||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
def _get_or_create_db(self):
|
def _get_or_create_db(self):
|
||||||
|
''' Get or create the database. '''
|
||||||
return chromadb.Client(self.client_settings)
|
return chromadb.Client(self.client_settings)
|
||||||
|
|
||||||
def _get_or_create_collection(self):
|
def _get_or_create_collection(self):
|
||||||
|
''' Get or create the collection. '''
|
||||||
return self.client.get_or_create_collection(
|
return self.client.get_or_create_collection(
|
||||||
'embedchain_store', embedding_function=self.ef,
|
'embedchain_store', embedding_function=self.ef,
|
||||||
)
|
)
|
||||||
|
|||||||
Reference in New Issue
Block a user