Added documentation (#219)

This commit is contained in:
aaishikdutta
2023-07-11 08:31:42 +05:30
committed by GitHub
parent eda28cc491
commit 6936d6983d
15 changed files with 21 additions and 1 deletions

View File

@@ -3,6 +3,7 @@ import hashlib
class BaseChunker: class BaseChunker:
def __init__(self, text_splitter): def __init__(self, text_splitter):
''' Initialize the chunker. '''
self.text_splitter = text_splitter self.text_splitter = text_splitter
def create_chunks(self, loader, src): def create_chunks(self, loader, src):

View File

@@ -14,6 +14,7 @@ TEXT_SPLITTER_CHUNK_PARAMS = {
class DocxFileChunker(BaseChunker): class DocxFileChunker(BaseChunker):
''' Chunker for .docx file. '''
def __init__(self, config: Optional[ChunkerConfig] = None): def __init__(self, config: Optional[ChunkerConfig] = None):
if config is None: if config is None:
config = TEXT_SPLITTER_CHUNK_PARAMS config = TEXT_SPLITTER_CHUNK_PARAMS

View File

@@ -13,6 +13,7 @@ TEXT_SPLITTER_CHUNK_PARAMS = {
class PdfFileChunker(BaseChunker): class PdfFileChunker(BaseChunker):
''' Chunker for PDF file. '''
def __init__(self, config: Optional[ChunkerConfig] = None): def __init__(self, config: Optional[ChunkerConfig] = None):
if config is None: if config is None:
config = TEXT_SPLITTER_CHUNK_PARAMS config = TEXT_SPLITTER_CHUNK_PARAMS

View File

@@ -13,6 +13,7 @@ TEXT_SPLITTER_CHUNK_PARAMS = {
class QnaPairChunker(BaseChunker): class QnaPairChunker(BaseChunker):
''' Chunker for QnA pair. '''
def __init__(self, config: Optional[ChunkerConfig] = None): def __init__(self, config: Optional[ChunkerConfig] = None):
if config is None: if config is None:
config = TEXT_SPLITTER_CHUNK_PARAMS config = TEXT_SPLITTER_CHUNK_PARAMS

View File

@@ -13,6 +13,7 @@ TEXT_SPLITTER_CHUNK_PARAMS = {
class TextChunker(BaseChunker): class TextChunker(BaseChunker):
''' Chunker for text. '''
def __init__(self, config: Optional[ChunkerConfig] = None): def __init__(self, config: Optional[ChunkerConfig] = None):
if config is None: if config is None:
config = TEXT_SPLITTER_CHUNK_PARAMS config = TEXT_SPLITTER_CHUNK_PARAMS

View File

@@ -13,6 +13,7 @@ TEXT_SPLITTER_CHUNK_PARAMS = {
class WebPageChunker(BaseChunker): class WebPageChunker(BaseChunker):
''' Chunker for web page. '''
def __init__(self, config: Optional[ChunkerConfig] = None): def __init__(self, config: Optional[ChunkerConfig] = None):
if config is None: if config is None:
config = TEXT_SPLITTER_CHUNK_PARAMS config = TEXT_SPLITTER_CHUNK_PARAMS

View File

@@ -13,6 +13,7 @@ TEXT_SPLITTER_CHUNK_PARAMS = {
class YoutubeVideoChunker(BaseChunker): class YoutubeVideoChunker(BaseChunker):
''' Chunker for Youtube video. '''
def __init__(self, config: Optional[ChunkerConfig] = None): def __init__(self, config: Optional[ChunkerConfig] = None):
if config is None: if config is None:
config = TEXT_SPLITTER_CHUNK_PARAMS config = TEXT_SPLITTER_CHUNK_PARAMS

View File

@@ -2,6 +2,7 @@ from langchain.document_loaders import Docx2txtLoader
class DocxFileLoader: class DocxFileLoader:
def load_data(self, url): def load_data(self, url):
''' Load data from a .docx file. '''
loader = Docx2txtLoader(url) loader = Docx2txtLoader(url)
output = [] output = []
data = loader.load() data = loader.load()

View File

@@ -1,6 +1,7 @@
class LocalQnaPairLoader: class LocalQnaPairLoader:
def load_data(self, content): def load_data(self, content):
''' Load data from a local QnA pair. '''
question, answer = content question, answer = content
content = f"Q: {question}\nA: {answer}" content = f"Q: {question}\nA: {answer}"
meta_data = { meta_data = {

View File

@@ -1,6 +1,7 @@
class LocalTextLoader: class LocalTextLoader:
def load_data(self, content): def load_data(self, content):
''' Load data from a local text file. '''
meta_data = { meta_data = {
"url": "local", "url": "local",
} }

View File

@@ -6,6 +6,7 @@ from embedchain.utils import clean_string
class PdfFileLoader: class PdfFileLoader:
def load_data(self, url): def load_data(self, url):
''' Load data from a PDF file. '''
loader = PyPDFLoader(url) loader = PyPDFLoader(url)
output = [] output = []
pages = loader.load_and_split() pages = loader.load_and_split()

View File

@@ -8,6 +8,7 @@ from embedchain.utils import clean_string
class WebPageLoader: class WebPageLoader:
def load_data(self, url): def load_data(self, url):
''' Load data from a web page. '''
response = requests.get(url) response = requests.get(url)
data = response.content data = response.content
soup = BeautifulSoup(data, 'html.parser') soup = BeautifulSoup(data, 'html.parser')

View File

@@ -6,6 +6,7 @@ from embedchain.utils import clean_string
class YoutubeVideoLoader: class YoutubeVideoLoader:
def load_data(self, url): def load_data(self, url):
''' Load data from a Youtube video. '''
loader = YoutubeLoader.from_youtube_url(url, add_video_info=True) loader = YoutubeLoader.from_youtube_url(url, add_video_info=True)
doc = loader.load() doc = loader.load()
output = [] output = []

View File

@@ -1,10 +1,13 @@
class BaseVectorDB: class BaseVectorDB:
''' Base class for vector database. '''
def __init__(self): def __init__(self):
self.client = self._get_or_create_db() self.client = self._get_or_create_db()
self.collection = self._get_or_create_collection() self.collection = self._get_or_create_collection()
def _get_or_create_db(self): def _get_or_create_db(self):
''' Get or create the database. '''
raise NotImplementedError raise NotImplementedError
def _get_or_create_collection(self): def _get_or_create_collection(self):
raise NotImplementedError raise NotImplementedError

View File

@@ -7,6 +7,8 @@ from embedchain.vectordb.base_vector_db import BaseVectorDB
class ChromaDB(BaseVectorDB): class ChromaDB(BaseVectorDB):
''' Vector database using ChromaDB. '''
def __init__(self, db_dir=None, ef=None): def __init__(self, db_dir=None, ef=None):
if ef: if ef:
self.ef = ef self.ef = ef
@@ -26,9 +28,11 @@ class ChromaDB(BaseVectorDB):
super().__init__() super().__init__()
def _get_or_create_db(self): def _get_or_create_db(self):
''' Get or create the database. '''
return chromadb.Client(self.client_settings) return chromadb.Client(self.client_settings)
def _get_or_create_collection(self): def _get_or_create_collection(self):
''' Get or create the collection. '''
return self.client.get_or_create_collection( return self.client.get_or_create_collection(
'embedchain_store', embedding_function=self.ef, 'embedchain_store', embedding_function=self.ef,
) )