feat: changed doc_file to docx and update readme (#157)

This commit is contained in:
Sahil Kumar Yadav
2023-07-07 16:18:05 +05:30
committed by GitHub
parent 51adc5c886
commit 0bb3d0afe9
5 changed files with 21 additions and 18 deletions

View File

@@ -4,13 +4,13 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter
TEXT_SPLITTER_CHUNK_PARAMS = {
"chunk_size": 500,
"chunk_size": 1000,
"chunk_overlap": 0,
"length_function": len,
}
class DocFileChunker(BaseChunker):
class DocxFileChunker(BaseChunker):
def __init__(self):
text_splitter = RecursiveCharacterTextSplitter(**TEXT_SPLITTER_CHUNK_PARAMS)
super().__init__(text_splitter)

View File

@@ -13,13 +13,13 @@ from embedchain.loaders.pdf_file import PdfFileLoader
from embedchain.loaders.web_page import WebPageLoader
from embedchain.loaders.local_qna_pair import LocalQnaPairLoader
from embedchain.loaders.local_text import LocalTextLoader
from embedchain.loaders.doc_file import DocFileLoader
from embedchain.loaders.docx_file import DocxFileLoader
from embedchain.chunkers.youtube_video import YoutubeVideoChunker
from embedchain.chunkers.pdf_file import PdfFileChunker
from embedchain.chunkers.web_page import WebPageChunker
from embedchain.chunkers.qna_pair import QnaPairChunker
from embedchain.chunkers.text import TextChunker
from embedchain.chunkers.doc_file import DocFileChunker
from embedchain.chunkers.docx_file import DocxFileChunker
from embedchain.vectordb.chroma_db import ChromaDB
@@ -61,6 +61,7 @@ class EmbedChain:
'web_page': WebPageLoader(),
'qna_pair': LocalQnaPairLoader(),
'text': LocalTextLoader(),
'docx': DocxFileLoader(),
}
if data_type in loaders:
return loaders[data_type]
@@ -81,6 +82,7 @@ class EmbedChain:
'web_page': WebPageChunker(),
'qna_pair': QnaPairChunker(),
'text': TextChunker(),
'docx': DocxFileChunker(),
}
if data_type in chunkers:
return chunkers[data_type]

View File

@@ -1,9 +1,8 @@
from langchain.document_loaders import UnstructuredWordDocumentLoader
from langchain.document_loaders import Docx2txtLoader
class DocFileLoader:
class DocxFileLoader:
def load_data(self, url):
loader = UnstructuredWordDocumentLoader(url)
loader = Docx2txtLoader(url)
output = []
data = loader.load()
content = data[0].page_content