feat: changed doc_file to docx and update readme (#157)

2023-07-07 16:18:05 +05:30
parent 51adc5c886
commit 0bb3d0afe9
5 changed files with 21 additions and 18 deletions
--- a/embedchain/chunkers/docx_file.py
+++ b/embedchain/chunkers/docx_file.py
@@ -4,13 +4,13 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter


 TEXT_SPLITTER_CHUNK_PARAMS = {
-    "chunk_size": 500,
+    "chunk_size": 1000,
    "chunk_overlap": 0,
    "length_function": len,
 }


-class DocFileChunker(BaseChunker):
+class DocxFileChunker(BaseChunker):
    def __init__(self):
        text_splitter = RecursiveCharacterTextSplitter(**TEXT_SPLITTER_CHUNK_PARAMS)
        super().__init__(text_splitter)
--- a/embedchain/embedchain.py
+++ b/embedchain/embedchain.py
@@ -13,13 +13,13 @@ from embedchain.loaders.pdf_file import PdfFileLoader
 from embedchain.loaders.web_page import WebPageLoader
 from embedchain.loaders.local_qna_pair import LocalQnaPairLoader
 from embedchain.loaders.local_text import LocalTextLoader
-from embedchain.loaders.doc_file import DocFileLoader
+from embedchain.loaders.docx_file import DocxFileLoader
 from embedchain.chunkers.youtube_video import YoutubeVideoChunker
 from embedchain.chunkers.pdf_file import PdfFileChunker
 from embedchain.chunkers.web_page import WebPageChunker
 from embedchain.chunkers.qna_pair import QnaPairChunker
 from embedchain.chunkers.text import TextChunker
-from embedchain.chunkers.doc_file import DocFileChunker
+from embedchain.chunkers.docx_file import DocxFileChunker
 from embedchain.vectordb.chroma_db import ChromaDB


@@ -61,6 +61,7 @@ class EmbedChain:
            'web_page': WebPageLoader(),
            'qna_pair': LocalQnaPairLoader(),
            'text': LocalTextLoader(),
+            'docx': DocxFileLoader(),
        }
        if data_type in loaders:
            return loaders[data_type]
@@ -81,6 +82,7 @@ class EmbedChain:
            'web_page': WebPageChunker(),
            'qna_pair': QnaPairChunker(),
            'text': TextChunker(),
+            'docx': DocxFileChunker(),
        }
        if data_type in chunkers:
            return chunkers[data_type]
--- a/embedchain/loaders/docx_file.py
+++ b/embedchain/loaders/docx_file.py
@@ -1,9 +1,8 @@
-from langchain.document_loaders import UnstructuredWordDocumentLoader
+from langchain.document_loaders import Docx2txtLoader

-
-class DocFileLoader:
+class DocxFileLoader:
    def load_data(self, url):
-        loader = UnstructuredWordDocumentLoader(url)
+        loader = Docx2txtLoader(url)
        output = []
        data = loader.load()
        content = data[0].page_content