diff --git a/README.md b/README.md index d9d7c7da..6317ebf9 100644 --- a/README.md +++ b/README.md @@ -1,15 +1,17 @@ # embedchain -[![](https://dcbadge.vercel.app/api/server/nhvCbCtKV?style=flat)](https://discord.gg/6PzXDgEjG5) [![PyPI](https://img.shields.io/pypi/v/embedchain)](https://pypi.org/project/embedchain/) +[![Discord](https://dcbadge.vercel.app/api/server/nhvCbCtKV?style=flat)](https://discord.gg/6PzXDgEjG5) +[![Twitter](https://img.shields.io/twitter/follow/embedchain)](https://twitter.com/embedchain) +[![Substack](https://img.shields.io/badge/Substack-%23006f5c.svg?logo=substack)](https://embedchain.substack.com/) embedchain is a framework to easily create LLM powered bots over any dataset. If you want a javascript version, check out [embedchain-js](https://github.com/embedchain/embedchainjs) # Latest Updates -* Introduce a new interface called `chat`. It remembers the history (last 5 messages) and can be used to powerful stateful bots. You can use it by calling `.chat` on any app instance. Works for both OpenAI and OpenSourceApp. +- Introduce a new interface called `chat`. It remembers the history (last 5 messages) and can be used to powerful stateful bots. You can use it by calling `.chat` on any app instance. Works for both OpenAI and OpenSourceApp. -* Introduce a new app type called `OpenSourceApp`. It uses `gpt4all` as the LLM and `sentence transformers` all-MiniLM-L6-v2 as the embedding model. If you use this app, you dont have to pay for anything. +- Introduce a new app type called `OpenSourceApp`. It uses `gpt4all` as the LLM and `sentence transformers` all-MiniLM-L6-v2 as the embedding model. If you use this app, you dont have to pay for anything. # What is embedchain? @@ -130,9 +132,9 @@ from embedchain import OpenSourceApp as ECOSApp ### Query Interface -* This interface is like a question answering bot. It takes a question and gets the answer. It does not maintain context about the previous chats. +- This interface is like a question answering bot. It takes a question and gets the answer. It does not maintain context about the previous chats. -* To use this, call `.query` function to get the answer for any query. +- To use this, call `.query` function to get the answer for any query. ```python print(naval_chat_bot.query("What unique capacity does Naval argue humans possess when it comes to understanding explanations or concepts?")) @@ -141,9 +143,9 @@ print(naval_chat_bot.query("What unique capacity does Naval argue humans possess ### Chat Interface -* This interface is chat interface where it remembers previous conversation. Right now it remembers 5 conversation by default. +- This interface is chat interface where it remembers previous conversation. Right now it remembers 5 conversation by default. -* To use this, call `.chat` function to get the answer for any query. +- To use this, call `.chat` function to get the answer for any query. ```python print(naval_chat_bot.chat("How to be happy in life?")) @@ -188,10 +190,10 @@ app.add('web_page', 'a_valid_web_page_url') ### Doc File -To add any doc/docx file, use the data_type as `doc_file`. Eg: +To add any doc/docx file, use the data_type as `docx`. Eg: ```python -app.add('doc_file', 'a_local_doc_file_path') +app.add('docx', 'a_local_docx_file_path') ``` ### Text diff --git a/embedchain/chunkers/doc_file.py b/embedchain/chunkers/docx_file.py similarity index 85% rename from embedchain/chunkers/doc_file.py rename to embedchain/chunkers/docx_file.py index 49ee6403..03db7bea 100644 --- a/embedchain/chunkers/doc_file.py +++ b/embedchain/chunkers/docx_file.py @@ -4,13 +4,13 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter TEXT_SPLITTER_CHUNK_PARAMS = { - "chunk_size": 500, + "chunk_size": 1000, "chunk_overlap": 0, "length_function": len, } -class DocFileChunker(BaseChunker): +class DocxFileChunker(BaseChunker): def __init__(self): text_splitter = RecursiveCharacterTextSplitter(**TEXT_SPLITTER_CHUNK_PARAMS) super().__init__(text_splitter) diff --git a/embedchain/embedchain.py b/embedchain/embedchain.py index 66cc1e56..ea8892ac 100644 --- a/embedchain/embedchain.py +++ b/embedchain/embedchain.py @@ -13,13 +13,13 @@ from embedchain.loaders.pdf_file import PdfFileLoader from embedchain.loaders.web_page import WebPageLoader from embedchain.loaders.local_qna_pair import LocalQnaPairLoader from embedchain.loaders.local_text import LocalTextLoader -from embedchain.loaders.doc_file import DocFileLoader +from embedchain.loaders.docx_file import DocxFileLoader from embedchain.chunkers.youtube_video import YoutubeVideoChunker from embedchain.chunkers.pdf_file import PdfFileChunker from embedchain.chunkers.web_page import WebPageChunker from embedchain.chunkers.qna_pair import QnaPairChunker from embedchain.chunkers.text import TextChunker -from embedchain.chunkers.doc_file import DocFileChunker +from embedchain.chunkers.docx_file import DocxFileChunker from embedchain.vectordb.chroma_db import ChromaDB @@ -61,6 +61,7 @@ class EmbedChain: 'web_page': WebPageLoader(), 'qna_pair': LocalQnaPairLoader(), 'text': LocalTextLoader(), + 'docx': DocxFileLoader(), } if data_type in loaders: return loaders[data_type] @@ -81,6 +82,7 @@ class EmbedChain: 'web_page': WebPageChunker(), 'qna_pair': QnaPairChunker(), 'text': TextChunker(), + 'docx': DocxFileChunker(), } if data_type in chunkers: return chunkers[data_type] diff --git a/embedchain/loaders/doc_file.py b/embedchain/loaders/docx_file.py similarity index 65% rename from embedchain/loaders/doc_file.py rename to embedchain/loaders/docx_file.py index e7351010..d9872e23 100644 --- a/embedchain/loaders/doc_file.py +++ b/embedchain/loaders/docx_file.py @@ -1,9 +1,8 @@ -from langchain.document_loaders import UnstructuredWordDocumentLoader +from langchain.document_loaders import Docx2txtLoader - -class DocFileLoader: +class DocxFileLoader: def load_data(self, url): - loader = UnstructuredWordDocumentLoader(url) + loader = Docx2txtLoader(url) output = [] data = loader.load() content = data[0].page_content diff --git a/setup.py b/setup.py index c6aa8e32..a200bbf5 100644 --- a/setup.py +++ b/setup.py @@ -31,6 +31,6 @@ setuptools.setup( "pytube", "gpt4all", "sentence_transformers", - "unstructured", + "docx2txt", ], )