feat: changed doc_file to docx and update readme (#157)
This commit is contained in:
committed by
GitHub
parent
51adc5c886
commit
0bb3d0afe9
20
README.md
20
README.md
@@ -1,15 +1,17 @@
|
|||||||
# embedchain
|
# embedchain
|
||||||
|
|
||||||
[](https://discord.gg/6PzXDgEjG5)
|
|
||||||
[](https://pypi.org/project/embedchain/)
|
[](https://pypi.org/project/embedchain/)
|
||||||
|
[](https://discord.gg/6PzXDgEjG5)
|
||||||
|
[](https://twitter.com/embedchain)
|
||||||
|
[](https://embedchain.substack.com/)
|
||||||
|
|
||||||
embedchain is a framework to easily create LLM powered bots over any dataset. If you want a javascript version, check out [embedchain-js](https://github.com/embedchain/embedchainjs)
|
embedchain is a framework to easily create LLM powered bots over any dataset. If you want a javascript version, check out [embedchain-js](https://github.com/embedchain/embedchainjs)
|
||||||
|
|
||||||
# Latest Updates
|
# Latest Updates
|
||||||
|
|
||||||
* Introduce a new interface called `chat`. It remembers the history (last 5 messages) and can be used to powerful stateful bots. You can use it by calling `.chat` on any app instance. Works for both OpenAI and OpenSourceApp.
|
- Introduce a new interface called `chat`. It remembers the history (last 5 messages) and can be used to powerful stateful bots. You can use it by calling `.chat` on any app instance. Works for both OpenAI and OpenSourceApp.
|
||||||
|
|
||||||
* Introduce a new app type called `OpenSourceApp`. It uses `gpt4all` as the LLM and `sentence transformers` all-MiniLM-L6-v2 as the embedding model. If you use this app, you dont have to pay for anything.
|
- Introduce a new app type called `OpenSourceApp`. It uses `gpt4all` as the LLM and `sentence transformers` all-MiniLM-L6-v2 as the embedding model. If you use this app, you dont have to pay for anything.
|
||||||
|
|
||||||
# What is embedchain?
|
# What is embedchain?
|
||||||
|
|
||||||
@@ -130,9 +132,9 @@ from embedchain import OpenSourceApp as ECOSApp
|
|||||||
|
|
||||||
### Query Interface
|
### Query Interface
|
||||||
|
|
||||||
* This interface is like a question answering bot. It takes a question and gets the answer. It does not maintain context about the previous chats.
|
- This interface is like a question answering bot. It takes a question and gets the answer. It does not maintain context about the previous chats.
|
||||||
|
|
||||||
* To use this, call `.query` function to get the answer for any query.
|
- To use this, call `.query` function to get the answer for any query.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
print(naval_chat_bot.query("What unique capacity does Naval argue humans possess when it comes to understanding explanations or concepts?"))
|
print(naval_chat_bot.query("What unique capacity does Naval argue humans possess when it comes to understanding explanations or concepts?"))
|
||||||
@@ -141,9 +143,9 @@ print(naval_chat_bot.query("What unique capacity does Naval argue humans possess
|
|||||||
|
|
||||||
### Chat Interface
|
### Chat Interface
|
||||||
|
|
||||||
* This interface is chat interface where it remembers previous conversation. Right now it remembers 5 conversation by default.
|
- This interface is chat interface where it remembers previous conversation. Right now it remembers 5 conversation by default.
|
||||||
|
|
||||||
* To use this, call `.chat` function to get the answer for any query.
|
- To use this, call `.chat` function to get the answer for any query.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
print(naval_chat_bot.chat("How to be happy in life?"))
|
print(naval_chat_bot.chat("How to be happy in life?"))
|
||||||
@@ -188,10 +190,10 @@ app.add('web_page', 'a_valid_web_page_url')
|
|||||||
|
|
||||||
### Doc File
|
### Doc File
|
||||||
|
|
||||||
To add any doc/docx file, use the data_type as `doc_file`. Eg:
|
To add any doc/docx file, use the data_type as `docx`. Eg:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
app.add('doc_file', 'a_local_doc_file_path')
|
app.add('docx', 'a_local_docx_file_path')
|
||||||
```
|
```
|
||||||
|
|
||||||
### Text
|
### Text
|
||||||
|
|||||||
@@ -4,13 +4,13 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|||||||
|
|
||||||
|
|
||||||
TEXT_SPLITTER_CHUNK_PARAMS = {
|
TEXT_SPLITTER_CHUNK_PARAMS = {
|
||||||
"chunk_size": 500,
|
"chunk_size": 1000,
|
||||||
"chunk_overlap": 0,
|
"chunk_overlap": 0,
|
||||||
"length_function": len,
|
"length_function": len,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
class DocFileChunker(BaseChunker):
|
class DocxFileChunker(BaseChunker):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
text_splitter = RecursiveCharacterTextSplitter(**TEXT_SPLITTER_CHUNK_PARAMS)
|
text_splitter = RecursiveCharacterTextSplitter(**TEXT_SPLITTER_CHUNK_PARAMS)
|
||||||
super().__init__(text_splitter)
|
super().__init__(text_splitter)
|
||||||
@@ -13,13 +13,13 @@ from embedchain.loaders.pdf_file import PdfFileLoader
|
|||||||
from embedchain.loaders.web_page import WebPageLoader
|
from embedchain.loaders.web_page import WebPageLoader
|
||||||
from embedchain.loaders.local_qna_pair import LocalQnaPairLoader
|
from embedchain.loaders.local_qna_pair import LocalQnaPairLoader
|
||||||
from embedchain.loaders.local_text import LocalTextLoader
|
from embedchain.loaders.local_text import LocalTextLoader
|
||||||
from embedchain.loaders.doc_file import DocFileLoader
|
from embedchain.loaders.docx_file import DocxFileLoader
|
||||||
from embedchain.chunkers.youtube_video import YoutubeVideoChunker
|
from embedchain.chunkers.youtube_video import YoutubeVideoChunker
|
||||||
from embedchain.chunkers.pdf_file import PdfFileChunker
|
from embedchain.chunkers.pdf_file import PdfFileChunker
|
||||||
from embedchain.chunkers.web_page import WebPageChunker
|
from embedchain.chunkers.web_page import WebPageChunker
|
||||||
from embedchain.chunkers.qna_pair import QnaPairChunker
|
from embedchain.chunkers.qna_pair import QnaPairChunker
|
||||||
from embedchain.chunkers.text import TextChunker
|
from embedchain.chunkers.text import TextChunker
|
||||||
from embedchain.chunkers.doc_file import DocFileChunker
|
from embedchain.chunkers.docx_file import DocxFileChunker
|
||||||
from embedchain.vectordb.chroma_db import ChromaDB
|
from embedchain.vectordb.chroma_db import ChromaDB
|
||||||
|
|
||||||
|
|
||||||
@@ -61,6 +61,7 @@ class EmbedChain:
|
|||||||
'web_page': WebPageLoader(),
|
'web_page': WebPageLoader(),
|
||||||
'qna_pair': LocalQnaPairLoader(),
|
'qna_pair': LocalQnaPairLoader(),
|
||||||
'text': LocalTextLoader(),
|
'text': LocalTextLoader(),
|
||||||
|
'docx': DocxFileLoader(),
|
||||||
}
|
}
|
||||||
if data_type in loaders:
|
if data_type in loaders:
|
||||||
return loaders[data_type]
|
return loaders[data_type]
|
||||||
@@ -81,6 +82,7 @@ class EmbedChain:
|
|||||||
'web_page': WebPageChunker(),
|
'web_page': WebPageChunker(),
|
||||||
'qna_pair': QnaPairChunker(),
|
'qna_pair': QnaPairChunker(),
|
||||||
'text': TextChunker(),
|
'text': TextChunker(),
|
||||||
|
'docx': DocxFileChunker(),
|
||||||
}
|
}
|
||||||
if data_type in chunkers:
|
if data_type in chunkers:
|
||||||
return chunkers[data_type]
|
return chunkers[data_type]
|
||||||
|
|||||||
@@ -1,9 +1,8 @@
|
|||||||
from langchain.document_loaders import UnstructuredWordDocumentLoader
|
from langchain.document_loaders import Docx2txtLoader
|
||||||
|
|
||||||
|
class DocxFileLoader:
|
||||||
class DocFileLoader:
|
|
||||||
def load_data(self, url):
|
def load_data(self, url):
|
||||||
loader = UnstructuredWordDocumentLoader(url)
|
loader = Docx2txtLoader(url)
|
||||||
output = []
|
output = []
|
||||||
data = loader.load()
|
data = loader.load()
|
||||||
content = data[0].page_content
|
content = data[0].page_content
|
||||||
Reference in New Issue
Block a user