feat: add google doc support added (#155)
This commit is contained in:
committed by
GitHub
parent
c6dbbf5dd3
commit
68e732a426
16
embedchain/chunkers/doc_file.py
Normal file
16
embedchain/chunkers/doc_file.py
Normal file
@@ -0,0 +1,16 @@
|
||||
from embedchain.chunkers.base_chunker import BaseChunker
|
||||
|
||||
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||
|
||||
|
||||
TEXT_SPLITTER_CHUNK_PARAMS = {
|
||||
"chunk_size": 500,
|
||||
"chunk_overlap": 0,
|
||||
"length_function": len,
|
||||
}
|
||||
|
||||
|
||||
class DocFileChunker(BaseChunker):
|
||||
def __init__(self):
|
||||
text_splitter = RecursiveCharacterTextSplitter(**TEXT_SPLITTER_CHUNK_PARAMS)
|
||||
super().__init__(text_splitter)
|
||||
@@ -12,11 +12,13 @@ from embedchain.loaders.pdf_file import PdfFileLoader
|
||||
from embedchain.loaders.web_page import WebPageLoader
|
||||
from embedchain.loaders.local_qna_pair import LocalQnaPairLoader
|
||||
from embedchain.loaders.local_text import LocalTextLoader
|
||||
from embedchain.loaders.doc_file import DocFileLoader
|
||||
from embedchain.chunkers.youtube_video import YoutubeVideoChunker
|
||||
from embedchain.chunkers.pdf_file import PdfFileChunker
|
||||
from embedchain.chunkers.web_page import WebPageChunker
|
||||
from embedchain.chunkers.qna_pair import QnaPairChunker
|
||||
from embedchain.chunkers.text import TextChunker
|
||||
from embedchain.chunkers.doc_file import DocFileChunker
|
||||
from embedchain.vectordb.chroma_db import ChromaDB
|
||||
|
||||
|
||||
|
||||
13
embedchain/loaders/doc_file.py
Normal file
13
embedchain/loaders/doc_file.py
Normal file
@@ -0,0 +1,13 @@
|
||||
from langchain.document_loaders import UnstructuredWordDocumentLoader
|
||||
|
||||
|
||||
class DocFileLoader:
|
||||
def load_data(self, url):
|
||||
loader = UnstructuredWordDocumentLoader(url)
|
||||
output = []
|
||||
data = loader.load()
|
||||
content = data[0].page_content
|
||||
meta_data = data[0].metadata
|
||||
meta_data["url"] = "local"
|
||||
output.append({"content": content, "meta_data": meta_data})
|
||||
return output
|
||||
Reference in New Issue
Block a user