feat: add google doc support added (#155)

This commit is contained in:
Sahil Kumar Yadav
2023-07-06 14:04:27 +05:30
committed by GitHub
parent c6dbbf5dd3
commit 68e732a426
5 changed files with 70 additions and 27 deletions

View File

@@ -0,0 +1,16 @@
from embedchain.chunkers.base_chunker import BaseChunker
from langchain.text_splitter import RecursiveCharacterTextSplitter
TEXT_SPLITTER_CHUNK_PARAMS = {
"chunk_size": 500,
"chunk_overlap": 0,
"length_function": len,
}
class DocFileChunker(BaseChunker):
def __init__(self):
text_splitter = RecursiveCharacterTextSplitter(**TEXT_SPLITTER_CHUNK_PARAMS)
super().__init__(text_splitter)

View File

@@ -12,11 +12,13 @@ from embedchain.loaders.pdf_file import PdfFileLoader
from embedchain.loaders.web_page import WebPageLoader
from embedchain.loaders.local_qna_pair import LocalQnaPairLoader
from embedchain.loaders.local_text import LocalTextLoader
from embedchain.loaders.doc_file import DocFileLoader
from embedchain.chunkers.youtube_video import YoutubeVideoChunker
from embedchain.chunkers.pdf_file import PdfFileChunker
from embedchain.chunkers.web_page import WebPageChunker
from embedchain.chunkers.qna_pair import QnaPairChunker
from embedchain.chunkers.text import TextChunker
from embedchain.chunkers.doc_file import DocFileChunker
from embedchain.vectordb.chroma_db import ChromaDB

View File

@@ -0,0 +1,13 @@
from langchain.document_loaders import UnstructuredWordDocumentLoader
class DocFileLoader:
def load_data(self, url):
loader = UnstructuredWordDocumentLoader(url)
output = []
data = loader.load()
content = data[0].page_content
meta_data = data[0].metadata
meta_data["url"] = "local"
output.append({"content": content, "meta_data": meta_data})
return output