From 8b64deab40544e2cbb196c7830d8a6f14fbabca0 Mon Sep 17 00:00:00 2001 From: Muhammad Muzammil Date: Thu, 19 Oct 2023 04:43:41 +0500 Subject: [PATCH] [Feature]: Unstructured File Loader Support - USF (#815) --- README.md | 3 +- embedchain/chunkers/unstructured_file.py | 22 ++++++++++++ embedchain/data_formatter/data_formatter.py | 4 +++ embedchain/loaders/unstructured_file.py | 40 +++++++++++++++++++++ embedchain/models/data_type.py | 2 ++ 5 files changed, 70 insertions(+), 1 deletion(-) create mode 100644 embedchain/chunkers/unstructured_file.py create mode 100644 embedchain/loaders/unstructured_file.py diff --git a/README.md b/README.md index 7ed2ffee..427d58c2 100644 --- a/README.md +++ b/README.md @@ -47,7 +47,8 @@ Embedchain empowers you to create ChatGPT like apps, on your own dynamic dataset * Doc file * JSON file * Code documentation website loader -* Notion and many more. +* Notion +* Unstructured file loader and many more You can find the full list of data types on [our documentation](https://docs.embedchain.ai/data-sources/csv). diff --git a/embedchain/chunkers/unstructured_file.py b/embedchain/chunkers/unstructured_file.py new file mode 100644 index 00000000..ab0322b0 --- /dev/null +++ b/embedchain/chunkers/unstructured_file.py @@ -0,0 +1,22 @@ +from typing import Optional + +from langchain.text_splitter import RecursiveCharacterTextSplitter + +from embedchain.chunkers.base_chunker import BaseChunker +from embedchain.config.add_config import ChunkerConfig +from embedchain.helper.json_serializable import register_deserializable + + +@register_deserializable +class UnstructuredFileChunker(BaseChunker): + """Chunker for Unstructured file.""" + + def __init__(self, config: Optional[ChunkerConfig] = None): + if config is None: + config = ChunkerConfig(chunk_size=1000, chunk_overlap=0, length_function=len) + text_splitter = RecursiveCharacterTextSplitter( + chunk_size=config.chunk_size, + chunk_overlap=config.chunk_overlap, + length_function=config.length_function, + ) + super().__init__(text_splitter) diff --git a/embedchain/data_formatter/data_formatter.py b/embedchain/data_formatter/data_formatter.py index e0344f89..86af515d 100644 --- a/embedchain/data_formatter/data_formatter.py +++ b/embedchain/data_formatter/data_formatter.py @@ -6,6 +6,7 @@ from embedchain.chunkers.json import JSONChunker from embedchain.chunkers.mdx import MdxChunker from embedchain.chunkers.notion import NotionChunker from embedchain.chunkers.pdf_file import PdfFileChunker +from embedchain.chunkers.unstructured_file import UnstructuredFileChunker from embedchain.chunkers.qna_pair import QnaPairChunker from embedchain.chunkers.sitemap import SitemapChunker from embedchain.chunkers.table import TableChunker @@ -30,6 +31,7 @@ from embedchain.loaders.sitemap import SitemapLoader from embedchain.loaders.web_page import WebPageLoader from embedchain.loaders.xml import XmlLoader from embedchain.loaders.youtube_video import YoutubeVideoLoader +from embedchain.loaders.unstructured_file import UnstructuredLoader from embedchain.models.data_type import DataType @@ -77,6 +79,7 @@ class DataFormatter(JSONSerializable): DataType.CSV: CsvLoader, DataType.MDX: MdxLoader, DataType.IMAGES: ImagesLoader, + DataType.UNSTRUCTURED: UnstructuredLoader, DataType.JSON: JSONLoader, } lazy_loaders = {DataType.NOTION} @@ -119,6 +122,7 @@ class DataFormatter(JSONSerializable): DataType.MDX: MdxChunker, DataType.IMAGES: ImagesChunker, DataType.XML: XmlChunker, + DataType.UNSTRUCTURED: UnstructuredFileChunker, DataType.JSON: JSONChunker, } if data_type in chunker_classes: diff --git a/embedchain/loaders/unstructured_file.py b/embedchain/loaders/unstructured_file.py new file mode 100644 index 00000000..1343a72d --- /dev/null +++ b/embedchain/loaders/unstructured_file.py @@ -0,0 +1,40 @@ +import hashlib + +try: + from langchain.document_loaders import UnstructuredFileLoader +except ImportError: + raise ImportError( + 'PDF File requires extra dependencies. Install with `pip install --upgrade "embedchain[dataloaders]"`' + ) from None +from embedchain.helper.json_serializable import register_deserializable +from embedchain.loaders.base_loader import BaseLoader +from embedchain.utils import clean_string + + +@register_deserializable +class UnstructuredLoader(BaseLoader): + def load_data(self, url): + """Load data from a Unstructured file.""" + loader = UnstructuredFileLoader(url) + data = [] + all_content = [] + pages = loader.load_and_split() + if not len(pages): + raise ValueError("No data found") + for page in pages: + content = page.page_content + content = clean_string(content) + meta_data = page.metadata + meta_data["url"] = url + data.append( + { + "content": content, + "meta_data": meta_data, + } + ) + all_content.append(content) + doc_id = hashlib.sha256((" ".join(all_content) + url).encode()).hexdigest() + return { + "doc_id": doc_id, + "data": data, + } diff --git a/embedchain/models/data_type.py b/embedchain/models/data_type.py index 57cb0ba3..2c646a9e 100644 --- a/embedchain/models/data_type.py +++ b/embedchain/models/data_type.py @@ -25,6 +25,7 @@ class IndirectDataType(Enum): CSV = "csv" MDX = "mdx" IMAGES = "images" + UNSTRUCTURED = 'unstructured' JSON = "json" @@ -50,4 +51,5 @@ class DataType(Enum): MDX = IndirectDataType.MDX.value QNA_PAIR = SpecialDataType.QNA_PAIR.value IMAGES = IndirectDataType.IMAGES.value + UNSTRUCTURED = IndirectDataType.UNSTRUCTURED.value JSON = IndirectDataType.JSON.value