[Feature]: Unstructured File Loader Support - USF (#815)

This commit is contained in:
Muhammad Muzammil
2023-10-19 04:43:41 +05:00
committed by GitHub
parent c8846e0e93
commit 8b64deab40
5 changed files with 70 additions and 1 deletions

View File

@@ -0,0 +1,22 @@
from typing import Optional
from langchain.text_splitter import RecursiveCharacterTextSplitter
from embedchain.chunkers.base_chunker import BaseChunker
from embedchain.config.add_config import ChunkerConfig
from embedchain.helper.json_serializable import register_deserializable
@register_deserializable
class UnstructuredFileChunker(BaseChunker):
"""Chunker for Unstructured file."""
def __init__(self, config: Optional[ChunkerConfig] = None):
if config is None:
config = ChunkerConfig(chunk_size=1000, chunk_overlap=0, length_function=len)
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=config.chunk_size,
chunk_overlap=config.chunk_overlap,
length_function=config.length_function,
)
super().__init__(text_splitter)

View File

@@ -6,6 +6,7 @@ from embedchain.chunkers.json import JSONChunker
from embedchain.chunkers.mdx import MdxChunker
from embedchain.chunkers.notion import NotionChunker
from embedchain.chunkers.pdf_file import PdfFileChunker
from embedchain.chunkers.unstructured_file import UnstructuredFileChunker
from embedchain.chunkers.qna_pair import QnaPairChunker
from embedchain.chunkers.sitemap import SitemapChunker
from embedchain.chunkers.table import TableChunker
@@ -30,6 +31,7 @@ from embedchain.loaders.sitemap import SitemapLoader
from embedchain.loaders.web_page import WebPageLoader
from embedchain.loaders.xml import XmlLoader
from embedchain.loaders.youtube_video import YoutubeVideoLoader
from embedchain.loaders.unstructured_file import UnstructuredLoader
from embedchain.models.data_type import DataType
@@ -77,6 +79,7 @@ class DataFormatter(JSONSerializable):
DataType.CSV: CsvLoader,
DataType.MDX: MdxLoader,
DataType.IMAGES: ImagesLoader,
DataType.UNSTRUCTURED: UnstructuredLoader,
DataType.JSON: JSONLoader,
}
lazy_loaders = {DataType.NOTION}
@@ -119,6 +122,7 @@ class DataFormatter(JSONSerializable):
DataType.MDX: MdxChunker,
DataType.IMAGES: ImagesChunker,
DataType.XML: XmlChunker,
DataType.UNSTRUCTURED: UnstructuredFileChunker,
DataType.JSON: JSONChunker,
}
if data_type in chunker_classes:

View File

@@ -0,0 +1,40 @@
import hashlib
try:
from langchain.document_loaders import UnstructuredFileLoader
except ImportError:
raise ImportError(
'PDF File requires extra dependencies. Install with `pip install --upgrade "embedchain[dataloaders]"`'
) from None
from embedchain.helper.json_serializable import register_deserializable
from embedchain.loaders.base_loader import BaseLoader
from embedchain.utils import clean_string
@register_deserializable
class UnstructuredLoader(BaseLoader):
def load_data(self, url):
"""Load data from a Unstructured file."""
loader = UnstructuredFileLoader(url)
data = []
all_content = []
pages = loader.load_and_split()
if not len(pages):
raise ValueError("No data found")
for page in pages:
content = page.page_content
content = clean_string(content)
meta_data = page.metadata
meta_data["url"] = url
data.append(
{
"content": content,
"meta_data": meta_data,
}
)
all_content.append(content)
doc_id = hashlib.sha256((" ".join(all_content) + url).encode()).hexdigest()
return {
"doc_id": doc_id,
"data": data,
}

View File

@@ -25,6 +25,7 @@ class IndirectDataType(Enum):
CSV = "csv"
MDX = "mdx"
IMAGES = "images"
UNSTRUCTURED = 'unstructured'
JSON = "json"
@@ -50,4 +51,5 @@ class DataType(Enum):
MDX = IndirectDataType.MDX.value
QNA_PAIR = SpecialDataType.QNA_PAIR.value
IMAGES = IndirectDataType.IMAGES.value
UNSTRUCTURED = IndirectDataType.UNSTRUCTURED.value
JSON = IndirectDataType.JSON.value