[Feature]: Unstructured File Loader Support - USF (#815)
This commit is contained in:
committed by
GitHub
parent
c8846e0e93
commit
8b64deab40
22
embedchain/chunkers/unstructured_file.py
Normal file
22
embedchain/chunkers/unstructured_file.py
Normal file
@@ -0,0 +1,22 @@
|
||||
from typing import Optional
|
||||
|
||||
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||
|
||||
from embedchain.chunkers.base_chunker import BaseChunker
|
||||
from embedchain.config.add_config import ChunkerConfig
|
||||
from embedchain.helper.json_serializable import register_deserializable
|
||||
|
||||
|
||||
@register_deserializable
|
||||
class UnstructuredFileChunker(BaseChunker):
|
||||
"""Chunker for Unstructured file."""
|
||||
|
||||
def __init__(self, config: Optional[ChunkerConfig] = None):
|
||||
if config is None:
|
||||
config = ChunkerConfig(chunk_size=1000, chunk_overlap=0, length_function=len)
|
||||
text_splitter = RecursiveCharacterTextSplitter(
|
||||
chunk_size=config.chunk_size,
|
||||
chunk_overlap=config.chunk_overlap,
|
||||
length_function=config.length_function,
|
||||
)
|
||||
super().__init__(text_splitter)
|
||||
@@ -6,6 +6,7 @@ from embedchain.chunkers.json import JSONChunker
|
||||
from embedchain.chunkers.mdx import MdxChunker
|
||||
from embedchain.chunkers.notion import NotionChunker
|
||||
from embedchain.chunkers.pdf_file import PdfFileChunker
|
||||
from embedchain.chunkers.unstructured_file import UnstructuredFileChunker
|
||||
from embedchain.chunkers.qna_pair import QnaPairChunker
|
||||
from embedchain.chunkers.sitemap import SitemapChunker
|
||||
from embedchain.chunkers.table import TableChunker
|
||||
@@ -30,6 +31,7 @@ from embedchain.loaders.sitemap import SitemapLoader
|
||||
from embedchain.loaders.web_page import WebPageLoader
|
||||
from embedchain.loaders.xml import XmlLoader
|
||||
from embedchain.loaders.youtube_video import YoutubeVideoLoader
|
||||
from embedchain.loaders.unstructured_file import UnstructuredLoader
|
||||
from embedchain.models.data_type import DataType
|
||||
|
||||
|
||||
@@ -77,6 +79,7 @@ class DataFormatter(JSONSerializable):
|
||||
DataType.CSV: CsvLoader,
|
||||
DataType.MDX: MdxLoader,
|
||||
DataType.IMAGES: ImagesLoader,
|
||||
DataType.UNSTRUCTURED: UnstructuredLoader,
|
||||
DataType.JSON: JSONLoader,
|
||||
}
|
||||
lazy_loaders = {DataType.NOTION}
|
||||
@@ -119,6 +122,7 @@ class DataFormatter(JSONSerializable):
|
||||
DataType.MDX: MdxChunker,
|
||||
DataType.IMAGES: ImagesChunker,
|
||||
DataType.XML: XmlChunker,
|
||||
DataType.UNSTRUCTURED: UnstructuredFileChunker,
|
||||
DataType.JSON: JSONChunker,
|
||||
}
|
||||
if data_type in chunker_classes:
|
||||
|
||||
40
embedchain/loaders/unstructured_file.py
Normal file
40
embedchain/loaders/unstructured_file.py
Normal file
@@ -0,0 +1,40 @@
|
||||
import hashlib
|
||||
|
||||
try:
|
||||
from langchain.document_loaders import UnstructuredFileLoader
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
'PDF File requires extra dependencies. Install with `pip install --upgrade "embedchain[dataloaders]"`'
|
||||
) from None
|
||||
from embedchain.helper.json_serializable import register_deserializable
|
||||
from embedchain.loaders.base_loader import BaseLoader
|
||||
from embedchain.utils import clean_string
|
||||
|
||||
|
||||
@register_deserializable
|
||||
class UnstructuredLoader(BaseLoader):
|
||||
def load_data(self, url):
|
||||
"""Load data from a Unstructured file."""
|
||||
loader = UnstructuredFileLoader(url)
|
||||
data = []
|
||||
all_content = []
|
||||
pages = loader.load_and_split()
|
||||
if not len(pages):
|
||||
raise ValueError("No data found")
|
||||
for page in pages:
|
||||
content = page.page_content
|
||||
content = clean_string(content)
|
||||
meta_data = page.metadata
|
||||
meta_data["url"] = url
|
||||
data.append(
|
||||
{
|
||||
"content": content,
|
||||
"meta_data": meta_data,
|
||||
}
|
||||
)
|
||||
all_content.append(content)
|
||||
doc_id = hashlib.sha256((" ".join(all_content) + url).encode()).hexdigest()
|
||||
return {
|
||||
"doc_id": doc_id,
|
||||
"data": data,
|
||||
}
|
||||
@@ -25,6 +25,7 @@ class IndirectDataType(Enum):
|
||||
CSV = "csv"
|
||||
MDX = "mdx"
|
||||
IMAGES = "images"
|
||||
UNSTRUCTURED = 'unstructured'
|
||||
JSON = "json"
|
||||
|
||||
|
||||
@@ -50,4 +51,5 @@ class DataType(Enum):
|
||||
MDX = IndirectDataType.MDX.value
|
||||
QNA_PAIR = SpecialDataType.QNA_PAIR.value
|
||||
IMAGES = IndirectDataType.IMAGES.value
|
||||
UNSTRUCTURED = IndirectDataType.UNSTRUCTURED.value
|
||||
JSON = IndirectDataType.JSON.value
|
||||
|
||||
Reference in New Issue
Block a user