diff --git a/embedchain/data_formatter/data_formatter.py b/embedchain/data_formatter/data_formatter.py index 28702871..c3486c62 100644 --- a/embedchain/data_formatter/data_formatter.py +++ b/embedchain/data_formatter/data_formatter.py @@ -1,41 +1,9 @@ +from importlib import import_module from embedchain.chunkers.base_chunker import BaseChunker -from embedchain.chunkers.docs_site import DocsSiteChunker -from embedchain.chunkers.docx_file import DocxFileChunker -from embedchain.chunkers.gmail import GmailChunker -from embedchain.chunkers.images import ImagesChunker -from embedchain.chunkers.json import JSONChunker -from embedchain.chunkers.mdx import MdxChunker -from embedchain.chunkers.notion import NotionChunker -from embedchain.chunkers.openapi import OpenAPIChunker -from embedchain.chunkers.pdf_file import PdfFileChunker -from embedchain.chunkers.qna_pair import QnaPairChunker -from embedchain.chunkers.sitemap import SitemapChunker -from embedchain.chunkers.table import TableChunker -from embedchain.chunkers.text import TextChunker -from embedchain.chunkers.unstructured_file import UnstructuredFileChunker -from embedchain.chunkers.web_page import WebPageChunker -from embedchain.chunkers.xml import XmlChunker -from embedchain.chunkers.youtube_video import YoutubeVideoChunker from embedchain.config import AddConfig from embedchain.config.add_config import ChunkerConfig, LoaderConfig from embedchain.helper.json_serializable import JSONSerializable from embedchain.loaders.base_loader import BaseLoader -from embedchain.loaders.csv import CsvLoader -from embedchain.loaders.docs_site_loader import DocsSiteLoader -from embedchain.loaders.docx_file import DocxFileLoader -from embedchain.loaders.gmail import GmailLoader -from embedchain.loaders.images import ImagesLoader -from embedchain.loaders.json import JSONLoader -from embedchain.loaders.local_qna_pair import LocalQnaPairLoader -from embedchain.loaders.local_text import LocalTextLoader -from embedchain.loaders.mdx import MdxLoader -from embedchain.loaders.openapi import OpenAPILoader -from embedchain.loaders.pdf_file import PdfFileLoader -from embedchain.loaders.sitemap import SitemapLoader -from embedchain.loaders.unstructured_file import UnstructuredLoader -from embedchain.loaders.web_page import WebPageLoader -from embedchain.loaders.xml import XmlLoader -from embedchain.loaders.youtube_video import YoutubeVideoLoader from embedchain.models.data_type import DataType @@ -58,6 +26,11 @@ class DataFormatter(JSONSerializable): self.loader = self._get_loader(data_type=data_type, config=config.loader) self.chunker = self._get_chunker(data_type=data_type, config=config.chunker) + def _lazy_load(self, module_path: str): + module_path, class_name = module_path.rsplit(".", 1) + module = import_module(module_path) + return getattr(module, class_name) + def _get_loader(self, data_type: DataType, config: LoaderConfig) -> BaseLoader: """ Returns the appropriate data loader for the given data type. @@ -71,71 +44,55 @@ class DataFormatter(JSONSerializable): :rtype: BaseLoader """ loaders = { - DataType.YOUTUBE_VIDEO: YoutubeVideoLoader, - DataType.PDF_FILE: PdfFileLoader, - DataType.WEB_PAGE: WebPageLoader, - DataType.QNA_PAIR: LocalQnaPairLoader, - DataType.TEXT: LocalTextLoader, - DataType.DOCX: DocxFileLoader, - DataType.SITEMAP: SitemapLoader, - DataType.XML: XmlLoader, - DataType.DOCS_SITE: DocsSiteLoader, - DataType.CSV: CsvLoader, - DataType.MDX: MdxLoader, - DataType.IMAGES: ImagesLoader, - DataType.UNSTRUCTURED: UnstructuredLoader, - DataType.JSON: JSONLoader, - DataType.OPENAPI: OpenAPILoader, - DataType.GMAIL: GmailLoader, + DataType.YOUTUBE_VIDEO: "embedchain.loaders.youtube_video.YoutubeVideoLoader", + DataType.PDF_FILE: "embedchain.loaders.pdf_file.PdfFileLoader", + DataType.WEB_PAGE: "embedchain.loaders.web_page.WebPageLoader", + DataType.QNA_PAIR: "embedchain.loaders.local_qna_pair.LocalQnaPairLoader", + DataType.TEXT: "embedchain.loaders.local_text.LocalTextLoader", + DataType.DOCX: "embedchain.loaders.docx_file.DocxFileLoader", + DataType.SITEMAP: "embedchain.loaders.sitemap.SitemapLoader", + DataType.XML: "embedchain.loaders.xml.XmlLoader", + DataType.DOCS_SITE: "embedchain.loaders.docs_site_loader.DocsSiteLoader", + DataType.CSV: "embedchain.loaders.csv.CsvLoader", + DataType.MDX: "embedchain.loaders.mdx.MdxLoader", + DataType.IMAGES: "embedchain.loaders.images.ImagesLoader", + DataType.UNSTRUCTURED: "embedchain.loaders.unstructured_file.UnstructuredLoader", + DataType.JSON: "embedchain.loaders.json.JSONLoader", + DataType.OPENAPI: "embedchain.loaders.openapi.OpenAPILoader", + DataType.GMAIL: "embedchain.loaders.gmail.GmailLoader", + DataType.NOTION: "embedchain.loaders.notion.NotionLoader", } - lazy_loaders = {DataType.NOTION} if data_type in loaders: - loader_class: type = loaders[data_type] - loader: BaseLoader = loader_class() - return loader - elif data_type in lazy_loaders: - if data_type == DataType.NOTION: - from embedchain.loaders.notion import NotionLoader - - return NotionLoader() - else: - raise ValueError(f"Unsupported data type: {data_type}") + loader_class: type = self._lazy_load(loaders[data_type]) + return loader_class() else: raise ValueError(f"Unsupported data type: {data_type}") def _get_chunker(self, data_type: DataType, config: ChunkerConfig) -> BaseChunker: - """Returns the appropriate chunker for the given data type. - - :param data_type: The type of the data to chunk. - :type data_type: DataType - :param config: Config to initialize the chunker with. - :type config: ChunkerConfig - :raises ValueError: If an unsupported data type is provided. - :return: The chunker for the given data type. - :rtype: BaseChunker - """ + """Returns the appropriate chunker for the given data type (updated for lazy loading).""" chunker_classes = { - DataType.YOUTUBE_VIDEO: YoutubeVideoChunker, - DataType.PDF_FILE: PdfFileChunker, - DataType.WEB_PAGE: WebPageChunker, - DataType.QNA_PAIR: QnaPairChunker, - DataType.TEXT: TextChunker, - DataType.DOCX: DocxFileChunker, - DataType.DOCS_SITE: DocsSiteChunker, - DataType.SITEMAP: SitemapChunker, - DataType.NOTION: NotionChunker, - DataType.CSV: TableChunker, - DataType.MDX: MdxChunker, - DataType.IMAGES: ImagesChunker, - DataType.XML: XmlChunker, - DataType.UNSTRUCTURED: UnstructuredFileChunker, - DataType.JSON: JSONChunker, - DataType.OPENAPI: OpenAPIChunker, - DataType.GMAIL: GmailChunker, + DataType.YOUTUBE_VIDEO: "embedchain.chunkers.youtube_video.YoutubeVideoChunker", + DataType.PDF_FILE: "embedchain.chunkers.pdf_file.PdfFileChunker", + DataType.WEB_PAGE: "embedchain.chunkers.web_page.WebPageChunker", + DataType.QNA_PAIR: "embedchain.chunkers.qna_pair.QnaPairChunker", + DataType.TEXT: "embedchain.chunkers.text.TextChunker", + DataType.DOCX: "embedchain.chunkers.docx_file.DocxFileChunker", + DataType.SITEMAP: "embedchain.chunkers.sitemap.SitemapChunker", + DataType.XML: "embedchain.chunkers.xml.XmlChunker", + DataType.DOCS_SITE: "embedchain.chunkers.docs_site.DocsSiteChunker", + DataType.CSV: "embedchain.chunkers.table.TableChunker", + DataType.MDX: "embedchain.chunkers.mdx.MdxChunker", + DataType.IMAGES: "embedchain.chunkers.images.ImagesChunker", + DataType.UNSTRUCTURED: "embedchain.chunkers.unstructured_file.UnstructuredFileChunker", + DataType.JSON: "embedchain.chunkers.json.JSONChunker", + DataType.OPENAPI: "embedchain.chunkers.openapi.OpenAPIChunker", + DataType.GMAIL: "embedchain.chunkers.gmail.GmailChunker", + DataType.NOTION: "embedchain.chunkers.notion.NotionChunker", } + if data_type in chunker_classes: - chunker_class: type = chunker_classes[data_type] - chunker: BaseChunker = chunker_class(config) + chunker_class = self._lazy_load(chunker_classes[data_type]) + chunker = chunker_class(config) chunker.set_data_type(data_type) return chunker else: diff --git a/pyproject.toml b/pyproject.toml index efec4599..cb747c81 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "embedchain" -version = "0.0.84" +version = "0.0.85" description = "Data platform for LLMs - Load, index, retrieve and sync any unstructured data" authors = [ "Taranjeet Singh ",