[Improvement] customize add method (#988)

2023-12-05 00:55:33 -08:00
parent 541b1cb7c7
commit 512cfc9466
7 changed files with 70 additions and 48 deletions
--- a/docs/data-sources/custom.mdx
+++ b/docs/data-sources/custom.mdx
@@ -0,0 +1,41 @@
 ---
 title: '⚙️ Custom'
 ---
 When we say "custom", we mean that you can customize the loader and chunker to your needs. This is done by passing a custom loader and chunker to the `add` method.
 ```python
 from embedchain import Pipeline as App
 import your_loader
 import your_chunker
 app = App()
 loader = your_loader()
 chunker = your_chunker()
 app.add("source", data_type="custom", loader=loader, chunker=chunker)
 ```
 <Note>
    The custom loader and chunker must be a class that inherits from the [`BaseLoader`](https://github.com/embedchain/embedchain/blob/main/embedchain/loaders/base_loader.py) and [`BaseChunker`](https://github.com/embedchain/embedchain/blob/main/embedchain/chunkers/base_chunker.py) classes respectively.
 </Note>
 <Note>
    If the `data_type` is not a valid data type, the `add` method will fallback to the `custom` data type and expect a custom loader and chunker to be passed by the user.
 </Note>
 Example:
 ```python
 from embedchain import Pipeline as App
 from embedchain.loaders.github import GithubLoader
 app = App()
 loader = GithubLoader(config={"token": "ghp_xxx"})
 app.add("repo:embedchain/embedchain type:repo", data_type="github", loader=loader)
 app.query("What is Embedchain?")
 # Answer: Embedchain is a Data Platform for Large Language Models (LLMs). It allows users to seamlessly load, index, retrieve, and sync unstructured data in order to build dynamic, LLM-powered applications. There is also a JavaScript implementation called embedchain-js available on GitHub.
 ```
--- a/docs/data-sources/overview.mdx
+++ b/docs/data-sources/overview.mdx
@@ -26,6 +26,7 @@ Embedchain comes with built-in support for various data sources. We handle the c
  <Card title="🗨️ Discourse" href="/data-sources/discourse"></Card>
  <Card title="💬 Discord" href="/data-sources/discord"></Card>
  <Card title="📝 Github" href="/data-sources/github"></Card>
  <Card title="⚙️ Custom" href="/data-sources/custom"></Card>
 </CardGroup>
 <br/ >
--- a/embedchain/chunkers/common_chunker.py
+++ b/embedchain/chunkers/common_chunker.py
@@ -13,7 +13,7 @@ class CommonChunker(BaseChunker):
    def __init__(self, config: Optional[ChunkerConfig] = None):
        if config is None:
-            config = ChunkerConfig(chunk_size=1000, chunk_overlap=0, length_function=len)
+            config = ChunkerConfig(chunk_size=2000, chunk_overlap=0, length_function=len)
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=config.chunk_size,
            chunk_overlap=config.chunk_overlap,
--- a/embedchain/data_formatter/data_formatter.py
+++ b/embedchain/data_formatter/data_formatter.py
@@ -68,23 +68,13 @@ class DataFormatter(JSONSerializable):
            DataType.DISCORD: "embedchain.loaders.discord.DiscordLoader",
        }
-        custom_loaders = set(
+        if data_type == DataType.CUSTOM or ("loader" in kwargs):
-            [
+            loader_class: type = kwargs.get("loader", None)
-                DataType.POSTGRES,
+            if loader_class:
-                DataType.MYSQL,
+                return loader_class
-                DataType.SLACK,
+        elif data_type in loaders:
                DataType.DISCOURSE,
                DataType.GITHUB,
            ]
        )
        if data_type in loaders:
            loader_class: type = self._lazy_load(loaders[data_type])
            return loader_class()
        elif data_type in custom_loaders:
            loader_class: type = kwargs.get("loader", None)
            if loader_class is not None:
                return loader_class
        raise ValueError(
            f"Cant find the loader for {data_type}.\
@@ -112,28 +102,26 @@ class DataFormatter(JSONSerializable):
            DataType.OPENAPI: "embedchain.chunkers.openapi.OpenAPIChunker",
            DataType.GMAIL: "embedchain.chunkers.gmail.GmailChunker",
            DataType.NOTION: "embedchain.chunkers.notion.NotionChunker",
            DataType.POSTGRES: "embedchain.chunkers.postgres.PostgresChunker",
            DataType.MYSQL: "embedchain.chunkers.mysql.MySQLChunker",
            DataType.SLACK: "embedchain.chunkers.slack.SlackChunker",
            DataType.DISCOURSE: "embedchain.chunkers.discourse.DiscourseChunker",
            DataType.SUBSTACK: "embedchain.chunkers.substack.SubstackChunker",
            DataType.GITHUB: "embedchain.chunkers.common_chunker.CommonChunker",
            DataType.YOUTUBE_CHANNEL: "embedchain.chunkers.common_chunker.CommonChunker",
            DataType.DISCORD: "embedchain.chunkers.common_chunker.CommonChunker",
            DataType.CUSTOM: "embedchain.chunkers.common_chunker.CommonChunker",
        }
-        if data_type in chunker_classes:
+        if "chunker" in kwargs:
-            if "chunker" in kwargs:
+            chunker_class = kwargs.get("chunker", None)
-                chunker_class = kwargs.get("chunker")
+            if chunker_class:
-            else:
+                chunker = chunker_class(config)
-                chunker_class = self._lazy_load(chunker_classes[data_type])
+                chunker.set_data_type(data_type)
-
+                return chunker
        elif data_type in chunker_classes:
            chunker_class = self._lazy_load(chunker_classes[data_type])
            chunker = chunker_class(config)
            chunker.set_data_type(data_type)
            return chunker
-        else:
+
-            raise ValueError(
+        raise ValueError(
-                f"Cant find the chunker for {data_type}.\
+            f"Cant find the chunker for {data_type}.\
-                    We recommend to pass the chunker to use data_type: {data_type},\
+                We recommend to pass the chunker to use data_type: {data_type},\
-                        check `https://docs.embedchain.ai/data-sources/overview`."
+                    check `https://docs.embedchain.ai/data-sources/overview`."
-            )
+        )
--- a/embedchain/embedchain.py
+++ b/embedchain/embedchain.py
@@ -178,10 +178,10 @@ class EmbedChain(JSONSerializable):
            try:
                data_type = DataType(data_type)
            except ValueError:
-                raise ValueError(
+                logging.info(
-                    f"Invalid data_type: '{data_type}'.",
+                    f"Invalid data_type: '{data_type}', using `custom` instead.\n Check docs to pass the valid data type: `https://docs.embedchain.ai/data-sources/overview`"  # noqa: E501
-                    f"Please use one of the following: {[data_type.value for data_type in DataType]}",
+                )
-                ) from None
+                data_type = DataType.CUSTOM
        if not data_type:
            data_type = detect_datatype(source)
--- a/embedchain/models/data_type.py
+++ b/embedchain/models/data_type.py
@@ -29,14 +29,10 @@ class IndirectDataType(Enum):
    JSON = "json"
    OPENAPI = "openapi"
    GMAIL = "gmail"
    POSTGRES = "postgres"
    MYSQL = "mysql"
    SLACK = "slack"
    DISCOURSE = "discourse"
    SUBSTACK = "substack"
    GITHUB = "github"
    YOUTUBE_CHANNEL = "youtube_channel"
    DISCORD = "discord"
    CUSTOM = "custom"
 class SpecialDataType(Enum):
@@ -65,11 +61,7 @@ class DataType(Enum):
    JSON = IndirectDataType.JSON.value
    OPENAPI = IndirectDataType.OPENAPI.value
    GMAIL = IndirectDataType.GMAIL.value
    POSTGRES = IndirectDataType.POSTGRES.value
    MYSQL = IndirectDataType.MYSQL.value
    SLACK = IndirectDataType.SLACK.value
    DISCOURSE = IndirectDataType.DISCOURSE.value
    SUBSTACK = IndirectDataType.SUBSTACK.value
    GITHUB = IndirectDataType.GITHUB.value
    YOUTUBE_CHANNEL = IndirectDataType.YOUTUBE_CHANNEL.value
    DISCORD = IndirectDataType.DISCORD.value
    CUSTOM = IndirectDataType.CUSTOM.value
--- a/tests/chunkers/test_chunkers.py
+++ b/tests/chunkers/test_chunkers.py
@@ -40,7 +40,7 @@ chunker_common_config = {
    PostgresChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
    SlackChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
    DiscourseChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
-    CommonChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
+    CommonChunker: {"chunk_size": 2000, "chunk_overlap": 0, "length_function": len},
 }