[Feature] JSON data loader support (#816)

2023-10-18 13:53:15 -07:00
parent 4dc1785ef1
commit 7641cba01d
10 changed files with 99 additions and 4 deletions
--- a/embedchain/chunkers/json.py
+++ b/embedchain/chunkers/json.py
@@ -0,0 +1,22 @@
+from typing import Optional
+
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+
+from embedchain.chunkers.base_chunker import BaseChunker
+from embedchain.config.add_config import ChunkerConfig
+from embedchain.helper.json_serializable import register_deserializable
+
+
+@register_deserializable
+class JSONChunker(BaseChunker):
+    """Chunker for json."""
+
+    def __init__(self, config: Optional[ChunkerConfig] = None):
+        if config is None:
+            config = ChunkerConfig(chunk_size=1000, chunk_overlap=0, length_function=len)
+        text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=config.chunk_size,
+            chunk_overlap=config.chunk_overlap,
+            length_function=config.length_function,
+        )
+        super().__init__(text_splitter)
--- a/embedchain/data_formatter/data_formatter.py
+++ b/embedchain/data_formatter/data_formatter.py
@@ -2,6 +2,7 @@ from embedchain.chunkers.base_chunker import BaseChunker
 from embedchain.chunkers.docs_site import DocsSiteChunker
 from embedchain.chunkers.docx_file import DocxFileChunker
 from embedchain.chunkers.images import ImagesChunker
+from embedchain.chunkers.json import JSONChunker
 from embedchain.chunkers.mdx import MdxChunker
 from embedchain.chunkers.notion import NotionChunker
 from embedchain.chunkers.pdf_file import PdfFileChunker
@@ -20,6 +21,7 @@ from embedchain.loaders.csv import CsvLoader
 from embedchain.loaders.docs_site_loader import DocsSiteLoader
 from embedchain.loaders.docx_file import DocxFileLoader
 from embedchain.loaders.images import ImagesLoader
+from embedchain.loaders.json import JSONLoader
 from embedchain.loaders.local_qna_pair import LocalQnaPairLoader
 from embedchain.loaders.local_text import LocalTextLoader
 from embedchain.loaders.mdx import MdxLoader
@@ -75,6 +77,7 @@ class DataFormatter(JSONSerializable):
            DataType.CSV: CsvLoader,
            DataType.MDX: MdxLoader,
            DataType.IMAGES: ImagesLoader,
+            DataType.JSON: JSONLoader,
        }
        lazy_loaders = {DataType.NOTION}
        if data_type in loaders:
@@ -116,6 +119,7 @@ class DataFormatter(JSONSerializable):
            DataType.MDX: MdxChunker,
            DataType.IMAGES: ImagesChunker,
            DataType.XML: XmlChunker,
+            DataType.JSON: JSONChunker,
        }
        if data_type in chunker_classes:
            chunker_class: type = chunker_classes[data_type]
--- a/embedchain/loaders/json.py
+++ b/embedchain/loaders/json.py
@@ -0,0 +1,23 @@
+import hashlib
+
+from langchain.document_loaders.json_loader import JSONLoader as LcJSONLoader
+
+from embedchain.loaders.base_loader import BaseLoader
+
+langchain_json_jq_schema = 'to_entries | map("\(.key): \(.value|tostring)") | .[]'
+
+
+class JSONLoader(BaseLoader):
+    @staticmethod
+    def load_data(content):
+        """Load a json file. Each data point is a key value pair."""
+        data = []
+        data_content = []
+        loader = LcJSONLoader(content, text_content=False, jq_schema=langchain_json_jq_schema)
+        docs = loader.load()
+        for doc in docs:
+            meta_data = doc.metadata
+            data.append({"content": doc.page_content, "meta_data": {"url": content, "row": meta_data["seq_num"]}})
+            data_content.append(doc.page_content)
+        doc_id = hashlib.sha256((content + ", ".join(data_content)).encode()).hexdigest()
+        return {"doc_id": doc_id, "data": data}
--- a/embedchain/models/data_type.py
+++ b/embedchain/models/data_type.py
@@ -25,6 +25,7 @@ class IndirectDataType(Enum):
    CSV = "csv"
    MDX = "mdx"
    IMAGES = "images"
+    JSON = "json"


 class SpecialDataType(Enum):
@@ -49,3 +50,4 @@ class DataType(Enum):
    MDX = IndirectDataType.MDX.value
    QNA_PAIR = SpecialDataType.QNA_PAIR.value
    IMAGES = IndirectDataType.IMAGES.value
+    JSON = IndirectDataType.JSON.value
--- a/embedchain/utils.py
+++ b/embedchain/utils.py
@@ -155,6 +155,10 @@ def detect_datatype(source: Any) -> DataType:
            logging.debug(f"Source of `{formatted_source}` detected as `docx`.")
            return DataType.DOCX

+        if url.path.endswith(".json"):
+            logging.debug(f"Source of `{formatted_source}` detected as `json_file`.")
+            return DataType.JSON
+
        if "docs" in url.netloc or ("docs" in url.path and url.scheme != "file"):
            # `docs_site` detection via path is not accepted for local filesystem URIs,
            # because that would mean all paths that contain `docs` are now doc sites, which is too aggressive.
@@ -194,6 +198,10 @@ def detect_datatype(source: Any) -> DataType:
            logging.debug(f"Source of `{formatted_source}` detected as `xml`.")
            return DataType.XML

+        if source.endswith(".json"):
+            logging.debug(f"Source of `{formatted_source}` detected as `json`.")
+            return DataType.JSON
+
        # If the source is a valid file, that's not detectable as a type, an error is raised.
        # It does not fallback to text.
        raise ValueError(