[Feature] JSON data loader support (#816)

This commit is contained in:
Deven Patel
2023-10-18 13:53:15 -07:00
committed by GitHub
parent 4dc1785ef1
commit 7641cba01d
10 changed files with 99 additions and 4 deletions

View File

@@ -38,7 +38,7 @@ lint:
poetry run ruff . poetry run ruff .
test: test:
poetry run pytest poetry run pytest $(file)
coverage: coverage:
poetry run pytest --cov=$(PROJECT_NAME) --cov-report=xml poetry run pytest --cov=$(PROJECT_NAME) --cov-report=xml

View File

@@ -45,6 +45,7 @@ Embedchain empowers you to create ChatGPT like apps, on your own dynamic dataset
* Web page * Web page
* Sitemap * Sitemap
* Doc file * Doc file
* JSON file
* Code documentation website loader * Code documentation website loader
* Notion and many more. * Notion and many more.

View File

@@ -0,0 +1,22 @@
from typing import Optional
from langchain.text_splitter import RecursiveCharacterTextSplitter
from embedchain.chunkers.base_chunker import BaseChunker
from embedchain.config.add_config import ChunkerConfig
from embedchain.helper.json_serializable import register_deserializable
@register_deserializable
class JSONChunker(BaseChunker):
"""Chunker for json."""
def __init__(self, config: Optional[ChunkerConfig] = None):
if config is None:
config = ChunkerConfig(chunk_size=1000, chunk_overlap=0, length_function=len)
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=config.chunk_size,
chunk_overlap=config.chunk_overlap,
length_function=config.length_function,
)
super().__init__(text_splitter)

View File

@@ -2,6 +2,7 @@ from embedchain.chunkers.base_chunker import BaseChunker
from embedchain.chunkers.docs_site import DocsSiteChunker from embedchain.chunkers.docs_site import DocsSiteChunker
from embedchain.chunkers.docx_file import DocxFileChunker from embedchain.chunkers.docx_file import DocxFileChunker
from embedchain.chunkers.images import ImagesChunker from embedchain.chunkers.images import ImagesChunker
from embedchain.chunkers.json import JSONChunker
from embedchain.chunkers.mdx import MdxChunker from embedchain.chunkers.mdx import MdxChunker
from embedchain.chunkers.notion import NotionChunker from embedchain.chunkers.notion import NotionChunker
from embedchain.chunkers.pdf_file import PdfFileChunker from embedchain.chunkers.pdf_file import PdfFileChunker
@@ -20,6 +21,7 @@ from embedchain.loaders.csv import CsvLoader
from embedchain.loaders.docs_site_loader import DocsSiteLoader from embedchain.loaders.docs_site_loader import DocsSiteLoader
from embedchain.loaders.docx_file import DocxFileLoader from embedchain.loaders.docx_file import DocxFileLoader
from embedchain.loaders.images import ImagesLoader from embedchain.loaders.images import ImagesLoader
from embedchain.loaders.json import JSONLoader
from embedchain.loaders.local_qna_pair import LocalQnaPairLoader from embedchain.loaders.local_qna_pair import LocalQnaPairLoader
from embedchain.loaders.local_text import LocalTextLoader from embedchain.loaders.local_text import LocalTextLoader
from embedchain.loaders.mdx import MdxLoader from embedchain.loaders.mdx import MdxLoader
@@ -75,6 +77,7 @@ class DataFormatter(JSONSerializable):
DataType.CSV: CsvLoader, DataType.CSV: CsvLoader,
DataType.MDX: MdxLoader, DataType.MDX: MdxLoader,
DataType.IMAGES: ImagesLoader, DataType.IMAGES: ImagesLoader,
DataType.JSON: JSONLoader,
} }
lazy_loaders = {DataType.NOTION} lazy_loaders = {DataType.NOTION}
if data_type in loaders: if data_type in loaders:
@@ -116,6 +119,7 @@ class DataFormatter(JSONSerializable):
DataType.MDX: MdxChunker, DataType.MDX: MdxChunker,
DataType.IMAGES: ImagesChunker, DataType.IMAGES: ImagesChunker,
DataType.XML: XmlChunker, DataType.XML: XmlChunker,
DataType.JSON: JSONChunker,
} }
if data_type in chunker_classes: if data_type in chunker_classes:
chunker_class: type = chunker_classes[data_type] chunker_class: type = chunker_classes[data_type]

View File

@@ -0,0 +1,23 @@
import hashlib
from langchain.document_loaders.json_loader import JSONLoader as LcJSONLoader
from embedchain.loaders.base_loader import BaseLoader
langchain_json_jq_schema = 'to_entries | map("\(.key): \(.value|tostring)") | .[]'
class JSONLoader(BaseLoader):
@staticmethod
def load_data(content):
"""Load a json file. Each data point is a key value pair."""
data = []
data_content = []
loader = LcJSONLoader(content, text_content=False, jq_schema=langchain_json_jq_schema)
docs = loader.load()
for doc in docs:
meta_data = doc.metadata
data.append({"content": doc.page_content, "meta_data": {"url": content, "row": meta_data["seq_num"]}})
data_content.append(doc.page_content)
doc_id = hashlib.sha256((content + ", ".join(data_content)).encode()).hexdigest()
return {"doc_id": doc_id, "data": data}

View File

@@ -25,6 +25,7 @@ class IndirectDataType(Enum):
CSV = "csv" CSV = "csv"
MDX = "mdx" MDX = "mdx"
IMAGES = "images" IMAGES = "images"
JSON = "json"
class SpecialDataType(Enum): class SpecialDataType(Enum):
@@ -49,3 +50,4 @@ class DataType(Enum):
MDX = IndirectDataType.MDX.value MDX = IndirectDataType.MDX.value
QNA_PAIR = SpecialDataType.QNA_PAIR.value QNA_PAIR = SpecialDataType.QNA_PAIR.value
IMAGES = IndirectDataType.IMAGES.value IMAGES = IndirectDataType.IMAGES.value
JSON = IndirectDataType.JSON.value

View File

@@ -155,6 +155,10 @@ def detect_datatype(source: Any) -> DataType:
logging.debug(f"Source of `{formatted_source}` detected as `docx`.") logging.debug(f"Source of `{formatted_source}` detected as `docx`.")
return DataType.DOCX return DataType.DOCX
if url.path.endswith(".json"):
logging.debug(f"Source of `{formatted_source}` detected as `json_file`.")
return DataType.JSON
if "docs" in url.netloc or ("docs" in url.path and url.scheme != "file"): if "docs" in url.netloc or ("docs" in url.path and url.scheme != "file"):
# `docs_site` detection via path is not accepted for local filesystem URIs, # `docs_site` detection via path is not accepted for local filesystem URIs,
# because that would mean all paths that contain `docs` are now doc sites, which is too aggressive. # because that would mean all paths that contain `docs` are now doc sites, which is too aggressive.
@@ -194,6 +198,10 @@ def detect_datatype(source: Any) -> DataType:
logging.debug(f"Source of `{formatted_source}` detected as `xml`.") logging.debug(f"Source of `{formatted_source}` detected as `xml`.")
return DataType.XML return DataType.XML
if source.endswith(".json"):
logging.debug(f"Source of `{formatted_source}` detected as `json`.")
return DataType.JSON
# If the source is a valid file, that's not detectable as a type, an error is raised. # If the source is a valid file, that's not detectable as a type, an error is raised.
# It does not fallback to text. # It does not fallback to text.
raise ValueError( raise ValueError(

View File

@@ -120,9 +120,10 @@ torchvision = { version = ">=0.15.1, !=0.15.2", optional = true }
ftfy = { version = "6.1.1", optional = true } ftfy = { version = "6.1.1", optional = true }
regex = { version = "2023.8.8", optional = true } regex = { version = "2023.8.8", optional = true }
huggingface_hub = { version = "^0.17.3", optional = true } huggingface_hub = { version = "^0.17.3", optional = true }
pymilvus = { version="2.3.1", optional = true } pymilvus = { version = "2.3.1", optional = true }
google-cloud-aiplatform = { version="^1.26.1", optional = true } google-cloud-aiplatform = { version = "^1.26.1", optional = true }
replicate = { version="^0.15.4", optional = true } replicate = { version = "^0.15.4", optional = true }
jq = { version=">=1.6.0", optional = true}
[tool.poetry.group.dev.dependencies] [tool.poetry.group.dev.dependencies]
black = "^23.3.0" black = "^23.3.0"
@@ -163,6 +164,7 @@ dataloaders=[
"docx2txt", "docx2txt",
"unstructured", "unstructured",
"sentence-transformers", "sentence-transformers",
"jq",
] ]
vertexai = ["google-cloud-aiplatform"] vertexai = ["google-cloud-aiplatform"]
llama2 = ["replicate"] llama2 = ["replicate"]

View File

@@ -10,6 +10,7 @@ from embedchain.chunkers.text import TextChunker
from embedchain.chunkers.web_page import WebPageChunker from embedchain.chunkers.web_page import WebPageChunker
from embedchain.chunkers.xml import XmlChunker from embedchain.chunkers.xml import XmlChunker
from embedchain.chunkers.youtube_video import YoutubeVideoChunker from embedchain.chunkers.youtube_video import YoutubeVideoChunker
from embedchain.chunkers.json import JSONChunker
from embedchain.config.add_config import ChunkerConfig from embedchain.config.add_config import ChunkerConfig
chunker_config = ChunkerConfig(chunk_size=500, chunk_overlap=0, length_function=len) chunker_config = ChunkerConfig(chunk_size=500, chunk_overlap=0, length_function=len)
@@ -27,6 +28,7 @@ chunker_common_config = {
WebPageChunker: {"chunk_size": 500, "chunk_overlap": 0, "length_function": len}, WebPageChunker: {"chunk_size": 500, "chunk_overlap": 0, "length_function": len},
XmlChunker: {"chunk_size": 500, "chunk_overlap": 50, "length_function": len}, XmlChunker: {"chunk_size": 500, "chunk_overlap": 50, "length_function": len},
YoutubeVideoChunker: {"chunk_size": 2000, "chunk_overlap": 0, "length_function": len}, YoutubeVideoChunker: {"chunk_size": 2000, "chunk_overlap": 0, "length_function": len},
JSONChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
} }

View File

@@ -0,0 +1,31 @@
import hashlib
from unittest.mock import patch
from langchain.docstore.document import Document
from langchain.document_loaders.json_loader import JSONLoader as LcJSONLoader
from embedchain.loaders.json import JSONLoader
def test_load_data():
mock_document = [
Document(page_content="content1", metadata={"seq_num": 1}),
Document(page_content="content2", metadata={"seq_num": 2}),
]
with patch.object(LcJSONLoader, "load", return_value=mock_document):
content = "temp.json"
result = JsonLoader.load_data(content)
assert "doc_id" in result
assert "data" in result
expected_data = [
{"content": "content1", "meta_data": {"url": content, "row": 1}},
{"content": "content2", "meta_data": {"url": content, "row": 2}},
]
assert result["data"] == expected_data
expected_doc_id = hashlib.sha256((content + ", ".join(["content1", "content2"])).encode()).hexdigest()
assert result["doc_id"] == expected_doc_id