From 797bb567c6d05e8561bab202eb85efd1b7c5a197 Mon Sep 17 00:00:00 2001
From: Deven Patel <iamdevenpatel@gmail.com>
Date: Wed, 25 Oct 2023 14:19:13 -0700
Subject: [PATCH] [feat]: Add openapi spec data loader (#818)

---
 README.md                                   |  1 +
 docs/data-sources/json.mdx                  | 36 ++++++++++++++++
 docs/data-sources/openapi.mdx               | 23 ++++++++++
 docs/data-sources/overview.mdx              |  2 +
 docs/mint.json                              |  2 +
 embedchain/chunkers/openapi.py              | 18 ++++++++
 embedchain/data_formatter/data_formatter.py |  4 ++
 embedchain/loaders/openapi.py               | 42 ++++++++++++++++++
 embedchain/models/data_type.py              |  2 +
 embedchain/utils.py                         | 48 +++++++++++++++++++++
 tests/chunkers/test_chunkers.py             |  2 +
 tests/embedchain/test_utils.py              |  6 +++
 tests/loaders/test_openapi.py               | 26 +++++++++++
 13 files changed, 212 insertions(+)
 create mode 100644 docs/data-sources/json.mdx
 create mode 100644 docs/data-sources/openapi.mdx
 create mode 100644 embedchain/chunkers/openapi.py
 create mode 100644 embedchain/loaders/openapi.py
 create mode 100644 tests/loaders/test_openapi.py

diff --git a/README.md b/README.md
index 6f545736..4e72b36b 100644
--- a/README.md
+++ b/README.md
@@ -47,6 +47,7 @@ Embedchain empowers you to create ChatGPT like apps, on your own dynamic dataset
 * Doc file
 * JSON file
 * Code documentation website loader
+* OpenAPI specs
 * Notion
 * Unstructured file loader and many more
 
diff --git a/docs/data-sources/json.mdx b/docs/data-sources/json.mdx
new file mode 100644
index 00000000..268009e9
--- /dev/null
+++ b/docs/data-sources/json.mdx
@@ -0,0 +1,36 @@
+---
+title: '📃 JSON'
+---
+
+To add any json file, use the data_type as `json`. `json` allows remote urls and conventional file paths. Headers are included for each line, so if you have an `age` column, `18` will be added as `age: 18`. Eg:
+
+```python
+import os
+
+from embedchain.apps.app import App
+
+os.environ["OPENAI_API_KEY"] = "openai_api_key"
+
+app = App()
+
+response = app.query("What is the net worth of Elon Musk as of October 2023?")
+
+print(response)
+"I'm sorry, but I don't have access to real-time information or future predictions. Therefore, I don't know the net worth of Elon Musk as of October 2023."
+
+source_id = app.add("temp.json")
+
+response = app.query("What is the net worth of Elon Musk as of October 2023?")
+
+print(response)
+"As of October 2023, Elon Musk's net worth is $255.2 billion."
+```
+
+```temp.json
+{
+    "question": "What is your net worth, Elon Musk?",
+    "answer": "As of October 2023, Elon Musk's net worth is $255.2 billion, making him one of the wealthiest individuals in the world."
+}
+```
+
+
diff --git a/docs/data-sources/openapi.mdx b/docs/data-sources/openapi.mdx
new file mode 100644
index 00000000..41c2b080
--- /dev/null
+++ b/docs/data-sources/openapi.mdx
@@ -0,0 +1,23 @@
+---
+title: 🙌 OpenAPI
+---
+
+To add any OpenAPI spec yaml file (currently the json file will be detected as JSON data type), use the data_type as 'openapi'. 'openapi' allows remote urls and conventional file paths. Headers are included for each line, so if you have an `age` column, `18` will be added as `age: 18`. Eg:
+
+```python
+from embedchain.apps.app import App
+import os
+
+os.environ["OPENAI_API_KEY"] = "sk-xxx"
+
+app = App()
+
+app.add("https://github.com/openai/openai-openapi/blob/master/openapi.yaml", data_type="openapi")
+# Or add using the local file path
+# app.add("configs/openai_openapi.yaml", data_type="openapi")
+
+response = app.query("What can OpenAI API endpoint do? Can you list the things it can learn from?")
+# Answer: The OpenAI API endpoint allows users to interact with OpenAI's models and perform various tasks such as generating text, answering questions, summarizing documents, translating languages, and more. The specific capabilities and tasks that the API can learn from may vary depending on the models and features provided by OpenAI. For more detailed information, it is recommended to refer to the OpenAI API documentation at https://platform.openai.com/docs/api-reference.
+```
+
+NOTE: The yaml file added to the App must have the required OpenAPI fields otherwise the adding OpenAPI spec will fail. Please refer to [OpenAPI Spec Doc](https://spec.openapis.org/oas/v3.1.0)
\ No newline at end of file
diff --git a/docs/data-sources/overview.mdx b/docs/data-sources/overview.mdx
index c5372ce5..e31df662 100644
--- a/docs/data-sources/overview.mdx
+++ b/docs/data-sources/overview.mdx
@@ -6,6 +6,7 @@ Embedchain comes with built-in support for various data sources. We handle the c
 
 <CardGroup cols={4}>
   <Card title="📊 csv" href="/data-sources/csv"></Card>
+  <Card title="📃 JSON" href="/data-sources/json"></Card>
   <Card title="📚🌐 docs site" href="/data-sources/docs-site"></Card>
   <Card title="📄 docx" href="/data-sources/docx"></Card>
   <Card title="📝 mdx" href="/data-sources/mdx"></Card>
@@ -16,6 +17,7 @@ Embedchain comes with built-in support for various data sources. We handle the c
   <Card title="📝 text" href="/data-sources/text"></Card>
   <Card title="🌐📄 web page" href="/data-sources/web-page"></Card>
   <Card title="🧾 xml" href="/data-sources/xml"></Card>
+  <Card title="🙌 OpenApi" href="/data-sources/openapi"></Card>
   <Card title="🎥📺 youtube video" href="/data-sources/youtube-video"></Card>
 </CardGroup>
 
diff --git a/docs/mint.json b/docs/mint.json
index 4aeb1b83..0aee9c78 100644
--- a/docs/mint.json
+++ b/docs/mint.json
@@ -46,6 +46,7 @@
           "group": "Supported data sources",
           "pages": [
             "data-sources/csv",
+            "data-sources/json",
             "data-sources/docs-site",
             "data-sources/docx",
             "data-sources/mdx",
@@ -55,6 +56,7 @@
             "data-sources/sitemap",
             "data-sources/text",
             "data-sources/web-page",
+            "data-sources/openapi",
             "data-sources/youtube-video"
           ]
         },
diff --git a/embedchain/chunkers/openapi.py b/embedchain/chunkers/openapi.py
new file mode 100644
index 00000000..fbe7b708
--- /dev/null
+++ b/embedchain/chunkers/openapi.py
@@ -0,0 +1,18 @@
+from typing import Optional
+
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+
+from embedchain.chunkers.base_chunker import BaseChunker
+from embedchain.config.add_config import ChunkerConfig
+
+
+class OpenAPIChunker(BaseChunker):
+    def __init__(self, config: Optional[ChunkerConfig] = None):
+        if config is None:
+            config = ChunkerConfig(chunk_size=1000, chunk_overlap=0, length_function=len)
+        text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=config.chunk_size,
+            chunk_overlap=config.chunk_overlap,
+            length_function=config.length_function,
+        )
+        super().__init__(text_splitter)
diff --git a/embedchain/data_formatter/data_formatter.py b/embedchain/data_formatter/data_formatter.py
index e45414f6..d9006421 100644
--- a/embedchain/data_formatter/data_formatter.py
+++ b/embedchain/data_formatter/data_formatter.py
@@ -5,6 +5,7 @@ from embedchain.chunkers.images import ImagesChunker
 from embedchain.chunkers.json import JSONChunker
 from embedchain.chunkers.mdx import MdxChunker
 from embedchain.chunkers.notion import NotionChunker
+from embedchain.chunkers.openapi import OpenAPIChunker
 from embedchain.chunkers.pdf_file import PdfFileChunker
 from embedchain.chunkers.qna_pair import QnaPairChunker
 from embedchain.chunkers.sitemap import SitemapChunker
@@ -26,6 +27,7 @@ from embedchain.loaders.json import JSONLoader
 from embedchain.loaders.local_qna_pair import LocalQnaPairLoader
 from embedchain.loaders.local_text import LocalTextLoader
 from embedchain.loaders.mdx import MdxLoader
+from embedchain.loaders.openapi import OpenAPILoader
 from embedchain.loaders.pdf_file import PdfFileLoader
 from embedchain.loaders.sitemap import SitemapLoader
 from embedchain.loaders.unstructured_file import UnstructuredLoader
@@ -81,6 +83,7 @@ class DataFormatter(JSONSerializable):
             DataType.IMAGES: ImagesLoader,
             DataType.UNSTRUCTURED: UnstructuredLoader,
             DataType.JSON: JSONLoader,
+            DataType.OPENAPI: OpenAPILoader,
         }
         lazy_loaders = {DataType.NOTION}
         if data_type in loaders:
@@ -124,6 +127,7 @@ class DataFormatter(JSONSerializable):
             DataType.XML: XmlChunker,
             DataType.UNSTRUCTURED: UnstructuredFileChunker,
             DataType.JSON: JSONChunker,
+            DataType.OPENAPI: OpenAPIChunker,
         }
         if data_type in chunker_classes:
             chunker_class: type = chunker_classes[data_type]
diff --git a/embedchain/loaders/openapi.py b/embedchain/loaders/openapi.py
new file mode 100644
index 00000000..f898b8ef
--- /dev/null
+++ b/embedchain/loaders/openapi.py
@@ -0,0 +1,42 @@
+import hashlib
+from io import StringIO
+from urllib.parse import urlparse
+
+import requests
+import yaml
+
+from embedchain.loaders.base_loader import BaseLoader
+
+
+class OpenAPILoader(BaseLoader):
+    @staticmethod
+    def _get_file_content(content):
+        url = urlparse(content)
+        if all([url.scheme, url.netloc]) and url.scheme not in ["file", "http", "https"]:
+            raise ValueError("Not a valid URL.")
+
+        if url.scheme in ["http", "https"]:
+            response = requests.get(content)
+            response.raise_for_status()
+            return StringIO(response.text)
+        elif url.scheme == "file":
+            path = url.path
+            return open(path)
+        else:
+            return open(content)
+
+    @staticmethod
+    def load_data(content):
+        """Load yaml file of openapi. Each pair is a document."""
+        data = []
+        file_path = content
+        data_content = []
+        with OpenAPILoader._get_file_content(content=content) as file:
+            yaml_data = yaml.load(file, Loader=yaml.Loader)
+            for i, (key, value) in enumerate(yaml_data.items()):
+                string_data = f"{key}: {value}"
+                meta_data = {"url": file_path, "row": i + 1}
+                data.append({"content": string_data, "meta_data": meta_data})
+                data_content.append(string_data)
+        doc_id = hashlib.sha256((content + ", ".join(data_content)).encode()).hexdigest()
+        return {"doc_id": doc_id, "data": data}
diff --git a/embedchain/models/data_type.py b/embedchain/models/data_type.py
index 9b3eb80d..dd256fb6 100644
--- a/embedchain/models/data_type.py
+++ b/embedchain/models/data_type.py
@@ -27,6 +27,7 @@ class IndirectDataType(Enum):
     IMAGES = "images"
     UNSTRUCTURED = "unstructured"
     JSON = "json"
+    OPENAPI = "openapi"
 
 
 class SpecialDataType(Enum):
@@ -53,3 +54,4 @@ class DataType(Enum):
     IMAGES = IndirectDataType.IMAGES.value
     UNSTRUCTURED = IndirectDataType.UNSTRUCTURED.value
     JSON = IndirectDataType.JSON.value
+    OPENAPI = IndirectDataType.OPENAPI.value
diff --git a/embedchain/utils.py b/embedchain/utils.py
index e974b7f0..9b3e1629 100644
--- a/embedchain/utils.py
+++ b/embedchain/utils.py
@@ -115,6 +115,13 @@ def detect_datatype(source: Any) -> DataType:
     """
     from urllib.parse import urlparse
 
+    import requests
+    import yaml
+
+    def is_openapi_yaml(yaml_content):
+        # currently the following two fields are required in openapi spec yaml config
+        return "openapi" in yaml_content and "info" in yaml_content
+
     try:
         if not isinstance(source, str):
             raise ValueError("Source is not a string and thus cannot be a URL.")
@@ -155,6 +162,31 @@ def detect_datatype(source: Any) -> DataType:
             logging.debug(f"Source of `{formatted_source}` detected as `docx`.")
             return DataType.DOCX
 
+        if url.path.endswith(".yaml"):
+            try:
+                response = requests.get(source)
+                response.raise_for_status()
+                try:
+                    yaml_content = yaml.safe_load(response.text)
+                except yaml.YAMLError as exc:
+                    logging.error(f"Error parsing YAML: {exc}")
+                    raise TypeError(f"Not a valid data type. Error loading YAML: {exc}")
+
+                if is_openapi_yaml(yaml_content):
+                    logging.debug(f"Source of `{formatted_source}` detected as `openapi`.")
+                    return DataType.OPENAPI
+                else:
+                    logging.error(
+                        f"Source of `{formatted_source}` does not contain all the required \
+                        fields of OpenAPI yaml. Check 'https://spec.openapis.org/oas/v3.1.0'"
+                    )
+                    raise TypeError(
+                        "Not a valid data type. Check 'https://spec.openapis.org/oas/v3.1.0', \
+                        make sure you have all the required fields in YAML config data"
+                    )
+            except requests.exceptions.RequestException as e:
+                logging.error(f"Error fetching URL {formatted_source}: {e}")
+
         if url.path.endswith(".json"):
             logging.debug(f"Source of `{formatted_source}` detected as `json_file`.")
             return DataType.JSON
@@ -198,6 +230,22 @@ def detect_datatype(source: Any) -> DataType:
             logging.debug(f"Source of `{formatted_source}` detected as `xml`.")
             return DataType.XML
 
+        if source.endswith(".yaml"):
+            with open(source, "r") as file:
+                yaml_content = yaml.safe_load(file)
+                if is_openapi_yaml(yaml_content):
+                    logging.debug(f"Source of `{formatted_source}` detected as `openapi`.")
+                    return DataType.OPENAPI
+                else:
+                    logging.error(
+                        f"Source of `{formatted_source}` does not contain all the required \
+                                  fields of OpenAPI yaml. Check 'https://spec.openapis.org/oas/v3.1.0'"
+                    )
+                    raise ValueError(
+                        "Invalid YAML data. Check 'https://spec.openapis.org/oas/v3.1.0', \
+                        make sure to add all the required params"
+                    )
+
         if source.endswith(".json"):
             logging.debug(f"Source of `{formatted_source}` detected as `json`.")
             return DataType.JSON
diff --git a/tests/chunkers/test_chunkers.py b/tests/chunkers/test_chunkers.py
index 29ddd28c..b7ee136e 100644
--- a/tests/chunkers/test_chunkers.py
+++ b/tests/chunkers/test_chunkers.py
@@ -3,6 +3,7 @@ from embedchain.chunkers.docx_file import DocxFileChunker
 from embedchain.chunkers.json import JSONChunker
 from embedchain.chunkers.mdx import MdxChunker
 from embedchain.chunkers.notion import NotionChunker
+from embedchain.chunkers.openapi import OpenAPIChunker
 from embedchain.chunkers.pdf_file import PdfFileChunker
 from embedchain.chunkers.qna_pair import QnaPairChunker
 from embedchain.chunkers.sitemap import SitemapChunker
@@ -29,6 +30,7 @@ chunker_common_config = {
     XmlChunker: {"chunk_size": 500, "chunk_overlap": 50, "length_function": len},
     YoutubeVideoChunker: {"chunk_size": 2000, "chunk_overlap": 0, "length_function": len},
     JSONChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
+    OpenAPIChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
 }
 
 
diff --git a/tests/embedchain/test_utils.py b/tests/embedchain/test_utils.py
index 2b23b094..819b1a14 100644
--- a/tests/embedchain/test_utils.py
+++ b/tests/embedchain/test_utils.py
@@ -39,6 +39,12 @@ class TestApp(unittest.TestCase):
     def test_detect_datatype_local_docx(self):
         self.assertEqual(detect_datatype("file:///home/user/document.docx"), DataType.DOCX)
 
+    def test_detect_data_type_json(self):
+        self.assertEqual(detect_datatype("https://www.example.com/data.json"), DataType.JSON)
+
+    def test_detect_data_type_local_json(self):
+        self.assertEqual(detect_datatype("file:///home/user/data.json"), DataType.JSON)
+
     @patch("os.path.isfile")
     def test_detect_datatype_regular_filesystem_docx(self, mock_isfile):
         with tempfile.NamedTemporaryFile(suffix=".docx", delete=True) as tmp:
diff --git a/tests/loaders/test_openapi.py b/tests/loaders/test_openapi.py
new file mode 100644
index 00000000..b39462c2
--- /dev/null
+++ b/tests/loaders/test_openapi.py
@@ -0,0 +1,26 @@
+import pytest
+
+from embedchain.loaders.openapi import OpenAPILoader
+
+
+@pytest.fixture
+def openapi_loader():
+    return OpenAPILoader()
+
+
+def test_load_data(openapi_loader, mocker):
+    mocker.patch("builtins.open", mocker.mock_open(read_data="key1: value1\nkey2: value2"))
+
+    mocker.patch("hashlib.sha256", return_value=mocker.Mock(hexdigest=lambda: "mock_hash"))
+
+    file_path = "configs/openai_openapi.yaml"
+    result = openapi_loader.load_data(file_path)
+
+    expected_doc_id = "mock_hash"
+    expected_data = [
+        {"content": "key1: value1", "meta_data": {"url": file_path, "row": 1}},
+        {"content": "key2: value2", "meta_data": {"url": file_path, "row": 2}},
+    ]
+
+    assert result["doc_id"] == expected_doc_id
+    assert result["data"] == expected_data