From f5f5e7edd1b3bf94145fc3965bc5e05d23c14210 Mon Sep 17 00:00:00 2001
From: cachho <admin@ch-webdev.com>
Date: Sun, 25 Jun 2023 19:43:41 +0200
Subject: [PATCH] feat: add local text (#44)

This commits extends the "add_local" function. It
adds support to take text and index/embed it.
---
 README.md                        |  9 +++++++++
 embedchain/chunkers/text.py      | 16 ++++++++++++++++
 embedchain/embedchain.py         |  6 +++++-
 embedchain/loaders/local_text.py | 10 ++++++++++
 4 files changed, 40 insertions(+), 1 deletion(-)
 create mode 100644 embedchain/chunkers/text.py
 create mode 100644 embedchain/loaders/local_text.py

diff --git a/README.md b/README.md
index 39812955..016f8528 100644
--- a/README.md
+++ b/README.md
@@ -116,6 +116,15 @@ To add any web page, use the data_type as `web_page`. Eg:
 app.add('web_page', 'a_valid_web_page_url')
 ```
 
+### Text
+
+To supply your own text, use the data_type as `text` and enter a string. The text is not processed, this can be very versatile. Eg:
+
+```python
+app.add_local('text', 'Seek wealth, not money or status. Wealth is having assets that earn while you sleep. Money is how we transfer time and wealth. Status is your place in the social hierarchy.')
+```
+Note: This is not used in the examples because in most cases you will supply a whole paragraph or file, which did not fit.
+
 ### QnA Pair
 
 To supply your own QnA pair, use the data_type as `qna_pair` and enter a tuple. Eg:
diff --git a/embedchain/chunkers/text.py b/embedchain/chunkers/text.py
new file mode 100644
index 00000000..bbf8e6b6
--- /dev/null
+++ b/embedchain/chunkers/text.py
@@ -0,0 +1,16 @@
+from embedchain.chunkers.base_chunker import BaseChunker
+
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+
+
+TEXT_SPLITTER_CHUNK_PARAMS = {
+    "chunk_size": 300,
+    "chunk_overlap": 0,
+    "length_function": len,
+}
+
+
+class TextChunker(BaseChunker):
+    def __init__(self):
+        text_splitter = RecursiveCharacterTextSplitter(**TEXT_SPLITTER_CHUNK_PARAMS)
+        super().__init__(text_splitter)
diff --git a/embedchain/embedchain.py b/embedchain/embedchain.py
index 0f12b230..e2b8302c 100644
--- a/embedchain/embedchain.py
+++ b/embedchain/embedchain.py
@@ -9,10 +9,12 @@ from embedchain.loaders.youtube_video import YoutubeVideoLoader
 from embedchain.loaders.pdf_file import PdfFileLoader
 from embedchain.loaders.web_page import WebPageLoader
 from embedchain.loaders.local_qna_pair import LocalQnaPairLoader
+from embedchain.loaders.local_text import LocalTextLoader
 from embedchain.chunkers.youtube_video import YoutubeVideoChunker
 from embedchain.chunkers.pdf_file import PdfFileChunker
 from embedchain.chunkers.web_page import WebPageChunker
 from embedchain.chunkers.qna_pair import QnaPairChunker
+from embedchain.chunkers.text import TextChunker
 from embedchain.vectordb.chroma_db import ChromaDB
 
 load_dotenv()
@@ -49,7 +51,8 @@ class EmbedChain:
             'youtube_video': YoutubeVideoLoader(),
             'pdf_file': PdfFileLoader(),
             'web_page': WebPageLoader(),
-            'qna_pair': LocalQnaPairLoader()
+            'qna_pair': LocalQnaPairLoader(),
+            'text': LocalTextLoader(),
         }
         if data_type in loaders:
             return loaders[data_type]
@@ -69,6 +72,7 @@ class EmbedChain:
             'pdf_file': PdfFileChunker(),
             'web_page': WebPageChunker(),
             'qna_pair': QnaPairChunker(),
+            'text': TextChunker(),
         }
         if data_type in chunkers:
             return chunkers[data_type]
diff --git a/embedchain/loaders/local_text.py b/embedchain/loaders/local_text.py
new file mode 100644
index 00000000..52ea143f
--- /dev/null
+++ b/embedchain/loaders/local_text.py
@@ -0,0 +1,10 @@
+class LocalTextLoader:
+
+    def load_data(self, content):
+        meta_data = {
+            "url": "local",
+        }
+        return [{
+            "content": content,
+            "meta_data": meta_data,
+        }]