feat: add local text (#44)

This commits extends the "add_local" function. It adds support to take text and index/embed it.
2023-06-25 19:43:41 +02:00
parent b9277c84c8
commit f5f5e7edd1
4 changed files with 40 additions and 1 deletions
--- a/README.md
+++ b/README.md
@@ -116,6 +116,15 @@ To add any web page, use the data_type as `web_page`. Eg:
 app.add('web_page', 'a_valid_web_page_url')
 ```

+### Text
+
+To supply your own text, use the data_type as `text` and enter a string. The text is not processed, this can be very versatile. Eg:
+
+```python
+app.add_local('text', 'Seek wealth, not money or status. Wealth is having assets that earn while you sleep. Money is how we transfer time and wealth. Status is your place in the social hierarchy.')
+```
+Note: This is not used in the examples because in most cases you will supply a whole paragraph or file, which did not fit.
+
 ### QnA Pair

 To supply your own QnA pair, use the data_type as `qna_pair` and enter a tuple. Eg:
--- a/embedchain/chunkers/text.py
+++ b/embedchain/chunkers/text.py
@@ -0,0 +1,16 @@
+from embedchain.chunkers.base_chunker import BaseChunker
+
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+
+
+TEXT_SPLITTER_CHUNK_PARAMS = {
+    "chunk_size": 300,
+    "chunk_overlap": 0,
+    "length_function": len,
+}
+
+
+class TextChunker(BaseChunker):
+    def __init__(self):
+        text_splitter = RecursiveCharacterTextSplitter(**TEXT_SPLITTER_CHUNK_PARAMS)
+        super().__init__(text_splitter)
--- a/embedchain/embedchain.py
+++ b/embedchain/embedchain.py
@@ -9,10 +9,12 @@ from embedchain.loaders.youtube_video import YoutubeVideoLoader
 from embedchain.loaders.pdf_file import PdfFileLoader
 from embedchain.loaders.web_page import WebPageLoader
 from embedchain.loaders.local_qna_pair import LocalQnaPairLoader
+from embedchain.loaders.local_text import LocalTextLoader
 from embedchain.chunkers.youtube_video import YoutubeVideoChunker
 from embedchain.chunkers.pdf_file import PdfFileChunker
 from embedchain.chunkers.web_page import WebPageChunker
 from embedchain.chunkers.qna_pair import QnaPairChunker
+from embedchain.chunkers.text import TextChunker
 from embedchain.vectordb.chroma_db import ChromaDB

 load_dotenv()
@@ -49,7 +51,8 @@ class EmbedChain:
            'youtube_video': YoutubeVideoLoader(),
            'pdf_file': PdfFileLoader(),
            'web_page': WebPageLoader(),
-            'qna_pair': LocalQnaPairLoader()
+            'qna_pair': LocalQnaPairLoader(),
+            'text': LocalTextLoader(),
        }
        if data_type in loaders:
            return loaders[data_type]
@@ -69,6 +72,7 @@ class EmbedChain:
            'pdf_file': PdfFileChunker(),
            'web_page': WebPageChunker(),
            'qna_pair': QnaPairChunker(),
+            'text': TextChunker(),
        }
        if data_type in chunkers:
            return chunkers[data_type]
--- a/embedchain/loaders/local_text.py
+++ b/embedchain/loaders/local_text.py
@@ -0,0 +1,10 @@
+class LocalTextLoader:
+
+    def load_data(self, content):
+        meta_data = {
+            "url": "local",
+        }
+        return [{
+            "content": content,
+            "meta_data": meta_data,
+        }]