From f5f5e7edd1b3bf94145fc3965bc5e05d23c14210 Mon Sep 17 00:00:00 2001 From: cachho Date: Sun, 25 Jun 2023 19:43:41 +0200 Subject: [PATCH] feat: add local text (#44) This commits extends the "add_local" function. It adds support to take text and index/embed it. --- README.md | 9 +++++++++ embedchain/chunkers/text.py | 16 ++++++++++++++++ embedchain/embedchain.py | 6 +++++- embedchain/loaders/local_text.py | 10 ++++++++++ 4 files changed, 40 insertions(+), 1 deletion(-) create mode 100644 embedchain/chunkers/text.py create mode 100644 embedchain/loaders/local_text.py diff --git a/README.md b/README.md index 39812955..016f8528 100644 --- a/README.md +++ b/README.md @@ -116,6 +116,15 @@ To add any web page, use the data_type as `web_page`. Eg: app.add('web_page', 'a_valid_web_page_url') ``` +### Text + +To supply your own text, use the data_type as `text` and enter a string. The text is not processed, this can be very versatile. Eg: + +```python +app.add_local('text', 'Seek wealth, not money or status. Wealth is having assets that earn while you sleep. Money is how we transfer time and wealth. Status is your place in the social hierarchy.') +``` +Note: This is not used in the examples because in most cases you will supply a whole paragraph or file, which did not fit. + ### QnA Pair To supply your own QnA pair, use the data_type as `qna_pair` and enter a tuple. Eg: diff --git a/embedchain/chunkers/text.py b/embedchain/chunkers/text.py new file mode 100644 index 00000000..bbf8e6b6 --- /dev/null +++ b/embedchain/chunkers/text.py @@ -0,0 +1,16 @@ +from embedchain.chunkers.base_chunker import BaseChunker + +from langchain.text_splitter import RecursiveCharacterTextSplitter + + +TEXT_SPLITTER_CHUNK_PARAMS = { + "chunk_size": 300, + "chunk_overlap": 0, + "length_function": len, +} + + +class TextChunker(BaseChunker): + def __init__(self): + text_splitter = RecursiveCharacterTextSplitter(**TEXT_SPLITTER_CHUNK_PARAMS) + super().__init__(text_splitter) diff --git a/embedchain/embedchain.py b/embedchain/embedchain.py index 0f12b230..e2b8302c 100644 --- a/embedchain/embedchain.py +++ b/embedchain/embedchain.py @@ -9,10 +9,12 @@ from embedchain.loaders.youtube_video import YoutubeVideoLoader from embedchain.loaders.pdf_file import PdfFileLoader from embedchain.loaders.web_page import WebPageLoader from embedchain.loaders.local_qna_pair import LocalQnaPairLoader +from embedchain.loaders.local_text import LocalTextLoader from embedchain.chunkers.youtube_video import YoutubeVideoChunker from embedchain.chunkers.pdf_file import PdfFileChunker from embedchain.chunkers.web_page import WebPageChunker from embedchain.chunkers.qna_pair import QnaPairChunker +from embedchain.chunkers.text import TextChunker from embedchain.vectordb.chroma_db import ChromaDB load_dotenv() @@ -49,7 +51,8 @@ class EmbedChain: 'youtube_video': YoutubeVideoLoader(), 'pdf_file': PdfFileLoader(), 'web_page': WebPageLoader(), - 'qna_pair': LocalQnaPairLoader() + 'qna_pair': LocalQnaPairLoader(), + 'text': LocalTextLoader(), } if data_type in loaders: return loaders[data_type] @@ -69,6 +72,7 @@ class EmbedChain: 'pdf_file': PdfFileChunker(), 'web_page': WebPageChunker(), 'qna_pair': QnaPairChunker(), + 'text': TextChunker(), } if data_type in chunkers: return chunkers[data_type] diff --git a/embedchain/loaders/local_text.py b/embedchain/loaders/local_text.py new file mode 100644 index 00000000..52ea143f --- /dev/null +++ b/embedchain/loaders/local_text.py @@ -0,0 +1,10 @@ +class LocalTextLoader: + + def load_data(self, content): + meta_data = { + "url": "local", + } + return [{ + "content": content, + "meta_data": meta_data, + }]