Add support for image dataset (#571)

Co-authored-by: Rupesh Bansal <rupeshbansal@Shankars-MacBook-Air.local>
2023-10-04 09:50:40 +05:30
parent 55e9a1cbd6
commit d0af018b8d
19 changed files with 498 additions and 31 deletions
--- a/embedchain/chunkers/base_chunker.py
+++ b/embedchain/chunkers/base_chunker.py
@@ -66,3 +66,6 @@ class BaseChunker(JSONSerializable):
        self.data_type = data_type

        # TODO: This should be done during initialization. This means it has to be done in the child classes.
+
+    def get_word_count(self, documents):
+        return sum([len(document.split(" ")) for document in documents])
--- a/embedchain/chunkers/images.py
+++ b/embedchain/chunkers/images.py
@@ -0,0 +1,63 @@
+import hashlib
+from typing import Optional
+
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+
+from embedchain.chunkers.base_chunker import BaseChunker
+from embedchain.config.add_config import ChunkerConfig
+
+
+class ImagesChunker(BaseChunker):
+    """Chunker for an Image."""
+
+    def __init__(self, config: Optional[ChunkerConfig] = None):
+        if config is None:
+            config = ChunkerConfig(chunk_size=300, chunk_overlap=0, length_function=len)
+        image_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=config.chunk_size,
+            chunk_overlap=config.chunk_overlap,
+            length_function=config.length_function,
+        )
+        super().__init__(image_splitter)
+
+    def create_chunks(self, loader, src):
+        """
+        Loads the image(s), and creates their corresponding embedding. This creates one chunk for each image
+
+        :param loader: The loader whose `load_data` method is used to create
+        the raw data.
+        :param src: The data to be handled by the loader. Can be a URL for
+        remote sources or local content for local loaders.
+        """
+        documents = []
+        embeddings = []
+        ids = []
+        data_result = loader.load_data(src)
+        data_records = data_result["data"]
+        doc_id = data_result["doc_id"]
+        metadatas = []
+        for data in data_records:
+            meta_data = data["meta_data"]
+            # add data type to meta data to allow query using data type
+            meta_data["data_type"] = self.data_type.value
+            chunk_id = hashlib.sha256(meta_data["url"].encode()).hexdigest()
+            ids.append(chunk_id)
+            documents.append(data["content"])
+            embeddings.append(data["embedding"])
+            meta_data["doc_id"] = doc_id
+            metadatas.append(meta_data)
+
+        return {
+            "documents": documents,
+            "embeddings": embeddings,
+            "ids": ids,
+            "metadatas": metadatas,
+            "doc_id": doc_id,
+        }
+
+    def get_word_count(self, documents):
+        """
+        The number of chunks and the corresponding word count for an image is fixed to 1, as 1 embedding is created for
+        each image
+        """
+        return 1