Add GPT4Vision Image loader (#1089)

Co-authored-by: Deshraj Yadav <deshrajdry@gmail.com>
2024-01-02 03:57:23 +05:30
parent 367d6b70e2
commit c62663f2e4
29 changed files with 291 additions and 714 deletions
--- a/embedchain/chunkers/image.py
+++ b/embedchain/chunkers/image.py
@@ -0,0 +1,22 @@
+from typing import Optional
+
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+
+from embedchain.chunkers.base_chunker import BaseChunker
+from embedchain.config.add_config import ChunkerConfig
+from embedchain.helpers.json_serializable import register_deserializable
+
+
+@register_deserializable
+class ImageChunker(BaseChunker):
+    """Chunker for Images."""
+
+    def __init__(self, config: Optional[ChunkerConfig] = None):
+        if config is None:
+            config = ChunkerConfig(chunk_size=2000, chunk_overlap=0, length_function=len)
+        text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=config.chunk_size,
+            chunk_overlap=config.chunk_overlap,
+            length_function=config.length_function,
+        )
+        super().__init__(text_splitter)
--- a/embedchain/chunkers/images.py
+++ b/embedchain/chunkers/images.py
@@ -1,67 +0,0 @@
-import hashlib
-import logging
-from typing import Optional
-
-from langchain.text_splitter import RecursiveCharacterTextSplitter
-
-from embedchain.chunkers.base_chunker import BaseChunker
-from embedchain.config.add_config import ChunkerConfig
-
-
-class ImagesChunker(BaseChunker):
-    """Chunker for an Image."""
-
-    def __init__(self, config: Optional[ChunkerConfig] = None):
-        if config is None:
-            config = ChunkerConfig(chunk_size=300, chunk_overlap=0, length_function=len)
-        image_splitter = RecursiveCharacterTextSplitter(
-            chunk_size=config.chunk_size,
-            chunk_overlap=config.chunk_overlap,
-            length_function=config.length_function,
-        )
-        super().__init__(image_splitter)
-
-    def create_chunks(self, loader, src, app_id=None, config: Optional[ChunkerConfig] = None):
-        """
-        Loads the image(s), and creates their corresponding embedding. This creates one chunk for each image
-
-        :param loader: The loader whose `load_data` method is used to create
-        the raw data.
-        :param src: The data to be handled by the loader. Can be a URL for
-        remote sources or local content for local loaders.
-        """
-        documents = []
-        embeddings = []
-        ids = []
-        min_chunk_size = config.min_chunk_size if config is not None else 0
-        logging.info(f"[INFO] Skipping chunks smaller than {min_chunk_size} characters")
-        data_result = loader.load_data(src)
-        data_records = data_result["data"]
-        doc_id = data_result["doc_id"]
-        doc_id = f"{app_id}--{doc_id}" if app_id is not None else doc_id
-        metadatas = []
-        for data in data_records:
-            meta_data = data["meta_data"]
-            # add data type to meta data to allow query using data type
-            meta_data["data_type"] = self.data_type.value
-            chunk_id = hashlib.sha256(meta_data["url"].encode()).hexdigest()
-            ids.append(chunk_id)
-            documents.append(data["content"])
-            embeddings.append(data["embedding"])
-            meta_data["doc_id"] = doc_id
-            metadatas.append(meta_data)
-
-        return {
-            "documents": documents,
-            "embeddings": embeddings,
-            "ids": ids,
-            "metadatas": metadatas,
-            "doc_id": doc_id,
-        }
-
-    def get_word_count(self, documents):
-        """
-        The number of chunks and the corresponding word count for an image is fixed to 1, as 1 embedding is created for
-        each image
-        """
-        return 1