Added Clip dependency (#778)

2023-10-10 00:32:45 +05:30
parent bc649b9a85
commit 19a9141c2d
4 changed files with 47 additions and 70 deletions
--- a/embedchain/loaders/images.py
+++ b/embedchain/loaders/images.py
@@ -16,15 +16,15 @@ class ImagesLoader(BaseLoader):
        # load model and image preprocessing
        from embedchain.models.clip_processor import ClipProcessor

-        model, preprocess = ClipProcessor.load_model()
+        model = ClipProcessor.load_model()
        if os.path.isfile(image_url):
-            data = [ClipProcessor.get_image_features(image_url, model, preprocess)]
+            data = [ClipProcessor.get_image_features(image_url, model)]
        else:
            data = []
            for filename in os.listdir(image_url):
                filepath = os.path.join(image_url, filename)
                try:
-                    data.append(ClipProcessor.get_image_features(filepath, model, preprocess))
+                    data.append(ClipProcessor.get_image_features(filepath, model))
                except Exception as e:
                    # Log the file that was not loaded
                    logging.exception("Failed to load the file {}. Exception {}".format(filepath, e))
--- a/embedchain/models/clip_processor.py
+++ b/embedchain/models/clip_processor.py
@@ -1,31 +1,27 @@
 try:
-    import clip
-    import torch
    from PIL import Image, UnidentifiedImageError
+    from sentence_transformers import SentenceTransformer
 except ImportError:
    raise ImportError(
-        "Images requires extra dependencies. Install with `pip install 'embedchain[images]' git+https://github.com/openai/CLIP.git#a1d0717`"  # noqa: E501
+        "Images requires extra dependencies. Install with `pip install 'embedchain[images]'"
    ) from None

-MODEL_NAME = "ViT-B/32"
+MODEL_NAME = "clip-ViT-B-32"


 class ClipProcessor:
    @staticmethod
    def load_model():
        """Load data from a director of images."""
-        device = "cuda" if torch.cuda.is_available() else "cpu"
-
        # load model and image preprocessing
-        model, preprocess = clip.load(MODEL_NAME, device=device, jit=False)
-        return model, preprocess
+        model = SentenceTransformer(MODEL_NAME)
+        return model

    @staticmethod
-    def get_image_features(image_url, model, preprocess):
+    def get_image_features(image_url, model):
        """
        Applies the CLIP model to evaluate the vector representation of the supplied image
        """
-        device = "cuda" if torch.cuda.is_available() else "cpu"
        try:
            # load image
            image = Image.open(image_url)
@@ -34,27 +30,15 @@ class ClipProcessor:
        except UnidentifiedImageError:
            raise UnidentifiedImageError("The supplied file is not an image`")

-        # pre-process image
-        processed_image = preprocess(image).unsqueeze(0).to(device)
-        with torch.no_grad():
-            image_features = model.encode_image(processed_image)
-            image_features /= image_features.norm(dim=-1, keepdim=True)
-
-        image_features = image_features.cpu().detach().numpy().tolist()[0]
+        image_features = model.encode(image)
        meta_data = {"url": image_url}
-        return {"content": image_url, "embedding": image_features, "meta_data": meta_data}
+        return {"content": image_url, "embedding": image_features.tolist(), "meta_data": meta_data}

    @staticmethod
    def get_text_features(query):
        """
        Applies the CLIP model to evaluate the vector representation of the supplied text
        """
-        device = "cuda" if torch.cuda.is_available() else "cpu"
-
-        model, preprocess = ClipProcessor.load_model()
-        text = clip.tokenize(query).to(device)
-        with torch.no_grad():
-            text_features = model.encode_text(text)
-            text_features /= text_features.norm(dim=-1, keepdim=True)
-
-        return text_features.cpu().numpy().tolist()[0]
+        model = ClipProcessor.load_model()
+        text_features = model.encode(query)
+        return text_features.tolist()
--- a/embedchain/utils.py
+++ b/embedchain/utils.py
@@ -128,7 +128,8 @@ def detect_datatype(source: Any) -> DataType:
    formatted_source = format_source(str(source), 30)

    if url:
-        from langchain.document_loaders.youtube import ALLOWED_NETLOCK as YOUTUBE_ALLOWED_NETLOCS
+        from langchain.document_loaders.youtube import \
+            ALLOWED_NETLOCK as YOUTUBE_ALLOWED_NETLOCS

        if url.netloc in YOUTUBE_ALLOWED_NETLOCS:
            logging.debug(f"Source of `{formatted_source}` detected as `youtube_video`.")