diff --git a/docs/open-source/multimodal-support.mdx b/docs/open-source/multimodal-support.mdx
index 7cff75df..fcf91049 100644
--- a/docs/open-source/multimodal-support.mdx
+++ b/docs/open-source/multimodal-support.mdx
@@ -10,11 +10,25 @@ Mem0 extends its capabilities beyond text by supporting multimodal data, includi
When a user provides an image, Mem0 processes the image to extract textual information and relevant details, which are then added to the user's memory. This feature enhances the system's ability to understand and remember details based on visual inputs.
+
+To enable multimodal support, you must set `enable_vision = True` in your configuration. The `vision_details` parameter can be set to "auto" (default), "low", or "high" to control the level of detail in image processing.
+
+
```python Code
from mem0 import Memory
-client = Memory()
+config = {
+ "llm": {
+ "provider": "openai",
+ "config": {
+ "enable_vision": True,
+ "vision_details": "high"
+ }
+ }
+}
+
+client = Memory.from_config(config=config)
messages = [
{
@@ -182,11 +196,72 @@ await client.add([imageMessage], { userId: "alice" })
```
-By utilizing these methods, you can effectively incorporate images into user interactions, enhancing the multimodal capabilities of your Mem0 instance.
+## 3. OpenAI-Compatible Message Format
-
-Currently, we support only OpenAI models for image description.
-
+You can also use the OpenAI-compatible format to combine text and images in a single message:
+
+
+```python Python
+import base64
+
+# Path to the image file
+image_path = "path/to/your/image.jpg"
+
+# Encode the image in Base64
+with open(image_path, "rb") as image_file:
+ base64_image = base64.b64encode(image_file.read()).decode("utf-8")
+
+# Create the message using OpenAI-compatible format
+message = {
+ "role": "user",
+ "content": [
+ {
+ "type": "text",
+ "text": "What is in this image?",
+ },
+ {
+ "type": "image_url",
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+ },
+ ],
+}
+
+# Add the message to memory
+client.add([message], user_id="alice")
+```
+
+```typescript TypeScript
+import { Memory, Message } from "mem0ai/oss";
+
+const client = new Memory();
+
+const imagePath = "path/to/your/image.jpg";
+
+const base64Image = fs.readFileSync(imagePath, { encoding: 'base64' });
+
+const message: Message = {
+ role: "user",
+ content: [
+ {
+ type: "text",
+ text: "What is in this image?",
+ },
+ {
+ type: "image_url",
+ image_url: {
+ url: `data:image/jpeg;base64,${base64Image}`
+ }
+ },
+ ],
+}
+
+await client.add([message], { userId: "alice" })
+```
+
+
+This format allows you to combine text and images in a single message, making it easier to provide context along with visual content.
+
+By utilizing these methods, you can effectively incorporate images into user interactions, enhancing the multimodal capabilities of your Mem0 instance.
If you have any questions, please feel free to reach out to us using one of the following methods:
diff --git a/mem0/configs/llms/base.py b/mem0/configs/llms/base.py
index a681db33..f55e9e9a 100644
--- a/mem0/configs/llms/base.py
+++ b/mem0/configs/llms/base.py
@@ -19,6 +19,8 @@ class BaseLlmConfig(ABC):
max_tokens: int = 2000,
top_p: float = 0.1,
top_k: int = 1,
+ enable_vision: bool = False,
+ vision_details: Optional[str] = "auto",
# Openrouter specific
models: Optional[list[str]] = None,
route: Optional[str] = "fallback",
@@ -55,6 +57,10 @@ class BaseLlmConfig(ABC):
:type top_p: float, optional
:param top_k: Controls the diversity of words. Higher values make word selection more diverse, defaults to 0
:type top_k: int, optional
+ :param enable_vision: Enable vision for the LLM, defaults to False
+ :type enable_vision: bool, optional
+ :param vision_details: Details of the vision to be used [low, high, auto], defaults to "auto"
+ :type vision_details: Optional[str], optional
:param models: Openrouter models to use, defaults to None
:type models: Optional[list[str]], optional
:param route: Openrouter route to be used, defaults to "fallback"
@@ -85,6 +91,8 @@ class BaseLlmConfig(ABC):
self.max_tokens = max_tokens
self.top_p = top_p
self.top_k = top_k
+ self.enable_vision = enable_vision
+ self.vision_details = vision_details
# AzureOpenAI specific
self.http_client = httpx.Client(proxies=http_client_proxies) if http_client_proxies else None
diff --git a/mem0/memory/main.py b/mem0/memory/main.py
index 28bef030..1772b418 100644
--- a/mem0/memory/main.py
+++ b/mem0/memory/main.py
@@ -115,7 +115,10 @@ class Memory(MemoryBase):
if isinstance(messages, str):
messages = [{"role": "user", "content": messages}]
- messages = parse_vision_messages(messages)
+ if self.config.llm.config.get("enable_vision"):
+ messages = parse_vision_messages(messages, self.llm, self.config.llm.config.get("vision_details"))
+ else:
+ messages = parse_vision_messages(messages)
with concurrent.futures.ThreadPoolExecutor() as executor:
future1 = executor.submit(self._add_to_vector_store, messages, metadata, filters)
diff --git a/mem0/memory/utils.py b/mem0/memory/utils.py
index 4847f306..ad14bf46 100644
--- a/mem0/memory/utils.py
+++ b/mem0/memory/utils.py
@@ -1,6 +1,6 @@
import re
+
from mem0.configs.prompts import FACT_RETRIEVAL_PROMPT
-from mem0.llms.openai import OpenAILLM
def get_fact_retrieval_messages(message):
@@ -45,13 +45,13 @@ def remove_code_blocks(content: str) -> str:
return match.group(1).strip() if match else content.strip()
-def get_image_description(image_url):
+def get_image_description(image_obj, llm, vision_details):
"""
Get the description of the image
"""
- llm = OpenAILLM()
- response = llm.generate_response(
- messages=[
+
+ if isinstance(image_obj, str):
+ messages = [
{
"role": "user",
"content": [
@@ -59,31 +59,42 @@ def get_image_description(image_url):
"type": "text",
"text": "A user is providing an image. Provide a high level description of the image and do not include any additional text.",
},
- {"type": "image_url", "image_url": {"url": image_url}},
+ {"type": "image_url", "image_url": {"url": image_obj, "detail": vision_details}},
],
},
]
- )
+ else:
+ messages = [image_obj]
+
+ response = llm.generate_response(messages=messages)
return response
-def parse_vision_messages(messages):
+def parse_vision_messages(messages, llm=None, vision_details="auto"):
"""
Parse the vision messages from the messages
"""
returned_messages = []
for msg in messages:
- if msg["role"] != "system":
- if not isinstance(msg["content"], str) and msg["content"]["type"] == "image_url":
- image_url = msg["content"]["image_url"]["url"]
- try:
- description = get_image_description(image_url)
- msg["content"]["text"] = description
- returned_messages.append({"role": msg["role"], "content": description})
- except Exception:
- raise Exception(f"Error while downloading {image_url}.")
- else:
- returned_messages.append(msg)
- else:
+ if msg["role"] == "system":
returned_messages.append(msg)
+ continue
+
+ # Handle message content
+ if isinstance(msg["content"], list):
+ # Multiple image URLs in content
+ description = get_image_description(msg, llm, vision_details)
+ returned_messages.append({"role": msg["role"], "content": description})
+ elif isinstance(msg["content"], dict) and msg["content"].get("type") == "image_url":
+ # Single image content
+ image_url = msg["content"]["image_url"]["url"]
+ try:
+ description = get_image_description(image_url, llm, vision_details)
+ returned_messages.append({"role": msg["role"], "content": description})
+ except Exception:
+ raise Exception(f"Error while downloading {image_url}.")
+ else:
+ # Regular text content
+ returned_messages.append(msg)
+
return returned_messages