Change multimodal to one client

SylphAI-Inc · Jan 6, 2025 · c6c4663 · c6c4663
1 parent 73089ff
commit c6c4663
Show file tree

Hide file tree

Showing 5 changed files with 154 additions and 138 deletions.
diff --git a/adalflow/adalflow/components/model_client/__init__.py b/adalflow/adalflow/components/model_client/__init__.py
@@ -64,10 +64,6 @@
     "adalflow.components.model_client.openai_client.get_probabilities",
     OptionalPackages.OPENAI,
 )
-OpenAIMultimodalClient = LazyImport(
-    "adalflow.components.model_client.openai_multimodal_client.OpenAIMultimodalClient",
-    OptionalPackages.OPENAI,
-)
 
 __all__ = [
     "CohereAPIClient",
@@ -80,7 +76,6 @@
     "GroqAPIClient",
     "OpenAIClient",
     "GoogleGenAIClient",
-    "OpenAIMultimodalClient",
 ]
 
 for name in __all__:

diff --git a/adalflow/adalflow/components/model_client/openai_client.py b/adalflow/adalflow/components/model_client/openai_client.py
@@ -1,6 +1,7 @@
 """OpenAI ModelClient integration."""
 
 import os
+import base64
 from typing import (
     Dict,
     Sequence,
@@ -51,6 +52,14 @@
 log = logging.getLogger(__name__)
 T = TypeVar("T")
 
+# Models that support multimodal inputs
+MULTIMODAL_MODELS = {
+    "gpt-4o",  # Versatile, high-intelligence flagship model
+    "gpt-4o-mini",  # Fast, affordable small model for focused tasks
+    "o1",  # Reasoning model that excels at complex, multi-step tasks
+    "o1-mini",  # Smaller reasoning model for complex tasks
+}
+
 
 # completion parsing functions and you can combine them into one singple chat completion parser
 def get_first_message_content(completion: ChatCompletion) -> str:
@@ -332,6 +341,102 @@ def to_dict(self) -> Dict[str, Any]:
         output = super().to_dict(exclude=exclude)
         return output
 
+    def _encode_image(self, image_path: str) -> str:
+        """Encode image to base64 string.
+
+        Args:
+            image_path: Path to image file.
+
+        Returns:
+            Base64 encoded image string.
+        """
+        with open(image_path, "rb") as image_file:
+            return base64.b64encode(image_file.read()).decode("utf-8")
+
+    def _prepare_image_content(
+        self, image_source: Union[str, Dict[str, Any]], detail: str = "auto"
+    ) -> Dict[str, Any]:
+        """Prepare image content for API request.
+
+        Args:
+            image_source: Either a path to local image or a URL.
+            detail: Image detail level ('auto', 'low', or 'high').
+
+        Returns:
+            Formatted image content for API request.
+        """
+        if isinstance(image_source, str):
+            if image_source.startswith(("http://", "https://")):
+                return {
+                    "type": "image_url",
+                    "image_url": {"url": image_source, "detail": detail},
+                }
+            else:
+                base64_image = self._encode_image(image_source)
+                return {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": f"data:image/jpeg;base64,{base64_image}",
+                        "detail": detail,
+                    },
+                }
+        return image_source
+
+    def generate(
+        self,
+        prompt: str,
+        images: Optional[
+            Union[str, List[str], Dict[str, Any], List[Dict[str, Any]]]
+        ] = None,
+        model_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> GeneratorOutput:
+        """Generate text response for given prompt and optionally images.
+
+        Args:
+            prompt: Text prompt.
+            images: Optional image source(s) - can be path(s), URL(s), or formatted dict(s).
+            model_kwargs: Additional model parameters.
+
+        Returns:
+            GeneratorOutput containing the model's response.
+        """
+        model_kwargs = model_kwargs or {}
+        model = model_kwargs.get("model", "gpt-4o-mini")
+        max_tokens = model_kwargs.get("max_tokens", 300)
+        detail = model_kwargs.get("detail", "auto")
+
+        # Check if model supports multimodal inputs when images are provided
+        if images and model not in MULTIMODAL_MODELS:
+            return GeneratorOutput(
+                error=f"Model {model} does not support multimodal inputs. Supported models: {MULTIMODAL_MODELS}"
+            )
+
+        # Prepare message content
+        if images:
+            content = [{"type": "text", "text": prompt}]
+            if not isinstance(images, list):
+                images = [images]
+            for img in images:
+                content.append(self._prepare_image_content(img, detail))
+            messages = [{"role": "user", "content": content}]
+        else:
+            messages = [{"role": "user", "content": prompt}]
+
+        try:
+            response = self.client.chat.completions.create(
+                model=model,
+                messages=messages,
+                max_tokens=max_tokens,
+            )
+            return GeneratorOutput(
+                id=response.id,
+                data=response.choices[0].message.content,
+                usage=response.usage.model_dump() if response.usage else None,
+                raw_response=response.model_dump(),
+            )
+        except Exception as e:
+            return GeneratorOutput(error=str(e))
+
 
 # if __name__ == "__main__":
 #     from adalflow.core import Generator

diff --git a/adalflow/adalflow/components/model_client/openai_multimodal_client.py b/adalflow/adalflow/components/model_client/openai_multimodal_client.py
diff --git a/docs/source/tutorials/multimodal.rst b/docs/source/tutorials/multimodal.rst
@@ -18,15 +18,22 @@ Multimodal Generation
 What you will learn?
 ------------------
 
-1. How to use the OpenAI multimodal client for image understanding
+1. How to use OpenAI's multimodal capabilities in AdalFlow
 2. Different ways to input images (local files, URLs)
 3. Controlling image detail levels
 4. Working with multiple images
 
-The OpenAIMultimodalClient
-------------------------
+Multimodal Support in OpenAIClient
+--------------------------------
 
-The :class:`OpenAIMultimodalClient` extends AdalFlow's model client capabilities to handle images along with text. It supports:
+The :class:`OpenAIClient` supports both text and image inputs. For multimodal generation, you can use the following models:
+
+- ``gpt-4o``: Versatile, high-intelligence flagship model
+- ``gpt-4o-mini``: Fast, affordable small model for focused tasks (default)
+- ``o1``: Reasoning model that excels at complex, multi-step tasks
+- ``o1-mini``: Smaller reasoning model for complex tasks
+
+The client supports:
 
 - Local image files (automatically encoded to base64)
 - Image URLs
@@ -42,16 +49,17 @@ First, install AdalFlow with OpenAI support:
 
     pip install "adalflow[openai]"
 
-Then you can use the client with the Generator:
+Then you can use the client with the Generator. By default, it uses ``gpt-4o-mini``, but you can specify any supported model:
 
 .. code-block:: python
 
-    from adalflow import Generator, OpenAIMultimodalClient
+    from adalflow import Generator, OpenAIClient
 
+    # Using the default gpt-4o-mini model
     generator = Generator(
-        model_client=OpenAIMultimodalClient(),
+        model_client=OpenAIClient(),
         model_kwargs={
-            "model": "gpt-4o-mini",
+            "model": "gpt-4o-mini",  # or "gpt-4o", "o1", "o1-mini"
             "max_tokens": 300
         }
     )
@@ -62,6 +70,15 @@ Then you can use the client with the Generator:
         images="https://example.com/image.jpg"
     )
 
+    # Using the flagship model for more complex tasks
+    generator_flagship = Generator(
+        model_client=OpenAIClient(),
+        model_kwargs={
+            "model": "gpt-4o",
+            "max_tokens": 300
+        }
+    )
+
 Image Detail Levels
 -----------------
 
@@ -74,7 +91,7 @@ The client supports three detail levels:
 .. code-block:: python
 
     generator = Generator(
-        model_client=OpenAIMultimodalClient(),
+        model_client=OpenAIClient(),
         model_kwargs={
             "model": "gpt-4o-mini",
             "detail": "high"  # or "low" or "auto"
@@ -111,6 +128,7 @@ The client handles:
 2. API Integration:
    - Proper message formatting for OpenAI's vision models
    - Error handling and response parsing
+   - Model compatibility checking
    - Usage tracking
 
 3. Output Format:
@@ -121,23 +139,33 @@ The client handles:
 Limitations
 ---------
 
-Be aware of these limitations when using the multimodal client:
+Be aware of these limitations when using multimodal features:
+
+1. Model Support and Capabilities:
+   - Four models available with different strengths:
+     - ``gpt-4o``: Best for complex visual analysis and detailed understanding
+     - ``gpt-4o-mini``: Good balance of speed and accuracy for common tasks
+     - ``o1``: Excels at multi-step reasoning with visual inputs
+     - ``o1-mini``: Efficient for focused visual reasoning tasks
+   - The client will return an error if using an unsupported model with images
 
-1. Image Size:
+2. Image Size and Format:
    - Maximum file size: 20MB per image
    - Supported formats: PNG, JPEG, WEBP, non-animated GIF
 
-2. Model Capabilities:
-   - Best for general visual understanding
+3. Common Limitations:
    - May struggle with:
-     - Small text
-     - Precise spatial relationships
-     - Complex graphs
-     - Non-Latin text
+     - Very small or blurry text
+     - Complex spatial relationships
+     - Detailed technical diagrams
+     - Non-Latin text or symbols
 
-3. Cost Considerations:
-   - Image inputs are metered in tokens
+4. Cost and Performance Considerations:
+   - Image inputs increase token usage
    - High detail mode uses more tokens
-   - Consider using low detail mode for cost efficiency
+   - Consider using:
+     - ``gpt-4o-mini`` for routine tasks
+     - ``o1-mini`` for basic reasoning tasks
+     - ``gpt-4o`` or ``o1`` for complex analysis
 
-For more details, see the :class:`OpenAIMultimodalClient` API reference.
+For more details, see the :class:`OpenAIClient` API reference.
diff --git a/notebooks/tutorials/adalflow_multimodal.ipynb b/notebooks/tutorials/adalflow_multimodal.ipynb