diff --git a/adalflow/adalflow/components/model_client/__init__.py b/adalflow/adalflow/components/model_client/__init__.py index 258285ec..ae508ece 100644 --- a/adalflow/adalflow/components/model_client/__init__.py +++ b/adalflow/adalflow/components/model_client/__init__.py @@ -64,10 +64,6 @@ "adalflow.components.model_client.openai_client.get_probabilities", OptionalPackages.OPENAI, ) -OpenAIMultimodalClient = LazyImport( - "adalflow.components.model_client.openai_multimodal_client.OpenAIMultimodalClient", - OptionalPackages.OPENAI, -) __all__ = [ "CohereAPIClient", @@ -80,7 +76,6 @@ "GroqAPIClient", "OpenAIClient", "GoogleGenAIClient", - "OpenAIMultimodalClient", ] for name in __all__: diff --git a/adalflow/adalflow/components/model_client/openai_client.py b/adalflow/adalflow/components/model_client/openai_client.py index 809fd3e0..7b08f887 100644 --- a/adalflow/adalflow/components/model_client/openai_client.py +++ b/adalflow/adalflow/components/model_client/openai_client.py @@ -1,6 +1,7 @@ """OpenAI ModelClient integration.""" import os +import base64 from typing import ( Dict, Sequence, @@ -51,6 +52,14 @@ log = logging.getLogger(__name__) T = TypeVar("T") +# Models that support multimodal inputs +MULTIMODAL_MODELS = { + "gpt-4o", # Versatile, high-intelligence flagship model + "gpt-4o-mini", # Fast, affordable small model for focused tasks + "o1", # Reasoning model that excels at complex, multi-step tasks + "o1-mini", # Smaller reasoning model for complex tasks +} + # completion parsing functions and you can combine them into one singple chat completion parser def get_first_message_content(completion: ChatCompletion) -> str: @@ -332,6 +341,102 @@ def to_dict(self) -> Dict[str, Any]: output = super().to_dict(exclude=exclude) return output + def _encode_image(self, image_path: str) -> str: + """Encode image to base64 string. + + Args: + image_path: Path to image file. + + Returns: + Base64 encoded image string. + """ + with open(image_path, "rb") as image_file: + return base64.b64encode(image_file.read()).decode("utf-8") + + def _prepare_image_content( + self, image_source: Union[str, Dict[str, Any]], detail: str = "auto" + ) -> Dict[str, Any]: + """Prepare image content for API request. + + Args: + image_source: Either a path to local image or a URL. + detail: Image detail level ('auto', 'low', or 'high'). + + Returns: + Formatted image content for API request. + """ + if isinstance(image_source, str): + if image_source.startswith(("http://", "https://")): + return { + "type": "image_url", + "image_url": {"url": image_source, "detail": detail}, + } + else: + base64_image = self._encode_image(image_source) + return { + "type": "image_url", + "image_url": { + "url": f"data:image/jpeg;base64,{base64_image}", + "detail": detail, + }, + } + return image_source + + def generate( + self, + prompt: str, + images: Optional[ + Union[str, List[str], Dict[str, Any], List[Dict[str, Any]]] + ] = None, + model_kwargs: Optional[Dict[str, Any]] = None, + ) -> GeneratorOutput: + """Generate text response for given prompt and optionally images. + + Args: + prompt: Text prompt. + images: Optional image source(s) - can be path(s), URL(s), or formatted dict(s). + model_kwargs: Additional model parameters. + + Returns: + GeneratorOutput containing the model's response. + """ + model_kwargs = model_kwargs or {} + model = model_kwargs.get("model", "gpt-4o-mini") + max_tokens = model_kwargs.get("max_tokens", 300) + detail = model_kwargs.get("detail", "auto") + + # Check if model supports multimodal inputs when images are provided + if images and model not in MULTIMODAL_MODELS: + return GeneratorOutput( + error=f"Model {model} does not support multimodal inputs. Supported models: {MULTIMODAL_MODELS}" + ) + + # Prepare message content + if images: + content = [{"type": "text", "text": prompt}] + if not isinstance(images, list): + images = [images] + for img in images: + content.append(self._prepare_image_content(img, detail)) + messages = [{"role": "user", "content": content}] + else: + messages = [{"role": "user", "content": prompt}] + + try: + response = self.client.chat.completions.create( + model=model, + messages=messages, + max_tokens=max_tokens, + ) + return GeneratorOutput( + id=response.id, + data=response.choices[0].message.content, + usage=response.usage.model_dump() if response.usage else None, + raw_response=response.model_dump(), + ) + except Exception as e: + return GeneratorOutput(error=str(e)) + # if __name__ == "__main__": # from adalflow.core import Generator diff --git a/adalflow/adalflow/components/model_client/openai_multimodal_client.py b/adalflow/adalflow/components/model_client/openai_multimodal_client.py deleted file mode 100644 index 9cb8053c..00000000 --- a/adalflow/adalflow/components/model_client/openai_multimodal_client.py +++ /dev/null @@ -1,112 +0,0 @@ -"""OpenAI multimodal client for handling image and text inputs.""" - -import base64 -from typing import Any, Dict, List, Optional, Union -from adalflow.utils.lazy_import import safe_import, OptionalPackages - -openai = safe_import(OptionalPackages.OPENAI.value[0], OptionalPackages.OPENAI.value[1]) -from openai import OpenAI - -from adalflow.core.model_client import ModelClient -from adalflow.core.types import GeneratorOutput - - -class OpenAIMultimodalClient(ModelClient): - """OpenAI client for multimodal models.""" - - def __init__(self, api_key: Optional[str] = None): - """Initialize the OpenAI multimodal client. - - Args: - api_key: OpenAI API key. If None, will try to get from environment variable. - """ - super().__init__() - self.client = OpenAI(api_key=api_key) - - def _encode_image(self, image_path: str) -> str: - """Encode image to base64 string. - - Args: - image_path: Path to image file. - - Returns: - Base64 encoded image string. - """ - with open(image_path, "rb") as image_file: - return base64.b64encode(image_file.read()).decode("utf-8") - - def _prepare_image_content( - self, image_source: Union[str, Dict[str, Any]], detail: str = "auto" - ) -> Dict[str, Any]: - """Prepare image content for API request. - - Args: - image_source: Either a path to local image or a URL. - detail: Image detail level ('auto', 'low', or 'high'). - - Returns: - Formatted image content for API request. - """ - if isinstance(image_source, str): - if image_source.startswith(("http://", "https://")): - return { - "type": "image_url", - "image_url": {"url": image_source, "detail": detail}, - } - else: - base64_image = self._encode_image(image_source) - return { - "type": "image_url", - "image_url": { - "url": f"data:image/jpeg;base64,{base64_image}", - "detail": detail, - }, - } - return image_source - - def generate( - self, - prompt: str, - images: Optional[ - Union[str, List[str], Dict[str, Any], List[Dict[str, Any]]] - ] = None, - model_kwargs: Optional[Dict[str, Any]] = None, - ) -> GeneratorOutput: - """Generate text response for given prompt and images. - - Args: - prompt: Text prompt. - images: Image source(s) - can be path(s), URL(s), or formatted dict(s). - model_kwargs: Additional model parameters. - - Returns: - GeneratorOutput containing the model's response. - """ - model_kwargs = model_kwargs or {} - model = model_kwargs.get("model", "gpt-4o-mini") - max_tokens = model_kwargs.get("max_tokens", 300) - detail = model_kwargs.get("detail", "auto") - - # Prepare message content - content = [{"type": "text", "text": prompt}] - - if images: - if not isinstance(images, list): - images = [images] - for img in images: - content.append(self._prepare_image_content(img, detail)) - - try: - response = self.client.chat.completions.create( - model=model, - messages=[{"role": "user", "content": content}], - max_tokens=max_tokens, - ) - return GeneratorOutput( - id=response.id, - data=response.choices[0].message.content, - usage=response.usage.model_dump() if response.usage else None, - raw_response=response.model_dump(), - ) - except Exception as e: - return GeneratorOutput(error=str(e)) diff --git a/docs/source/tutorials/multimodal.rst b/docs/source/tutorials/multimodal.rst index 2cbb08ab..6c32c60d 100644 --- a/docs/source/tutorials/multimodal.rst +++ b/docs/source/tutorials/multimodal.rst @@ -18,15 +18,22 @@ Multimodal Generation What you will learn? ------------------ -1. How to use the OpenAI multimodal client for image understanding +1. How to use OpenAI's multimodal capabilities in AdalFlow 2. Different ways to input images (local files, URLs) 3. Controlling image detail levels 4. Working with multiple images -The OpenAIMultimodalClient ------------------------- +Multimodal Support in OpenAIClient +-------------------------------- -The :class:`OpenAIMultimodalClient` extends AdalFlow's model client capabilities to handle images along with text. It supports: +The :class:`OpenAIClient` supports both text and image inputs. For multimodal generation, you can use the following models: + +- ``gpt-4o``: Versatile, high-intelligence flagship model +- ``gpt-4o-mini``: Fast, affordable small model for focused tasks (default) +- ``o1``: Reasoning model that excels at complex, multi-step tasks +- ``o1-mini``: Smaller reasoning model for complex tasks + +The client supports: - Local image files (automatically encoded to base64) - Image URLs @@ -42,16 +49,17 @@ First, install AdalFlow with OpenAI support: pip install "adalflow[openai]" -Then you can use the client with the Generator: +Then you can use the client with the Generator. By default, it uses ``gpt-4o-mini``, but you can specify any supported model: .. code-block:: python - from adalflow import Generator, OpenAIMultimodalClient + from adalflow import Generator, OpenAIClient + # Using the default gpt-4o-mini model generator = Generator( - model_client=OpenAIMultimodalClient(), + model_client=OpenAIClient(), model_kwargs={ - "model": "gpt-4o-mini", + "model": "gpt-4o-mini", # or "gpt-4o", "o1", "o1-mini" "max_tokens": 300 } ) @@ -62,6 +70,15 @@ Then you can use the client with the Generator: images="https://example.com/image.jpg" ) + # Using the flagship model for more complex tasks + generator_flagship = Generator( + model_client=OpenAIClient(), + model_kwargs={ + "model": "gpt-4o", + "max_tokens": 300 + } + ) + Image Detail Levels ----------------- @@ -74,7 +91,7 @@ The client supports three detail levels: .. code-block:: python generator = Generator( - model_client=OpenAIMultimodalClient(), + model_client=OpenAIClient(), model_kwargs={ "model": "gpt-4o-mini", "detail": "high" # or "low" or "auto" @@ -111,6 +128,7 @@ The client handles: 2. API Integration: - Proper message formatting for OpenAI's vision models - Error handling and response parsing + - Model compatibility checking - Usage tracking 3. Output Format: @@ -121,23 +139,33 @@ The client handles: Limitations --------- -Be aware of these limitations when using the multimodal client: +Be aware of these limitations when using multimodal features: + +1. Model Support and Capabilities: + - Four models available with different strengths: + - ``gpt-4o``: Best for complex visual analysis and detailed understanding + - ``gpt-4o-mini``: Good balance of speed and accuracy for common tasks + - ``o1``: Excels at multi-step reasoning with visual inputs + - ``o1-mini``: Efficient for focused visual reasoning tasks + - The client will return an error if using an unsupported model with images -1. Image Size: +2. Image Size and Format: - Maximum file size: 20MB per image - Supported formats: PNG, JPEG, WEBP, non-animated GIF -2. Model Capabilities: - - Best for general visual understanding +3. Common Limitations: - May struggle with: - - Small text - - Precise spatial relationships - - Complex graphs - - Non-Latin text + - Very small or blurry text + - Complex spatial relationships + - Detailed technical diagrams + - Non-Latin text or symbols -3. Cost Considerations: - - Image inputs are metered in tokens +4. Cost and Performance Considerations: + - Image inputs increase token usage - High detail mode uses more tokens - - Consider using low detail mode for cost efficiency + - Consider using: + - ``gpt-4o-mini`` for routine tasks + - ``o1-mini`` for basic reasoning tasks + - ``gpt-4o`` or ``o1`` for complex analysis -For more details, see the :class:`OpenAIMultimodalClient` API reference. +For more details, see the :class:`OpenAIClient` API reference. diff --git a/notebooks/tutorials/adalflow_multimodal.ipynb b/notebooks/tutorials/adalflow_multimodal.ipynb deleted file mode 100644 index e69de29b..00000000