Merge pull request #17137 from BerriAI/litellm_gemini3_media_res_fix

Make sure that media resolution is only for gemini 3 model
2025-12-06 11:33:26 +08:00 · 2025-12-06 00:06:55 +05:30
parent 85d73403f4 3907667892
commit e924b6978a
7 changed files with 138 additions and 66 deletions
--- a/litellm/llms/gemini/chat/transformation.py
+++ b/litellm/llms/gemini/chat/transformation.py
@@ -114,20 +114,27 @@ class GoogleAIStudioGeminiConfig(VertexGeminiConfig):
                        img_element = element
                        _image_url: Optional[str] = None
                        format: Optional[str] = None
+                        detail: Optional[str] = None
                        if isinstance(img_element.get("image_url"), dict):
                            _image_url = img_element["image_url"].get("url")  # type: ignore
                            format = img_element["image_url"].get("format")  # type: ignore
+                            detail = img_element["image_url"].get("detail")  # type: ignore
                        else:
                            _image_url = img_element.get("image_url")  # type: ignore
                        if _image_url and "https://" in _image_url:
                            image_obj = convert_to_anthropic_image_obj(
                                _image_url, format=format
                            )
-                            img_element["image_url"] = (  # type: ignore
-                                convert_generic_image_chunk_to_openai_image_obj(
-                                    image_obj
-                                )
+                            converted_image_url = convert_generic_image_chunk_to_openai_image_obj(
+                                image_obj
                            )
+                            if detail is not None:
+                                img_element["image_url"] = {  # type: ignore
+                                    "url": converted_image_url,
+                                    "detail": detail
+                                }
+                            else:
+                                img_element["image_url"] = converted_image_url  # type: ignore
                    elif element.get("type") == "file":
                        file_element = cast(ChatCompletionFileObject, element)
                        file_id = file_element["file"].get("file_id")
--- a/litellm/llms/vertex_ai/common_utils.py
+++ b/litellm/llms/vertex_ai/common_utils.py
@@ -280,18 +280,24 @@ def _get_gemini_url(
    stream: Optional[bool],
    gemini_api_key: Optional[str],
 ) -> Tuple[str, str]:
+    from litellm.llms.vertex_ai.gemini.vertex_and_google_ai_studio_gemini import (
+        VertexGeminiConfig,
+    )
+    
    _gemini_model_name = "models/{}".format(model)
+    api_version = "v1alpha" if VertexGeminiConfig._is_gemini_3_or_newer(model) else "v1beta"
+    
    if mode == "chat":
        endpoint = "generateContent"
        if stream is True:
            endpoint = "streamGenerateContent"
-            url = "https://generativelanguage.googleapis.com/v1beta/{}:{}?key={}&alt=sse".format(
-                _gemini_model_name, endpoint, gemini_api_key
+            url = "https://generativelanguage.googleapis.com/{}/{}:{}?key={}&alt=sse".format(
+                api_version, _gemini_model_name, endpoint, gemini_api_key
            )
        else:
            url = (
-                "https://generativelanguage.googleapis.com/v1beta/{}:{}?key={}".format(
-                    _gemini_model_name, endpoint, gemini_api_key
+                "https://generativelanguage.googleapis.com/{}/{}:{}?key={}".format(
+                    api_version, _gemini_model_name, endpoint, gemini_api_key
                )
            )
    elif mode == "embedding":
--- a/litellm/llms/vertex_ai/gemini/transformation.py
+++ b/litellm/llms/vertex_ai/gemini/transformation.py
@@ -5,7 +5,7 @@ Why separate file? Make it easy to see how transformation works
 """

 import os
-from typing import TYPE_CHECKING, List, Literal, Optional, Tuple, Union, cast
+from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Tuple, Union, cast

 import httpx
 from pydantic import BaseModel
@@ -28,7 +28,6 @@ from litellm.types.files import (
    get_file_type_from_extension,
    is_gemini_1_5_accepted_file_type,
 )
-from litellm.types.utils import LlmProviders
 from litellm.types.llms.openai import (
    AllMessageValues,
    ChatCompletionAssistantMessage,
@@ -48,7 +47,7 @@ from litellm.types.llms.vertex_ai import (
    ToolConfig,
    Tools,
 )
-from litellm.types.utils import GenericImageParsingChunk
+from litellm.types.utils import GenericImageParsingChunk, LlmProviders

 from ..common_utils import (
    _check_text_in_content,
@@ -64,24 +63,21 @@ else:
    LiteLLMLoggingObj = Any


-def _map_openai_detail_to_media_resolution(
+def _convert_detail_to_media_resolution_enum(
    detail: Optional[str],
-) -> Optional[Literal["low", "medium", "high"]]:
-    """
-    Map OpenAI's "detail" parameter to Gemini's "media_resolution" parameter.
-    """
+) -> Optional[Dict[str, str]]:
    if detail == "low":
-        return "low"
+        return {"level": "MEDIA_RESOLUTION_LOW"}
    elif detail == "high":
-        return "high"
-    # "auto" or None means let the model decide, so we don't set media_resolution
+        return {"level": "MEDIA_RESOLUTION_HIGH"}
    return None


 def _process_gemini_image(
    image_url: str, 
    format: Optional[str] = None,
-    media_resolution: Optional[Literal["low", "medium", "high"]] = None,
+    media_resolution_enum: Optional[Dict[str, str]] = None,
+    model: Optional[str] = None,
 ) -> PartType:
    """
    Given an image URL, return the appropriate PartType for Gemini
@@ -105,31 +101,43 @@ def _process_gemini_image(
            else:
                mime_type = format
            file_data = FileDataType(mime_type=mime_type, file_uri=image_url)
-
-            return PartType(file_data=file_data)
+            part: PartType = {"file_data": file_data}
+            
+            if media_resolution_enum is not None and model is not None:
+                from .vertex_and_google_ai_studio_gemini import VertexGeminiConfig
+                if VertexGeminiConfig._is_gemini_3_or_newer(model):
+                    part_dict = dict(part)
+                    part_dict["media_resolution"] = media_resolution_enum
+                    return cast(PartType, part_dict)
+            return part
        elif (
            "https://" in image_url
            and (image_type := format or _get_image_mime_type_from_url(image_url))
            is not None
        ):
            file_data = FileDataType(file_uri=image_url, mime_type=image_type)
-            return PartType(file_data=file_data)
+            part: PartType = {"file_data": file_data}
+            
+            if media_resolution_enum is not None and model is not None:
+                from .vertex_and_google_ai_studio_gemini import VertexGeminiConfig
+                if VertexGeminiConfig._is_gemini_3_or_newer(model):
+                    part_dict = dict(part)
+                    part_dict["media_resolution"] = media_resolution_enum
+                    return cast(PartType, part_dict)
+            return part
        elif "http://" in image_url or "https://" in image_url or "base64" in image_url:
-            # https links for unsupported mime types and base64 images
            image = convert_to_anthropic_image_obj(image_url, format=format)
            _blob: BlobType = {"data": image["data"], "mime_type": image["media_type"]}
-            if media_resolution is not None:
-                _blob["media_resolution"] = media_resolution
            
-            # Convert snake_case keys to camelCase for JSON serialization
-            # The TypedDict uses snake_case, but the API expects camelCase
-            _blob_dict = dict(_blob)
-            if "media_resolution" in _blob_dict:
-                _blob_dict["mediaResolution"] = _blob_dict.pop("media_resolution")
-            if "mime_type" in _blob_dict:
-                _blob_dict["mimeType"] = _blob_dict.pop("mime_type")
+            part: PartType = {"inline_data": cast(BlobType, _blob)}
            
-            return PartType(inline_data=cast(BlobType, _blob_dict))
+            if media_resolution_enum is not None and model is not None:
+                from .vertex_and_google_ai_studio_gemini import VertexGeminiConfig
+                if VertexGeminiConfig._is_gemini_3_or_newer(model):
+                    part_dict = dict(part)
+                    part_dict["media_resolution"] = media_resolution_enum
+                    return cast(PartType, part_dict)
+            return part
        raise Exception("Invalid image received - {}".format(image_url))
    except Exception as e:
        raise e
@@ -235,18 +243,19 @@ def _gemini_convert_messages_with_history(  # noqa: PLR0915
                            element = cast(ChatCompletionImageObject, element)
                            img_element = element
                            format: Optional[str] = None
-                            media_resolution: Optional[Literal["low", "medium", "high"]] = None
+                            media_resolution_enum: Optional[Dict[str, str]] = None
                            if isinstance(img_element["image_url"], dict):
                                image_url = img_element["image_url"]["url"]
                                format = img_element["image_url"].get("format")
                                detail = img_element["image_url"].get("detail")
-                                media_resolution = _map_openai_detail_to_media_resolution(detail)
+                                media_resolution_enum = _convert_detail_to_media_resolution_enum(detail)
                            else:
                                image_url = img_element["image_url"]
                            _part = _process_gemini_image(
                                image_url=image_url, 
                                format=format,
-                                media_resolution=media_resolution,
+                                media_resolution_enum=media_resolution_enum,
+                                model=model,
                            )
                            _parts.append(_part)
                        elif element["type"] == "input_audio":
@@ -271,6 +280,7 @@ def _gemini_convert_messages_with_history(  # noqa: PLR0915
                                _part = _process_gemini_image(
                                    image_url=openai_image_str,
                                    format=audio_format_modified,
+                                    model=model,
                                )
                                _parts.append(_part)
                        elif element["type"] == "file":
@@ -287,6 +297,7 @@ def _gemini_convert_messages_with_history(  # noqa: PLR0915
                                _part = _process_gemini_image(
                                    image_url=passed_file, 
                                    format=format,
+                                    model=model,
                                )
                                _parts.append(_part)
                            except Exception:
--- a/litellm/types/llms/vertex_ai.py
+++ b/litellm/types/llms/vertex_ai.py
@@ -32,7 +32,6 @@ class FileDataType(TypedDict):
 class BlobType(TypedDict, total=False):
    mime_type: Required[str]
    data: Required[str]
-    media_resolution: Literal["low", "medium", "high"]


 class PartType(TypedDict, total=False):
@@ -43,6 +42,7 @@ class PartType(TypedDict, total=False):
    function_response: FunctionResponse
    thought: bool
    thoughtSignature: str
+    media_resolution: Literal["low", "medium", "high"]


 class HttpxFunctionCall(TypedDict):
@@ -63,7 +63,6 @@ class HttpxCodeExecutionResult(TypedDict):
 class HttpxBlobType(TypedDict, total=False):
    mimeType: str
    data: str
-    mediaResolution: Literal["low", "medium", "high"]


 class HttpxPartType(TypedDict, total=False):
@@ -76,6 +75,7 @@ class HttpxPartType(TypedDict, total=False):
    codeExecutionResult: HttpxCodeExecutionResult
    thought: bool
    thoughtSignature: str
+    mediaResolution: Literal["low", "medium", "high"]


 class HttpxContentType(TypedDict, total=False):
--- a/tests/llm_translation/test_prompt_factory.py
+++ b/tests/llm_translation/test_prompt_factory.py
@@ -567,7 +567,7 @@ def test_vertex_only_image_user_message():
        },
    ]

-    response = _gemini_convert_messages_with_history(messages=messages)
+    response = _gemini_convert_messages_with_history(messages=messages, model="gemini-1.5-pro")

    expected_response = [
        {
@@ -576,7 +576,7 @@ def test_vertex_only_image_user_message():
                {
                    "inline_data": {
                        "data": "/9j/2wCEAAgGBgcGBQ",
-                        "mimeType": "image/jpeg",
+                        "mime_type": "image/jpeg",
                    }
                },
                {"text": " "},
--- a/tests/test_litellm/llms/vertex_ai/gemini/test_vertex_and_google_ai_studio_gemini.py
+++ b/tests/test_litellm/llms/vertex_ai/gemini/test_vertex_and_google_ai_studio_gemini.py
@@ -1767,15 +1767,15 @@ def test_temperature_default_for_gemini_3():
 def test_media_resolution_from_detail_parameter():
    """Test that OpenAI's detail parameter is correctly mapped to media_resolution"""
    from litellm.llms.vertex_ai.gemini.transformation import (
+        _convert_detail_to_media_resolution_enum,
        _gemini_convert_messages_with_history,
-        _map_openai_detail_to_media_resolution,
    )

-    # Test detail -> media_resolution mapping
-    assert _map_openai_detail_to_media_resolution("low") == "low"
-    assert _map_openai_detail_to_media_resolution("high") == "high"
-    assert _map_openai_detail_to_media_resolution("auto") is None
-    assert _map_openai_detail_to_media_resolution(None) is None
+    # Test detail -> media_resolution enum mapping
+    assert _convert_detail_to_media_resolution_enum("low") == {"level": "MEDIA_RESOLUTION_LOW"}
+    assert _convert_detail_to_media_resolution_enum("high") == {"level": "MEDIA_RESOLUTION_HIGH"}
+    assert _convert_detail_to_media_resolution_enum("auto") is None
+    assert _convert_detail_to_media_resolution_enum(None) is None

    # Test with actual message transformation using base64 image
    # Using a minimal valid base64-encoded 1x1 PNG
@@ -1795,27 +1795,28 @@ def test_media_resolution_from_detail_parameter():
        }
    ]

-    contents = _gemini_convert_messages_with_history(messages=messages)
+    contents = _gemini_convert_messages_with_history(
+        messages=messages, model="gemini-3-pro-preview"
+    )
    
-    # Verify media_resolution is set in the inline_data
-    # Note: Gemini adds a blank text part when there's no text, so we expect 2 parts
+    # Verify media_resolution is set at the Part level (not inside inline_data)
    assert len(contents) == 1
    assert len(contents[0]["parts"]) >= 1
    # Find the part with inline_data
    image_part = None
    for part in contents[0]["parts"]:
-        if "inline_data" in part:
+        if "inline_data" in part or "inlineData" in part:
            image_part = part
            break
    assert image_part is not None
-    assert "inline_data" in image_part
-    # The TypedDict uses snake_case internally, but mediaResolution is camelCase in the dict
-    assert "mediaResolution" in image_part["inline_data"]
-    assert image_part["inline_data"]["mediaResolution"] == "high"
+    # media_resolution should be at the Part level, not inside inline_data
+    assert "media_resolution" in image_part
+    media_res = image_part.get("media_resolution")
+    assert media_res == {"level": "MEDIA_RESOLUTION_HIGH"}


 def test_media_resolution_low_detail():
-    """Test that detail='low' maps to media_resolution='low'"""
+    """Test that detail='low' maps to media_resolution enum with MEDIA_RESOLUTION_LOW"""
    from litellm.llms.vertex_ai.gemini.transformation import (
        _gemini_convert_messages_with_history,
    )
@@ -1837,7 +1838,9 @@ def test_media_resolution_low_detail():
        }
    ]

-    contents = _gemini_convert_messages_with_history(messages=messages)
+    contents = _gemini_convert_messages_with_history(
+        messages=messages, model="gemini-3-pro-preview"
+    )
    
    # Find the part with inline_data
    image_part = None
@@ -1847,7 +1850,9 @@ def test_media_resolution_low_detail():
            break
    assert image_part is not None
    assert "inline_data" in image_part
-    assert image_part["inline_data"]["mediaResolution"] == "low"
+    # media_resolution should be at the Part level, not inside inline_data
+    assert "media_resolution" in image_part
+    assert image_part["media_resolution"] == {"level": "MEDIA_RESOLUTION_LOW"}


 def test_media_resolution_auto_detail():
@@ -1884,8 +1889,8 @@ def test_media_resolution_auto_detail():
            break
    assert image_part is not None
    assert "inline_data" in image_part
-    # mediaResolution should not be set for auto
-    assert "mediaResolution" not in image_part["inline_data"] or image_part["inline_data"].get("mediaResolution") is None
+    # media_resolution should not be set for auto (check Part level, not inline_data)
+    assert "media_resolution" not in image_part

    # Test with None
    messages_none = [
@@ -1911,8 +1916,8 @@ def test_media_resolution_auto_detail():
            break
    assert image_part is not None
    assert "inline_data" in image_part
-    # mediaResolution should not be set
-    assert "mediaResolution" not in image_part["inline_data"] or image_part["inline_data"].get("mediaResolution") is None
+    # media_resolution should not be set (check Part level, not inline_data)
+    assert "media_resolution" not in image_part


 def test_media_resolution_per_part():
@@ -1951,7 +1956,9 @@ def test_media_resolution_per_part():
        }
    ]

-    contents = _gemini_convert_messages_with_history(messages=messages)
+    contents = _gemini_convert_messages_with_history(
+        messages=messages, model="gemini-3-pro-preview"
+    )
    
    # Should have one content with multiple parts
    assert len(contents) == 1
@@ -1960,12 +1967,53 @@ def test_media_resolution_per_part():
    # First image should have low resolution (first part is the image)
    image1_part = contents[0]["parts"][0]
    assert "inline_data" in image1_part
-    assert image1_part["inline_data"]["mediaResolution"] == "low"
+    # media_resolution should be at the Part level, not inside inline_data
+    assert "media_resolution" in image1_part
+    assert image1_part["media_resolution"] == {"level": "MEDIA_RESOLUTION_LOW"}
    
    # Second image should have high resolution (third part is the second image)
    image2_part = contents[0]["parts"][2]
    assert "inline_data" in image2_part
-    assert image2_part["inline_data"]["mediaResolution"] == "high"
+    # media_resolution should be at the Part level, not inside inline_data
+    assert "media_resolution" in image2_part
+    assert image2_part["media_resolution"] == {"level": "MEDIA_RESOLUTION_HIGH"}
+
+
+def test_media_resolution_only_for_gemini_3_models():
+    """Ensure media_resolution is not added for non-Gemini 3 models."""
+    from litellm.llms.vertex_ai.gemini.transformation import (
+        _gemini_convert_messages_with_history,
+    )
+
+    base64_image = "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg=="
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": base64_image,
+                        "detail": "high",
+                    },
+                }
+            ],
+        }
+    ]
+
+    contents = _gemini_convert_messages_with_history(
+        messages=messages, model="gemini-2.5-pro"
+    )
+    image_part = None
+    for part in contents[0]["parts"]:
+        if "inline_data" in part:
+            image_part = part
+            break
+    assert image_part is not None
+    assert "inline_data" in image_part
+    # media_resolution should not be at the Part level for non-Gemini 3 models
+    assert "media_resolution" not in image_part
+    assert "mediaResolution" not in image_part


 def test_gemini_3_image_models_no_thinking_config():
--- a/tests/test_litellm/llms/vertex_ai/test_vertex.py
+++ b/tests/test_litellm/llms/vertex_ai/test_vertex.py
@@ -1241,7 +1241,7 @@ def test_process_gemini_image():
    base64_image = "data:image/jpeg;base64,/9j/4AAQSkZJRg..."
    base64_result = _process_gemini_image(base64_image)
    print("base64_result", base64_result)
-    assert base64_result["inline_data"]["mimeType"] == "image/jpeg"
+    assert base64_result["inline_data"]["mime_type"] == "image/jpeg"
    assert base64_result["inline_data"]["data"] == "/9j/4AAQSkZJRg..."