Merge pull request #17137 from BerriAI/litellm_gemini3_media_res_fix

Make sure that media resolution is only for gemini 3 model
This commit is contained in:
Sameer Kankute
2025-12-06 00:06:55 +05:30
committed by GitHub
7 changed files with 138 additions and 66 deletions

View File

@@ -114,20 +114,27 @@ class GoogleAIStudioGeminiConfig(VertexGeminiConfig):
img_element = element
_image_url: Optional[str] = None
format: Optional[str] = None
detail: Optional[str] = None
if isinstance(img_element.get("image_url"), dict):
_image_url = img_element["image_url"].get("url") # type: ignore
format = img_element["image_url"].get("format") # type: ignore
detail = img_element["image_url"].get("detail") # type: ignore
else:
_image_url = img_element.get("image_url") # type: ignore
if _image_url and "https://" in _image_url:
image_obj = convert_to_anthropic_image_obj(
_image_url, format=format
)
img_element["image_url"] = ( # type: ignore
convert_generic_image_chunk_to_openai_image_obj(
image_obj
)
converted_image_url = convert_generic_image_chunk_to_openai_image_obj(
image_obj
)
if detail is not None:
img_element["image_url"] = { # type: ignore
"url": converted_image_url,
"detail": detail
}
else:
img_element["image_url"] = converted_image_url # type: ignore
elif element.get("type") == "file":
file_element = cast(ChatCompletionFileObject, element)
file_id = file_element["file"].get("file_id")

View File

@@ -280,18 +280,24 @@ def _get_gemini_url(
stream: Optional[bool],
gemini_api_key: Optional[str],
) -> Tuple[str, str]:
from litellm.llms.vertex_ai.gemini.vertex_and_google_ai_studio_gemini import (
VertexGeminiConfig,
)
_gemini_model_name = "models/{}".format(model)
api_version = "v1alpha" if VertexGeminiConfig._is_gemini_3_or_newer(model) else "v1beta"
if mode == "chat":
endpoint = "generateContent"
if stream is True:
endpoint = "streamGenerateContent"
url = "https://generativelanguage.googleapis.com/v1beta/{}:{}?key={}&alt=sse".format(
_gemini_model_name, endpoint, gemini_api_key
url = "https://generativelanguage.googleapis.com/{}/{}:{}?key={}&alt=sse".format(
api_version, _gemini_model_name, endpoint, gemini_api_key
)
else:
url = (
"https://generativelanguage.googleapis.com/v1beta/{}:{}?key={}".format(
_gemini_model_name, endpoint, gemini_api_key
"https://generativelanguage.googleapis.com/{}/{}:{}?key={}".format(
api_version, _gemini_model_name, endpoint, gemini_api_key
)
)
elif mode == "embedding":

View File

@@ -5,7 +5,7 @@ Why separate file? Make it easy to see how transformation works
"""
import os
from typing import TYPE_CHECKING, List, Literal, Optional, Tuple, Union, cast
from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Tuple, Union, cast
import httpx
from pydantic import BaseModel
@@ -28,7 +28,6 @@ from litellm.types.files import (
get_file_type_from_extension,
is_gemini_1_5_accepted_file_type,
)
from litellm.types.utils import LlmProviders
from litellm.types.llms.openai import (
AllMessageValues,
ChatCompletionAssistantMessage,
@@ -48,7 +47,7 @@ from litellm.types.llms.vertex_ai import (
ToolConfig,
Tools,
)
from litellm.types.utils import GenericImageParsingChunk
from litellm.types.utils import GenericImageParsingChunk, LlmProviders
from ..common_utils import (
_check_text_in_content,
@@ -64,24 +63,21 @@ else:
LiteLLMLoggingObj = Any
def _map_openai_detail_to_media_resolution(
def _convert_detail_to_media_resolution_enum(
detail: Optional[str],
) -> Optional[Literal["low", "medium", "high"]]:
"""
Map OpenAI's "detail" parameter to Gemini's "media_resolution" parameter.
"""
) -> Optional[Dict[str, str]]:
if detail == "low":
return "low"
return {"level": "MEDIA_RESOLUTION_LOW"}
elif detail == "high":
return "high"
# "auto" or None means let the model decide, so we don't set media_resolution
return {"level": "MEDIA_RESOLUTION_HIGH"}
return None
def _process_gemini_image(
image_url: str,
format: Optional[str] = None,
media_resolution: Optional[Literal["low", "medium", "high"]] = None,
media_resolution_enum: Optional[Dict[str, str]] = None,
model: Optional[str] = None,
) -> PartType:
"""
Given an image URL, return the appropriate PartType for Gemini
@@ -105,31 +101,43 @@ def _process_gemini_image(
else:
mime_type = format
file_data = FileDataType(mime_type=mime_type, file_uri=image_url)
return PartType(file_data=file_data)
part: PartType = {"file_data": file_data}
if media_resolution_enum is not None and model is not None:
from .vertex_and_google_ai_studio_gemini import VertexGeminiConfig
if VertexGeminiConfig._is_gemini_3_or_newer(model):
part_dict = dict(part)
part_dict["media_resolution"] = media_resolution_enum
return cast(PartType, part_dict)
return part
elif (
"https://" in image_url
and (image_type := format or _get_image_mime_type_from_url(image_url))
is not None
):
file_data = FileDataType(file_uri=image_url, mime_type=image_type)
return PartType(file_data=file_data)
part: PartType = {"file_data": file_data}
if media_resolution_enum is not None and model is not None:
from .vertex_and_google_ai_studio_gemini import VertexGeminiConfig
if VertexGeminiConfig._is_gemini_3_or_newer(model):
part_dict = dict(part)
part_dict["media_resolution"] = media_resolution_enum
return cast(PartType, part_dict)
return part
elif "http://" in image_url or "https://" in image_url or "base64" in image_url:
# https links for unsupported mime types and base64 images
image = convert_to_anthropic_image_obj(image_url, format=format)
_blob: BlobType = {"data": image["data"], "mime_type": image["media_type"]}
if media_resolution is not None:
_blob["media_resolution"] = media_resolution
# Convert snake_case keys to camelCase for JSON serialization
# The TypedDict uses snake_case, but the API expects camelCase
_blob_dict = dict(_blob)
if "media_resolution" in _blob_dict:
_blob_dict["mediaResolution"] = _blob_dict.pop("media_resolution")
if "mime_type" in _blob_dict:
_blob_dict["mimeType"] = _blob_dict.pop("mime_type")
part: PartType = {"inline_data": cast(BlobType, _blob)}
return PartType(inline_data=cast(BlobType, _blob_dict))
if media_resolution_enum is not None and model is not None:
from .vertex_and_google_ai_studio_gemini import VertexGeminiConfig
if VertexGeminiConfig._is_gemini_3_or_newer(model):
part_dict = dict(part)
part_dict["media_resolution"] = media_resolution_enum
return cast(PartType, part_dict)
return part
raise Exception("Invalid image received - {}".format(image_url))
except Exception as e:
raise e
@@ -235,18 +243,19 @@ def _gemini_convert_messages_with_history( # noqa: PLR0915
element = cast(ChatCompletionImageObject, element)
img_element = element
format: Optional[str] = None
media_resolution: Optional[Literal["low", "medium", "high"]] = None
media_resolution_enum: Optional[Dict[str, str]] = None
if isinstance(img_element["image_url"], dict):
image_url = img_element["image_url"]["url"]
format = img_element["image_url"].get("format")
detail = img_element["image_url"].get("detail")
media_resolution = _map_openai_detail_to_media_resolution(detail)
media_resolution_enum = _convert_detail_to_media_resolution_enum(detail)
else:
image_url = img_element["image_url"]
_part = _process_gemini_image(
image_url=image_url,
format=format,
media_resolution=media_resolution,
media_resolution_enum=media_resolution_enum,
model=model,
)
_parts.append(_part)
elif element["type"] == "input_audio":
@@ -271,6 +280,7 @@ def _gemini_convert_messages_with_history( # noqa: PLR0915
_part = _process_gemini_image(
image_url=openai_image_str,
format=audio_format_modified,
model=model,
)
_parts.append(_part)
elif element["type"] == "file":
@@ -287,6 +297,7 @@ def _gemini_convert_messages_with_history( # noqa: PLR0915
_part = _process_gemini_image(
image_url=passed_file,
format=format,
model=model,
)
_parts.append(_part)
except Exception:

View File

@@ -32,7 +32,6 @@ class FileDataType(TypedDict):
class BlobType(TypedDict, total=False):
mime_type: Required[str]
data: Required[str]
media_resolution: Literal["low", "medium", "high"]
class PartType(TypedDict, total=False):
@@ -43,6 +42,7 @@ class PartType(TypedDict, total=False):
function_response: FunctionResponse
thought: bool
thoughtSignature: str
media_resolution: Literal["low", "medium", "high"]
class HttpxFunctionCall(TypedDict):
@@ -63,7 +63,6 @@ class HttpxCodeExecutionResult(TypedDict):
class HttpxBlobType(TypedDict, total=False):
mimeType: str
data: str
mediaResolution: Literal["low", "medium", "high"]
class HttpxPartType(TypedDict, total=False):
@@ -76,6 +75,7 @@ class HttpxPartType(TypedDict, total=False):
codeExecutionResult: HttpxCodeExecutionResult
thought: bool
thoughtSignature: str
mediaResolution: Literal["low", "medium", "high"]
class HttpxContentType(TypedDict, total=False):

View File

@@ -567,7 +567,7 @@ def test_vertex_only_image_user_message():
},
]
response = _gemini_convert_messages_with_history(messages=messages)
response = _gemini_convert_messages_with_history(messages=messages, model="gemini-1.5-pro")
expected_response = [
{
@@ -576,7 +576,7 @@ def test_vertex_only_image_user_message():
{
"inline_data": {
"data": "/9j/2wCEAAgGBgcGBQ",
"mimeType": "image/jpeg",
"mime_type": "image/jpeg",
}
},
{"text": " "},

View File

@@ -1767,15 +1767,15 @@ def test_temperature_default_for_gemini_3():
def test_media_resolution_from_detail_parameter():
"""Test that OpenAI's detail parameter is correctly mapped to media_resolution"""
from litellm.llms.vertex_ai.gemini.transformation import (
_convert_detail_to_media_resolution_enum,
_gemini_convert_messages_with_history,
_map_openai_detail_to_media_resolution,
)
# Test detail -> media_resolution mapping
assert _map_openai_detail_to_media_resolution("low") == "low"
assert _map_openai_detail_to_media_resolution("high") == "high"
assert _map_openai_detail_to_media_resolution("auto") is None
assert _map_openai_detail_to_media_resolution(None) is None
# Test detail -> media_resolution enum mapping
assert _convert_detail_to_media_resolution_enum("low") == {"level": "MEDIA_RESOLUTION_LOW"}
assert _convert_detail_to_media_resolution_enum("high") == {"level": "MEDIA_RESOLUTION_HIGH"}
assert _convert_detail_to_media_resolution_enum("auto") is None
assert _convert_detail_to_media_resolution_enum(None) is None
# Test with actual message transformation using base64 image
# Using a minimal valid base64-encoded 1x1 PNG
@@ -1795,27 +1795,28 @@ def test_media_resolution_from_detail_parameter():
}
]
contents = _gemini_convert_messages_with_history(messages=messages)
contents = _gemini_convert_messages_with_history(
messages=messages, model="gemini-3-pro-preview"
)
# Verify media_resolution is set in the inline_data
# Note: Gemini adds a blank text part when there's no text, so we expect 2 parts
# Verify media_resolution is set at the Part level (not inside inline_data)
assert len(contents) == 1
assert len(contents[0]["parts"]) >= 1
# Find the part with inline_data
image_part = None
for part in contents[0]["parts"]:
if "inline_data" in part:
if "inline_data" in part or "inlineData" in part:
image_part = part
break
assert image_part is not None
assert "inline_data" in image_part
# The TypedDict uses snake_case internally, but mediaResolution is camelCase in the dict
assert "mediaResolution" in image_part["inline_data"]
assert image_part["inline_data"]["mediaResolution"] == "high"
# media_resolution should be at the Part level, not inside inline_data
assert "media_resolution" in image_part
media_res = image_part.get("media_resolution")
assert media_res == {"level": "MEDIA_RESOLUTION_HIGH"}
def test_media_resolution_low_detail():
"""Test that detail='low' maps to media_resolution='low'"""
"""Test that detail='low' maps to media_resolution enum with MEDIA_RESOLUTION_LOW"""
from litellm.llms.vertex_ai.gemini.transformation import (
_gemini_convert_messages_with_history,
)
@@ -1837,7 +1838,9 @@ def test_media_resolution_low_detail():
}
]
contents = _gemini_convert_messages_with_history(messages=messages)
contents = _gemini_convert_messages_with_history(
messages=messages, model="gemini-3-pro-preview"
)
# Find the part with inline_data
image_part = None
@@ -1847,7 +1850,9 @@ def test_media_resolution_low_detail():
break
assert image_part is not None
assert "inline_data" in image_part
assert image_part["inline_data"]["mediaResolution"] == "low"
# media_resolution should be at the Part level, not inside inline_data
assert "media_resolution" in image_part
assert image_part["media_resolution"] == {"level": "MEDIA_RESOLUTION_LOW"}
def test_media_resolution_auto_detail():
@@ -1884,8 +1889,8 @@ def test_media_resolution_auto_detail():
break
assert image_part is not None
assert "inline_data" in image_part
# mediaResolution should not be set for auto
assert "mediaResolution" not in image_part["inline_data"] or image_part["inline_data"].get("mediaResolution") is None
# media_resolution should not be set for auto (check Part level, not inline_data)
assert "media_resolution" not in image_part
# Test with None
messages_none = [
@@ -1911,8 +1916,8 @@ def test_media_resolution_auto_detail():
break
assert image_part is not None
assert "inline_data" in image_part
# mediaResolution should not be set
assert "mediaResolution" not in image_part["inline_data"] or image_part["inline_data"].get("mediaResolution") is None
# media_resolution should not be set (check Part level, not inline_data)
assert "media_resolution" not in image_part
def test_media_resolution_per_part():
@@ -1951,7 +1956,9 @@ def test_media_resolution_per_part():
}
]
contents = _gemini_convert_messages_with_history(messages=messages)
contents = _gemini_convert_messages_with_history(
messages=messages, model="gemini-3-pro-preview"
)
# Should have one content with multiple parts
assert len(contents) == 1
@@ -1960,12 +1967,53 @@ def test_media_resolution_per_part():
# First image should have low resolution (first part is the image)
image1_part = contents[0]["parts"][0]
assert "inline_data" in image1_part
assert image1_part["inline_data"]["mediaResolution"] == "low"
# media_resolution should be at the Part level, not inside inline_data
assert "media_resolution" in image1_part
assert image1_part["media_resolution"] == {"level": "MEDIA_RESOLUTION_LOW"}
# Second image should have high resolution (third part is the second image)
image2_part = contents[0]["parts"][2]
assert "inline_data" in image2_part
assert image2_part["inline_data"]["mediaResolution"] == "high"
# media_resolution should be at the Part level, not inside inline_data
assert "media_resolution" in image2_part
assert image2_part["media_resolution"] == {"level": "MEDIA_RESOLUTION_HIGH"}
def test_media_resolution_only_for_gemini_3_models():
"""Ensure media_resolution is not added for non-Gemini 3 models."""
from litellm.llms.vertex_ai.gemini.transformation import (
_gemini_convert_messages_with_history,
)
base64_image = "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg=="
messages = [
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": base64_image,
"detail": "high",
},
}
],
}
]
contents = _gemini_convert_messages_with_history(
messages=messages, model="gemini-2.5-pro"
)
image_part = None
for part in contents[0]["parts"]:
if "inline_data" in part:
image_part = part
break
assert image_part is not None
assert "inline_data" in image_part
# media_resolution should not be at the Part level for non-Gemini 3 models
assert "media_resolution" not in image_part
assert "mediaResolution" not in image_part
def test_gemini_3_image_models_no_thinking_config():

View File

@@ -1241,7 +1241,7 @@ def test_process_gemini_image():
base64_image = "data:image/jpeg;base64,/9j/4AAQSkZJRg..."
base64_result = _process_gemini_image(base64_image)
print("base64_result", base64_result)
assert base64_result["inline_data"]["mimeType"] == "image/jpeg"
assert base64_result["inline_data"]["mime_type"] == "image/jpeg"
assert base64_result["inline_data"]["data"] == "/9j/4AAQSkZJRg..."