mirror of
https://github.com/BerriAI/litellm.git
synced 2025-12-06 11:33:26 +08:00
Merge pull request #17137 from BerriAI/litellm_gemini3_media_res_fix
Make sure that media resolution is only for gemini 3 model
This commit is contained in:
@@ -114,20 +114,27 @@ class GoogleAIStudioGeminiConfig(VertexGeminiConfig):
|
||||
img_element = element
|
||||
_image_url: Optional[str] = None
|
||||
format: Optional[str] = None
|
||||
detail: Optional[str] = None
|
||||
if isinstance(img_element.get("image_url"), dict):
|
||||
_image_url = img_element["image_url"].get("url") # type: ignore
|
||||
format = img_element["image_url"].get("format") # type: ignore
|
||||
detail = img_element["image_url"].get("detail") # type: ignore
|
||||
else:
|
||||
_image_url = img_element.get("image_url") # type: ignore
|
||||
if _image_url and "https://" in _image_url:
|
||||
image_obj = convert_to_anthropic_image_obj(
|
||||
_image_url, format=format
|
||||
)
|
||||
img_element["image_url"] = ( # type: ignore
|
||||
convert_generic_image_chunk_to_openai_image_obj(
|
||||
image_obj
|
||||
)
|
||||
converted_image_url = convert_generic_image_chunk_to_openai_image_obj(
|
||||
image_obj
|
||||
)
|
||||
if detail is not None:
|
||||
img_element["image_url"] = { # type: ignore
|
||||
"url": converted_image_url,
|
||||
"detail": detail
|
||||
}
|
||||
else:
|
||||
img_element["image_url"] = converted_image_url # type: ignore
|
||||
elif element.get("type") == "file":
|
||||
file_element = cast(ChatCompletionFileObject, element)
|
||||
file_id = file_element["file"].get("file_id")
|
||||
|
||||
@@ -280,18 +280,24 @@ def _get_gemini_url(
|
||||
stream: Optional[bool],
|
||||
gemini_api_key: Optional[str],
|
||||
) -> Tuple[str, str]:
|
||||
from litellm.llms.vertex_ai.gemini.vertex_and_google_ai_studio_gemini import (
|
||||
VertexGeminiConfig,
|
||||
)
|
||||
|
||||
_gemini_model_name = "models/{}".format(model)
|
||||
api_version = "v1alpha" if VertexGeminiConfig._is_gemini_3_or_newer(model) else "v1beta"
|
||||
|
||||
if mode == "chat":
|
||||
endpoint = "generateContent"
|
||||
if stream is True:
|
||||
endpoint = "streamGenerateContent"
|
||||
url = "https://generativelanguage.googleapis.com/v1beta/{}:{}?key={}&alt=sse".format(
|
||||
_gemini_model_name, endpoint, gemini_api_key
|
||||
url = "https://generativelanguage.googleapis.com/{}/{}:{}?key={}&alt=sse".format(
|
||||
api_version, _gemini_model_name, endpoint, gemini_api_key
|
||||
)
|
||||
else:
|
||||
url = (
|
||||
"https://generativelanguage.googleapis.com/v1beta/{}:{}?key={}".format(
|
||||
_gemini_model_name, endpoint, gemini_api_key
|
||||
"https://generativelanguage.googleapis.com/{}/{}:{}?key={}".format(
|
||||
api_version, _gemini_model_name, endpoint, gemini_api_key
|
||||
)
|
||||
)
|
||||
elif mode == "embedding":
|
||||
|
||||
@@ -5,7 +5,7 @@ Why separate file? Make it easy to see how transformation works
|
||||
"""
|
||||
|
||||
import os
|
||||
from typing import TYPE_CHECKING, List, Literal, Optional, Tuple, Union, cast
|
||||
from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Tuple, Union, cast
|
||||
|
||||
import httpx
|
||||
from pydantic import BaseModel
|
||||
@@ -28,7 +28,6 @@ from litellm.types.files import (
|
||||
get_file_type_from_extension,
|
||||
is_gemini_1_5_accepted_file_type,
|
||||
)
|
||||
from litellm.types.utils import LlmProviders
|
||||
from litellm.types.llms.openai import (
|
||||
AllMessageValues,
|
||||
ChatCompletionAssistantMessage,
|
||||
@@ -48,7 +47,7 @@ from litellm.types.llms.vertex_ai import (
|
||||
ToolConfig,
|
||||
Tools,
|
||||
)
|
||||
from litellm.types.utils import GenericImageParsingChunk
|
||||
from litellm.types.utils import GenericImageParsingChunk, LlmProviders
|
||||
|
||||
from ..common_utils import (
|
||||
_check_text_in_content,
|
||||
@@ -64,24 +63,21 @@ else:
|
||||
LiteLLMLoggingObj = Any
|
||||
|
||||
|
||||
def _map_openai_detail_to_media_resolution(
|
||||
def _convert_detail_to_media_resolution_enum(
|
||||
detail: Optional[str],
|
||||
) -> Optional[Literal["low", "medium", "high"]]:
|
||||
"""
|
||||
Map OpenAI's "detail" parameter to Gemini's "media_resolution" parameter.
|
||||
"""
|
||||
) -> Optional[Dict[str, str]]:
|
||||
if detail == "low":
|
||||
return "low"
|
||||
return {"level": "MEDIA_RESOLUTION_LOW"}
|
||||
elif detail == "high":
|
||||
return "high"
|
||||
# "auto" or None means let the model decide, so we don't set media_resolution
|
||||
return {"level": "MEDIA_RESOLUTION_HIGH"}
|
||||
return None
|
||||
|
||||
|
||||
def _process_gemini_image(
|
||||
image_url: str,
|
||||
format: Optional[str] = None,
|
||||
media_resolution: Optional[Literal["low", "medium", "high"]] = None,
|
||||
media_resolution_enum: Optional[Dict[str, str]] = None,
|
||||
model: Optional[str] = None,
|
||||
) -> PartType:
|
||||
"""
|
||||
Given an image URL, return the appropriate PartType for Gemini
|
||||
@@ -105,31 +101,43 @@ def _process_gemini_image(
|
||||
else:
|
||||
mime_type = format
|
||||
file_data = FileDataType(mime_type=mime_type, file_uri=image_url)
|
||||
|
||||
return PartType(file_data=file_data)
|
||||
part: PartType = {"file_data": file_data}
|
||||
|
||||
if media_resolution_enum is not None and model is not None:
|
||||
from .vertex_and_google_ai_studio_gemini import VertexGeminiConfig
|
||||
if VertexGeminiConfig._is_gemini_3_or_newer(model):
|
||||
part_dict = dict(part)
|
||||
part_dict["media_resolution"] = media_resolution_enum
|
||||
return cast(PartType, part_dict)
|
||||
return part
|
||||
elif (
|
||||
"https://" in image_url
|
||||
and (image_type := format or _get_image_mime_type_from_url(image_url))
|
||||
is not None
|
||||
):
|
||||
file_data = FileDataType(file_uri=image_url, mime_type=image_type)
|
||||
return PartType(file_data=file_data)
|
||||
part: PartType = {"file_data": file_data}
|
||||
|
||||
if media_resolution_enum is not None and model is not None:
|
||||
from .vertex_and_google_ai_studio_gemini import VertexGeminiConfig
|
||||
if VertexGeminiConfig._is_gemini_3_or_newer(model):
|
||||
part_dict = dict(part)
|
||||
part_dict["media_resolution"] = media_resolution_enum
|
||||
return cast(PartType, part_dict)
|
||||
return part
|
||||
elif "http://" in image_url or "https://" in image_url or "base64" in image_url:
|
||||
# https links for unsupported mime types and base64 images
|
||||
image = convert_to_anthropic_image_obj(image_url, format=format)
|
||||
_blob: BlobType = {"data": image["data"], "mime_type": image["media_type"]}
|
||||
if media_resolution is not None:
|
||||
_blob["media_resolution"] = media_resolution
|
||||
|
||||
# Convert snake_case keys to camelCase for JSON serialization
|
||||
# The TypedDict uses snake_case, but the API expects camelCase
|
||||
_blob_dict = dict(_blob)
|
||||
if "media_resolution" in _blob_dict:
|
||||
_blob_dict["mediaResolution"] = _blob_dict.pop("media_resolution")
|
||||
if "mime_type" in _blob_dict:
|
||||
_blob_dict["mimeType"] = _blob_dict.pop("mime_type")
|
||||
part: PartType = {"inline_data": cast(BlobType, _blob)}
|
||||
|
||||
return PartType(inline_data=cast(BlobType, _blob_dict))
|
||||
if media_resolution_enum is not None and model is not None:
|
||||
from .vertex_and_google_ai_studio_gemini import VertexGeminiConfig
|
||||
if VertexGeminiConfig._is_gemini_3_or_newer(model):
|
||||
part_dict = dict(part)
|
||||
part_dict["media_resolution"] = media_resolution_enum
|
||||
return cast(PartType, part_dict)
|
||||
return part
|
||||
raise Exception("Invalid image received - {}".format(image_url))
|
||||
except Exception as e:
|
||||
raise e
|
||||
@@ -235,18 +243,19 @@ def _gemini_convert_messages_with_history( # noqa: PLR0915
|
||||
element = cast(ChatCompletionImageObject, element)
|
||||
img_element = element
|
||||
format: Optional[str] = None
|
||||
media_resolution: Optional[Literal["low", "medium", "high"]] = None
|
||||
media_resolution_enum: Optional[Dict[str, str]] = None
|
||||
if isinstance(img_element["image_url"], dict):
|
||||
image_url = img_element["image_url"]["url"]
|
||||
format = img_element["image_url"].get("format")
|
||||
detail = img_element["image_url"].get("detail")
|
||||
media_resolution = _map_openai_detail_to_media_resolution(detail)
|
||||
media_resolution_enum = _convert_detail_to_media_resolution_enum(detail)
|
||||
else:
|
||||
image_url = img_element["image_url"]
|
||||
_part = _process_gemini_image(
|
||||
image_url=image_url,
|
||||
format=format,
|
||||
media_resolution=media_resolution,
|
||||
media_resolution_enum=media_resolution_enum,
|
||||
model=model,
|
||||
)
|
||||
_parts.append(_part)
|
||||
elif element["type"] == "input_audio":
|
||||
@@ -271,6 +280,7 @@ def _gemini_convert_messages_with_history( # noqa: PLR0915
|
||||
_part = _process_gemini_image(
|
||||
image_url=openai_image_str,
|
||||
format=audio_format_modified,
|
||||
model=model,
|
||||
)
|
||||
_parts.append(_part)
|
||||
elif element["type"] == "file":
|
||||
@@ -287,6 +297,7 @@ def _gemini_convert_messages_with_history( # noqa: PLR0915
|
||||
_part = _process_gemini_image(
|
||||
image_url=passed_file,
|
||||
format=format,
|
||||
model=model,
|
||||
)
|
||||
_parts.append(_part)
|
||||
except Exception:
|
||||
|
||||
@@ -32,7 +32,6 @@ class FileDataType(TypedDict):
|
||||
class BlobType(TypedDict, total=False):
|
||||
mime_type: Required[str]
|
||||
data: Required[str]
|
||||
media_resolution: Literal["low", "medium", "high"]
|
||||
|
||||
|
||||
class PartType(TypedDict, total=False):
|
||||
@@ -43,6 +42,7 @@ class PartType(TypedDict, total=False):
|
||||
function_response: FunctionResponse
|
||||
thought: bool
|
||||
thoughtSignature: str
|
||||
media_resolution: Literal["low", "medium", "high"]
|
||||
|
||||
|
||||
class HttpxFunctionCall(TypedDict):
|
||||
@@ -63,7 +63,6 @@ class HttpxCodeExecutionResult(TypedDict):
|
||||
class HttpxBlobType(TypedDict, total=False):
|
||||
mimeType: str
|
||||
data: str
|
||||
mediaResolution: Literal["low", "medium", "high"]
|
||||
|
||||
|
||||
class HttpxPartType(TypedDict, total=False):
|
||||
@@ -76,6 +75,7 @@ class HttpxPartType(TypedDict, total=False):
|
||||
codeExecutionResult: HttpxCodeExecutionResult
|
||||
thought: bool
|
||||
thoughtSignature: str
|
||||
mediaResolution: Literal["low", "medium", "high"]
|
||||
|
||||
|
||||
class HttpxContentType(TypedDict, total=False):
|
||||
|
||||
@@ -567,7 +567,7 @@ def test_vertex_only_image_user_message():
|
||||
},
|
||||
]
|
||||
|
||||
response = _gemini_convert_messages_with_history(messages=messages)
|
||||
response = _gemini_convert_messages_with_history(messages=messages, model="gemini-1.5-pro")
|
||||
|
||||
expected_response = [
|
||||
{
|
||||
@@ -576,7 +576,7 @@ def test_vertex_only_image_user_message():
|
||||
{
|
||||
"inline_data": {
|
||||
"data": "/9j/2wCEAAgGBgcGBQ",
|
||||
"mimeType": "image/jpeg",
|
||||
"mime_type": "image/jpeg",
|
||||
}
|
||||
},
|
||||
{"text": " "},
|
||||
|
||||
@@ -1767,15 +1767,15 @@ def test_temperature_default_for_gemini_3():
|
||||
def test_media_resolution_from_detail_parameter():
|
||||
"""Test that OpenAI's detail parameter is correctly mapped to media_resolution"""
|
||||
from litellm.llms.vertex_ai.gemini.transformation import (
|
||||
_convert_detail_to_media_resolution_enum,
|
||||
_gemini_convert_messages_with_history,
|
||||
_map_openai_detail_to_media_resolution,
|
||||
)
|
||||
|
||||
# Test detail -> media_resolution mapping
|
||||
assert _map_openai_detail_to_media_resolution("low") == "low"
|
||||
assert _map_openai_detail_to_media_resolution("high") == "high"
|
||||
assert _map_openai_detail_to_media_resolution("auto") is None
|
||||
assert _map_openai_detail_to_media_resolution(None) is None
|
||||
# Test detail -> media_resolution enum mapping
|
||||
assert _convert_detail_to_media_resolution_enum("low") == {"level": "MEDIA_RESOLUTION_LOW"}
|
||||
assert _convert_detail_to_media_resolution_enum("high") == {"level": "MEDIA_RESOLUTION_HIGH"}
|
||||
assert _convert_detail_to_media_resolution_enum("auto") is None
|
||||
assert _convert_detail_to_media_resolution_enum(None) is None
|
||||
|
||||
# Test with actual message transformation using base64 image
|
||||
# Using a minimal valid base64-encoded 1x1 PNG
|
||||
@@ -1795,27 +1795,28 @@ def test_media_resolution_from_detail_parameter():
|
||||
}
|
||||
]
|
||||
|
||||
contents = _gemini_convert_messages_with_history(messages=messages)
|
||||
contents = _gemini_convert_messages_with_history(
|
||||
messages=messages, model="gemini-3-pro-preview"
|
||||
)
|
||||
|
||||
# Verify media_resolution is set in the inline_data
|
||||
# Note: Gemini adds a blank text part when there's no text, so we expect 2 parts
|
||||
# Verify media_resolution is set at the Part level (not inside inline_data)
|
||||
assert len(contents) == 1
|
||||
assert len(contents[0]["parts"]) >= 1
|
||||
# Find the part with inline_data
|
||||
image_part = None
|
||||
for part in contents[0]["parts"]:
|
||||
if "inline_data" in part:
|
||||
if "inline_data" in part or "inlineData" in part:
|
||||
image_part = part
|
||||
break
|
||||
assert image_part is not None
|
||||
assert "inline_data" in image_part
|
||||
# The TypedDict uses snake_case internally, but mediaResolution is camelCase in the dict
|
||||
assert "mediaResolution" in image_part["inline_data"]
|
||||
assert image_part["inline_data"]["mediaResolution"] == "high"
|
||||
# media_resolution should be at the Part level, not inside inline_data
|
||||
assert "media_resolution" in image_part
|
||||
media_res = image_part.get("media_resolution")
|
||||
assert media_res == {"level": "MEDIA_RESOLUTION_HIGH"}
|
||||
|
||||
|
||||
def test_media_resolution_low_detail():
|
||||
"""Test that detail='low' maps to media_resolution='low'"""
|
||||
"""Test that detail='low' maps to media_resolution enum with MEDIA_RESOLUTION_LOW"""
|
||||
from litellm.llms.vertex_ai.gemini.transformation import (
|
||||
_gemini_convert_messages_with_history,
|
||||
)
|
||||
@@ -1837,7 +1838,9 @@ def test_media_resolution_low_detail():
|
||||
}
|
||||
]
|
||||
|
||||
contents = _gemini_convert_messages_with_history(messages=messages)
|
||||
contents = _gemini_convert_messages_with_history(
|
||||
messages=messages, model="gemini-3-pro-preview"
|
||||
)
|
||||
|
||||
# Find the part with inline_data
|
||||
image_part = None
|
||||
@@ -1847,7 +1850,9 @@ def test_media_resolution_low_detail():
|
||||
break
|
||||
assert image_part is not None
|
||||
assert "inline_data" in image_part
|
||||
assert image_part["inline_data"]["mediaResolution"] == "low"
|
||||
# media_resolution should be at the Part level, not inside inline_data
|
||||
assert "media_resolution" in image_part
|
||||
assert image_part["media_resolution"] == {"level": "MEDIA_RESOLUTION_LOW"}
|
||||
|
||||
|
||||
def test_media_resolution_auto_detail():
|
||||
@@ -1884,8 +1889,8 @@ def test_media_resolution_auto_detail():
|
||||
break
|
||||
assert image_part is not None
|
||||
assert "inline_data" in image_part
|
||||
# mediaResolution should not be set for auto
|
||||
assert "mediaResolution" not in image_part["inline_data"] or image_part["inline_data"].get("mediaResolution") is None
|
||||
# media_resolution should not be set for auto (check Part level, not inline_data)
|
||||
assert "media_resolution" not in image_part
|
||||
|
||||
# Test with None
|
||||
messages_none = [
|
||||
@@ -1911,8 +1916,8 @@ def test_media_resolution_auto_detail():
|
||||
break
|
||||
assert image_part is not None
|
||||
assert "inline_data" in image_part
|
||||
# mediaResolution should not be set
|
||||
assert "mediaResolution" not in image_part["inline_data"] or image_part["inline_data"].get("mediaResolution") is None
|
||||
# media_resolution should not be set (check Part level, not inline_data)
|
||||
assert "media_resolution" not in image_part
|
||||
|
||||
|
||||
def test_media_resolution_per_part():
|
||||
@@ -1951,7 +1956,9 @@ def test_media_resolution_per_part():
|
||||
}
|
||||
]
|
||||
|
||||
contents = _gemini_convert_messages_with_history(messages=messages)
|
||||
contents = _gemini_convert_messages_with_history(
|
||||
messages=messages, model="gemini-3-pro-preview"
|
||||
)
|
||||
|
||||
# Should have one content with multiple parts
|
||||
assert len(contents) == 1
|
||||
@@ -1960,12 +1967,53 @@ def test_media_resolution_per_part():
|
||||
# First image should have low resolution (first part is the image)
|
||||
image1_part = contents[0]["parts"][0]
|
||||
assert "inline_data" in image1_part
|
||||
assert image1_part["inline_data"]["mediaResolution"] == "low"
|
||||
# media_resolution should be at the Part level, not inside inline_data
|
||||
assert "media_resolution" in image1_part
|
||||
assert image1_part["media_resolution"] == {"level": "MEDIA_RESOLUTION_LOW"}
|
||||
|
||||
# Second image should have high resolution (third part is the second image)
|
||||
image2_part = contents[0]["parts"][2]
|
||||
assert "inline_data" in image2_part
|
||||
assert image2_part["inline_data"]["mediaResolution"] == "high"
|
||||
# media_resolution should be at the Part level, not inside inline_data
|
||||
assert "media_resolution" in image2_part
|
||||
assert image2_part["media_resolution"] == {"level": "MEDIA_RESOLUTION_HIGH"}
|
||||
|
||||
|
||||
def test_media_resolution_only_for_gemini_3_models():
|
||||
"""Ensure media_resolution is not added for non-Gemini 3 models."""
|
||||
from litellm.llms.vertex_ai.gemini.transformation import (
|
||||
_gemini_convert_messages_with_history,
|
||||
)
|
||||
|
||||
base64_image = ""
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": base64_image,
|
||||
"detail": "high",
|
||||
},
|
||||
}
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
contents = _gemini_convert_messages_with_history(
|
||||
messages=messages, model="gemini-2.5-pro"
|
||||
)
|
||||
image_part = None
|
||||
for part in contents[0]["parts"]:
|
||||
if "inline_data" in part:
|
||||
image_part = part
|
||||
break
|
||||
assert image_part is not None
|
||||
assert "inline_data" in image_part
|
||||
# media_resolution should not be at the Part level for non-Gemini 3 models
|
||||
assert "media_resolution" not in image_part
|
||||
assert "mediaResolution" not in image_part
|
||||
|
||||
|
||||
def test_gemini_3_image_models_no_thinking_config():
|
||||
|
||||
@@ -1241,7 +1241,7 @@ def test_process_gemini_image():
|
||||
base64_image = "..."
|
||||
base64_result = _process_gemini_image(base64_image)
|
||||
print("base64_result", base64_result)
|
||||
assert base64_result["inline_data"]["mimeType"] == "image/jpeg"
|
||||
assert base64_result["inline_data"]["mime_type"] == "image/jpeg"
|
||||
assert base64_result["inline_data"]["data"] == "/9j/4AAQSkZJRg..."
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user