Fix: Gemini image_tokens incorrectly treated as text tokens in cost calculation (#17554)

When Gemini image generation models return `text_tokens=0` with `image_tokens > 0`, the cost calculator was assuming no token breakdown existed and treating all completion tokens as text tokens, resulting in ~10x underestimation of costs. Changes: - Fix cost calculation logic to respect token breakdown when image/audio/reasoning tokens are present, even if text_tokens=0 - Add `output_cost_per_image_token` pricing for gemini-3-pro-image-preview models - Add test case reproducing the issue - Add documentation explaining image token pricing Fixes #17410
2025-12-06 11:33:26 +08:00 · 2025-12-05 20:55:38 -03:00
parent 2905feb889
commit 829b06f53f
5 changed files with 107 additions and 2 deletions
--- a/docs/my-website/docs/providers/gemini.md
+++ b/docs/my-website/docs/providers/gemini.md
@@ -2006,3 +2006,34 @@ curl -L -X POST 'http://localhost:4000/v1/chat/completions' \
 </TabItem>
 </Tabs>

+### Image Generation Pricing
+
+Gemini image generation models (like `gemini-3-pro-image-preview`) return `image_tokens` in the response usage. These tokens are priced differently from text tokens:
+
+| Token Type | Price per 1M tokens | Price per token |
+|------------|---------------------|-----------------|
+| Text output | $12 | $0.000012 |
+| Image output | $120 | $0.00012 |
+
+The number of image tokens depends on the output resolution:
+
+| Resolution | Tokens per image | Cost per image |
+|------------|------------------|----------------|
+| 1K-2K (1024x1024 to 2048x2048) | 1,120 | $0.134 |
+| 4K (4096x4096) | 2,000 | $0.24 |
+
+LiteLLM automatically calculates costs using `output_cost_per_image_token` from the model pricing configuration.
+
+**Example response usage:**
+```json
+{
+    "completion_tokens_details": {
+        "reasoning_tokens": 225,
+        "text_tokens": 0,
+        "image_tokens": 1120
+    }
+}
+```
+
+For more details, see [Google's Gemini pricing documentation](https://ai.google.dev/gemini-api/docs/pricing).
+
--- a/litellm/litellm_core_utils/llm_cost_calc/utils.py
+++ b/litellm/litellm_core_utils/llm_cost_calc/utils.py
@@ -583,9 +583,11 @@ def generic_cost_per_token(
        reasoning_tokens = completion_tokens_details["reasoning_tokens"]
        image_tokens = completion_tokens_details["image_tokens"]

-    if text_tokens == 0:
+    # Only assume all tokens are text if there's NO breakdown at all
+    # If image_tokens, audio_tokens, or reasoning_tokens exist, respect text_tokens=0
+    has_token_breakdown = image_tokens > 0 or audio_tokens > 0 or reasoning_tokens > 0
+    if text_tokens == 0 and not has_token_breakdown:
        text_tokens = usage.completion_tokens
-    if text_tokens == usage.completion_tokens:
        is_text_tokens_total = True
    ## TEXT COST
    completion_cost = float(text_tokens) * completion_base_cost
--- a/litellm/model_prices_and_context_window_backup.json
+++ b/litellm/model_prices_and_context_window_backup.json
@@ -12147,6 +12147,7 @@
        "max_tokens": 65536,
        "mode": "image_generation",
        "output_cost_per_image": 0.134,
+        "output_cost_per_image_token": 1.2e-04,
        "output_cost_per_token": 1.2e-05,
        "output_cost_per_token_batches": 6e-06,
        "source": "https://ai.google.dev/gemini-api/docs/pricing",
@@ -13884,6 +13885,7 @@
        "max_tokens": 65536,
        "mode": "image_generation",
        "output_cost_per_image": 0.134,
+        "output_cost_per_image_token": 1.2e-04,
        "output_cost_per_token": 1.2e-05,
        "rpm": 1000,
        "tpm": 4000000,
@@ -25802,6 +25804,7 @@
        "max_tokens": 65536,
        "mode": "image_generation",
        "output_cost_per_image": 0.134,
+        "output_cost_per_image_token": 1.2e-04,
        "output_cost_per_token": 1.2e-05,
        "output_cost_per_token_batches": 6e-06,
        "source": "https://docs.cloud.google.com/vertex-ai/generative-ai/docs/models/gemini/3-pro-image"
--- a/model_prices_and_context_window.json
+++ b/model_prices_and_context_window.json
@@ -12147,6 +12147,7 @@
        "max_tokens": 65536,
        "mode": "image_generation",
        "output_cost_per_image": 0.134,
+        "output_cost_per_image_token": 1.2e-04,
        "output_cost_per_token": 1.2e-05,
        "output_cost_per_token_batches": 6e-06,
        "source": "https://ai.google.dev/gemini-api/docs/pricing",
@@ -13884,6 +13885,7 @@
        "max_tokens": 65536,
        "mode": "image_generation",
        "output_cost_per_image": 0.134,
+        "output_cost_per_image_token": 1.2e-04,
        "output_cost_per_token": 1.2e-05,
        "rpm": 1000,
        "tpm": 4000000,
@@ -25802,6 +25804,7 @@
        "max_tokens": 65536,
        "mode": "image_generation",
        "output_cost_per_image": 0.134,
+        "output_cost_per_image_token": 1.2e-04,
        "output_cost_per_token": 1.2e-05,
        "output_cost_per_token_batches": 6e-06,
        "source": "https://docs.cloud.google.com/vertex-ai/generative-ai/docs/models/gemini/3-pro-image"
--- a/tests/test_litellm/litellm_core_utils/llm_cost_calc/test_llm_cost_calc_utils.py
+++ b/tests/test_litellm/litellm_core_utils/llm_cost_calc/test_llm_cost_calc_utils.py
@@ -720,6 +720,72 @@ def test_service_tier_fallback_pricing():
    assert abs(std_cost[1] - expected_standard_completion) < 1e-10, f"Standard completion cost mismatch: {std_cost[1]} vs {expected_standard_completion}"


+def test_gemini_image_generation_cost_with_zero_text_tokens():
+    """
+    Test that image_tokens are correctly costed when text_tokens=0.
+
+    Reproduces issue #17410: completion_cost calculates incorrectly for
+    Gemini-3-pro-image model - image_tokens were treated as text tokens
+    when text_tokens=0.
+
+    https://github.com/BerriAI/litellm/issues/17410
+    """
+    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
+    litellm.model_cost = litellm.get_model_cost_map(url="")
+
+    model = "gemini-3-pro-image-preview"
+    custom_llm_provider = "vertex_ai"
+
+    # Usage from the issue: text_tokens=0, image_tokens=1120, reasoning_tokens=225
+    usage = Usage(
+        completion_tokens=1345,
+        prompt_tokens=10,
+        total_tokens=1355,
+        completion_tokens_details=CompletionTokensDetailsWrapper(
+            accepted_prediction_tokens=None,
+            audio_tokens=None,
+            reasoning_tokens=225,
+            rejected_prediction_tokens=None,
+            text_tokens=0,  # This is the key: text_tokens=0
+            image_tokens=1120,
+        ),
+        prompt_tokens_details=PromptTokensDetailsWrapper(
+            audio_tokens=None, cached_tokens=None, text_tokens=10, image_tokens=None
+        ),
+    )
+
+    model_cost_map = litellm.model_cost[model]
+    prompt_cost, completion_cost = generic_cost_per_token(
+        model=model,
+        usage=usage,
+        custom_llm_provider=custom_llm_provider,
+    )
+
+    # Expected costs:
+    # - text_tokens: 0 * output_cost_per_token = 0
+    # - image_tokens: 1120 * output_cost_per_image_token = 1120 * 1.2e-04 = 0.1344
+    # - reasoning_tokens: 225 * output_cost_per_token = 225 * 1.2e-05 = 0.0027
+    # Total completion: ~0.1371
+
+    output_cost_per_image_token = model_cost_map.get("output_cost_per_image_token", 0)
+    output_cost_per_token = model_cost_map.get("output_cost_per_token", 0)
+
+    expected_image_cost = 1120 * output_cost_per_image_token
+    expected_reasoning_cost = 225 * output_cost_per_token  # reasoning uses base token cost
+    expected_completion_cost = expected_image_cost + expected_reasoning_cost
+
+    # The bug was: all 1345 tokens were treated as text = 1345 * 1.2e-05 = 0.01614
+    # Fixed: image_tokens use image pricing = ~0.137
+
+    assert completion_cost > 0.10, (
+        f"Completion cost should be > $0.10 (image tokens are expensive), got ${completion_cost:.6f}. "
+        f"Bug: tokens may be incorrectly treated as text tokens."
+    )
+    assert round(completion_cost, 4) == round(expected_completion_cost, 4), (
+        f"Expected completion cost ${expected_completion_cost:.6f}, got ${completion_cost:.6f}"
+    )
+
+
 def test_bedrock_anthropic_prompt_caching():
    """Test Bedrock Anthropic models with prompt caching return correct costs."""
    model = "us.anthropic.claude-sonnet-4-5-20250929-v1:0"