mirror of
https://github.com/BerriAI/litellm.git
synced 2025-12-06 11:33:26 +08:00
Add support for cursor BYOK with its own configuration
This commit is contained in:
94
docs/my-website/docs/proxy/cursor.md
Normal file
94
docs/my-website/docs/proxy/cursor.md
Normal file
@@ -0,0 +1,94 @@
|
||||
---
|
||||
id: cursor
|
||||
title: Cursor Endpoint (/cursor/chat/completions)
|
||||
description: Accept Responses API input from Cursor and return OpenAI Chat Completions output
|
||||
---
|
||||
|
||||
LiteLLM provides a Cursor-specific endpoint to make Cursor IDE work seamlessly with the LiteLLM Proxy when using BYOK + custom `base_url`.
|
||||
|
||||
- Accepts Requests in OpenAI Responses API input format (Cursor sends this)
|
||||
- Returns Responses in OpenAI Chat Completions format (Cursor expects this)
|
||||
- Supports streaming and non‑streaming
|
||||
|
||||
## Endpoint
|
||||
|
||||
- Path: `/cursor/chat/completions`
|
||||
- Auth: Standard LiteLLM Proxy auth (`Authorization: Bearer <key>`)
|
||||
- Behavior: Internally routes to LiteLLM `/responses` flow and transforms output to Chat Completions
|
||||
|
||||
## Why this exists
|
||||
|
||||
When setting up Cursor with BYOK against a custom `base_url`, Cursor sends requests to the Chat Completions endpoint but in the OpenAI Responses API input shape. Without translation, Cursor won’t display streamed output. This endpoint bridges the formats:
|
||||
|
||||
- Input: Responses API (`input`, tool calls, etc.)
|
||||
- Output: Chat Completions (`choices`, `delta`, `finish_reason`, etc.)
|
||||
|
||||
## Usage
|
||||
|
||||
### Non-streaming
|
||||
|
||||
```bash
|
||||
curl -X POST http://localhost:4000/cursor/chat/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "Authorization: Bearer sk-1234" \
|
||||
-d '{
|
||||
"model": "gpt-4o",
|
||||
"input": [{"role": "user", "content": "Hello"}]
|
||||
}'
|
||||
```
|
||||
|
||||
Example response (shape):
|
||||
|
||||
```json
|
||||
{
|
||||
"id": "chatcmpl-123",
|
||||
"object": "chat.completion",
|
||||
"created": 1733333333,
|
||||
"model": "gpt-4o",
|
||||
"choices": [
|
||||
{
|
||||
"index": 0,
|
||||
"message": {
|
||||
"role": "assistant",
|
||||
"content": "Hello! How can I help you?"
|
||||
},
|
||||
"finish_reason": "stop"
|
||||
}
|
||||
],
|
||||
"usage": {
|
||||
"prompt_tokens": 10,
|
||||
"completion_tokens": 8,
|
||||
"total_tokens": 18
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Streaming
|
||||
|
||||
```bash
|
||||
curl -N -X POST http://localhost:4000/cursor/chat/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "Authorization: Bearer sk-1234" \
|
||||
-d '{
|
||||
"model": "gpt-4o",
|
||||
"input": [{"role": "user", "content": "Hello"}],
|
||||
"stream": true
|
||||
}'
|
||||
```
|
||||
|
||||
- Server-Sent Events (SSE)
|
||||
- Emits `chat.completion.chunk` deltas (`choices[].delta`) and ends with `data: [DONE]`
|
||||
|
||||
## Configuration
|
||||
|
||||
No special configuration is required beyond your normal LiteLLM Proxy setup. Ensure that:
|
||||
|
||||
- Your `config.yaml` includes the models you want to call via this endpoint
|
||||
- Your Cursor project uses your LiteLLM Proxy `base_url` and a valid API key
|
||||
|
||||
## Notes
|
||||
|
||||
- Only this page documents the Cursor endpoint. The native `/responses` docs remain unchanged.
|
||||
- This endpoint is intended specifically for Cursor’s request/response expectations. Other clients should continue to use `/v1/chat/completions` or `/v1/responses` as appropriate.
|
||||
|
||||
|
||||
@@ -448,6 +448,7 @@ const sidebars = {
|
||||
"realtime",
|
||||
"rerank",
|
||||
"response_api",
|
||||
"proxy/cursor",
|
||||
{
|
||||
type: "category",
|
||||
label: "/search",
|
||||
|
||||
@@ -85,6 +85,150 @@ async def responses_api(
|
||||
)
|
||||
|
||||
|
||||
@router.post(
|
||||
"/cursor/chat/completions",
|
||||
dependencies=[Depends(user_api_key_auth)],
|
||||
tags=["responses"],
|
||||
)
|
||||
async def cursor_chat_completions(
|
||||
request: Request,
|
||||
fastapi_response: Response,
|
||||
user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
|
||||
):
|
||||
"""
|
||||
Cursor-specific endpoint that accepts Responses API input format but returns chat completions format.
|
||||
|
||||
This endpoint handles requests from Cursor IDE which sends Responses API format (`input` field)
|
||||
but expects chat completions format response (`choices`, `messages`, etc.).
|
||||
|
||||
```bash
|
||||
curl -X POST http://localhost:4000/cursor/chat/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "Authorization: Bearer sk-1234" \
|
||||
-d '{
|
||||
"model": "gpt-4o",
|
||||
"input": [{"role": "user", "content": "Hello"}]
|
||||
}'
|
||||
Responds back in chat completions format.
|
||||
```
|
||||
"""
|
||||
from litellm.completion_extras.litellm_responses_transformation.handler import (
|
||||
responses_api_bridge,
|
||||
)
|
||||
from litellm.litellm_core_utils.streaming_handler import CustomStreamWrapper
|
||||
from litellm.proxy.proxy_server import (
|
||||
_read_request_body,
|
||||
async_data_generator,
|
||||
general_settings,
|
||||
llm_router,
|
||||
proxy_config,
|
||||
proxy_logging_obj,
|
||||
user_api_base,
|
||||
user_max_tokens,
|
||||
user_model,
|
||||
user_request_timeout,
|
||||
user_temperature,
|
||||
version,
|
||||
)
|
||||
from litellm.responses.streaming_iterator import BaseResponsesAPIStreamingIterator
|
||||
from litellm.types.llms.openai import ResponsesAPIResponse
|
||||
|
||||
data = await _read_request_body(request=request)
|
||||
processor = ProxyBaseLLMRequestProcessing(data=data)
|
||||
|
||||
def cursor_data_generator(response, user_api_key_dict, request_data):
|
||||
"""
|
||||
Custom generator that transforms Responses API streaming chunks to chat completion chunks.
|
||||
|
||||
This generator is used for the cursor endpoint to convert Responses API format responses
|
||||
to chat completion format that Cursor IDE expects.
|
||||
|
||||
Args:
|
||||
response: The streaming response (BaseResponsesAPIStreamingIterator or other)
|
||||
user_api_key_dict: User API key authentication dict
|
||||
request_data: Request data containing model, logging_obj, etc.
|
||||
|
||||
Returns:
|
||||
Async generator that yields SSE-formatted chat completion chunks
|
||||
"""
|
||||
# If response is a BaseResponsesAPIStreamingIterator, transform it first
|
||||
if isinstance(response, BaseResponsesAPIStreamingIterator):
|
||||
# Transform Responses API iterator to chat completion iterator
|
||||
completion_stream = responses_api_bridge.transformation_handler.get_model_response_iterator(
|
||||
streaming_response=response,
|
||||
sync_stream=False,
|
||||
json_mode=False,
|
||||
)
|
||||
# Wrap in CustomStreamWrapper to get the async generator
|
||||
logging_obj = request_data.get("litellm_logging_obj")
|
||||
streamwrapper = CustomStreamWrapper(
|
||||
completion_stream=completion_stream,
|
||||
model=request_data.get("model", ""),
|
||||
custom_llm_provider=None,
|
||||
logging_obj=logging_obj,
|
||||
)
|
||||
# Use async_data_generator to format as SSE
|
||||
return async_data_generator(
|
||||
response=streamwrapper,
|
||||
user_api_key_dict=user_api_key_dict,
|
||||
request_data=request_data,
|
||||
)
|
||||
# Otherwise, use the default generator
|
||||
return async_data_generator(
|
||||
response=response,
|
||||
user_api_key_dict=user_api_key_dict,
|
||||
request_data=request_data,
|
||||
)
|
||||
|
||||
try:
|
||||
response = await processor.base_process_llm_request(
|
||||
request=request,
|
||||
fastapi_response=fastapi_response,
|
||||
user_api_key_dict=user_api_key_dict,
|
||||
route_type="aresponses",
|
||||
proxy_logging_obj=proxy_logging_obj,
|
||||
llm_router=llm_router,
|
||||
general_settings=general_settings,
|
||||
proxy_config=proxy_config,
|
||||
select_data_generator=cursor_data_generator,
|
||||
model=None,
|
||||
user_model=user_model,
|
||||
user_temperature=user_temperature,
|
||||
user_request_timeout=user_request_timeout,
|
||||
user_max_tokens=user_max_tokens,
|
||||
user_api_base=user_api_base,
|
||||
version=version,
|
||||
)
|
||||
|
||||
# Transform non-streaming Responses API response to chat completions format
|
||||
if isinstance(response, ResponsesAPIResponse):
|
||||
logging_obj = processor.data.get("litellm_logging_obj")
|
||||
transformed_response = responses_api_bridge.transformation_handler.transform_response(
|
||||
model=processor.data.get("model", ""),
|
||||
raw_response=response,
|
||||
model_response=None,
|
||||
logging_obj=logging_obj,
|
||||
request_data=processor.data,
|
||||
messages=processor.data.get("input", []),
|
||||
optional_params={},
|
||||
litellm_params={},
|
||||
encoding=None,
|
||||
api_key=None,
|
||||
json_mode=None,
|
||||
)
|
||||
return transformed_response
|
||||
|
||||
# Streaming responses are already transformed by cursor_select_data_generator
|
||||
return response
|
||||
except Exception as e:
|
||||
raise await processor._handle_llm_api_exception(
|
||||
e=e,
|
||||
user_api_key_dict=user_api_key_dict,
|
||||
proxy_logging_obj=proxy_logging_obj,
|
||||
version=version,
|
||||
)
|
||||
|
||||
|
||||
@router.get(
|
||||
"/v1/responses/{response_id}",
|
||||
dependencies=[Depends(user_api_key_auth)],
|
||||
|
||||
@@ -51,3 +51,66 @@ class TestResponsesAPIEndpoints(unittest.TestCase):
|
||||
|
||||
assert response.status_code in [200, 401, 500]
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@patch("litellm.proxy.proxy_server.llm_router")
|
||||
@patch("litellm.proxy.proxy_server.user_api_key_auth")
|
||||
async def test_cursor_chat_completions_route(self, mock_auth, mock_router):
|
||||
"""
|
||||
Test that /cursor/chat/completions endpoint:
|
||||
1. Accepts Responses API input format
|
||||
2. Returns chat completions format response
|
||||
3. Transforms streaming responses correctly
|
||||
"""
|
||||
from litellm.types.llms.openai import ResponsesAPIResponse
|
||||
from litellm.types.utils import ResponseOutputMessage, ResponseOutputText
|
||||
|
||||
mock_auth.return_value = MagicMock(
|
||||
token="test_token",
|
||||
user_id="test_user",
|
||||
team_id=None,
|
||||
)
|
||||
|
||||
# Mock a Responses API response
|
||||
mock_responses_response = ResponsesAPIResponse(
|
||||
id="resp_cursor123",
|
||||
created_at=1234567890,
|
||||
model="gpt-4o",
|
||||
object="response",
|
||||
output=[
|
||||
ResponseOutputMessage(
|
||||
type="message",
|
||||
role="assistant",
|
||||
content=[
|
||||
ResponseOutputText(type="output_text", text="Hello from Cursor!")
|
||||
],
|
||||
)
|
||||
],
|
||||
)
|
||||
|
||||
mock_router.aresponses = AsyncMock(return_value=mock_responses_response)
|
||||
|
||||
client = TestClient(app)
|
||||
|
||||
# Test with Responses API input format (what Cursor sends)
|
||||
test_data = {
|
||||
"model": "gpt-4o",
|
||||
"input": [{"role": "user", "content": "Hello"}],
|
||||
}
|
||||
|
||||
response = client.post(
|
||||
"/cursor/chat/completions",
|
||||
json=test_data,
|
||||
headers={"Authorization": "Bearer sk-1234"},
|
||||
)
|
||||
|
||||
# Should return 200 (or 401/500 if auth fails)
|
||||
assert response.status_code in [200, 401, 500]
|
||||
|
||||
# If successful, verify it returns chat completions format
|
||||
if response.status_code == 200:
|
||||
response_data = response.json()
|
||||
# Should have chat completion structure
|
||||
assert "choices" in response_data or "id" in response_data
|
||||
# Should not have Responses API structure
|
||||
assert "output" not in response_data or "status" not in response_data
|
||||
|
||||
|
||||
Reference in New Issue
Block a user