Add support for cursor BYOK with its own configuration

This commit is contained in:
Sameer Kankute
2025-12-05 09:34:49 +05:30
parent 4f3b843efe
commit a6006e698c
4 changed files with 302 additions and 0 deletions

View File

@@ -0,0 +1,94 @@
---
id: cursor
title: Cursor Endpoint (/cursor/chat/completions)
description: Accept Responses API input from Cursor and return OpenAI Chat Completions output
---
LiteLLM provides a Cursor-specific endpoint to make Cursor IDE work seamlessly with the LiteLLM Proxy when using BYOK + custom `base_url`.
- Accepts Requests in OpenAI Responses API input format (Cursor sends this)
- Returns Responses in OpenAI Chat Completions format (Cursor expects this)
- Supports streaming and nonstreaming
## Endpoint
- Path: `/cursor/chat/completions`
- Auth: Standard LiteLLM Proxy auth (`Authorization: Bearer <key>`)
- Behavior: Internally routes to LiteLLM `/responses` flow and transforms output to Chat Completions
## Why this exists
When setting up Cursor with BYOK against a custom `base_url`, Cursor sends requests to the Chat Completions endpoint but in the OpenAI Responses API input shape. Without translation, Cursor wont display streamed output. This endpoint bridges the formats:
- Input: Responses API (`input`, tool calls, etc.)
- Output: Chat Completions (`choices`, `delta`, `finish_reason`, etc.)
## Usage
### Non-streaming
```bash
curl -X POST http://localhost:4000/cursor/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer sk-1234" \
-d '{
"model": "gpt-4o",
"input": [{"role": "user", "content": "Hello"}]
}'
```
Example response (shape):
```json
{
"id": "chatcmpl-123",
"object": "chat.completion",
"created": 1733333333,
"model": "gpt-4o",
"choices": [
{
"index": 0,
"message": {
"role": "assistant",
"content": "Hello! How can I help you?"
},
"finish_reason": "stop"
}
],
"usage": {
"prompt_tokens": 10,
"completion_tokens": 8,
"total_tokens": 18
}
}
```
### Streaming
```bash
curl -N -X POST http://localhost:4000/cursor/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer sk-1234" \
-d '{
"model": "gpt-4o",
"input": [{"role": "user", "content": "Hello"}],
"stream": true
}'
```
- Server-Sent Events (SSE)
- Emits `chat.completion.chunk` deltas (`choices[].delta`) and ends with `data: [DONE]`
## Configuration
No special configuration is required beyond your normal LiteLLM Proxy setup. Ensure that:
- Your `config.yaml` includes the models you want to call via this endpoint
- Your Cursor project uses your LiteLLM Proxy `base_url` and a valid API key
## Notes
- Only this page documents the Cursor endpoint. The native `/responses` docs remain unchanged.
- This endpoint is intended specifically for Cursors request/response expectations. Other clients should continue to use `/v1/chat/completions` or `/v1/responses` as appropriate.

View File

@@ -448,6 +448,7 @@ const sidebars = {
"realtime",
"rerank",
"response_api",
"proxy/cursor",
{
type: "category",
label: "/search",

View File

@@ -85,6 +85,150 @@ async def responses_api(
)
@router.post(
"/cursor/chat/completions",
dependencies=[Depends(user_api_key_auth)],
tags=["responses"],
)
async def cursor_chat_completions(
request: Request,
fastapi_response: Response,
user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
):
"""
Cursor-specific endpoint that accepts Responses API input format but returns chat completions format.
This endpoint handles requests from Cursor IDE which sends Responses API format (`input` field)
but expects chat completions format response (`choices`, `messages`, etc.).
```bash
curl -X POST http://localhost:4000/cursor/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer sk-1234" \
-d '{
"model": "gpt-4o",
"input": [{"role": "user", "content": "Hello"}]
}'
Responds back in chat completions format.
```
"""
from litellm.completion_extras.litellm_responses_transformation.handler import (
responses_api_bridge,
)
from litellm.litellm_core_utils.streaming_handler import CustomStreamWrapper
from litellm.proxy.proxy_server import (
_read_request_body,
async_data_generator,
general_settings,
llm_router,
proxy_config,
proxy_logging_obj,
user_api_base,
user_max_tokens,
user_model,
user_request_timeout,
user_temperature,
version,
)
from litellm.responses.streaming_iterator import BaseResponsesAPIStreamingIterator
from litellm.types.llms.openai import ResponsesAPIResponse
data = await _read_request_body(request=request)
processor = ProxyBaseLLMRequestProcessing(data=data)
def cursor_data_generator(response, user_api_key_dict, request_data):
"""
Custom generator that transforms Responses API streaming chunks to chat completion chunks.
This generator is used for the cursor endpoint to convert Responses API format responses
to chat completion format that Cursor IDE expects.
Args:
response: The streaming response (BaseResponsesAPIStreamingIterator or other)
user_api_key_dict: User API key authentication dict
request_data: Request data containing model, logging_obj, etc.
Returns:
Async generator that yields SSE-formatted chat completion chunks
"""
# If response is a BaseResponsesAPIStreamingIterator, transform it first
if isinstance(response, BaseResponsesAPIStreamingIterator):
# Transform Responses API iterator to chat completion iterator
completion_stream = responses_api_bridge.transformation_handler.get_model_response_iterator(
streaming_response=response,
sync_stream=False,
json_mode=False,
)
# Wrap in CustomStreamWrapper to get the async generator
logging_obj = request_data.get("litellm_logging_obj")
streamwrapper = CustomStreamWrapper(
completion_stream=completion_stream,
model=request_data.get("model", ""),
custom_llm_provider=None,
logging_obj=logging_obj,
)
# Use async_data_generator to format as SSE
return async_data_generator(
response=streamwrapper,
user_api_key_dict=user_api_key_dict,
request_data=request_data,
)
# Otherwise, use the default generator
return async_data_generator(
response=response,
user_api_key_dict=user_api_key_dict,
request_data=request_data,
)
try:
response = await processor.base_process_llm_request(
request=request,
fastapi_response=fastapi_response,
user_api_key_dict=user_api_key_dict,
route_type="aresponses",
proxy_logging_obj=proxy_logging_obj,
llm_router=llm_router,
general_settings=general_settings,
proxy_config=proxy_config,
select_data_generator=cursor_data_generator,
model=None,
user_model=user_model,
user_temperature=user_temperature,
user_request_timeout=user_request_timeout,
user_max_tokens=user_max_tokens,
user_api_base=user_api_base,
version=version,
)
# Transform non-streaming Responses API response to chat completions format
if isinstance(response, ResponsesAPIResponse):
logging_obj = processor.data.get("litellm_logging_obj")
transformed_response = responses_api_bridge.transformation_handler.transform_response(
model=processor.data.get("model", ""),
raw_response=response,
model_response=None,
logging_obj=logging_obj,
request_data=processor.data,
messages=processor.data.get("input", []),
optional_params={},
litellm_params={},
encoding=None,
api_key=None,
json_mode=None,
)
return transformed_response
# Streaming responses are already transformed by cursor_select_data_generator
return response
except Exception as e:
raise await processor._handle_llm_api_exception(
e=e,
user_api_key_dict=user_api_key_dict,
proxy_logging_obj=proxy_logging_obj,
version=version,
)
@router.get(
"/v1/responses/{response_id}",
dependencies=[Depends(user_api_key_auth)],

View File

@@ -51,3 +51,66 @@ class TestResponsesAPIEndpoints(unittest.TestCase):
assert response.status_code in [200, 401, 500]
@pytest.mark.asyncio
@patch("litellm.proxy.proxy_server.llm_router")
@patch("litellm.proxy.proxy_server.user_api_key_auth")
async def test_cursor_chat_completions_route(self, mock_auth, mock_router):
"""
Test that /cursor/chat/completions endpoint:
1. Accepts Responses API input format
2. Returns chat completions format response
3. Transforms streaming responses correctly
"""
from litellm.types.llms.openai import ResponsesAPIResponse
from litellm.types.utils import ResponseOutputMessage, ResponseOutputText
mock_auth.return_value = MagicMock(
token="test_token",
user_id="test_user",
team_id=None,
)
# Mock a Responses API response
mock_responses_response = ResponsesAPIResponse(
id="resp_cursor123",
created_at=1234567890,
model="gpt-4o",
object="response",
output=[
ResponseOutputMessage(
type="message",
role="assistant",
content=[
ResponseOutputText(type="output_text", text="Hello from Cursor!")
],
)
],
)
mock_router.aresponses = AsyncMock(return_value=mock_responses_response)
client = TestClient(app)
# Test with Responses API input format (what Cursor sends)
test_data = {
"model": "gpt-4o",
"input": [{"role": "user", "content": "Hello"}],
}
response = client.post(
"/cursor/chat/completions",
json=test_data,
headers={"Authorization": "Bearer sk-1234"},
)
# Should return 200 (or 401/500 if auth fails)
assert response.status_code in [200, 401, 500]
# If successful, verify it returns chat completions format
if response.status_code == 200:
response_data = response.json()
# Should have chat completion structure
assert "choices" in response_data or "id" in response_data
# Should not have Responses API structure
assert "output" not in response_data or "status" not in response_data