[Doc] ruff format some Python examples (#26767)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
@@ -11,8 +11,7 @@ The following code splits the model across 2 GPUs.
|
||||
```python
|
||||
from vllm import LLM
|
||||
|
||||
llm = LLM(model="ibm-granite/granite-3.1-8b-instruct",
|
||||
tensor_parallel_size=2)
|
||||
llm = LLM(model="ibm-granite/granite-3.1-8b-instruct", tensor_parallel_size=2)
|
||||
```
|
||||
|
||||
!!! warning
|
||||
@@ -43,9 +42,7 @@ and the maximum batch size (`max_num_seqs` option).
|
||||
```python
|
||||
from vllm import LLM
|
||||
|
||||
llm = LLM(model="adept/fuyu-8b",
|
||||
max_model_len=2048,
|
||||
max_num_seqs=2)
|
||||
llm = LLM(model="adept/fuyu-8b", max_model_len=2048, max_num_seqs=2)
|
||||
```
|
||||
|
||||
## Reduce CUDA Graphs
|
||||
@@ -78,8 +75,7 @@ You can disable graph capturing completely via the `enforce_eager` flag:
|
||||
```python
|
||||
from vllm import LLM
|
||||
|
||||
llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct",
|
||||
enforce_eager=True)
|
||||
llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct", enforce_eager=True)
|
||||
```
|
||||
|
||||
## Adjust cache size
|
||||
@@ -97,8 +93,10 @@ You can allow a smaller number of multi-modal items per prompt to reduce the mem
|
||||
from vllm import LLM
|
||||
|
||||
# Accept up to 3 images and 1 video per prompt
|
||||
llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
|
||||
limit_mm_per_prompt={"image": 3, "video": 1})
|
||||
llm = LLM(
|
||||
model="Qwen/Qwen2.5-VL-3B-Instruct",
|
||||
limit_mm_per_prompt={"image": 3, "video": 1},
|
||||
)
|
||||
```
|
||||
|
||||
You can go a step further and disable unused modalities completely by setting its limit to zero.
|
||||
@@ -108,8 +106,10 @@ For example, if your application only accepts image input, there is no need to a
|
||||
from vllm import LLM
|
||||
|
||||
# Accept any number of images but no videos
|
||||
llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
|
||||
limit_mm_per_prompt={"video": 0})
|
||||
llm = LLM(
|
||||
model="Qwen/Qwen2.5-VL-3B-Instruct",
|
||||
limit_mm_per_prompt={"video": 0},
|
||||
)
|
||||
```
|
||||
|
||||
You can even run a multi-modal model for text-only inference:
|
||||
@@ -118,8 +118,10 @@ You can even run a multi-modal model for text-only inference:
|
||||
from vllm import LLM
|
||||
|
||||
# Don't accept images. Just text.
|
||||
llm = LLM(model="google/gemma-3-27b-it",
|
||||
limit_mm_per_prompt={"image": 0})
|
||||
llm = LLM(
|
||||
model="google/gemma-3-27b-it",
|
||||
limit_mm_per_prompt={"image": 0},
|
||||
)
|
||||
```
|
||||
|
||||
### Configurable options
|
||||
@@ -173,14 +175,14 @@ Here are some examples:
|
||||
from vllm import LLM
|
||||
|
||||
# Available for Qwen2-VL series models
|
||||
llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
|
||||
mm_processor_kwargs={
|
||||
"max_pixels": 768 * 768, # Default is 1280 * 28 * 28
|
||||
})
|
||||
llm = LLM(
|
||||
model="Qwen/Qwen2.5-VL-3B-Instruct",
|
||||
mm_processor_kwargs={"max_pixels": 768 * 768}, # Default is 1280 * 28 * 28
|
||||
)
|
||||
|
||||
# Available for InternVL series models
|
||||
llm = LLM(model="OpenGVLab/InternVL2-2B",
|
||||
mm_processor_kwargs={
|
||||
"max_dynamic_patch": 4, # Default is 12
|
||||
})
|
||||
llm = LLM(
|
||||
model="OpenGVLab/InternVL2-2B",
|
||||
mm_processor_kwargs={"max_dynamic_patch": 4}, # Default is 12
|
||||
)
|
||||
```
|
||||
|
||||
@@ -100,7 +100,7 @@ from vllm import LLM
|
||||
llm = LLM(
|
||||
model="meta-llama/Llama-3.3-70B-Instruct,
|
||||
tensor_parallel_size=4,
|
||||
pipeline_parallel_size=2
|
||||
pipeline_parallel_size=2,
|
||||
)
|
||||
```
|
||||
|
||||
@@ -257,18 +257,24 @@ Examples:
|
||||
|
||||
```python
|
||||
# Use a larger cache
|
||||
llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
|
||||
mm_processor_cache_gb=8)
|
||||
llm = LLM(
|
||||
model="Qwen/Qwen2.5-VL-3B-Instruct",
|
||||
mm_processor_cache_gb=8,
|
||||
)
|
||||
|
||||
# Use a shared-memory based IPC cache
|
||||
llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
|
||||
tensor_parallel_size=2,
|
||||
mm_processor_cache_type="shm",
|
||||
mm_processor_cache_gb=8)
|
||||
llm = LLM(
|
||||
model="Qwen/Qwen2.5-VL-3B-Instruct",
|
||||
tensor_parallel_size=2,
|
||||
mm_processor_cache_type="shm",
|
||||
mm_processor_cache_gb=8,
|
||||
)
|
||||
|
||||
# Disable the cache
|
||||
llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
|
||||
mm_processor_cache_gb=0)
|
||||
llm = LLM(
|
||||
model="Qwen/Qwen2.5-VL-3B-Instruct",
|
||||
mm_processor_cache_gb=0,
|
||||
)
|
||||
```
|
||||
|
||||
### Cache Placement
|
||||
|
||||
@@ -73,8 +73,8 @@ def forward(
|
||||
self,
|
||||
input_ids: torch.Tensor,
|
||||
positions: torch.Tensor,
|
||||
intermediate_tensors: Optional[IntermediateTensors] = None,
|
||||
inputs_embeds: Optional[torch.Tensor] = None,
|
||||
intermediate_tensors: IntermediateTensors | None = None,
|
||||
inputs_embeds: torch.Tensor | None = None,
|
||||
) -> torch.Tensor:
|
||||
...
|
||||
```
|
||||
|
||||
@@ -16,7 +16,7 @@ Further update the model as follows:
|
||||
...
|
||||
|
||||
@classmethod
|
||||
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
|
||||
def get_placeholder_str(cls, modality: str, i: int) -> str | None:
|
||||
if modality.startswith("image"):
|
||||
return "<image>"
|
||||
|
||||
@@ -45,14 +45,14 @@ Further update the model as follows:
|
||||
...
|
||||
|
||||
def _process_image_input(self, image_input: YourModelImageInputs) -> torch.Tensor:
|
||||
|
||||
assert self.vision_encoder is not None
|
||||
image_features = self.vision_encoder(image_input)
|
||||
return self.multi_modal_projector(image_features)
|
||||
|
||||
def get_multimodal_embeddings(
|
||||
self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
|
||||
|
||||
self,
|
||||
**kwargs: object,
|
||||
) -> MultiModalEmbeddings | None:
|
||||
# Validate the multimodal input keyword arguments
|
||||
image_input = self._parse_and_validate_image_input(**kwargs)
|
||||
if image_input is None:
|
||||
@@ -110,7 +110,7 @@ to return the maximum number of input items for each modality supported by the m
|
||||
For example, if the model supports any number of images but only one video per prompt:
|
||||
|
||||
```python
|
||||
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
|
||||
def get_supported_mm_limits(self) -> Mapping[str, int | None]:
|
||||
return {"image": None, "video": 1}
|
||||
```
|
||||
|
||||
@@ -258,7 +258,7 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
|
||||
mm_options: Mapping[str, BaseDummyOptions] | None = None,
|
||||
) -> MultiModalDataDict:
|
||||
num_images = mm_counts.get("image", 0)
|
||||
|
||||
@@ -421,8 +421,10 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
|
||||
```python
|
||||
def get_image_size_with_most_features(self) -> ImageSize:
|
||||
image_processor = self.get_image_processor()
|
||||
return ImageSize(width=image_processor.size["width"],
|
||||
height=image_processor.size["height"])
|
||||
return ImageSize(
|
||||
width=image_processor.size["width"],
|
||||
height=image_processor.size["height"],
|
||||
)
|
||||
```
|
||||
|
||||
Fuyu does not expect image placeholders in the inputs to HF processor, so
|
||||
@@ -452,10 +454,12 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
|
||||
|
||||
return {
|
||||
"image":
|
||||
self._get_dummy_images(width=target_width,
|
||||
height=target_height,
|
||||
num_images=num_images,
|
||||
overrides=image_overrides)
|
||||
self._get_dummy_images(
|
||||
width=target_width,
|
||||
height=target_height,
|
||||
num_images=num_images,
|
||||
overrides=image_overrides,
|
||||
)
|
||||
}
|
||||
```
|
||||
|
||||
@@ -744,8 +748,7 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies
|
||||
image_width=image_size.width,
|
||||
image_height=image_size.height,
|
||||
)
|
||||
image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
|
||||
[_NEWLINE_TOKEN_ID]) * nrows
|
||||
image_tokens = ([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows
|
||||
|
||||
return PromptUpdateDetails.select_token_id(
|
||||
image_tokens + [bos_token_id],
|
||||
@@ -781,8 +784,7 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies
|
||||
image_width=image_size.width,
|
||||
image_height=image_size.height,
|
||||
)
|
||||
image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
|
||||
[_NEWLINE_TOKEN_ID]) * nrows
|
||||
image_tokens = ([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows
|
||||
|
||||
return PromptUpdateDetails.select_token_id(
|
||||
image_tokens + [bos_token_id],
|
||||
@@ -810,9 +812,11 @@ to register them to the multi-modal registry:
|
||||
from vllm.model_executor.models.interfaces import SupportsMultiModal
|
||||
+ from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
|
||||
+ @MULTIMODAL_REGISTRY.register_processor(YourMultiModalProcessor,
|
||||
+ info=YourProcessingInfo,
|
||||
+ dummy_inputs=YourDummyInputsBuilder)
|
||||
+ @MULTIMODAL_REGISTRY.register_processor(
|
||||
+ YourMultiModalProcessor,
|
||||
+ info=YourProcessingInfo,
|
||||
+ dummy_inputs=YourDummyInputsBuilder,
|
||||
+ )
|
||||
class YourModelForImage2Seq(nn.Module, SupportsMultiModal):
|
||||
```
|
||||
|
||||
|
||||
@@ -42,7 +42,7 @@ def register():
|
||||
|
||||
ModelRegistry.register_model(
|
||||
"YourModelForCausalLM",
|
||||
"your_code:YourModelForCausalLM"
|
||||
"your_code:YourModelForCausalLM",
|
||||
)
|
||||
```
|
||||
|
||||
|
||||
@@ -15,6 +15,7 @@ Declare supported languages and capabilities:
|
||||
- Set `supports_transcription_only=True` if the model should not serve text generation (eg Whisper).
|
||||
|
||||
??? code "supported_languages and supports_transcription_only"
|
||||
|
||||
```python
|
||||
from typing import ClassVar, Mapping, Literal
|
||||
import numpy as np
|
||||
@@ -43,6 +44,7 @@ Provide an ASR configuration via [get_speech_to_text_config][vllm.model_executor
|
||||
This is for controlling general behavior of the API when serving your model:
|
||||
|
||||
??? code "get_speech_to_text_config()"
|
||||
|
||||
```python
|
||||
class YourASRModel(nn.Module, SupportsTranscription):
|
||||
...
|
||||
@@ -71,6 +73,7 @@ Implement the prompt construction via [get_generation_prompt][vllm.model_executo
|
||||
Return a dict containing `multi_modal_data` with the audio, and either a `prompt` string or `prompt_token_ids`:
|
||||
|
||||
??? code "get_generation_prompt()"
|
||||
|
||||
```python
|
||||
class YourASRModel(nn.Module, SupportsTranscription):
|
||||
...
|
||||
@@ -107,6 +110,7 @@ Return a dict containing `multi_modal_data` with the audio, and either a `prompt
|
||||
Return a dict with separate `encoder_prompt` and `decoder_prompt` entries:
|
||||
|
||||
??? code "get_generation_prompt()"
|
||||
|
||||
```python
|
||||
class YourASRModel(nn.Module, SupportsTranscription):
|
||||
...
|
||||
@@ -148,12 +152,16 @@ Language validation via [validate_language][vllm.model_executor.models.interface
|
||||
If your model requires a language and you want a default, override this method (see Whisper):
|
||||
|
||||
??? code "validate_language()"
|
||||
|
||||
```python
|
||||
@classmethod
|
||||
def validate_language(cls, language: str | None) -> str | None:
|
||||
if language is None:
|
||||
logger.warning(
|
||||
"Defaulting to language='en'. If you wish to transcribe audio in a different language, pass the `language` field.")
|
||||
"Defaulting to language='en'. If you wish to transcribe "
|
||||
"audio in a different language, pass the `language` field "
|
||||
"in the TranscriptionRequest."
|
||||
)
|
||||
language = "en"
|
||||
return super().validate_language(language)
|
||||
```
|
||||
@@ -165,6 +173,7 @@ Token accounting for streaming via [get_num_audio_tokens][vllm.model_executor.mo
|
||||
Provide a fast duration→token estimate to improve streaming usage statistics:
|
||||
|
||||
??? code "get_num_audio_tokens()"
|
||||
|
||||
```python
|
||||
class YourASRModel(nn.Module, SupportsTranscription):
|
||||
...
|
||||
@@ -191,6 +200,7 @@ The API server takes care of basic audio I/O and optional chunking before buildi
|
||||
Relevant server logic:
|
||||
|
||||
??? code "_preprocess_speech_to_text()"
|
||||
|
||||
```python
|
||||
# vllm/entrypoints/openai/speech_to_text.py
|
||||
async def _preprocess_speech_to_text(...):
|
||||
|
||||
@@ -63,7 +63,7 @@ If successful, you should be returned a CURL command that you can call inference
|
||||
|
||||
??? console "Command"
|
||||
|
||||
```python
|
||||
```bash
|
||||
curl -X POST https://api.cortex.cerebrium.ai/v4/p-xxxxxx/vllm/run \
|
||||
-H 'Content-Type: application/json' \
|
||||
-H 'Authorization: <JWT TOKEN>' \
|
||||
@@ -81,7 +81,7 @@ You should get a response like:
|
||||
|
||||
??? console "Response"
|
||||
|
||||
```python
|
||||
```json
|
||||
{
|
||||
"run_id": "52911756-3066-9ae8-bcc9-d9129d1bd262",
|
||||
"result": {
|
||||
|
||||
@@ -83,7 +83,7 @@ After the provisioning, you can interact with the model by using the OpenAI SDK:
|
||||
|
||||
client = OpenAI(
|
||||
base_url="https://gateway.<gateway domain>",
|
||||
api_key="<YOUR-DSTACK-SERVER-ACCESS-TOKEN>"
|
||||
api_key="<YOUR-DSTACK-SERVER-ACCESS-TOKEN>",
|
||||
)
|
||||
|
||||
completion = client.chat.completions.create(
|
||||
@@ -93,7 +93,7 @@ After the provisioning, you can interact with the model by using the OpenAI SDK:
|
||||
"role": "user",
|
||||
"content": "Compose a poem that explains the concept of recursion in programming.",
|
||||
}
|
||||
]
|
||||
],
|
||||
)
|
||||
|
||||
print(completion.choices[0].message.content)
|
||||
|
||||
@@ -34,7 +34,7 @@ pip install vllm haystack-ai
|
||||
api_key=Secret.from_token("VLLM-PLACEHOLDER-API-KEY"),
|
||||
model="mistralai/Mistral-7B-Instruct-v0.1",
|
||||
api_base_url="http://{your-vLLM-host-ip}:{your-vLLM-host-port}/v1",
|
||||
generation_kwargs = {"max_tokens": 512}
|
||||
generation_kwargs={"max_tokens": 512},
|
||||
)
|
||||
|
||||
response = generator.run(
|
||||
|
||||
@@ -32,28 +32,28 @@ This is the easiest way to get started with vLLM on Hugging Face Inference Endpo
|
||||
import os
|
||||
|
||||
client = OpenAI(
|
||||
base_url = DEPLOYMENT_URL,
|
||||
api_key = os.environ["HF_TOKEN"] # https://huggingface.co/settings/tokens
|
||||
base_url=DEPLOYMENT_URL,
|
||||
api_key=os.environ["HF_TOKEN"], # https://huggingface.co/settings/tokens
|
||||
)
|
||||
|
||||
chat_completion = client.chat.completions.create(
|
||||
model = "HuggingFaceTB/SmolLM3-3B",
|
||||
messages = [
|
||||
model="HuggingFaceTB/SmolLM3-3B",
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Give me a brief explanation of gravity in simple terms."
|
||||
"text": "Give me a brief explanation of gravity in simple terms.",
|
||||
}
|
||||
]
|
||||
],
|
||||
}
|
||||
],
|
||||
stream = True
|
||||
stream=True,
|
||||
)
|
||||
|
||||
for message in chat_completion:
|
||||
print(message.choices[0].delta.content, end = "")
|
||||
print(message.choices[0].delta.content, end="")
|
||||
```
|
||||
|
||||
!!! note
|
||||
@@ -86,34 +86,34 @@ This method applies to models with the [`transformers` library tag](https://hugg
|
||||
import os
|
||||
|
||||
client = OpenAI(
|
||||
base_url = DEPLOYMENT_URL,
|
||||
api_key = os.environ["HF_TOKEN"] # https://huggingface.co/settings/tokens
|
||||
base_url=DEPLOYMENT_URL,
|
||||
api_key=os.environ["HF_TOKEN"], # https://huggingface.co/settings/tokens
|
||||
)
|
||||
|
||||
chat_completion = client.chat.completions.create(
|
||||
model = "ibm-granite/granite-docling-258M",
|
||||
messages = [
|
||||
model="ibm-granite/granite-docling-258M",
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": "https://huggingface.co/ibm-granite/granite-docling-258M/resolve/main/assets/new_arxiv.png"
|
||||
}
|
||||
"url": "https://huggingface.co/ibm-granite/granite-docling-258M/resolve/main/assets/new_arxiv.png",
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Convert this page to docling."
|
||||
}
|
||||
"text": "Convert this page to docling.",
|
||||
},
|
||||
]
|
||||
}
|
||||
],
|
||||
stream = True
|
||||
stream=True,
|
||||
)
|
||||
|
||||
for message in chat_completion:
|
||||
print(message.choices[0].delta.content, end = "")
|
||||
print(message.choices[0].delta.content, end="")
|
||||
```
|
||||
|
||||
!!! note
|
||||
|
||||
@@ -36,15 +36,16 @@ pip install vllm litellm
|
||||
```python
|
||||
import litellm
|
||||
|
||||
messages = [{ "content": "Hello, how are you?","role": "user"}]
|
||||
messages = [{"content": "Hello, how are you?", "role": "user"}]
|
||||
|
||||
# hosted_vllm is prefix key word and necessary
|
||||
response = litellm.completion(
|
||||
model="hosted_vllm/qwen/Qwen1.5-0.5B-Chat", # pass the vllm model name
|
||||
messages=messages,
|
||||
api_base="http://{your-vllm-server-host}:{your-vllm-server-port}/v1",
|
||||
temperature=0.2,
|
||||
max_tokens=80)
|
||||
model="hosted_vllm/qwen/Qwen1.5-0.5B-Chat", # pass the vllm model name
|
||||
messages=messages,
|
||||
api_base="http://{your-vllm-server-host}:{your-vllm-server-port}/v1",
|
||||
temperature=0.2,
|
||||
max_tokens=80,
|
||||
)
|
||||
|
||||
print(response)
|
||||
```
|
||||
|
||||
@@ -40,7 +40,7 @@ pip install -U vllm \
|
||||
|
||||
1. Run the script
|
||||
|
||||
```python
|
||||
```bash
|
||||
python retrieval_augmented_generation_with_langchain.py
|
||||
```
|
||||
|
||||
@@ -78,6 +78,6 @@ pip install vllm \
|
||||
|
||||
1. Run the script:
|
||||
|
||||
```python
|
||||
```bash
|
||||
python retrieval_augmented_generation_with_llamaindex.py
|
||||
```
|
||||
|
||||
@@ -106,9 +106,11 @@ The dispatch code looks like:
|
||||
batch_descriptor=BatchDescriptor(num_tokens=num_input_tokens, uniform_decode=...)
|
||||
runtime_mode, batch_descriptor = cudagraphdispatcher.dispatch(batch_descriptor)
|
||||
# execution
|
||||
with set_forward_context(...,
|
||||
cudagraph_runtime_mode=runtime_mode,
|
||||
batch_descriptor=batch_descriptor):
|
||||
with set_forward_context(
|
||||
...,
|
||||
cudagraph_runtime_mode=runtime_mode,
|
||||
batch_descriptor=batch_descriptor,
|
||||
):
|
||||
output = self.model(...)
|
||||
```
|
||||
|
||||
@@ -202,10 +204,10 @@ from vllm.config import CUDAGraphMode
|
||||
|
||||
compilation_config = {"level": 3, "cudagraph_mode": "FULL_AND_PIECEWISE"}
|
||||
model = vllm.LLM(
|
||||
model="meta-llama/Llama-3.1-8B-Instruct",
|
||||
dtype='auto',
|
||||
compilation_config = compilation_config,
|
||||
)
|
||||
model="meta-llama/Llama-3.1-8B-Instruct",
|
||||
dtype="auto",
|
||||
compilation_config=compilation_config,
|
||||
)
|
||||
sampling_params = vllm.SamplingParams(
|
||||
temperature=0, # greedy decoding
|
||||
max_tokens=1024,
|
||||
|
||||
@@ -9,8 +9,8 @@ When performing an inference with IO Processor plugins, the prompt type is defin
|
||||
IO Processor plugins implement the `IOProcessor` interface (<gh-file:vllm/plugins/io_processors/interface.py>):
|
||||
|
||||
```python
|
||||
IOProcessorInput = TypeVar('IOProcessorInput')
|
||||
IOProcessorOutput = TypeVar('IOProcessorOutput')
|
||||
IOProcessorInput = TypeVar("IOProcessorInput")
|
||||
IOProcessorOutput = TypeVar("IOProcessorOutput")
|
||||
|
||||
class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]):
|
||||
|
||||
@@ -21,30 +21,32 @@ class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]):
|
||||
def pre_process(
|
||||
self,
|
||||
prompt: IOProcessorInput,
|
||||
request_id: Optional[str] = None,
|
||||
request_id: str | None = None,
|
||||
**kwargs,
|
||||
) -> Union[PromptType, Sequence[PromptType]]:
|
||||
) -> PromptType | Sequence[PromptType]:
|
||||
raise NotImplementedError
|
||||
|
||||
async def pre_process_async(
|
||||
self,
|
||||
prompt: IOProcessorInput,
|
||||
request_id: Optional[str] = None,
|
||||
request_id: str | None = None,
|
||||
**kwargs,
|
||||
) -> Union[PromptType, Sequence[PromptType]]:
|
||||
) -> PromptType | Sequence[PromptType]:
|
||||
return self.pre_process(prompt, request_id, **kwargs)
|
||||
|
||||
@abstractmethod
|
||||
def post_process(self,
|
||||
model_output: Sequence[PoolingRequestOutput],
|
||||
request_id: Optional[str] = None,
|
||||
**kwargs) -> IOProcessorOutput:
|
||||
def post_process(
|
||||
self,
|
||||
model_output: Sequence[PoolingRequestOutput],
|
||||
request_id: str | None = None,
|
||||
**kwargs,
|
||||
) -> IOProcessorOutput:
|
||||
raise NotImplementedError
|
||||
|
||||
async def post_process_async(
|
||||
self,
|
||||
model_output: AsyncGenerator[tuple[int, PoolingRequestOutput]],
|
||||
request_id: Optional[str] = None,
|
||||
request_id: str | None = None,
|
||||
**kwargs,
|
||||
) -> IOProcessorOutput:
|
||||
collected_output = [item async for i, item in model_output]
|
||||
@@ -56,7 +58,8 @@ class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]):
|
||||
|
||||
@abstractmethod
|
||||
def output_to_response(
|
||||
self, plugin_output: IOProcessorOutput) -> IOProcessorResponse:
|
||||
self, plugin_output: IOProcessorOutput
|
||||
) -> IOProcessorResponse:
|
||||
raise NotImplementedError
|
||||
```
|
||||
|
||||
|
||||
@@ -478,15 +478,17 @@ us with:
|
||||
|
||||
```python
|
||||
if seq_group.is_finished():
|
||||
if (seq_group.metrics.first_scheduled_time is not None and
|
||||
seq_group.metrics.first_token_time is not None):
|
||||
if (
|
||||
seq_group.metrics.first_scheduled_time is not None
|
||||
and seq_group.metrics.first_token_time is not None
|
||||
):
|
||||
time_queue_requests.append(
|
||||
seq_group.metrics.first_scheduled_time -
|
||||
seq_group.metrics.arrival_time)
|
||||
seq_group.metrics.arrival_time
|
||||
)
|
||||
...
|
||||
if seq_group.metrics.time_in_queue is not None:
|
||||
time_in_queue_requests.append(
|
||||
seq_group.metrics.time_in_queue)
|
||||
time_in_queue_requests.append(seq_group.metrics.time_in_queue)
|
||||
```
|
||||
|
||||
This seems duplicative, and one of them should be removed. The latter
|
||||
|
||||
@@ -112,8 +112,8 @@ class KVCacheBlock:
|
||||
ref_cnt: int
|
||||
|
||||
# The pointers to form a doubly linked list for the free queue.
|
||||
prev_free_block: Optional["KVCacheBlock"] = None
|
||||
next_free_block: Optional["KVCacheBlock"] = None
|
||||
prev_free_block: "KVCacheBlock | None" = None
|
||||
next_free_block: "KVCacheBlock | None" = None
|
||||
```
|
||||
|
||||
There are two design points to highlight:
|
||||
|
||||
@@ -32,7 +32,7 @@ the third parameter is the path to the LoRA adapter.
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0,
|
||||
max_tokens=256,
|
||||
stop=["[/assistant]"]
|
||||
stop=["[/assistant]"],
|
||||
)
|
||||
|
||||
prompts = [
|
||||
@@ -43,7 +43,7 @@ the third parameter is the path to the LoRA adapter.
|
||||
outputs = llm.generate(
|
||||
prompts,
|
||||
sampling_params,
|
||||
lora_request=LoRARequest("sql_adapter", 1, sql_lora_path)
|
||||
lora_request=LoRARequest("sql_adapter", 1, sql_lora_path),
|
||||
)
|
||||
```
|
||||
|
||||
@@ -197,7 +197,7 @@ Alternatively, follow these example steps to implement your own plugin:
|
||||
lora_request = LoRARequest(
|
||||
lora_name=lora_name,
|
||||
lora_path=local_path,
|
||||
lora_int_id=abs(hash(lora_name))
|
||||
lora_int_id=abs(hash(lora_name)),
|
||||
)
|
||||
return lora_request
|
||||
```
|
||||
@@ -296,10 +296,7 @@ To this end, we allow registration of default multimodal LoRAs to handle this au
|
||||
if has_audio:
|
||||
question = f"<|audio|>{question}"
|
||||
chat = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": question
|
||||
}
|
||||
{"role": "user", "content": question},
|
||||
]
|
||||
return tokenizer.apply_chat_template(chat, tokenize=False)
|
||||
|
||||
|
||||
@@ -154,9 +154,7 @@ To substitute multiple images inside the same text prompt, you can pass in a lis
|
||||
|
||||
outputs = llm.generate({
|
||||
"prompt": prompt,
|
||||
"multi_modal_data": {
|
||||
"image": [image1, image2]
|
||||
},
|
||||
"multi_modal_data": {"image": [image1, image2]},
|
||||
})
|
||||
|
||||
for o in outputs:
|
||||
@@ -183,21 +181,24 @@ conversation = [
|
||||
{"role": "assistant", "content": "Hello! How can I assist you today?"},
|
||||
{
|
||||
"role": "user",
|
||||
"content": [{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url
|
||||
}
|
||||
},{
|
||||
"type": "image_pil",
|
||||
"image_pil": image_pil
|
||||
}, {
|
||||
"type": "image_embeds",
|
||||
"image_embeds": image_embeds
|
||||
}, {
|
||||
"type": "text",
|
||||
"text": "What's in these images?"
|
||||
}],
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": image_url},
|
||||
},
|
||||
{
|
||||
"type": "image_pil",
|
||||
"image_pil": image_pil,
|
||||
},
|
||||
{
|
||||
"type": "image_embeds",
|
||||
"image_embeds": image_embeds,
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What's in these images?",
|
||||
},
|
||||
],
|
||||
},
|
||||
]
|
||||
|
||||
@@ -224,7 +225,10 @@ Multi-image input can be extended to perform video captioning. We show this with
|
||||
message = {
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "Describe this set of frames. Consider the frames to be a part of the same video."},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Describe this set of frames. Consider the frames to be a part of the same video.",
|
||||
},
|
||||
],
|
||||
}
|
||||
for i in range(len(video_frames)):
|
||||
@@ -255,13 +259,13 @@ When loading RGBA images (images with transparency), vLLM converts them to RGB f
|
||||
# Custom black background for dark theme
|
||||
llm = LLM(
|
||||
model="llava-hf/llava-1.5-7b-hf",
|
||||
media_io_kwargs={"image": {"rgba_background_color": [0, 0, 0]}}
|
||||
media_io_kwargs={"image": {"rgba_background_color": [0, 0, 0]}},
|
||||
)
|
||||
|
||||
# Custom brand color background (e.g., blue)
|
||||
llm = LLM(
|
||||
model="llava-hf/llava-1.5-7b-hf",
|
||||
media_io_kwargs={"image": {"rgba_background_color": [0, 0, 255]}}
|
||||
media_io_kwargs={"image": {"rgba_background_color": [0, 0, 255]}},
|
||||
)
|
||||
```
|
||||
|
||||
@@ -294,20 +298,23 @@ Instead of NumPy arrays, you can also pass `'torch.Tensor'` instances, as shown
|
||||
limit_mm_per_prompt={"video": 1},
|
||||
)
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
max_tokens=1024,
|
||||
)
|
||||
sampling_params = SamplingParams(max_tokens=1024)
|
||||
|
||||
video_messages = [
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{"role": "user", "content": [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful assistant.",
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "describe this video."},
|
||||
{
|
||||
"type": "video",
|
||||
"video": video_path,
|
||||
"total_pixels": 20480 * 28 * 28,
|
||||
"min_pixels": 16 * 28 * 28
|
||||
}
|
||||
"min_pixels": 16 * 28 * 28,
|
||||
},
|
||||
]
|
||||
},
|
||||
]
|
||||
@@ -465,21 +472,24 @@ Then, you can use the OpenAI client as follows:
|
||||
|
||||
chat_response = client.chat.completions.create(
|
||||
model="microsoft/Phi-3.5-vision-instruct",
|
||||
messages=[{
|
||||
"role": "user",
|
||||
"content": [
|
||||
# NOTE: The prompt formatting with the image token `<image>` is not needed
|
||||
# since the prompt will be processed automatically by the API server.
|
||||
{"type": "text", "text": "What’s in this image?"},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
url": image_url
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
# NOTE: The prompt formatting with the image token `<image>` is not needed
|
||||
# since the prompt will be processed automatically by the API server.
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What’s in this image?",
|
||||
},
|
||||
"uuid": image_url # Optional
|
||||
},
|
||||
],
|
||||
}],
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": image_url},
|
||||
"uuid": image_url, # Optional
|
||||
},
|
||||
],
|
||||
}
|
||||
],
|
||||
)
|
||||
print("Chat completion output:", chat_response.choices[0].message.content)
|
||||
|
||||
@@ -489,26 +499,27 @@ Then, you can use the OpenAI client as follows:
|
||||
|
||||
chat_response = client.chat.completions.create(
|
||||
model="microsoft/Phi-3.5-vision-instruct",
|
||||
messages=[{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "What are the animals in these images?"},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url_duck
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What are the animals in these images?",
|
||||
},
|
||||
"uuid": image_url_duck # Optional
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url_lion
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": image_url_duck},
|
||||
"uuid": image_url_duck, # Optional
|
||||
},
|
||||
"uuid": image_url_lion # Optional
|
||||
},
|
||||
],
|
||||
}],
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": image_url_lion},
|
||||
"uuid": image_url_lion, # Optional
|
||||
},
|
||||
],
|
||||
}
|
||||
],
|
||||
)
|
||||
print("Chat completion output:", chat_response.choices[0].message.content)
|
||||
```
|
||||
@@ -560,23 +571,22 @@ Then, you can use the OpenAI client as follows:
|
||||
|
||||
## Use video url in the payload
|
||||
chat_completion_from_url = client.chat.completions.create(
|
||||
messages=[{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What's in this video?"
|
||||
},
|
||||
{
|
||||
"type": "video_url",
|
||||
"video_url": {
|
||||
"url": video_url
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What's in this video?",
|
||||
},
|
||||
"uuid": video_url # Optional
|
||||
},
|
||||
],
|
||||
}],
|
||||
{
|
||||
"type": "video_url",
|
||||
"video_url": {"url": video_url},
|
||||
"uuid": video_url, # Optional
|
||||
},
|
||||
],
|
||||
}
|
||||
],
|
||||
model=model,
|
||||
max_completion_tokens=64,
|
||||
)
|
||||
@@ -652,23 +662,25 @@ Then, you can use the OpenAI client as follows:
|
||||
audio_base64 = encode_base64_content_from_url(audio_url)
|
||||
|
||||
chat_completion_from_base64 = client.chat.completions.create(
|
||||
messages=[{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What's in this audio?"
|
||||
},
|
||||
{
|
||||
"type": "input_audio",
|
||||
"input_audio": {
|
||||
"data": audio_base64,
|
||||
"format": "wav"
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What's in this audio?",
|
||||
},
|
||||
"uuid": audio_url # Optional
|
||||
},
|
||||
],
|
||||
}],
|
||||
{
|
||||
"type": "input_audio",
|
||||
"input_audio": {
|
||||
"data": audio_base64,
|
||||
"format": "wav",
|
||||
},
|
||||
"uuid": audio_url, # Optional
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
model=model,
|
||||
max_completion_tokens=64,
|
||||
)
|
||||
@@ -683,22 +695,22 @@ Alternatively, you can pass `audio_url`, which is the audio counterpart of `imag
|
||||
|
||||
```python
|
||||
chat_completion_from_url = client.chat.completions.create(
|
||||
messages=[{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What's in this audio?"
|
||||
},
|
||||
{
|
||||
"type": "audio_url",
|
||||
"audio_url": {
|
||||
"url": audio_url
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What's in this audio?",
|
||||
},
|
||||
"uuid": audio_url # Optional
|
||||
},
|
||||
],
|
||||
}],
|
||||
{
|
||||
"type": "audio_url",
|
||||
"audio_url": {"url": audio_url},
|
||||
"uuid": audio_url, # Optional
|
||||
},
|
||||
],
|
||||
}
|
||||
],
|
||||
model=model,
|
||||
max_completion_tokens=64,
|
||||
)
|
||||
@@ -747,43 +759,48 @@ The following example demonstrates how to pass image embeddings to the OpenAI se
|
||||
|
||||
# Basic usage - this is equivalent to the LLaVA example for offline inference
|
||||
model = "llava-hf/llava-1.5-7b-hf"
|
||||
embeds = {
|
||||
embeds = {
|
||||
"type": "image_embeds",
|
||||
"image_embeds": f"{base64_image_embedding}",
|
||||
"uuid": image_url # Optional
|
||||
"uuid": image_url, # Optional
|
||||
}
|
||||
|
||||
# Pass additional parameters (available to Qwen2-VL and MiniCPM-V)
|
||||
model = "Qwen/Qwen2-VL-2B-Instruct"
|
||||
embeds = {
|
||||
embeds = {
|
||||
"type": "image_embeds",
|
||||
"image_embeds": {
|
||||
"image_embeds": f"{base64_image_embedding}" , # Required
|
||||
"image_grid_thw": f"{base64_image_grid_thw}" # Required by Qwen/Qwen2-VL-2B-Instruct
|
||||
"image_embeds": f"{base64_image_embedding}", # Required
|
||||
"image_grid_thw": f"{base64_image_grid_thw}", # Required by Qwen/Qwen2-VL-2B-Instruct
|
||||
},
|
||||
"uuid": image_url # Optional
|
||||
"uuid": image_url, # Optional
|
||||
}
|
||||
model = "openbmb/MiniCPM-V-2_6"
|
||||
embeds = {
|
||||
embeds = {
|
||||
"type": "image_embeds",
|
||||
"image_embeds": {
|
||||
"image_embeds": f"{base64_image_embedding}" , # Required
|
||||
"image_sizes": f"{base64_image_sizes}" # Required by openbmb/MiniCPM-V-2_6
|
||||
"image_embeds": f"{base64_image_embedding}", # Required
|
||||
"image_sizes": f"{base64_image_sizes}", # Required by openbmb/MiniCPM-V-2_6
|
||||
},
|
||||
"uuid": image_url # Optional
|
||||
"uuid": image_url, # Optional
|
||||
}
|
||||
chat_completion = client.chat.completions.create(
|
||||
messages=[
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{"role": "user", "content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What's in this image?",
|
||||
"role": "system",
|
||||
"content": "You are a helpful assistant.",
|
||||
},
|
||||
embeds,
|
||||
],
|
||||
},
|
||||
],
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What's in this image?",
|
||||
},
|
||||
embeds,
|
||||
],
|
||||
},
|
||||
],
|
||||
model=model,
|
||||
)
|
||||
```
|
||||
@@ -802,22 +819,22 @@ For Online Serving, you can also skip sending media if you expect cache hits wit
|
||||
{
|
||||
"type": "image_embeds",
|
||||
"image_embeds": None,
|
||||
"uuid": image_uuid
|
||||
"uuid": image_uuid,
|
||||
},
|
||||
|
||||
# input_audio:
|
||||
{
|
||||
"type": "input_audio",
|
||||
"input_audio": None,
|
||||
"uuid": audio_uuid
|
||||
"uuid": audio_uuid,
|
||||
},
|
||||
|
||||
# PIL Image:
|
||||
{
|
||||
"type": "image_pil",
|
||||
"image_pil": None
|
||||
"uuid": image_uuid
|
||||
}
|
||||
"image_pil": None,
|
||||
"uuid": image_uuid,
|
||||
},
|
||||
|
||||
```
|
||||
|
||||
|
||||
@@ -117,9 +117,11 @@ OpenAI Python client library does not officially support `reasoning_content` att
|
||||
# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
|
||||
# For Qwen3 series, if you want to disable thinking in reasoning mode, add:
|
||||
# extra_body={"chat_template_kwargs": {"enable_thinking": False}}
|
||||
stream = client.chat.completions.create(model=model,
|
||||
messages=messages,
|
||||
stream=True)
|
||||
stream = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=messages,
|
||||
stream=True,
|
||||
)
|
||||
|
||||
print("client: Start streaming chat completions...")
|
||||
printed_reasoning_content = False
|
||||
@@ -159,27 +161,29 @@ The reasoning content is also available when both tool calling and the reasoning
|
||||
|
||||
client = OpenAI(base_url="http://localhost:8000/v1", api_key="dummy")
|
||||
|
||||
tools = [{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "get_weather",
|
||||
"description": "Get the current weather in a given location",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"},
|
||||
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}
|
||||
},
|
||||
"required": ["location", "unit"]
|
||||
}
|
||||
tools = [
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "get_weather",
|
||||
"description": "Get the current weather in a given location",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"},
|
||||
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
|
||||
},
|
||||
"required": ["location", "unit"],
|
||||
}
|
||||
},
|
||||
}
|
||||
}]
|
||||
]
|
||||
|
||||
response = client.chat.completions.create(
|
||||
model=client.models.list().data[0].id,
|
||||
messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}],
|
||||
tools=tools,
|
||||
tool_choice="auto"
|
||||
tool_choice="auto",
|
||||
)
|
||||
|
||||
print(response)
|
||||
@@ -225,7 +229,7 @@ You can add a new `ReasoningParser` similar to <gh-file:vllm/reasoning/deepseek_
|
||||
previous_token_ids: Sequence[int],
|
||||
current_token_ids: Sequence[int],
|
||||
delta_token_ids: Sequence[int],
|
||||
) -> Union[DeltaMessage, None]:
|
||||
) -> DeltaMessage | None:
|
||||
"""
|
||||
Instance method that should be implemented for extracting reasoning
|
||||
from an incomplete response; for use when handling reasoning calls and
|
||||
@@ -235,8 +239,10 @@ You can add a new `ReasoningParser` similar to <gh-file:vllm/reasoning/deepseek_
|
||||
"""
|
||||
|
||||
def extract_reasoning_content(
|
||||
self, model_output: str, request: ChatCompletionRequest
|
||||
) -> tuple[Optional[str], Optional[str]]:
|
||||
self,
|
||||
model_output: str,
|
||||
request: ChatCompletionRequest | ResponsesRequest,
|
||||
) -> tuple[str | None, str | None]:
|
||||
"""
|
||||
Extract reasoning content from a complete model-generated string.
|
||||
|
||||
@@ -274,10 +280,10 @@ Additionally, to enable structured output, you'll need to create a new `Reasoner
|
||||
|
||||
@classmethod
|
||||
def from_tokenizer(cls, tokenizer: PreTrainedTokenizer) -> Reasoner:
|
||||
return cls(start_token_id=tokenizer.encode(
|
||||
"<think>", add_special_tokens=False)[0],
|
||||
end_token_id=tokenizer.encode("</think>",
|
||||
add_special_tokens=False)[0])
|
||||
return cls(
|
||||
start_token_id=tokenizer.encode("<think>", add_special_tokens=False)[0],
|
||||
end_token_id=tokenizer.encode("</think>", add_special_tokens=False)[0],
|
||||
)
|
||||
|
||||
def is_reasoning_end(self, input_ids: list[int]) -> bool:
|
||||
return self.end_token_id in input_ids
|
||||
|
||||
@@ -27,27 +27,29 @@ Next, make a request that triggers the model to use the available tools:
|
||||
return f"Getting the weather for {location} in {unit}..."
|
||||
tool_functions = {"get_weather": get_weather}
|
||||
|
||||
tools = [{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "get_weather",
|
||||
"description": "Get the current weather in a given location",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"},
|
||||
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}
|
||||
tools = [
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "get_weather",
|
||||
"description": "Get the current weather in a given location",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"},
|
||||
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}
|
||||
},
|
||||
"required": ["location", "unit"],
|
||||
},
|
||||
"required": ["location", "unit"]
|
||||
}
|
||||
}
|
||||
}]
|
||||
},
|
||||
},
|
||||
]
|
||||
|
||||
response = client.chat.completions.create(
|
||||
model=client.models.list().data[0].id,
|
||||
messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}],
|
||||
tools=tools,
|
||||
tool_choice="auto"
|
||||
tool_choice="auto",
|
||||
)
|
||||
|
||||
tool_call = response.choices[0].message.tool_calls[0].function
|
||||
@@ -402,8 +404,7 @@ Here is a summary of a plugin file:
|
||||
|
||||
# adjust request. e.g.: set skip special tokens
|
||||
# to False for tool call output.
|
||||
def adjust_request(
|
||||
self, request: ChatCompletionRequest) -> ChatCompletionRequest:
|
||||
def adjust_request(self, request: ChatCompletionRequest) -> ChatCompletionRequest:
|
||||
return request
|
||||
|
||||
# implement the tool call parse for stream call
|
||||
@@ -416,7 +417,7 @@ Here is a summary of a plugin file:
|
||||
current_token_ids: Sequence[int],
|
||||
delta_token_ids: Sequence[int],
|
||||
request: ChatCompletionRequest,
|
||||
) -> Union[DeltaMessage, None]:
|
||||
) -> DeltaMessage | None:
|
||||
return delta
|
||||
|
||||
# implement the tool parse for non-stream call
|
||||
|
||||
Reference in New Issue
Block a user