[[V0 deprecation]]Remove VLLM_USE_V1 env (#28204)

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
wangxiyuan
2025-11-12 09:22:16 +08:00
committed by GitHub
parent 3f770f4427
commit e1710393c4
8 changed files with 15 additions and 59 deletions

View File

@@ -76,7 +76,7 @@ function cpu_tests() {
# Run AWQ test
# docker exec cpu-test-"$NUMA_NODE" bash -c "
# set -e
# VLLM_USE_V1=0 pytest -x -s -v \
# pytest -x -s -v \
# tests/quantization/test_ipex_quant.py"
# Run multi-lora tests

View File

@@ -4,8 +4,7 @@
This file demonstrates the usage of text generation with an LLM model,
comparing the performance with and without speculative decoding.
Note that still not support `v1`:
VLLM_USE_V1=0 python examples/offline_inference/mlpspeculator.py
Note that this example is out of date and not supported in vLLM v1.
"""
import gc

View File

@@ -11,12 +11,10 @@ python examples/offline_inference/qwen2_5_omni/only_thinker.py \
# Read vision and audio inputs from a single video file
# NOTE: V1 engine does not support interleaved modalities yet.
VLLM_USE_V1=0 \
python examples/offline_inference/qwen2_5_omni/only_thinker.py \
-q use_audio_in_video
# Multiple audios
VLLM_USE_V1=0 \
python examples/offline_inference/qwen2_5_omni/only_thinker.py \
-q multi_audios
```

View File

@@ -7,7 +7,6 @@ with the correct prompt format on Qwen2.5-Omni (thinker only).
from typing import NamedTuple
import vllm.envs as envs
from vllm import LLM, SamplingParams
from vllm.assets.audio import AudioAsset
from vllm.assets.image import ImageAsset
@@ -72,11 +71,7 @@ def get_use_audio_in_video_query() -> QueryResult:
)
asset = VideoAsset(name="baby_reading", num_frames=16)
audio = asset.get_audio(sampling_rate=16000)
assert not envs.VLLM_USE_V1, (
"V1 does not support use_audio_in_video. "
"Please launch this example with "
"`VLLM_USE_V1=0`."
)
return QueryResult(
inputs={
"prompt": prompt,

View File

@@ -37,7 +37,7 @@ from vllm.config import KVTransferConfig
from vllm.engine.arg_utils import EngineArgs
def setup_environment_variables(vllm_version: str):
def setup_environment_variables():
# LMCache-related environment variables
# Use experimental features in LMCache
os.environ["LMCACHE_USE_EXPERIMENTAL"] = "True"
@@ -47,12 +47,10 @@ def setup_environment_variables(vllm_version: str):
os.environ["LMCACHE_LOCAL_CPU"] = "True"
# Set local CPU memory limit to 5.0 GB
os.environ["LMCACHE_MAX_LOCAL_CPU_SIZE"] = "5.0"
if vllm_version == "v0":
os.environ["VLLM_USE_V1"] = "0"
@contextlib.contextmanager
def build_llm_with_lmcache(lmcache_connector: str, model: str, vllm_version: str):
def build_llm_with_lmcache(lmcache_connector: str, model: str):
ktc = KVTransferConfig(
kv_connector=lmcache_connector,
kv_role="kv_both",
@@ -60,21 +58,12 @@ def build_llm_with_lmcache(lmcache_connector: str, model: str, vllm_version: str
# Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
# memory. Reduce the value if your GPU has less memory.
# Note: LMCache supports chunked prefill (see vLLM#14505, LMCache#392).
if vllm_version == "v0":
llm_args = EngineArgs(
model=model,
kv_transfer_config=ktc,
max_model_len=8000,
gpu_memory_utilization=0.8,
enable_chunked_prefill=True, # Only in v0
)
else:
llm_args = EngineArgs(
model=model,
kv_transfer_config=ktc,
max_model_len=8000,
gpu_memory_utilization=0.8,
)
llm_args = EngineArgs(
model=model,
kv_transfer_config=ktc,
max_model_len=8000,
gpu_memory_utilization=0.8,
)
llm = LLM(**asdict(llm_args))
try:
@@ -116,18 +105,10 @@ def parse_args():
def main():
args = parse_args()
if args.version == "v0":
lmcache_connector = "LMCacheConnector"
model = "mistralai/Mistral-7B-Instruct-v0.2"
else:
lmcache_connector = "LMCacheConnectorV1"
model = "meta-llama/Meta-Llama-3.1-8B-Instruct"
setup_environment_variables(args.version)
with build_llm_with_lmcache(lmcache_connector, model, args.version) as llm:
lmcache_connector = "LMCacheConnectorV1"
model = "meta-llama/Meta-Llama-3.1-8B-Instruct"
setup_environment_variables()
with build_llm_with_lmcache(lmcache_connector, model) as llm:
# This example script runs two requests with a shared prefix.
# Define the shared prompt and specific prompts
shared_prompt = "Hello, how are you?" * 1000

View File

@@ -22,9 +22,6 @@ def monkeypatch_module():
@pytest.fixture(scope="module", params=[True])
def server(request, monkeypatch_module):
use_v1 = request.param
monkeypatch_module.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
args = [
"--dtype",
"bfloat16",

View File

@@ -100,7 +100,6 @@ if TYPE_CHECKING:
VLLM_SKIP_P2P_CHECK: bool = False
VLLM_DISABLED_KERNELS: list[str] = []
VLLM_DISABLE_PYNCCL: bool = False
VLLM_USE_V1: bool = True
VLLM_ROCM_USE_AITER: bool = False
VLLM_ROCM_USE_AITER_PAGED_ATTN: bool = False
VLLM_ROCM_USE_AITER_LINEAR: bool = True
@@ -884,8 +883,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_DISABLE_PYNCCL": lambda: (
os.getenv("VLLM_DISABLE_PYNCCL", "False").lower() in ("true", "1")
),
# If set, use the V1 code path.
"VLLM_USE_V1": lambda: bool(int(os.getenv("VLLM_USE_V1", "1"))),
# Disable aiter ops unless specifically enabled.
# Acts as a parent switch to enable the rest of the other operations.
"VLLM_ROCM_USE_AITER": lambda: (
@@ -1538,16 +1535,6 @@ def is_set(name: str):
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
def set_vllm_use_v1(use_v1: bool):
if is_set("VLLM_USE_V1"):
raise ValueError(
"Should not call set_vllm_use_v1() if VLLM_USE_V1 is set "
"explicitly by the user. Please raise this as a Github "
"Issue and explicitly set VLLM_USE_V1=0 or 1."
)
os.environ["VLLM_USE_V1"] = "1" if use_v1 else "0"
def compute_hash() -> str:
"""
WARNING: Whenever a new key is added to this environment

View File

@@ -42,7 +42,6 @@ _USAGE_ENV_VARS_TO_COLLECT = [
"VLLM_USE_FLASHINFER_SAMPLER",
"VLLM_PP_LAYER_PARTITION",
"VLLM_USE_TRITON_AWQ",
"VLLM_USE_V1",
"VLLM_ENABLE_V1_MULTIPROCESSING",
]