mirror of
https://github.com/vllm-project/vllm.git
synced 2025-12-06 15:04:47 +08:00
[[V0 deprecation]]Remove VLLM_USE_V1 env (#28204)
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
@@ -76,7 +76,7 @@ function cpu_tests() {
|
||||
# Run AWQ test
|
||||
# docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||
# set -e
|
||||
# VLLM_USE_V1=0 pytest -x -s -v \
|
||||
# pytest -x -s -v \
|
||||
# tests/quantization/test_ipex_quant.py"
|
||||
|
||||
# Run multi-lora tests
|
||||
|
||||
@@ -4,8 +4,7 @@
|
||||
This file demonstrates the usage of text generation with an LLM model,
|
||||
comparing the performance with and without speculative decoding.
|
||||
|
||||
Note that still not support `v1`:
|
||||
VLLM_USE_V1=0 python examples/offline_inference/mlpspeculator.py
|
||||
Note that this example is out of date and not supported in vLLM v1.
|
||||
"""
|
||||
|
||||
import gc
|
||||
|
||||
@@ -11,12 +11,10 @@ python examples/offline_inference/qwen2_5_omni/only_thinker.py \
|
||||
|
||||
# Read vision and audio inputs from a single video file
|
||||
# NOTE: V1 engine does not support interleaved modalities yet.
|
||||
VLLM_USE_V1=0 \
|
||||
python examples/offline_inference/qwen2_5_omni/only_thinker.py \
|
||||
-q use_audio_in_video
|
||||
|
||||
# Multiple audios
|
||||
VLLM_USE_V1=0 \
|
||||
python examples/offline_inference/qwen2_5_omni/only_thinker.py \
|
||||
-q multi_audios
|
||||
```
|
||||
|
||||
@@ -7,7 +7,6 @@ with the correct prompt format on Qwen2.5-Omni (thinker only).
|
||||
|
||||
from typing import NamedTuple
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.assets.audio import AudioAsset
|
||||
from vllm.assets.image import ImageAsset
|
||||
@@ -72,11 +71,7 @@ def get_use_audio_in_video_query() -> QueryResult:
|
||||
)
|
||||
asset = VideoAsset(name="baby_reading", num_frames=16)
|
||||
audio = asset.get_audio(sampling_rate=16000)
|
||||
assert not envs.VLLM_USE_V1, (
|
||||
"V1 does not support use_audio_in_video. "
|
||||
"Please launch this example with "
|
||||
"`VLLM_USE_V1=0`."
|
||||
)
|
||||
|
||||
return QueryResult(
|
||||
inputs={
|
||||
"prompt": prompt,
|
||||
|
||||
@@ -37,7 +37,7 @@ from vllm.config import KVTransferConfig
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
|
||||
|
||||
def setup_environment_variables(vllm_version: str):
|
||||
def setup_environment_variables():
|
||||
# LMCache-related environment variables
|
||||
# Use experimental features in LMCache
|
||||
os.environ["LMCACHE_USE_EXPERIMENTAL"] = "True"
|
||||
@@ -47,12 +47,10 @@ def setup_environment_variables(vllm_version: str):
|
||||
os.environ["LMCACHE_LOCAL_CPU"] = "True"
|
||||
# Set local CPU memory limit to 5.0 GB
|
||||
os.environ["LMCACHE_MAX_LOCAL_CPU_SIZE"] = "5.0"
|
||||
if vllm_version == "v0":
|
||||
os.environ["VLLM_USE_V1"] = "0"
|
||||
|
||||
|
||||
@contextlib.contextmanager
|
||||
def build_llm_with_lmcache(lmcache_connector: str, model: str, vllm_version: str):
|
||||
def build_llm_with_lmcache(lmcache_connector: str, model: str):
|
||||
ktc = KVTransferConfig(
|
||||
kv_connector=lmcache_connector,
|
||||
kv_role="kv_both",
|
||||
@@ -60,21 +58,12 @@ def build_llm_with_lmcache(lmcache_connector: str, model: str, vllm_version: str
|
||||
# Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
|
||||
# memory. Reduce the value if your GPU has less memory.
|
||||
# Note: LMCache supports chunked prefill (see vLLM#14505, LMCache#392).
|
||||
if vllm_version == "v0":
|
||||
llm_args = EngineArgs(
|
||||
model=model,
|
||||
kv_transfer_config=ktc,
|
||||
max_model_len=8000,
|
||||
gpu_memory_utilization=0.8,
|
||||
enable_chunked_prefill=True, # Only in v0
|
||||
)
|
||||
else:
|
||||
llm_args = EngineArgs(
|
||||
model=model,
|
||||
kv_transfer_config=ktc,
|
||||
max_model_len=8000,
|
||||
gpu_memory_utilization=0.8,
|
||||
)
|
||||
llm_args = EngineArgs(
|
||||
model=model,
|
||||
kv_transfer_config=ktc,
|
||||
max_model_len=8000,
|
||||
gpu_memory_utilization=0.8,
|
||||
)
|
||||
|
||||
llm = LLM(**asdict(llm_args))
|
||||
try:
|
||||
@@ -116,18 +105,10 @@ def parse_args():
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
|
||||
if args.version == "v0":
|
||||
lmcache_connector = "LMCacheConnector"
|
||||
model = "mistralai/Mistral-7B-Instruct-v0.2"
|
||||
else:
|
||||
lmcache_connector = "LMCacheConnectorV1"
|
||||
model = "meta-llama/Meta-Llama-3.1-8B-Instruct"
|
||||
|
||||
setup_environment_variables(args.version)
|
||||
|
||||
with build_llm_with_lmcache(lmcache_connector, model, args.version) as llm:
|
||||
lmcache_connector = "LMCacheConnectorV1"
|
||||
model = "meta-llama/Meta-Llama-3.1-8B-Instruct"
|
||||
setup_environment_variables()
|
||||
with build_llm_with_lmcache(lmcache_connector, model) as llm:
|
||||
# This example script runs two requests with a shared prefix.
|
||||
# Define the shared prompt and specific prompts
|
||||
shared_prompt = "Hello, how are you?" * 1000
|
||||
|
||||
@@ -22,9 +22,6 @@ def monkeypatch_module():
|
||||
|
||||
@pytest.fixture(scope="module", params=[True])
|
||||
def server(request, monkeypatch_module):
|
||||
use_v1 = request.param
|
||||
monkeypatch_module.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
|
||||
|
||||
args = [
|
||||
"--dtype",
|
||||
"bfloat16",
|
||||
|
||||
13
vllm/envs.py
13
vllm/envs.py
@@ -100,7 +100,6 @@ if TYPE_CHECKING:
|
||||
VLLM_SKIP_P2P_CHECK: bool = False
|
||||
VLLM_DISABLED_KERNELS: list[str] = []
|
||||
VLLM_DISABLE_PYNCCL: bool = False
|
||||
VLLM_USE_V1: bool = True
|
||||
VLLM_ROCM_USE_AITER: bool = False
|
||||
VLLM_ROCM_USE_AITER_PAGED_ATTN: bool = False
|
||||
VLLM_ROCM_USE_AITER_LINEAR: bool = True
|
||||
@@ -884,8 +883,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
"VLLM_DISABLE_PYNCCL": lambda: (
|
||||
os.getenv("VLLM_DISABLE_PYNCCL", "False").lower() in ("true", "1")
|
||||
),
|
||||
# If set, use the V1 code path.
|
||||
"VLLM_USE_V1": lambda: bool(int(os.getenv("VLLM_USE_V1", "1"))),
|
||||
# Disable aiter ops unless specifically enabled.
|
||||
# Acts as a parent switch to enable the rest of the other operations.
|
||||
"VLLM_ROCM_USE_AITER": lambda: (
|
||||
@@ -1538,16 +1535,6 @@ def is_set(name: str):
|
||||
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
||||
|
||||
|
||||
def set_vllm_use_v1(use_v1: bool):
|
||||
if is_set("VLLM_USE_V1"):
|
||||
raise ValueError(
|
||||
"Should not call set_vllm_use_v1() if VLLM_USE_V1 is set "
|
||||
"explicitly by the user. Please raise this as a Github "
|
||||
"Issue and explicitly set VLLM_USE_V1=0 or 1."
|
||||
)
|
||||
os.environ["VLLM_USE_V1"] = "1" if use_v1 else "0"
|
||||
|
||||
|
||||
def compute_hash() -> str:
|
||||
"""
|
||||
WARNING: Whenever a new key is added to this environment
|
||||
|
||||
@@ -42,7 +42,6 @@ _USAGE_ENV_VARS_TO_COLLECT = [
|
||||
"VLLM_USE_FLASHINFER_SAMPLER",
|
||||
"VLLM_PP_LAYER_PARTITION",
|
||||
"VLLM_USE_TRITON_AWQ",
|
||||
"VLLM_USE_V1",
|
||||
"VLLM_ENABLE_V1_MULTIPROCESSING",
|
||||
]
|
||||
|
||||
|
||||
Reference in New Issue
Block a user