[XPU] Upgrade NIXL to remove CUDA dependency (#26570)
Signed-off-by: zhenwei-intel <zhenwei.liu@intel.com>
This commit is contained in:
@@ -44,6 +44,5 @@ docker run \
|
||||
pytest -v -s v1/structured_output
|
||||
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py
|
||||
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py
|
||||
pytest -v -s v1/test_metrics
|
||||
pytest -v -s v1/test_serial_utils.py
|
||||
'
|
||||
|
||||
@@ -69,4 +69,9 @@ RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
|
||||
# install development dependencies (for testing)
|
||||
RUN python3 -m pip install -e tests/vllm_test_utils
|
||||
|
||||
# install nixl from source code
|
||||
RUN python3 /workspace/vllm/tools/install_nixl_from_source_ubuntu.py
|
||||
ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/python3.12/dist-packages/.nixl.mesonpy.libs/plugins/"
|
||||
|
||||
ENTRYPOINT ["vllm", "serve"]
|
||||
|
||||
@@ -10,7 +10,6 @@ wheel
|
||||
jinja2>=3.1.6
|
||||
datasets # for benchmark scripts
|
||||
numba == 0.61.2 # Required for N-gram speculative decoding
|
||||
nixl==0.3.0 # for PD disaggregation
|
||||
torch==2.8.0+xpu
|
||||
torchaudio
|
||||
torchvision
|
||||
|
||||
@@ -135,6 +135,7 @@ def build_and_install_prerequisites(args):
|
||||
"--enable-devel-headers",
|
||||
"--with-verbs",
|
||||
"--enable-mt",
|
||||
"--with-ze=no",
|
||||
]
|
||||
run_command(configure_command, cwd=ucx_source_path)
|
||||
run_command(["make", "-j", str(os.cpu_count() or 1)], cwd=ucx_source_path)
|
||||
|
||||
@@ -54,6 +54,14 @@ class XPUPlatform(Platform):
|
||||
has_sink: bool,
|
||||
use_sparse,
|
||||
) -> str:
|
||||
from vllm.v1.attention.backends.utils import set_kv_cache_layout
|
||||
|
||||
set_kv_cache_layout("NHD")
|
||||
logger.info(
|
||||
"Setting VLLM_KV_CACHE_LAYOUT to 'NHD' for XPU; "
|
||||
"only NHD layout is supported by XPU attention kernels."
|
||||
)
|
||||
|
||||
from vllm.attention.backends.registry import _Backend
|
||||
|
||||
if use_sparse:
|
||||
@@ -190,13 +198,6 @@ class XPUPlatform(Platform):
|
||||
vllm_config.scheduler_config.max_model_len,
|
||||
DEFAULT_MAX_NUM_BATCHED_TOKENS,
|
||||
)
|
||||
from vllm.v1.attention.backends.utils import set_kv_cache_layout
|
||||
|
||||
set_kv_cache_layout("NHD")
|
||||
logger.info(
|
||||
"Setting VLLM_KV_CACHE_LAYOUT to 'NHD' for XPU; "
|
||||
"only NHD layout is supported by XPU attention kernels."
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def support_hybrid_kv_cache(cls) -> bool:
|
||||
|
||||
Reference in New Issue
Block a user