[XPU] Upgrade NIXL to remove CUDA dependency (#26570)

Signed-off-by: zhenwei-intel <zhenwei.liu@intel.com>
This commit is contained in:
liuzhenwei
2025-10-11 13:15:23 +08:00
committed by GitHub
parent 8f8474fbe3
commit 27ed39a347
5 changed files with 14 additions and 9 deletions

View File

@@ -44,6 +44,5 @@ docker run \
pytest -v -s v1/structured_output
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py
pytest -v -s v1/test_metrics
pytest -v -s v1/test_serial_utils.py
'

View File

@@ -69,4 +69,9 @@ RUN --mount=type=cache,target=/root/.cache/pip \
# install development dependencies (for testing)
RUN python3 -m pip install -e tests/vllm_test_utils
# install nixl from source code
RUN python3 /workspace/vllm/tools/install_nixl_from_source_ubuntu.py
ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/python3.12/dist-packages/.nixl.mesonpy.libs/plugins/"
ENTRYPOINT ["vllm", "serve"]

View File

@@ -10,7 +10,6 @@ wheel
jinja2>=3.1.6
datasets # for benchmark scripts
numba == 0.61.2 # Required for N-gram speculative decoding
nixl==0.3.0 # for PD disaggregation
torch==2.8.0+xpu
torchaudio
torchvision

View File

@@ -135,6 +135,7 @@ def build_and_install_prerequisites(args):
"--enable-devel-headers",
"--with-verbs",
"--enable-mt",
"--with-ze=no",
]
run_command(configure_command, cwd=ucx_source_path)
run_command(["make", "-j", str(os.cpu_count() or 1)], cwd=ucx_source_path)

View File

@@ -54,6 +54,14 @@ class XPUPlatform(Platform):
has_sink: bool,
use_sparse,
) -> str:
from vllm.v1.attention.backends.utils import set_kv_cache_layout
set_kv_cache_layout("NHD")
logger.info(
"Setting VLLM_KV_CACHE_LAYOUT to 'NHD' for XPU; "
"only NHD layout is supported by XPU attention kernels."
)
from vllm.attention.backends.registry import _Backend
if use_sparse:
@@ -190,13 +198,6 @@ class XPUPlatform(Platform):
vllm_config.scheduler_config.max_model_len,
DEFAULT_MAX_NUM_BATCHED_TOKENS,
)
from vllm.v1.attention.backends.utils import set_kv_cache_layout
set_kv_cache_layout("NHD")
logger.info(
"Setting VLLM_KV_CACHE_LAYOUT to 'NHD' for XPU; "
"only NHD layout is supported by XPU attention kernels."
)
@classmethod
def support_hybrid_kv_cache(cls) -> bool: