[XPU] Upgrade NIXL to remove CUDA dependency (#26570)

Signed-off-by: zhenwei-intel <zhenwei.liu@intel.com>
2025-10-11 13:15:23 +08:00
parent 8f8474fbe3
commit 27ed39a347
5 changed files with 14 additions and 9 deletions
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@@ -44,6 +44,5 @@ docker run \
    pytest -v -s v1/structured_output
    pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py
    pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py
-    pytest -v -s v1/test_metrics
    pytest -v -s v1/test_serial_utils.py
 '
--- a/docker/Dockerfile.xpu
+++ b/docker/Dockerfile.xpu
@@ -69,4 +69,9 @@ RUN --mount=type=cache,target=/root/.cache/pip \

 # install development dependencies (for testing)
 RUN python3 -m pip install -e tests/vllm_test_utils
+
+# install nixl from source code
+RUN python3 /workspace/vllm/tools/install_nixl_from_source_ubuntu.py
+ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/python3.12/dist-packages/.nixl.mesonpy.libs/plugins/"
+
 ENTRYPOINT ["vllm", "serve"]
--- a/requirements/xpu.txt
+++ b/requirements/xpu.txt
@@ -10,7 +10,6 @@ wheel
 jinja2>=3.1.6
 datasets # for benchmark scripts
 numba == 0.61.2 # Required for N-gram speculative decoding
-nixl==0.3.0 # for PD disaggregation
 torch==2.8.0+xpu
 torchaudio
 torchvision
--- a/tools/install_nixl_from_source_ubuntu.py
+++ b/tools/install_nixl_from_source_ubuntu.py
@@ -135,6 +135,7 @@ def build_and_install_prerequisites(args):
        "--enable-devel-headers",
        "--with-verbs",
        "--enable-mt",
+        "--with-ze=no",
    ]
    run_command(configure_command, cwd=ucx_source_path)
    run_command(["make", "-j", str(os.cpu_count() or 1)], cwd=ucx_source_path)
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -54,6 +54,14 @@ class XPUPlatform(Platform):
        has_sink: bool,
        use_sparse,
    ) -> str:
+        from vllm.v1.attention.backends.utils import set_kv_cache_layout
+
+        set_kv_cache_layout("NHD")
+        logger.info(
+            "Setting VLLM_KV_CACHE_LAYOUT to 'NHD' for XPU; "
+            "only NHD layout is supported by XPU attention kernels."
+        )
+
        from vllm.attention.backends.registry import _Backend

        if use_sparse:
@@ -190,13 +198,6 @@ class XPUPlatform(Platform):
                vllm_config.scheduler_config.max_model_len,
                DEFAULT_MAX_NUM_BATCHED_TOKENS,
            )
-        from vllm.v1.attention.backends.utils import set_kv_cache_layout
-
-        set_kv_cache_layout("NHD")
-        logger.info(
-            "Setting VLLM_KV_CACHE_LAYOUT to 'NHD' for XPU; "
-            "only NHD layout is supported by XPU attention kernels."
-        )

    @classmethod
    def support_hybrid_kv_cache(cls) -> bool: