[Compressed Tensors] Add XPU wNa16 support (#29484)

Signed-off-by: yiliu30 <yi4.liu@intel.com>
2025-12-06 15:04:47 +08:00 · 2025-12-05 22:02:09 +08:00
parent 9843e332da
commit 0d8a7d8a26
3 changed files with 102 additions and 0 deletions
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@@ -38,6 +38,7 @@ docker run \
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
+    python3 examples/offline_inference/basic/generate.py --model Intel/Qwen2.5-0.5B-W4A16-G128-AutoRound-LLMC-TEST-ONLY --enforce-eager
    VLLM_ATTENTION_BACKEND=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
    cd tests
    pytest -v -s v1/core
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/init.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/init.py
@@ -30,6 +30,9 @@ from vllm.model_executor.layers.quantization.kernels.mixed_precision.MPLinearKer
    MPLinearKernel,
    MPLinearLayerConfig,
 )
+from vllm.model_executor.layers.quantization.kernels.mixed_precision.xpu import (  # noqa: E501
+    XPUwNa16LinearKernel,
+)
 from vllm.platforms import current_platform

 # in priority/performance order (when available)
@@ -42,6 +45,7 @@ _POSSIBLE_KERNELS: list[type[MPLinearKernel]] = [
    BitBLASLinearKernel,
    ConchLinearKernel,
    ExllamaLinearKernel,
+    XPUwNa16LinearKernel,
 ]


--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/xpu.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/xpu.py
@@ -0,0 +1,97 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+import torch
+
+from vllm.platforms import current_platform
+
+from .MPLinearKernel import MPLinearKernel, MPLinearLayerConfig
+
+
+class XPUwNa16LinearKernel(MPLinearKernel):
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 0
+
+    @classmethod
+    def can_implement(cls, c: MPLinearLayerConfig) -> tuple[bool, str | None]:
+        if not current_platform.is_xpu():
+            return False, "IPEX wNa16 only supported on XPU/CPU devices"
+
+        # TODO: (yiliu30) relax these restrictions in later PRs
+        if c.zero_points:
+            return False, "Zero points not supported for Now"
+
+        return True, None
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        from packaging import version
+
+        MIN_IPEX_VERSION = "2.6.0"
+        bias = layer.bias if not layer.skip_bias_add else None
+
+        try:
+            import intel_extension_for_pytorch as ipex
+
+            if version.parse(ipex.__version__) < version.parse(MIN_IPEX_VERSION):
+                raise ImportError(
+                    "intel_extension_for_pytorch version is "
+                    "wrong. Please install "
+                    f"intel_extension_for_pytorch>={MIN_IPEX_VERSION}."
+                )
+        except ImportError as err:
+            raise ImportError(
+                "Please install "
+                f"intel_extension_for_pytorch>={MIN_IPEX_VERSION} via "
+                f"`pip install intel_extension_for_pytorch>={MIN_IPEX_VERSION}`"
+                " to use IPEX-AWQ linear method."
+            ) from err
+        # Using the compute dtype (lowp_mode) as INT8 to leverage instructions
+        # with better performance.
+        lowp_mode = ipex.quantization.WoqLowpMode.INT8
+        # The weight will be de-packed from INT4 to INT8.
+        weight_dtype = ipex.quantization.WoqWeightDtype.INT4
+        # The float activation will be quantized (dynamic, per-token) to INT8.
+        act_quant_mode = ipex.quantization.WoqActQuantMode.PER_BATCH
+
+        qconfig = ipex.quantization.get_weight_only_quant_qconfig_mapping(
+            weight_dtype=weight_dtype,
+            lowp_mode=lowp_mode,
+            act_quant_mode=act_quant_mode,
+            group_size=self.config.group_size,
+            weight_qscheme=ipex.quantization.WoqWeightQScheme.SYMMETRIC,
+        )
+        qweight = layer.weight_packed
+        g_idx = layer.weight_g_idx if self.config.has_g_idx else None
+        scales = layer.weight_scale
+        qzeros = None
+        if self.config.zero_points:
+            qzeros = layer.weight_zero_point.contiguous()
+        qweight = qweight.t().contiguous()
+        scales = scales.t().contiguous()
+        layer.ipex_output_size = self.config.partition_weight_shape[1]
+        layer.ipex_qlinear = (
+            ipex.llm.quantization.woq_linear.IPEXWeightOnlyQuantizedLinear.from_weight(
+                qweight,
+                scales,
+                qzeros,
+                in_features=self.config.partition_weight_shape[0],
+                out_features=self.config.partition_weight_shape[1],
+                qconfig=qconfig,
+                g_idx=g_idx,
+                bias=bias,
+                group_size=self.config.group_size,
+                quant_method=0,  # `0` stands for the IPEX GPTQ
+            )
+        )
+
+    def apply_weights(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        reshaped_x = x.reshape(-1, x.shape[-1])
+        out = layer.ipex_qlinear(reshaped_x)
+        return out.reshape(x.shape[:-1] + (layer.ipex_output_size,))