Compare commits

..

3 Commits

Author SHA1 Message Date
Animesh Jain
01de02e8b4 [gguf][torch.compile time] Convert to plain tensor earlier in dequantize_gguf_tensor (#13166)
[gguf] Convert to plain tensor earlier in dequantize_gguf_tensor

Once dequantize_gguf_tensor fetches the quant_type attributed from the
GGUFParamter tensor subclass, there is no further need of running the
actual dequantize operations on the Tensor subclass, we can just convert
to plain tensor right away.

This not only makes PyTorch eager faster, but reduces torch.compile
tracer compile time from 36 seconds to 10 seconds, because there is lot
less code to trace now.
2026-02-20 09:31:52 +05:30
Dhruv Nair
db2d7e7bc4 [CI] Fix new LoRAHotswap tests (#13163)
update

Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>
2026-02-20 09:01:20 +05:30
Sayak Paul
f8d3db9ca7 remove deps related to test from ci (#13164) 2026-02-20 08:35:35 +05:30
8 changed files with 16 additions and 40 deletions

View File

@@ -117,7 +117,7 @@ jobs:
- name: Install dependencies
run: |
uv pip install -e ".[quality,test]"
uv pip install -e ".[quality]"
#uv pip uninstall transformers huggingface_hub && uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
uv pip uninstall transformers huggingface_hub && uv pip install transformers==4.57.1
uv pip uninstall accelerate && uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git --no-deps

View File

@@ -114,7 +114,7 @@ jobs:
- name: Install dependencies
run: |
uv pip install -e ".[quality,test]"
uv pip install -e ".[quality]"
#uv pip uninstall transformers huggingface_hub && uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
uv pip uninstall transformers huggingface_hub && uv pip install transformers==4.57.1
uv pip uninstall accelerate && uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git --no-deps
@@ -191,7 +191,7 @@ jobs:
- name: Install dependencies
run: |
uv pip install -e ".[quality,test]"
uv pip install -e ".[quality]"
- name: Environment
run: |
@@ -242,7 +242,7 @@ jobs:
- name: Install dependencies
run: |
uv pip install -e ".[quality,test]"
uv pip install -e ".[quality]"
# TODO (sayakpaul, DN6): revisit `--no-deps`
uv pip install -U peft@git+https://github.com/huggingface/peft.git --no-deps
uv pip install -U tokenizers

View File

@@ -41,7 +41,7 @@ jobs:
shell: arch -arch arm64 bash {0}
run: |
${CONDA_RUN} python -m pip install --upgrade pip uv
${CONDA_RUN} python -m uv pip install -e ".[quality,test]"
${CONDA_RUN} python -m uv pip install -e ".[quality]"
${CONDA_RUN} python -m uv pip install torch torchvision torchaudio
${CONDA_RUN} python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
${CONDA_RUN} python -m uv pip install transformers --upgrade

View File

@@ -38,7 +38,6 @@ from ..utils import (
is_flash_attn_available,
is_flash_attn_version,
is_kernels_available,
is_kernels_version,
is_sageattention_available,
is_sageattention_version,
is_torch_npu_available,
@@ -266,7 +265,6 @@ class _HubKernelConfig:
repo_id: str
function_attr: str
revision: str | None = None
version: int | None = None
kernel_fn: Callable | None = None
wrapped_forward_attr: str | None = None
wrapped_backward_attr: str | None = None
@@ -276,31 +274,27 @@ class _HubKernelConfig:
# Registry for hub-based attention kernels
_HUB_KERNELS_REGISTRY: dict["AttentionBackendName", _HubKernelConfig] = {
# TODO: temporary revision for now. Remove when merged upstream into `main`.
AttentionBackendName._FLASH_3_HUB: _HubKernelConfig(
repo_id="kernels-community/flash-attn3", function_attr="flash_attn_func", version=1
repo_id="kernels-community/flash-attn3", function_attr="flash_attn_func", revision="fake-ops-return-probs"
),
AttentionBackendName._FLASH_3_VARLEN_HUB: _HubKernelConfig(
repo_id="kernels-community/flash-attn3",
function_attr="flash_attn_varlen_func",
version=1,
# revision="fake-ops-return-probs",
),
AttentionBackendName.FLASH_HUB: _HubKernelConfig(
repo_id="kernels-community/flash-attn2",
function_attr="flash_attn_func",
version=1,
revision=None,
wrapped_forward_attr="flash_attn_interface._wrapped_flash_attn_forward",
wrapped_backward_attr="flash_attn_interface._wrapped_flash_attn_backward",
),
AttentionBackendName.FLASH_VARLEN_HUB: _HubKernelConfig(
repo_id="kernels-community/flash-attn2",
function_attr="flash_attn_varlen_func",
version=1,
repo_id="kernels-community/flash-attn2", function_attr="flash_attn_varlen_func", revision=None
),
AttentionBackendName.SAGE_HUB: _HubKernelConfig(
repo_id="kernels-community/sage-attention",
function_attr="sageattn",
version=1,
repo_id="kernels-community/sage_attention", function_attr="sageattn", revision=None
),
}
@@ -470,10 +464,6 @@ def _check_attention_backend_requirements(backend: AttentionBackendName) -> None
raise RuntimeError(
f"Backend '{backend.value}' is not usable because the `kernels` package isn't available. Please install it with `pip install kernels`."
)
if not is_kernels_version(">=", "0.12"):
raise RuntimeError(
f"Backend '{backend.value}' needs to be used with a `kernels` version of at least 0.12. Please update with `pip install -U kernels`."
)
elif backend == AttentionBackendName.AITER:
if not _CAN_USE_AITER_ATTN:

View File

@@ -516,6 +516,9 @@ def dequantize_gguf_tensor(tensor):
block_size, type_size = GGML_QUANT_SIZES[quant_type]
# Conver to plain tensor to avoid unnecessary __torch_function__ overhead.
tensor = tensor.as_tensor()
tensor = tensor.view(torch.uint8)
shape = _quant_shape_from_byte_shape(tensor.shape, type_size, block_size)
@@ -525,7 +528,7 @@ def dequantize_gguf_tensor(tensor):
dequant = dequant_fn(blocks, block_size, type_size)
dequant = dequant.reshape(shape)
return dequant.as_tensor()
return dequant
class GGUFParameter(torch.nn.Parameter):

View File

@@ -86,7 +86,6 @@ from .import_utils import (
is_inflect_available,
is_invisible_watermark_available,
is_kernels_available,
is_kernels_version,
is_kornia_available,
is_librosa_available,
is_matplotlib_available,

View File

@@ -724,22 +724,6 @@ def is_transformers_version(operation: str, version: str):
return compare_versions(parse(_transformers_version), operation, version)
@cache
def is_kernels_version(operation: str, version: str):
"""
Compares the current Kernels version to a given reference with an operation.
Args:
operation (`str`):
A string representation of an operator, such as `">"` or `"<="`
version (`str`):
A version string
"""
if not _kernels_available:
return False
return compare_versions(parse(_kernels_version), operation, version)
@cache
def is_hf_hub_version(operation: str, version: str):
"""

View File

@@ -375,7 +375,7 @@ class LoraHotSwappingForModelTesterMixin:
# additionally check if dynamic compilation works.
if different_shapes is not None:
for height, width in different_shapes:
new_inputs_dict = self.prepare_dummy_input(height=height, width=width)
new_inputs_dict = self.get_dummy_inputs(height=height, width=width)
_ = model(**new_inputs_dict)
else:
output0_after = model(**inputs_dict)["sample"]
@@ -390,7 +390,7 @@ class LoraHotSwappingForModelTesterMixin:
with torch.inference_mode():
if different_shapes is not None:
for height, width in different_shapes:
new_inputs_dict = self.prepare_dummy_input(height=height, width=width)
new_inputs_dict = self.get_dummy_inputs(height=height, width=width)
_ = model(**new_inputs_dict)
else:
output1_after = model(**inputs_dict)["sample"]