Merge branch 'main' into update-kernel-hub-repos

[docs] Fix torchrun command argument order in docs (#13181 )
Fix torchrun command argument order in docs
2026-02-27 05:10:25 +08:00 · 2026-02-25 07:28:50 +05:30 · 2026-02-24 08:31:39 -08:00 · 2026-02-24 13:37:39 +05:30 · 2026-02-20 16:20:11 +05:30 · 2026-02-19 18:22:49 +05:30
4 changed files with 108 additions and 28 deletions
--- a/docs/source/en/training/distributed_inference.md
+++ b/docs/source/en/training/distributed_inference.md
@@ -111,7 +111,7 @@ if __name__ == "__main__":
 Call `torchrun` to run the inference script and use the `--nproc_per_node` argument to set the number of GPUs to use.

 ```bash
-torchrun run_distributed.py --nproc_per_node=2
+torchrun --nproc_per_node=2 run_distributed.py
 ```

 ## device_map
--- a/src/diffusers/models/attention_dispatch.py
+++ b/src/diffusers/models/attention_dispatch.py
@@ -38,6 +38,7 @@ from ..utils import (
    is_flash_attn_available,
    is_flash_attn_version,
    is_kernels_available,
+    is_kernels_version,
    is_sageattention_available,
    is_sageattention_version,
    is_torch_npu_available,
@@ -62,6 +63,8 @@ _REQUIRED_FLEX_VERSION = "2.5.0"
 _REQUIRED_XLA_VERSION = "2.2"
 _REQUIRED_XFORMERS_VERSION = "0.0.29"

+logger = get_logger(__name__)  # pylint: disable=invalid-name
+
 _CAN_USE_FLASH_ATTN = is_flash_attn_available() and is_flash_attn_version(">=", _REQUIRED_FLASH_VERSION)
 _CAN_USE_FLASH_ATTN_3 = is_flash_attn_3_available()
 _CAN_USE_AITER_ATTN = is_aiter_available() and is_aiter_version(">=", _REQUIRED_AITER_VERSION)
@@ -73,8 +76,18 @@ _CAN_USE_XFORMERS_ATTN = is_xformers_available() and is_xformers_version(">=", _


 if _CAN_USE_FLASH_ATTN:
-    from flash_attn import flash_attn_func, flash_attn_varlen_func
-    from flash_attn.flash_attn_interface import _wrapped_flash_attn_backward, _wrapped_flash_attn_forward
+    try:
+        from flash_attn import flash_attn_func, flash_attn_varlen_func
+        from flash_attn.flash_attn_interface import _wrapped_flash_attn_backward, _wrapped_flash_attn_forward
+    except (ImportError, OSError, RuntimeError) as e:
+        # Handle ABI mismatch or other import failures gracefully.
+        # This can happen when flash_attn was compiled against a different PyTorch version.
+        logger.warning(f"flash_attn is installed but failed to import: {e}. Falling back to native PyTorch attention.")
+        _CAN_USE_FLASH_ATTN = False
+        flash_attn_func = None
+        flash_attn_varlen_func = None
+        _wrapped_flash_attn_backward = None
+        _wrapped_flash_attn_forward = None
 else:
    flash_attn_func = None
    flash_attn_varlen_func = None
@@ -83,26 +96,47 @@ else:


 if _CAN_USE_FLASH_ATTN_3:
-    from flash_attn_interface import flash_attn_func as flash_attn_3_func
-    from flash_attn_interface import flash_attn_varlen_func as flash_attn_3_varlen_func
+    try:
+        from flash_attn_interface import flash_attn_func as flash_attn_3_func
+        from flash_attn_interface import flash_attn_varlen_func as flash_attn_3_varlen_func
+    except (ImportError, OSError, RuntimeError) as e:
+        logger.warning(f"flash_attn_3 failed to import: {e}. Falling back to native attention.")
+        _CAN_USE_FLASH_ATTN_3 = False
+        flash_attn_3_func = None
+        flash_attn_3_varlen_func = None
 else:
    flash_attn_3_func = None
    flash_attn_3_varlen_func = None

 if _CAN_USE_AITER_ATTN:
-    from aiter import flash_attn_func as aiter_flash_attn_func
+    try:
+        from aiter import flash_attn_func as aiter_flash_attn_func
+    except (ImportError, OSError, RuntimeError) as e:
+        logger.warning(f"aiter failed to import: {e}. Falling back to native attention.")
+        _CAN_USE_AITER_ATTN = False
+        aiter_flash_attn_func = None
 else:
    aiter_flash_attn_func = None

 if _CAN_USE_SAGE_ATTN:
-    from sageattention import (
-        sageattn,
-        sageattn_qk_int8_pv_fp8_cuda,
-        sageattn_qk_int8_pv_fp8_cuda_sm90,
-        sageattn_qk_int8_pv_fp16_cuda,
-        sageattn_qk_int8_pv_fp16_triton,
-        sageattn_varlen,
-    )
+    try:
+        from sageattention import (
+            sageattn,
+            sageattn_qk_int8_pv_fp8_cuda,
+            sageattn_qk_int8_pv_fp8_cuda_sm90,
+            sageattn_qk_int8_pv_fp16_cuda,
+            sageattn_qk_int8_pv_fp16_triton,
+            sageattn_varlen,
+        )
+    except (ImportError, OSError, RuntimeError) as e:
+        logger.warning(f"sageattention failed to import: {e}. Falling back to native attention.")
+        _CAN_USE_SAGE_ATTN = False
+        sageattn = None
+        sageattn_qk_int8_pv_fp8_cuda = None
+        sageattn_qk_int8_pv_fp8_cuda_sm90 = None
+        sageattn_qk_int8_pv_fp16_cuda = None
+        sageattn_qk_int8_pv_fp16_triton = None
+        sageattn_varlen = None
 else:
    sageattn = None
    sageattn_qk_int8_pv_fp16_cuda = None
@@ -113,26 +147,48 @@ else:


 if _CAN_USE_FLEX_ATTN:
-    # We cannot import the flex_attention function from the package directly because it is expected (from the
-    # pytorch documentation) that the user may compile it. If we import directly, we will not have access to the
-    # compiled function.
-    import torch.nn.attention.flex_attention as flex_attention
+    try:
+        # We cannot import the flex_attention function from the package directly because it is expected (from the
+        # pytorch documentation) that the user may compile it. If we import directly, we will not have access to the
+        # compiled function.
+        import torch.nn.attention.flex_attention as flex_attention
+    except (ImportError, OSError, RuntimeError) as e:
+        logger.warning(f"flex_attention failed to import: {e}. Falling back to native attention.")
+        _CAN_USE_FLEX_ATTN = False
+        flex_attention = None
+else:
+    flex_attention = None


 if _CAN_USE_NPU_ATTN:
-    from torch_npu import npu_fusion_attention
+    try:
+        from torch_npu import npu_fusion_attention
+    except (ImportError, OSError, RuntimeError) as e:
+        logger.warning(f"torch_npu failed to import: {e}. Falling back to native attention.")
+        _CAN_USE_NPU_ATTN = False
+        npu_fusion_attention = None
 else:
    npu_fusion_attention = None


 if _CAN_USE_XLA_ATTN:
-    from torch_xla.experimental.custom_kernel import flash_attention as xla_flash_attention
+    try:
+        from torch_xla.experimental.custom_kernel import flash_attention as xla_flash_attention
+    except (ImportError, OSError, RuntimeError) as e:
+        logger.warning(f"torch_xla failed to import: {e}. Falling back to native attention.")
+        _CAN_USE_XLA_ATTN = False
+        xla_flash_attention = None
 else:
    xla_flash_attention = None


 if _CAN_USE_XFORMERS_ATTN:
-    import xformers.ops as xops
+    try:
+        import xformers.ops as xops
+    except (ImportError, OSError, RuntimeError) as e:
+        logger.warning(f"xformers failed to import: {e}. Falling back to native attention.")
+        _CAN_USE_XFORMERS_ATTN = False
+        xops = None
 else:
    xops = None

@@ -158,8 +214,6 @@ else:
    _register_fake = register_fake_no_op


-logger = get_logger(__name__)  # pylint: disable=invalid-name
-
 # TODO(aryan): Add support for the following:
 # - Sage Attention++
 # - block sparse, radial and other attention methods
@@ -265,6 +319,7 @@ class _HubKernelConfig:
    repo_id: str
    function_attr: str
    revision: str | None = None
+    version: int | None = None
    kernel_fn: Callable | None = None
    wrapped_forward_attr: str | None = None
    wrapped_backward_attr: str | None = None
@@ -274,27 +329,31 @@ class _HubKernelConfig:

 # Registry for hub-based attention kernels
 _HUB_KERNELS_REGISTRY: dict["AttentionBackendName", _HubKernelConfig] = {
-    # TODO: temporary revision for now. Remove when merged upstream into `main`.
    AttentionBackendName._FLASH_3_HUB: _HubKernelConfig(
-        repo_id="kernels-community/flash-attn3", function_attr="flash_attn_func", revision="fake-ops-return-probs"
+        repo_id="kernels-community/flash-attn3", function_attr="flash_attn_func", version=1
    ),
    AttentionBackendName._FLASH_3_VARLEN_HUB: _HubKernelConfig(
        repo_id="kernels-community/flash-attn3",
        function_attr="flash_attn_varlen_func",
-        # revision="fake-ops-return-probs",
+        version=1,
    ),
    AttentionBackendName.FLASH_HUB: _HubKernelConfig(
        repo_id="kernels-community/flash-attn2",
        function_attr="flash_attn_func",
+        version=1,
        revision=None,
        wrapped_forward_attr="flash_attn_interface._wrapped_flash_attn_forward",
        wrapped_backward_attr="flash_attn_interface._wrapped_flash_attn_backward",
    ),
    AttentionBackendName.FLASH_VARLEN_HUB: _HubKernelConfig(
-        repo_id="kernels-community/flash-attn2", function_attr="flash_attn_varlen_func", revision=None
+        repo_id="kernels-community/flash-attn2",
+        function_attr="flash_attn_varlen_func",
+        version=1,
    ),
    AttentionBackendName.SAGE_HUB: _HubKernelConfig(
-        repo_id="kernels-community/sage_attention", function_attr="sageattn", revision=None
+        repo_id="kernels-community/sage-attention",
+        function_attr="sageattn",
+        version=1,
    ),
 }

@@ -464,6 +523,10 @@ def _check_attention_backend_requirements(backend: AttentionBackendName) -> None
            raise RuntimeError(
                f"Backend '{backend.value}' is not usable because the `kernels` package isn't available. Please install it with `pip install kernels`."
            )
+        if not is_kernels_version(">=", "0.12"):
+            raise RuntimeError(
+                f"Backend '{backend.value}' needs to be used with a `kernels` version of at least 0.12. Please update with `pip install -U kernels`."
+            )

    elif backend == AttentionBackendName.AITER:
        if not _CAN_USE_AITER_ATTN:
--- a/src/diffusers/utils/init.py
+++ b/src/diffusers/utils/init.py
@@ -86,6 +86,7 @@ from .import_utils import (
    is_inflect_available,
    is_invisible_watermark_available,
    is_kernels_available,
+    is_kernels_version,
    is_kornia_available,
    is_librosa_available,
    is_matplotlib_available,
--- a/src/diffusers/utils/import_utils.py
+++ b/src/diffusers/utils/import_utils.py
@@ -724,6 +724,22 @@ def is_transformers_version(operation: str, version: str):
    return compare_versions(parse(_transformers_version), operation, version)


+@cache
+def is_kernels_version(operation: str, version: str):
+    """
+    Compares the current Kernels version to a given reference with an operation.
+
+    Args:
+        operation (`str`):
+            A string representation of an operator, such as `">"` or `"<="`
+        version (`str`):
+            A version string
+    """
+    if not _kernels_available:
+        return False
+    return compare_versions(parse(_kernels_version), operation, version)
+
+
@cache
 def is_hf_hub_version(operation: str, version: str):
    """
Author	SHA1	Message	Date
Sayak Paul	6663decbce	Merge branch 'main' into update-kernel-hub-repos	2026-02-25 07:28:50 +05:30
Sayak Paul	aac94befce	[docs] Fix torchrun command argument order in docs (#13181 ) Fix torchrun command argument order in docs	2026-02-24 08:31:39 -08:00
SYM.BOT	1f6ac1c3d1	fix: graceful fallback when attention backends fail to import (#13060 ) * fix: graceful fallback when attention backends fail to import ## Problem External attention backends (flash_attn, xformers, sageattention, etc.) may be installed but fail to import at runtime due to ABI mismatches. For example, when `flash_attn` is compiled against PyTorch 2.4 but used with PyTorch 2.8, the import fails with: ``` OSError: .../flash_attn_2_cuda.cpython-311-x86_64-linux-gnu.so: undefined symbol: _ZN3c104cuda9SetDeviceEab ``` The current code uses `importlib.util.find_spec()` to check if packages exist, but this only verifies the package is installed—not that it can actually be imported. When the import fails, diffusers crashes instead of falling back to native PyTorch attention. ## Solution Wrap all external attention backend imports in try-except blocks that catch `ImportError` and `OSError`. On failure: 1. Log a warning message explaining the issue 2. Set the corresponding `_CAN_USE_` flag to `False` 3. Set the imported functions to `None` This allows diffusers to gracefully degrade to PyTorch's native SDPA (scaled_dot_product_attention) instead of crashing. ## Affected backends - flash_attn (Flash Attention) - flash_attn_3 (Flash Attention 3) - aiter (AMD Instinct) - sageattention (SageAttention) - flex_attention (PyTorch Flex Attention) - torch_npu (Huawei NPU) - torch_xla (TPU/XLA) - xformers (Meta xFormers) ## Testing Tested with PyTorch 2.8.0 and flash_attn 2.7.4.post1 (compiled for PyTorch 2.4). Before: crashes on import. After: logs warning and uses native attention. address review: use single logger and catch RuntimeError - Move logger to module level instead of creating per-backend loggers - Add RuntimeError to exception list alongside ImportError and OSError Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * Apply style fixes --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com> Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>	2026-02-24 13:37:39 +05:30
Sayak Paul	2b7ed4c8dc	Merge branch 'main' into update-kernel-hub-repos	2026-02-20 16:20:11 +05:30
sayakpaul	67f4691cab	resolve conflicts.	2026-02-19 18:22:49 +05:30
sayakpaul	e10fe61303	fix version and force updated kernels.	2026-02-19 18:00:01 +05:30
Sayak Paul	348350cf24	Merge branch 'main' into update-kernel-hub-repos	2026-02-19 17:53:46 +05:30
Sayak Paul	af35e3806c	Merge branch 'main' into update-kernel-hub-repos	2026-02-19 09:35:15 +05:30
sayakpaul	d6bc647932	change to updated repo and version.	2026-02-18 23:46:06 +05:30