update

2026-03-17 22:18:03 +08:00 · 2026-03-16 12:43:24 +01:00 · 2026-03-16 12:28:33 +01:00
13 changed files with 164 additions and 106 deletions
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -22,8 +22,6 @@
    title: Reproducibility
  - local: using-diffusers/schedulers
    title: Schedulers
-  - local: using-diffusers/guiders
-    title: Guiders
  - local: using-diffusers/automodel
    title: AutoModel
  - local: using-diffusers/other-formats
@@ -112,6 +110,8 @@
    title: ModularPipeline
  - local: modular_diffusers/components_manager
    title: ComponentsManager
+  - local: modular_diffusers/guiders
+    title: Guiders
  - local: modular_diffusers/custom_blocks
    title: Building Custom Blocks
  - local: modular_diffusers/mellon
--- a/docs/source/en/api/pipelines/hunyuan_video15.md
+++ b/docs/source/en/api/pipelines/hunyuan_video15.md
@@ -99,7 +99,7 @@ To update guider configuration, you can run `pipe.guider = pipe.guider.new(...)`
 pipe.guider = pipe.guider.new(guidance_scale=5.0)
 ```

-Read more on Guider [here](../../using-diffusers/guiders).
+Read more on Guider [here](../../modular_diffusers/guiders).



--- a/docs/source/en/api/pipelines/hunyuanimage21.md
+++ b/docs/source/en/api/pipelines/hunyuanimage21.md
@@ -30,7 +30,7 @@ HunyuanImage-2.1 comes in the following variants:

 ## HunyuanImage-2.1

-HunyuanImage-2.1 applies [Adaptive Projected Guidance (APG)](https://huggingface.co/papers/2410.02416) combined with Classifier-Free Guidance (CFG) in the denoising loop. `HunyuanImagePipeline` has a `guider` component (read more about [Guider](../../using-diffusers/guiders)) and does not take a `guidance_scale` parameter at runtime. To change guider-related parameters, e.g., `guidance_scale`, you can update the `guider` configuration instead.
+HunyuanImage-2.1 applies [Adaptive Projected Guidance (APG)](https://huggingface.co/papers/2410.02416) combined with Classifier-Free Guidance (CFG) in the denoising loop. `HunyuanImagePipeline` has a `guider` component (read more about [Guider](../modular_diffusers/guiders.md)) and does not take a `guidance_scale` parameter at runtime. To change guider-related parameters, e.g., `guidance_scale`, you can update the `guider` configuration instead.

 ```python
 import torch
--- a/docs/source/en/modular_diffusers/guiders.md
+++ b/docs/source/en/modular_diffusers/guiders.md
--- a/docs/source/en/modular_diffusers/modular_pipeline.md
+++ b/docs/source/en/modular_diffusers/modular_pipeline.md
@@ -338,7 +338,7 @@ guider = ClassifierFreeGuidance(guidance_scale=5.0)
 pipeline.update_components(guider=guider)
 ```

-See the [Guiders](../using-diffusers/guiders) guide for more details on available guiders and how to configure them.
+See the [Guiders](./guiders) guide for more details on available guiders and how to configure them.

 ## Splitting a pipeline into stages

--- a/docs/source/en/modular_diffusers/overview.md
+++ b/docs/source/en/modular_diffusers/overview.md
@@ -39,7 +39,7 @@ The Modular Diffusers docs are organized as shown below.

 - [ModularPipeline](./modular_pipeline) shows you how to create and convert pipeline blocks into an executable [`ModularPipeline`].
 - [ComponentsManager](./components_manager) shows you how to manage and reuse components across multiple pipelines.
- [Guiders](../using-diffusers/guiders) shows you how to use different guidance methods in the pipeline.
+- [Guiders](./guiders) shows you how to use different guidance methods in the pipeline.

 ## Mellon Integration

--- a/docs/source/en/optimization/memory.md
+++ b/docs/source/en/optimization/memory.md
@@ -482,6 +482,144 @@ print(
 )  # (2880, 1, 960, 320) having a stride of 1 for the 2nd dimension proves that it works
 ```

+## torch.jit.trace
+
+[torch.jit.trace](https://pytorch.org/docs/stable/generated/torch.jit.trace.html) records the operations a model performs on a sample input and creates a new, optimized representation of the model based on the recorded execution path. During tracing, the model is optimized to reduce overhead from Python and dynamic control flows and operations are fused together for more efficiency. The returned executable or [ScriptFunction](https://pytorch.org/docs/stable/generated/torch.jit.ScriptFunction.html) can be compiled.
+
+```py
+import time
+import torch
+from diffusers import StableDiffusionPipeline
+import functools
+
+# torch disable grad
+torch.set_grad_enabled(False)
+
+# set variables
+n_experiments = 2
+unet_runs_per_experiment = 50
+
+# load sample inputs
+def generate_inputs():
+    sample = torch.randn((2, 4, 64, 64), device="cuda", dtype=torch.float16)
+    timestep = torch.rand(1, device="cuda", dtype=torch.float16) * 999
+    encoder_hidden_states = torch.randn((2, 77, 768), device="cuda", dtype=torch.float16)
+    return sample, timestep, encoder_hidden_states
+
+
+pipeline = StableDiffusionPipeline.from_pretrained(
+    "stable-diffusion-v1-5/stable-diffusion-v1-5",
+    torch_dtype=torch.float16,
+    use_safetensors=True,
+).to("cuda")
+unet = pipeline.unet
+unet.eval()
+unet.to(memory_format=torch.channels_last)  # use channels_last memory format
+unet.forward = functools.partial(unet.forward, return_dict=False)  # set return_dict=False as default
+
+# warmup
+for _ in range(3):
+    with torch.inference_mode():
+        inputs = generate_inputs()
+        orig_output = unet(*inputs)
+
+# trace
+print("tracing..")
+unet_traced = torch.jit.trace(unet, inputs)
+unet_traced.eval()
+print("done tracing")
+
+# warmup and optimize graph
+for _ in range(5):
+    with torch.inference_mode():
+        inputs = generate_inputs()
+        orig_output = unet_traced(*inputs)
+
+# benchmarking
+with torch.inference_mode():
+    for _ in range(n_experiments):
+        torch.cuda.synchronize()
+        start_time = time.time()
+        for _ in range(unet_runs_per_experiment):
+            orig_output = unet_traced(*inputs)
+        torch.cuda.synchronize()
+        print(f"unet traced inference took {time.time() - start_time:.2f} seconds")
+    for _ in range(n_experiments):
+        torch.cuda.synchronize()
+        start_time = time.time()
+        for _ in range(unet_runs_per_experiment):
+            orig_output = unet(*inputs)
+        torch.cuda.synchronize()
+        print(f"unet inference took {time.time() - start_time:.2f} seconds")
+
+# save the model
+unet_traced.save("unet_traced.pt")
+```
+
+Replace the pipeline's UNet with the traced version.
+
+```py
+import torch
+from diffusers import StableDiffusionPipeline
+from dataclasses import dataclass
+
+@dataclass
+class UNet2DConditionOutput:
+    sample: torch.Tensor
+
+pipeline = StableDiffusionPipeline.from_pretrained(
+    "stable-diffusion-v1-5/stable-diffusion-v1-5",
+    torch_dtype=torch.float16,
+    use_safetensors=True,
+).to("cuda")
+
+# use jitted unet
+unet_traced = torch.jit.load("unet_traced.pt")
+
+# del pipeline.unet
+class TracedUNet(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.in_channels = pipe.unet.config.in_channels
+        self.device = pipe.unet.device
+
+    def forward(self, latent_model_input, t, encoder_hidden_states):
+        sample = unet_traced(latent_model_input, t, encoder_hidden_states)[0]
+        return UNet2DConditionOutput(sample=sample)
+
+pipeline.unet = TracedUNet()
+
+with torch.inference_mode():
+    image = pipe([prompt] * 1, num_inference_steps=50).images[0]
+```
+
 ## Memory-efficient attention

-Diffusers supports multiple memory-efficient attention backends (FlashAttention, xFormers, SageAttention, and more) through [`~ModelMixin.set_attention_backend`]. Refer to the [Attention backends](./attention_backends) guide to learn how to switch between them.
+> [!TIP]
+> Memory-efficient attention optimizes for memory usage *and* [inference speed](./fp16#scaled-dot-product-attention)!
+
+The Transformers attention mechanism is memory-intensive, especially for long sequences, so you can try using different and more memory-efficient attention types.
+
+By default, if PyTorch >= 2.0 is installed, [scaled dot-product attention (SDPA)](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) is used. You don't need to make any additional changes to your code.
+
+SDPA supports [FlashAttention](https://github.com/Dao-AILab/flash-attention) and [xFormers](https://github.com/facebookresearch/xformers) as well as a native C++ PyTorch implementation. It automatically selects the most optimal implementation based on your input.
+
+You can explicitly use xFormers with the [`~ModelMixin.enable_xformers_memory_efficient_attention`] method.
+
+```py
+# pip install xformers
+import torch
+from diffusers import StableDiffusionXLPipeline
+
+pipeline = StableDiffusionXLPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-xl-base-1.0",
+    torch_dtype=torch.float16,
+).to("cuda")
+pipeline.enable_xformers_memory_efficient_attention()
+```
+
+Call [`~ModelMixin.disable_xformers_memory_efficient_attention`] to disable it.
+
+```py
+pipeline.disable_xformers_memory_efficient_attention()
+```
--- a/docs/source/en/optimization/xformers.md
+++ b/docs/source/en/optimization/xformers.md
@@ -23,7 +23,7 @@ pip install xformers
 > [!TIP]
 > The xFormers `pip` package requires the latest version of PyTorch. If you need to use a previous version of PyTorch, then we recommend [installing xFormers from the source](https://github.com/facebookresearch/xformers#installing-xformers).

-After xFormers is installed, you can use it with [`~ModelMixin.set_attention_backend`] as shown in the [Attention backends](./attention_backends) guide.
+After xFormers is installed, you can use `enable_xformers_memory_efficient_attention()` for faster inference and reduced memory consumption as shown in this [section](memory#memory-efficient-attention).

 > [!WARNING]
 > According to this [issue](https://github.com/huggingface/diffusers/issues/2234#issuecomment-1416931212), xFormers `v0.0.16` cannot be used for training (fine-tune or DreamBooth) in some GPUs. If you observe this problem, please install a development version as indicated in the issue comments.
--- a/docs/source/zh/_toctree.yml
+++ b/docs/source/zh/_toctree.yml
@@ -14,8 +14,6 @@
  sections:
  - local: using-diffusers/schedulers
    title: Load schedulers and models
-  - local: using-diffusers/guiders
-    title: Guiders

 - title: Inference
  isExpanded: false
@@ -82,6 +80,8 @@
    title: ModularPipeline
  - local: modular_diffusers/components_manager
    title: ComponentsManager
+  - local: modular_diffusers/guiders
+    title: Guiders

 - title: Training
  isExpanded: false
--- a/docs/source/zh/modular_diffusers/guiders.md
+++ b/docs/source/zh/modular_diffusers/guiders.md
--- a/src/diffusers/hooks/group_offloading.py
+++ b/src/diffusers/hooks/group_offloading.py
@@ -22,7 +22,7 @@ from typing import Set
 import safetensors.torch
 import torch

-from ..utils import get_logger, is_accelerate_available, is_torchao_available
+from ..utils import get_logger, is_accelerate_available
 from ._common import _GO_LC_SUPPORTED_PYTORCH_LAYERS
 from .hooks import HookRegistry, ModelHook

@@ -35,41 +35,6 @@ if is_accelerate_available():
 logger = get_logger(__name__)  # pylint: disable=invalid-name


-def _is_torchao_tensor(tensor: torch.Tensor) -> bool:
-    """Check if a tensor is a TorchAO quantized tensor subclass."""
-    if not is_torchao_available():
-        return False
-    from torchao.utils import TorchAOBaseTensor
-
-    return isinstance(tensor, TorchAOBaseTensor)
-
-
-def _get_torchao_inner_tensor_names(tensor: torch.Tensor) -> list[str]:
-    """Get names of all internal tensor data attributes from a TorchAO tensor."""
-    cls = type(tensor)
-    names = list(getattr(cls, "tensor_data_names", []))
-    for attr_name in getattr(cls, "optional_tensor_data_names", []):
-        if getattr(tensor, attr_name, None) is not None:
-            names.append(attr_name)
-    return names
-
-
-def _update_torchao_tensor_in_place(param: torch.Tensor, source: torch.Tensor) -> None:
-    """Update internal tensor data of a TorchAO parameter in-place from source.
-
-    Must operate on the parameter/buffer object directly (not ``param.data``) because ``_make_wrapper_subclass``
-    returns a fresh wrapper from ``.data`` each time, so attribute mutations on ``.data`` are lost.
-    """
-    for attr_name in _get_torchao_inner_tensor_names(source):
-        setattr(param, attr_name, getattr(source, attr_name))
-
-
-def _record_stream_torchao_tensor(param: torch.Tensor, stream) -> None:
-    """Record stream for all internal tensors of a TorchAO parameter."""
-    for attr_name in _get_torchao_inner_tensor_names(param):
-        getattr(param, attr_name).record_stream(stream)
-
-
 # fmt: off
 _GROUP_OFFLOADING = "group_offloading"
 _LAYER_EXECUTION_TRACKER = "layer_execution_tracker"
@@ -192,16 +157,9 @@ class ModuleGroup:
            pinned_dict = None

    def _transfer_tensor_to_device(self, tensor, source_tensor, default_stream):
-        moved = source_tensor.to(self.onload_device, non_blocking=self.non_blocking)
-        if _is_torchao_tensor(tensor):
-            _update_torchao_tensor_in_place(tensor, moved)
-        else:
-            tensor.data = moved
+        tensor.data = source_tensor.to(self.onload_device, non_blocking=self.non_blocking)
        if self.record_stream:
-            if _is_torchao_tensor(tensor):
-                _record_stream_torchao_tensor(tensor, default_stream)
-            else:
-                tensor.data.record_stream(default_stream)
+            tensor.data.record_stream(default_stream)

    def _process_tensors_from_modules(self, pinned_memory=None, default_stream=None):
        for group_module in self.modules:
@@ -287,35 +245,18 @@ class ModuleGroup:

            for group_module in self.modules:
                for param in group_module.parameters():
-                    if _is_torchao_tensor(param):
-                        _update_torchao_tensor_in_place(param, self.cpu_param_dict[param])
-                    else:
-                        param.data = self.cpu_param_dict[param]
-            for param in self.parameters:
-                if _is_torchao_tensor(param):
-                    _update_torchao_tensor_in_place(param, self.cpu_param_dict[param])
-                else:
                    param.data = self.cpu_param_dict[param]
+            for param in self.parameters:
+                param.data = self.cpu_param_dict[param]
            for buffer in self.buffers:
-                if _is_torchao_tensor(buffer):
-                    _update_torchao_tensor_in_place(buffer, self.cpu_param_dict[buffer])
-                else:
-                    buffer.data = self.cpu_param_dict[buffer]
+                buffer.data = self.cpu_param_dict[buffer]
        else:
            for group_module in self.modules:
                group_module.to(self.offload_device, non_blocking=False)
            for param in self.parameters:
-                if _is_torchao_tensor(param):
-                    moved = param.data.to(self.offload_device, non_blocking=False)
-                    _update_torchao_tensor_in_place(param, moved)
-                else:
-                    param.data = param.data.to(self.offload_device, non_blocking=False)
+                param.data = param.data.to(self.offload_device, non_blocking=False)
            for buffer in self.buffers:
-                if _is_torchao_tensor(buffer):
-                    moved = buffer.data.to(self.offload_device, non_blocking=False)
-                    _update_torchao_tensor_in_place(buffer, moved)
-                else:
-                    buffer.data = buffer.data.to(self.offload_device, non_blocking=False)
+                buffer.data = buffer.data.to(self.offload_device, non_blocking=False)

    @torch.compiler.disable()
    def onload_(self):
--- a/tests/models/testing_utils/parallelism.py
+++ b/tests/models/testing_utils/parallelism.py
@@ -26,17 +26,9 @@ from diffusers.models._modeling_parallel import ContextParallelConfig
 from ...testing_utils import (
    is_context_parallel,
    require_torch_multi_accelerator,
-    torch_device,
 )


-# Device configuration mapping
-DEVICE_CONFIG = {
-    "cuda": {"backend": "nccl", "module": torch.cuda},
-    "xpu": {"backend": "xccl", "module": torch.xpu},
-}
-
-
 def _find_free_port():
    """Find a free port on localhost."""
    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
@@ -55,17 +47,12 @@ def _context_parallel_worker(rank, world_size, master_port, model_class, init_di
        os.environ["RANK"] = str(rank)
        os.environ["WORLD_SIZE"] = str(world_size)

-        # Get device configuration
-        device_config = DEVICE_CONFIG.get(torch_device, DEVICE_CONFIG["cuda"])
-        backend = device_config["backend"]
-        device_module = device_config["module"]
-
        # Initialize process group
-        dist.init_process_group(backend=backend, rank=rank, world_size=world_size)
+        dist.init_process_group(backend="nccl", rank=rank, world_size=world_size)

        # Set device for this process
-        device_module.set_device(rank)
-        device = torch.device(f"{torch_device}:{rank}")
+        torch.cuda.set_device(rank)
+        device = torch.device(f"cuda:{rank}")

        # Create model
        model = model_class(**init_dict)
@@ -116,16 +103,10 @@ def _custom_mesh_worker(
        os.environ["RANK"] = str(rank)
        os.environ["WORLD_SIZE"] = str(world_size)

-        # Get device configuration
-        device_config = DEVICE_CONFIG.get(torch_device, DEVICE_CONFIG["cuda"])
-        backend = device_config["backend"]
-        device_module = device_config["module"]
+        dist.init_process_group(backend="nccl", rank=rank, world_size=world_size)

-        dist.init_process_group(backend=backend, rank=rank, world_size=world_size)
-
-        # Set device for this process
-        device_module.set_device(rank)
-        device = torch.device(f"{torch_device}:{rank}")
+        torch.cuda.set_device(rank)
+        device = torch.device(f"cuda:{rank}")

        model = model_class(**init_dict)
        model.to(device)
@@ -135,7 +116,7 @@ def _custom_mesh_worker(

        # DeviceMesh must be created after init_process_group, inside each worker process.
        mesh = torch.distributed.device_mesh.init_device_mesh(
-            torch_device, mesh_shape=mesh_shape, mesh_dim_names=mesh_dim_names
+            "cuda", mesh_shape=mesh_shape, mesh_dim_names=mesh_dim_names
        )
        cp_config = ContextParallelConfig(**cp_dict, mesh=mesh)
        model.enable_parallelism(config=cp_config)
--- a/tests/pipelines/hunyuan_video/test_hunyuan_image2video.py
+++ b/tests/pipelines/hunyuan_video/test_hunyuan_image2video.py
@@ -139,9 +139,7 @@ class HunyuanVideoImageToVideoPipelineFastTests(
            num_hidden_layers=2,
            image_size=224,
        )
-        llava_text_encoder_config = LlavaConfig(
-            vision_config=vision_config, text_config=text_config, pad_token_id=100, image_token_index=101
-        )
+        llava_text_encoder_config = LlavaConfig(vision_config, text_config, pad_token_id=100, image_token_index=101)

        clip_text_encoder_config = CLIPTextConfig(
            bos_token_id=0,
Author	SHA1	Message	Date
Dhruv Nair	514dd552d4	update	2026-03-16 12:43:24 +01:00
Dhruv Nair	0d87803e80	update	2026-03-16 12:28:33 +01:00