update

2026-03-17 22:18:03 +08:00 · 2026-03-16 12:43:24 +01:00 · 2026-03-16 12:28:33 +01:00
14 changed files with 234 additions and 579 deletions
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -22,8 +22,6 @@
    title: Reproducibility
  - local: using-diffusers/schedulers
    title: Schedulers
-  - local: using-diffusers/guiders
-    title: Guiders
  - local: using-diffusers/automodel
    title: AutoModel
  - local: using-diffusers/other-formats
@@ -112,6 +110,8 @@
    title: ModularPipeline
  - local: modular_diffusers/components_manager
    title: ComponentsManager
+  - local: modular_diffusers/guiders
+    title: Guiders
  - local: modular_diffusers/custom_blocks
    title: Building Custom Blocks
  - local: modular_diffusers/mellon
--- a/docs/source/en/api/pipelines/hunyuan_video15.md
+++ b/docs/source/en/api/pipelines/hunyuan_video15.md
@@ -99,7 +99,7 @@ To update guider configuration, you can run `pipe.guider = pipe.guider.new(...)`
 pipe.guider = pipe.guider.new(guidance_scale=5.0)
 ```

-Read more on Guider [here](../../using-diffusers/guiders).
+Read more on Guider [here](../../modular_diffusers/guiders).



--- a/docs/source/en/api/pipelines/hunyuanimage21.md
+++ b/docs/source/en/api/pipelines/hunyuanimage21.md
@@ -30,7 +30,7 @@ HunyuanImage-2.1 comes in the following variants:

 ## HunyuanImage-2.1

-HunyuanImage-2.1 applies [Adaptive Projected Guidance (APG)](https://huggingface.co/papers/2410.02416) combined with Classifier-Free Guidance (CFG) in the denoising loop. `HunyuanImagePipeline` has a `guider` component (read more about [Guider](../../using-diffusers/guiders)) and does not take a `guidance_scale` parameter at runtime. To change guider-related parameters, e.g., `guidance_scale`, you can update the `guider` configuration instead.
+HunyuanImage-2.1 applies [Adaptive Projected Guidance (APG)](https://huggingface.co/papers/2410.02416) combined with Classifier-Free Guidance (CFG) in the denoising loop. `HunyuanImagePipeline` has a `guider` component (read more about [Guider](../modular_diffusers/guiders.md)) and does not take a `guidance_scale` parameter at runtime. To change guider-related parameters, e.g., `guidance_scale`, you can update the `guider` configuration instead.

 ```python
 import torch
--- a/docs/source/en/modular_diffusers/guiders.md
+++ b/docs/source/en/modular_diffusers/guiders.md
--- a/docs/source/en/modular_diffusers/modular_pipeline.md
+++ b/docs/source/en/modular_diffusers/modular_pipeline.md
@@ -338,7 +338,7 @@ guider = ClassifierFreeGuidance(guidance_scale=5.0)
 pipeline.update_components(guider=guider)
 ```

-See the [Guiders](../using-diffusers/guiders) guide for more details on available guiders and how to configure them.
+See the [Guiders](./guiders) guide for more details on available guiders and how to configure them.

 ## Splitting a pipeline into stages

--- a/docs/source/en/modular_diffusers/overview.md
+++ b/docs/source/en/modular_diffusers/overview.md
@@ -39,7 +39,7 @@ The Modular Diffusers docs are organized as shown below.

 - [ModularPipeline](./modular_pipeline) shows you how to create and convert pipeline blocks into an executable [`ModularPipeline`].
 - [ComponentsManager](./components_manager) shows you how to manage and reuse components across multiple pipelines.
- [Guiders](../using-diffusers/guiders) shows you how to use different guidance methods in the pipeline.
+- [Guiders](./guiders) shows you how to use different guidance methods in the pipeline.

 ## Mellon Integration

--- a/docs/source/en/optimization/memory.md
+++ b/docs/source/en/optimization/memory.md
@@ -482,6 +482,144 @@ print(
 )  # (2880, 1, 960, 320) having a stride of 1 for the 2nd dimension proves that it works
 ```

+## torch.jit.trace
+
+[torch.jit.trace](https://pytorch.org/docs/stable/generated/torch.jit.trace.html) records the operations a model performs on a sample input and creates a new, optimized representation of the model based on the recorded execution path. During tracing, the model is optimized to reduce overhead from Python and dynamic control flows and operations are fused together for more efficiency. The returned executable or [ScriptFunction](https://pytorch.org/docs/stable/generated/torch.jit.ScriptFunction.html) can be compiled.
+
+```py
+import time
+import torch
+from diffusers import StableDiffusionPipeline
+import functools
+
+# torch disable grad
+torch.set_grad_enabled(False)
+
+# set variables
+n_experiments = 2
+unet_runs_per_experiment = 50
+
+# load sample inputs
+def generate_inputs():
+    sample = torch.randn((2, 4, 64, 64), device="cuda", dtype=torch.float16)
+    timestep = torch.rand(1, device="cuda", dtype=torch.float16) * 999
+    encoder_hidden_states = torch.randn((2, 77, 768), device="cuda", dtype=torch.float16)
+    return sample, timestep, encoder_hidden_states
+
+
+pipeline = StableDiffusionPipeline.from_pretrained(
+    "stable-diffusion-v1-5/stable-diffusion-v1-5",
+    torch_dtype=torch.float16,
+    use_safetensors=True,
+).to("cuda")
+unet = pipeline.unet
+unet.eval()
+unet.to(memory_format=torch.channels_last)  # use channels_last memory format
+unet.forward = functools.partial(unet.forward, return_dict=False)  # set return_dict=False as default
+
+# warmup
+for _ in range(3):
+    with torch.inference_mode():
+        inputs = generate_inputs()
+        orig_output = unet(*inputs)
+
+# trace
+print("tracing..")
+unet_traced = torch.jit.trace(unet, inputs)
+unet_traced.eval()
+print("done tracing")
+
+# warmup and optimize graph
+for _ in range(5):
+    with torch.inference_mode():
+        inputs = generate_inputs()
+        orig_output = unet_traced(*inputs)
+
+# benchmarking
+with torch.inference_mode():
+    for _ in range(n_experiments):
+        torch.cuda.synchronize()
+        start_time = time.time()
+        for _ in range(unet_runs_per_experiment):
+            orig_output = unet_traced(*inputs)
+        torch.cuda.synchronize()
+        print(f"unet traced inference took {time.time() - start_time:.2f} seconds")
+    for _ in range(n_experiments):
+        torch.cuda.synchronize()
+        start_time = time.time()
+        for _ in range(unet_runs_per_experiment):
+            orig_output = unet(*inputs)
+        torch.cuda.synchronize()
+        print(f"unet inference took {time.time() - start_time:.2f} seconds")
+
+# save the model
+unet_traced.save("unet_traced.pt")
+```
+
+Replace the pipeline's UNet with the traced version.
+
+```py
+import torch
+from diffusers import StableDiffusionPipeline
+from dataclasses import dataclass
+
+@dataclass
+class UNet2DConditionOutput:
+    sample: torch.Tensor
+
+pipeline = StableDiffusionPipeline.from_pretrained(
+    "stable-diffusion-v1-5/stable-diffusion-v1-5",
+    torch_dtype=torch.float16,
+    use_safetensors=True,
+).to("cuda")
+
+# use jitted unet
+unet_traced = torch.jit.load("unet_traced.pt")
+
+# del pipeline.unet
+class TracedUNet(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.in_channels = pipe.unet.config.in_channels
+        self.device = pipe.unet.device
+
+    def forward(self, latent_model_input, t, encoder_hidden_states):
+        sample = unet_traced(latent_model_input, t, encoder_hidden_states)[0]
+        return UNet2DConditionOutput(sample=sample)
+
+pipeline.unet = TracedUNet()
+
+with torch.inference_mode():
+    image = pipe([prompt] * 1, num_inference_steps=50).images[0]
+```
+
 ## Memory-efficient attention

-Diffusers supports multiple memory-efficient attention backends (FlashAttention, xFormers, SageAttention, and more) through [`~ModelMixin.set_attention_backend`]. Refer to the [Attention backends](./attention_backends) guide to learn how to switch between them.
+> [!TIP]
+> Memory-efficient attention optimizes for memory usage *and* [inference speed](./fp16#scaled-dot-product-attention)!
+
+The Transformers attention mechanism is memory-intensive, especially for long sequences, so you can try using different and more memory-efficient attention types.
+
+By default, if PyTorch >= 2.0 is installed, [scaled dot-product attention (SDPA)](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) is used. You don't need to make any additional changes to your code.
+
+SDPA supports [FlashAttention](https://github.com/Dao-AILab/flash-attention) and [xFormers](https://github.com/facebookresearch/xformers) as well as a native C++ PyTorch implementation. It automatically selects the most optimal implementation based on your input.
+
+You can explicitly use xFormers with the [`~ModelMixin.enable_xformers_memory_efficient_attention`] method.
+
+```py
+# pip install xformers
+import torch
+from diffusers import StableDiffusionXLPipeline
+
+pipeline = StableDiffusionXLPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-xl-base-1.0",
+    torch_dtype=torch.float16,
+).to("cuda")
+pipeline.enable_xformers_memory_efficient_attention()
+```
+
+Call [`~ModelMixin.disable_xformers_memory_efficient_attention`] to disable it.
+
+```py
+pipeline.disable_xformers_memory_efficient_attention()
+```
--- a/docs/source/en/optimization/xformers.md
+++ b/docs/source/en/optimization/xformers.md
@@ -23,7 +23,7 @@ pip install xformers
 > [!TIP]
 > The xFormers `pip` package requires the latest version of PyTorch. If you need to use a previous version of PyTorch, then we recommend [installing xFormers from the source](https://github.com/facebookresearch/xformers#installing-xformers).

-After xFormers is installed, you can use it with [`~ModelMixin.set_attention_backend`] as shown in the [Attention backends](./attention_backends) guide.
+After xFormers is installed, you can use `enable_xformers_memory_efficient_attention()` for faster inference and reduced memory consumption as shown in this [section](memory#memory-efficient-attention).

 > [!WARNING]
 > According to this [issue](https://github.com/huggingface/diffusers/issues/2234#issuecomment-1416931212), xFormers `v0.0.16` cannot be used for training (fine-tune or DreamBooth) in some GPUs. If you observe this problem, please install a development version as indicated in the issue comments.
--- a/docs/source/zh/_toctree.yml
+++ b/docs/source/zh/_toctree.yml
@@ -14,8 +14,6 @@
  sections:
  - local: using-diffusers/schedulers
    title: Load schedulers and models
-  - local: using-diffusers/guiders
-    title: Guiders

 - title: Inference
  isExpanded: false
@@ -82,6 +80,8 @@
    title: ModularPipeline
  - local: modular_diffusers/components_manager
    title: ComponentsManager
+  - local: modular_diffusers/guiders
+    title: Guiders

 - title: Training
  isExpanded: false
--- a/docs/source/zh/modular_diffusers/guiders.md
+++ b/docs/source/zh/modular_diffusers/guiders.md
--- a/tests/models/testing_utils/parallelism.py
+++ b/tests/models/testing_utils/parallelism.py
@@ -26,17 +26,9 @@ from diffusers.models._modeling_parallel import ContextParallelConfig
 from ...testing_utils import (
    is_context_parallel,
    require_torch_multi_accelerator,
-    torch_device,
 )


-# Device configuration mapping
-DEVICE_CONFIG = {
-    "cuda": {"backend": "nccl", "module": torch.cuda},
-    "xpu": {"backend": "xccl", "module": torch.xpu},
-}
-
-
 def _find_free_port():
    """Find a free port on localhost."""
    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
@@ -55,17 +47,12 @@ def _context_parallel_worker(rank, world_size, master_port, model_class, init_di
        os.environ["RANK"] = str(rank)
        os.environ["WORLD_SIZE"] = str(world_size)

-        # Get device configuration
-        device_config = DEVICE_CONFIG.get(torch_device, DEVICE_CONFIG["cuda"])
-        backend = device_config["backend"]
-        device_module = device_config["module"]
-
        # Initialize process group
-        dist.init_process_group(backend=backend, rank=rank, world_size=world_size)
+        dist.init_process_group(backend="nccl", rank=rank, world_size=world_size)

        # Set device for this process
-        device_module.set_device(rank)
-        device = torch.device(f"{torch_device}:{rank}")
+        torch.cuda.set_device(rank)
+        device = torch.device(f"cuda:{rank}")

        # Create model
        model = model_class(**init_dict)
@@ -116,16 +103,10 @@ def _custom_mesh_worker(
        os.environ["RANK"] = str(rank)
        os.environ["WORLD_SIZE"] = str(world_size)

-        # Get device configuration
-        device_config = DEVICE_CONFIG.get(torch_device, DEVICE_CONFIG["cuda"])
-        backend = device_config["backend"]
-        device_module = device_config["module"]
+        dist.init_process_group(backend="nccl", rank=rank, world_size=world_size)

-        dist.init_process_group(backend=backend, rank=rank, world_size=world_size)
-
-        # Set device for this process
-        device_module.set_device(rank)
-        device = torch.device(f"{torch_device}:{rank}")
+        torch.cuda.set_device(rank)
+        device = torch.device(f"cuda:{rank}")

        model = model_class(**init_dict)
        model.to(device)
@@ -135,7 +116,7 @@ def _custom_mesh_worker(

        # DeviceMesh must be created after init_process_group, inside each worker process.
        mesh = torch.distributed.device_mesh.init_device_mesh(
-            torch_device, mesh_shape=mesh_shape, mesh_dim_names=mesh_dim_names
+            "cuda", mesh_shape=mesh_shape, mesh_dim_names=mesh_dim_names
        )
        cp_config = ContextParallelConfig(**cp_dict, mesh=mesh)
        model.enable_parallelism(config=cp_config)
--- a/tests/models/transformers/test_models_transformer_flux.py
+++ b/tests/models/transformers/test_models_transformer_flux.py
@@ -41,6 +41,7 @@ from ..testing_utils import (
    ModelOptCompileTesterMixin,
    ModelOptTesterMixin,
    ModelTesterMixin,
+    PyramidAttentionBroadcastTesterMixin,
    QuantoCompileTesterMixin,
    QuantoTesterMixin,
    SingleFileTesterMixin,
@@ -218,10 +219,6 @@ class TestFluxTransformerMemory(FluxTransformerTesterConfig, MemoryTesterMixin):
 class TestFluxTransformerTraining(FluxTransformerTesterConfig, TrainingTesterMixin):
    """Training tests for Flux Transformer."""

-    def test_gradient_checkpointing_is_applied(self):
-        expected_set = {"FluxTransformer2DModel"}
-        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
-

 class TestFluxTransformerAttention(FluxTransformerTesterConfig, AttentionTesterMixin):
    """Attention processor tests for Flux Transformer."""
@@ -415,6 +412,10 @@ class TestFluxTransformerBitsAndBytesCompile(FluxTransformerTesterConfig, BitsAn
    """BitsAndBytes + compile tests for Flux Transformer."""


+class TestFluxTransformerPABCache(FluxTransformerTesterConfig, PyramidAttentionBroadcastTesterMixin):
+    """PyramidAttentionBroadcast cache tests for Flux Transformer."""
+
+
 class TestFluxTransformerFBCCache(FluxTransformerTesterConfig, FirstBlockCacheTesterMixin):
    """FirstBlockCache tests for Flux Transformer."""

--- a/tests/models/transformers/test_models_transformer_flux2.py
+++ b/tests/models/transformers/test_models_transformer_flux2.py
@@ -13,95 +13,48 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+import unittest
+
 import torch

-from diffusers import Flux2Transformer2DModel
-from diffusers.models.transformers.transformer_flux2 import (
-    Flux2KVAttnProcessor,
-    Flux2KVCache,
-    Flux2KVLayerCache,
-    Flux2KVParallelSelfAttnProcessor,
-)
-from diffusers.utils.torch_utils import randn_tensor
+from diffusers import Flux2Transformer2DModel, attention_backend

 from ...testing_utils import enable_full_determinism, torch_device
-from ..testing_utils import (
-    AttentionTesterMixin,
-    BaseModelTesterConfig,
-    BitsAndBytesTesterMixin,
-    ContextParallelTesterMixin,
-    GGUFCompileTesterMixin,
-    GGUFTesterMixin,
-    LoraHotSwappingForModelTesterMixin,
-    LoraTesterMixin,
-    MemoryTesterMixin,
-    ModelTesterMixin,
-    TorchAoCompileTesterMixin,
-    TorchAoTesterMixin,
-    TorchCompileTesterMixin,
-    TrainingTesterMixin,
-)
+from ..test_modeling_common import LoraHotSwappingForModelTesterMixin, ModelTesterMixin, TorchCompileTesterMixin


 enable_full_determinism()


-class Flux2TransformerTesterConfig(BaseModelTesterConfig):
-    @property
-    def model_class(self):
-        return Flux2Transformer2DModel
+class Flux2TransformerTests(ModelTesterMixin, unittest.TestCase):
+    model_class = Flux2Transformer2DModel
+    main_input_name = "hidden_states"
+    # We override the items here because the transformer under consideration is small.
+    model_split_percents = [0.7, 0.6, 0.6]
+
+    # Skip setting testing with default: AttnProcessor
+    uses_custom_attn_processor = True

    @property
-    def output_shape(self) -> tuple[int, int]:
+    def dummy_input(self):
+        return self.prepare_dummy_input()
+
+    @property
+    def input_shape(self):
        return (16, 4)

    @property
-    def input_shape(self) -> tuple[int, int]:
+    def output_shape(self):
        return (16, 4)

-    @property
-    def model_split_percents(self) -> list:
-        # We override the items here because the transformer under consideration is small.
-        return [0.7, 0.6, 0.6]
-
-    @property
-    def main_input_name(self) -> str:
-        return "hidden_states"
-
-    @property
-    def uses_custom_attn_processor(self) -> bool:
-        # Skip setting testing with default: AttnProcessor
-        return True
-
-    @property
-    def generator(self):
-        return torch.Generator("cpu").manual_seed(0)
-
-    def get_init_dict(self) -> dict[str, int | list[int]]:
-        return {
-            "patch_size": 1,
-            "in_channels": 4,
-            "num_layers": 1,
-            "num_single_layers": 1,
-            "attention_head_dim": 16,
-            "num_attention_heads": 2,
-            "joint_attention_dim": 32,
-            "timestep_guidance_channels": 256,  # Hardcoded in original code
-            "axes_dims_rope": [4, 4, 4, 4],
-        }
-
-    def get_dummy_inputs(self, height: int = 4, width: int = 4) -> dict[str, torch.Tensor]:
+    def prepare_dummy_input(self, height=4, width=4):
        batch_size = 1
        num_latent_channels = 4
        sequence_length = 48
        embedding_dim = 32

-        hidden_states = randn_tensor(
-            (batch_size, height * width, num_latent_channels), generator=self.generator, device=torch_device
-        )
-        encoder_hidden_states = randn_tensor(
-            (batch_size, sequence_length, embedding_dim), generator=self.generator, device=torch_device
-        )
+        hidden_states = torch.randn((batch_size, height * width, num_latent_channels)).to(torch_device)
+        encoder_hidden_states = torch.randn((batch_size, sequence_length, embedding_dim)).to(torch_device)

        t_coords = torch.arange(1)
        h_coords = torch.arange(height)
@@ -129,286 +82,8 @@ class Flux2TransformerTesterConfig(BaseModelTesterConfig):
            "guidance": guidance,
        }

-
-class TestFlux2Transformer(Flux2TransformerTesterConfig, ModelTesterMixin):
-    pass
-
-
-class TestFlux2TransformerMemory(Flux2TransformerTesterConfig, MemoryTesterMixin):
-    """Memory optimization tests for Flux2 Transformer."""
-
-
-class TestFlux2TransformerTraining(Flux2TransformerTesterConfig, TrainingTesterMixin):
-    """Training tests for Flux2 Transformer."""
-
-    def test_gradient_checkpointing_is_applied(self):
-        expected_set = {"Flux2Transformer2DModel"}
-        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
-
-
-class TestFlux2TransformerAttention(Flux2TransformerTesterConfig, AttentionTesterMixin):
-    """Attention processor tests for Flux2 Transformer."""
-
-
-class TestFlux2TransformerContextParallel(Flux2TransformerTesterConfig, ContextParallelTesterMixin):
-    """Context Parallel inference tests for Flux2 Transformer."""
-
-
-class TestFlux2TransformerLoRA(Flux2TransformerTesterConfig, LoraTesterMixin):
-    """LoRA adapter tests for Flux2 Transformer."""
-
-
-class TestFlux2TransformerLoRAHotSwap(Flux2TransformerTesterConfig, LoraHotSwappingForModelTesterMixin):
-    """LoRA hot-swapping tests for Flux2 Transformer."""
-
-    @property
-    def different_shapes_for_compilation(self):
-        return [(4, 4), (4, 8), (8, 8)]
-
-    def get_dummy_inputs(self, height: int = 4, width: int = 4) -> dict[str, torch.Tensor]:
-        """Override to support dynamic height/width for LoRA hotswap tests."""
-        batch_size = 1
-        num_latent_channels = 4
-        sequence_length = 48
-        embedding_dim = 32
-
-        hidden_states = randn_tensor(
-            (batch_size, height * width, num_latent_channels), generator=self.generator, device=torch_device
-        )
-        encoder_hidden_states = randn_tensor(
-            (batch_size, sequence_length, embedding_dim), generator=self.generator, device=torch_device
-        )
-
-        t_coords = torch.arange(1)
-        h_coords = torch.arange(height)
-        w_coords = torch.arange(width)
-        l_coords = torch.arange(1)
-        image_ids = torch.cartesian_prod(t_coords, h_coords, w_coords, l_coords)
-        image_ids = image_ids.unsqueeze(0).expand(batch_size, -1, -1).to(torch_device)
-
-        text_t_coords = torch.arange(1)
-        text_h_coords = torch.arange(1)
-        text_w_coords = torch.arange(1)
-        text_l_coords = torch.arange(sequence_length)
-        text_ids = torch.cartesian_prod(text_t_coords, text_h_coords, text_w_coords, text_l_coords)
-        text_ids = text_ids.unsqueeze(0).expand(batch_size, -1, -1).to(torch_device)
-
-        timestep = torch.tensor([1.0]).to(torch_device).expand(batch_size)
-        guidance = torch.tensor([1.0]).to(torch_device).expand(batch_size)
-
-        return {
-            "hidden_states": hidden_states,
-            "encoder_hidden_states": encoder_hidden_states,
-            "img_ids": image_ids,
-            "txt_ids": text_ids,
-            "timestep": timestep,
-            "guidance": guidance,
-        }
-
-
-class TestFlux2TransformerCompile(Flux2TransformerTesterConfig, TorchCompileTesterMixin):
-    @property
-    def different_shapes_for_compilation(self):
-        return [(4, 4), (4, 8), (8, 8)]
-
-    def get_dummy_inputs(self, height: int = 4, width: int = 4) -> dict[str, torch.Tensor]:
-        """Override to support dynamic height/width for compilation tests."""
-        batch_size = 1
-        num_latent_channels = 4
-        sequence_length = 48
-        embedding_dim = 32
-
-        hidden_states = randn_tensor(
-            (batch_size, height * width, num_latent_channels), generator=self.generator, device=torch_device
-        )
-        encoder_hidden_states = randn_tensor(
-            (batch_size, sequence_length, embedding_dim), generator=self.generator, device=torch_device
-        )
-
-        t_coords = torch.arange(1)
-        h_coords = torch.arange(height)
-        w_coords = torch.arange(width)
-        l_coords = torch.arange(1)
-        image_ids = torch.cartesian_prod(t_coords, h_coords, w_coords, l_coords)
-        image_ids = image_ids.unsqueeze(0).expand(batch_size, -1, -1).to(torch_device)
-
-        text_t_coords = torch.arange(1)
-        text_h_coords = torch.arange(1)
-        text_w_coords = torch.arange(1)
-        text_l_coords = torch.arange(sequence_length)
-        text_ids = torch.cartesian_prod(text_t_coords, text_h_coords, text_w_coords, text_l_coords)
-        text_ids = text_ids.unsqueeze(0).expand(batch_size, -1, -1).to(torch_device)
-
-        timestep = torch.tensor([1.0]).to(torch_device).expand(batch_size)
-        guidance = torch.tensor([1.0]).to(torch_device).expand(batch_size)
-
-        return {
-            "hidden_states": hidden_states,
-            "encoder_hidden_states": encoder_hidden_states,
-            "img_ids": image_ids,
-            "txt_ids": text_ids,
-            "timestep": timestep,
-            "guidance": guidance,
-        }
-
-
-class TestFlux2TransformerBitsAndBytes(Flux2TransformerTesterConfig, BitsAndBytesTesterMixin):
-    """BitsAndBytes quantization tests for Flux2 Transformer."""
-
-
-class TestFlux2TransformerTorchAo(Flux2TransformerTesterConfig, TorchAoTesterMixin):
-    """TorchAO quantization tests for Flux2 Transformer."""
-
-
-class TestFlux2TransformerGGUF(Flux2TransformerTesterConfig, GGUFTesterMixin):
-    """GGUF quantization tests for Flux2 Transformer."""
-
-    @property
-    def gguf_filename(self):
-        return "https://huggingface.co/unsloth/FLUX.2-dev-GGUF/blob/main/flux2-dev-Q2_K.gguf"
-
-    @property
-    def torch_dtype(self):
-        return torch.bfloat16
-
-    def get_dummy_inputs(self):
-        """Override to provide inputs matching the real FLUX2 model dimensions.
-
-        Flux2 defaults: in_channels=128, joint_attention_dim=15360
-        """
-        batch_size = 1
-        height = 64
-        width = 64
-        sequence_length = 512
-
-        hidden_states = randn_tensor(
-            (batch_size, height * width, 128), generator=self.generator, device=torch_device, dtype=self.torch_dtype
-        )
-        encoder_hidden_states = randn_tensor(
-            (batch_size, sequence_length, 15360), generator=self.generator, device=torch_device, dtype=self.torch_dtype
-        )
-
-        # Flux2 uses 4D image/text IDs (t, h, w, l)
-        t_coords = torch.arange(1)
-        h_coords = torch.arange(height)
-        w_coords = torch.arange(width)
-        l_coords = torch.arange(1)
-        image_ids = torch.cartesian_prod(t_coords, h_coords, w_coords, l_coords)
-        image_ids = image_ids.unsqueeze(0).expand(batch_size, -1, -1).to(torch_device)
-
-        text_t_coords = torch.arange(1)
-        text_h_coords = torch.arange(1)
-        text_w_coords = torch.arange(1)
-        text_l_coords = torch.arange(sequence_length)
-        text_ids = torch.cartesian_prod(text_t_coords, text_h_coords, text_w_coords, text_l_coords)
-        text_ids = text_ids.unsqueeze(0).expand(batch_size, -1, -1).to(torch_device)
-
-        timestep = torch.tensor([1.0]).to(torch_device, self.torch_dtype)
-        guidance = torch.tensor([3.5]).to(torch_device, self.torch_dtype)
-
-        return {
-            "hidden_states": hidden_states,
-            "encoder_hidden_states": encoder_hidden_states,
-            "img_ids": image_ids,
-            "txt_ids": text_ids,
-            "timestep": timestep,
-            "guidance": guidance,
-        }
-
-
-class TestFlux2TransformerTorchAoCompile(Flux2TransformerTesterConfig, TorchAoCompileTesterMixin):
-    """TorchAO + compile tests for Flux2 Transformer."""
-
-
-class TestFlux2TransformerGGUFCompile(Flux2TransformerTesterConfig, GGUFCompileTesterMixin):
-    """GGUF + compile tests for Flux2 Transformer."""
-
-    @property
-    def gguf_filename(self):
-        return "https://huggingface.co/unsloth/FLUX.2-dev-GGUF/blob/main/flux2-dev-Q2_K.gguf"
-
-    @property
-    def torch_dtype(self):
-        return torch.bfloat16
-
-    def get_dummy_inputs(self):
-        """Override to provide inputs matching the real FLUX2 model dimensions.
-
-        Flux2 defaults: in_channels=128, joint_attention_dim=15360
-        """
-        batch_size = 1
-        height = 64
-        width = 64
-        sequence_length = 512
-
-        hidden_states = randn_tensor(
-            (batch_size, height * width, 128), generator=self.generator, device=torch_device, dtype=self.torch_dtype
-        )
-        encoder_hidden_states = randn_tensor(
-            (batch_size, sequence_length, 15360), generator=self.generator, device=torch_device, dtype=self.torch_dtype
-        )
-
-        # Flux2 uses 4D image/text IDs (t, h, w, l)
-        t_coords = torch.arange(1)
-        h_coords = torch.arange(height)
-        w_coords = torch.arange(width)
-        l_coords = torch.arange(1)
-        image_ids = torch.cartesian_prod(t_coords, h_coords, w_coords, l_coords)
-        image_ids = image_ids.unsqueeze(0).expand(batch_size, -1, -1).to(torch_device)
-
-        text_t_coords = torch.arange(1)
-        text_h_coords = torch.arange(1)
-        text_w_coords = torch.arange(1)
-        text_l_coords = torch.arange(sequence_length)
-        text_ids = torch.cartesian_prod(text_t_coords, text_h_coords, text_w_coords, text_l_coords)
-        text_ids = text_ids.unsqueeze(0).expand(batch_size, -1, -1).to(torch_device)
-
-        timestep = torch.tensor([1.0]).to(torch_device, self.torch_dtype)
-        guidance = torch.tensor([3.5]).to(torch_device, self.torch_dtype)
-
-        return {
-            "hidden_states": hidden_states,
-            "encoder_hidden_states": encoder_hidden_states,
-            "img_ids": image_ids,
-            "txt_ids": text_ids,
-            "timestep": timestep,
-            "guidance": guidance,
-        }
-
-
-class Flux2TransformerKVCacheTesterConfig(BaseModelTesterConfig):
-    num_ref_tokens = 4
-
-    @property
-    def model_class(self):
-        return Flux2Transformer2DModel
-
-    @property
-    def output_shape(self) -> tuple[int, int]:
-        return (16, 4)
-
-    @property
-    def input_shape(self) -> tuple[int, int]:
-        return (16, 4)
-
-    @property
-    def model_split_percents(self) -> list:
-        return [0.7, 0.6, 0.6]
-
-    @property
-    def main_input_name(self) -> str:
-        return "hidden_states"
-
-    @property
-    def uses_custom_attn_processor(self) -> bool:
-        return True
-
-    @property
-    def generator(self):
-        return torch.Generator("cpu").manual_seed(0)
-
-    def get_init_dict(self) -> dict[str, int | list[int]]:
-        return {
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = {
            "patch_size": 1,
            "in_channels": 4,
            "num_layers": 1,
@@ -416,210 +91,72 @@ class Flux2TransformerKVCacheTesterConfig(BaseModelTesterConfig):
            "attention_head_dim": 16,
            "num_attention_heads": 2,
            "joint_attention_dim": 32,
-            "timestep_guidance_channels": 256,
+            "timestep_guidance_channels": 256,  # Hardcoded in original code
            "axes_dims_rope": [4, 4, 4, 4],
        }

-    def get_dummy_inputs(self, height: int = 4, width: int = 4) -> dict[str, torch.Tensor]:
-        batch_size = 1
-        num_latent_channels = 4
-        sequence_length = 48
-        embedding_dim = 32
-        num_ref_tokens = self.num_ref_tokens
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict

-        ref_hidden_states = randn_tensor(
-            (batch_size, num_ref_tokens, num_latent_channels), generator=self.generator, device=torch_device
-        )
-        img_hidden_states = randn_tensor(
-            (batch_size, height * width, num_latent_channels), generator=self.generator, device=torch_device
-        )
-        hidden_states = torch.cat([ref_hidden_states, img_hidden_states], dim=1)
+    # TODO (Daniel, Sayak): We can remove this test.
+    def test_flux2_consistency(self, seed=0):
+        torch.manual_seed(seed)
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()

-        encoder_hidden_states = randn_tensor(
-            (batch_size, sequence_length, embedding_dim), generator=self.generator, device=torch_device
-        )
-
-        ref_t_coords = torch.arange(1)
-        ref_h_coords = torch.arange(num_ref_tokens)
-        ref_w_coords = torch.arange(1)
-        ref_l_coords = torch.arange(1)
-        ref_ids = torch.cartesian_prod(ref_t_coords, ref_h_coords, ref_w_coords, ref_l_coords)
-        ref_ids = ref_ids.unsqueeze(0).expand(batch_size, -1, -1).to(torch_device)
-
-        t_coords = torch.arange(1)
-        h_coords = torch.arange(height)
-        w_coords = torch.arange(width)
-        l_coords = torch.arange(1)
-        image_ids = torch.cartesian_prod(t_coords, h_coords, w_coords, l_coords)
-        image_ids = image_ids.unsqueeze(0).expand(batch_size, -1, -1).to(torch_device)
-        image_ids = torch.cat([ref_ids, image_ids], dim=1)
-
-        text_t_coords = torch.arange(1)
-        text_h_coords = torch.arange(1)
-        text_w_coords = torch.arange(1)
-        text_l_coords = torch.arange(sequence_length)
-        text_ids = torch.cartesian_prod(text_t_coords, text_h_coords, text_w_coords, text_l_coords)
-        text_ids = text_ids.unsqueeze(0).expand(batch_size, -1, -1).to(torch_device)
-
-        timestep = torch.tensor([1.0]).to(torch_device).expand(batch_size)
-        guidance = torch.tensor([1.0]).to(torch_device).expand(batch_size)
-
-        return {
-            "hidden_states": hidden_states,
-            "encoder_hidden_states": encoder_hidden_states,
-            "img_ids": image_ids,
-            "txt_ids": text_ids,
-            "timestep": timestep,
-            "guidance": guidance,
-        }
-
-
-class TestFlux2TransformerKVCache(Flux2TransformerKVCacheTesterConfig):
-    """KV cache tests for Flux2 Transformer."""
-
-    def test_kv_layer_cache_store_and_get(self):
-        cache = Flux2KVLayerCache()
-        k = torch.randn(1, 4, 2, 16)
-        v = torch.randn(1, 4, 2, 16)
-        cache.store(k, v)
-        k_out, v_out = cache.get()
-        assert torch.equal(k, k_out)
-        assert torch.equal(v, v_out)
-
-    def test_kv_layer_cache_get_before_store_raises(self):
-        cache = Flux2KVLayerCache()
-        try:
-            cache.get()
-            assert False, "Expected RuntimeError"
-        except RuntimeError:
-            pass
-
-    def test_kv_layer_cache_clear(self):
-        cache = Flux2KVLayerCache()
-        cache.store(torch.randn(1, 4, 2, 16), torch.randn(1, 4, 2, 16))
-        cache.clear()
-        assert cache.k_ref is None
-        assert cache.v_ref is None
-
-    def test_kv_cache_structure(self):
-        num_double = 3
-        num_single = 2
-        cache = Flux2KVCache(num_double, num_single)
-        assert len(cache.double_block_caches) == num_double
-        assert len(cache.single_block_caches) == num_single
-        assert cache.num_ref_tokens == 0
-
-        for i in range(num_double):
-            assert isinstance(cache.get_double(i), Flux2KVLayerCache)
-        for i in range(num_single):
-            assert isinstance(cache.get_single(i), Flux2KVLayerCache)
-
-    def test_kv_cache_clear(self):
-        cache = Flux2KVCache(2, 1)
-        cache.num_ref_tokens = 4
-        cache.get_double(0).store(torch.randn(1, 4, 2, 16), torch.randn(1, 4, 2, 16))
-        cache.clear()
-        assert cache.num_ref_tokens == 0
-        assert cache.get_double(0).k_ref is None
-
-    def _set_kv_attn_processors(self, model):
-        for block in model.transformer_blocks:
-            block.attn.set_processor(Flux2KVAttnProcessor())
-        for block in model.single_transformer_blocks:
-            block.attn.set_processor(Flux2KVParallelSelfAttnProcessor())
-
-    @torch.no_grad()
-    def test_extract_mode_returns_cache(self):
-        model = self.model_class(**self.get_init_dict())
-        model.to(torch_device)
-        model.eval()
-        self._set_kv_attn_processors(model)
-
-        output = model(
-            **self.get_dummy_inputs(),
-            kv_cache_mode="extract",
-            num_ref_tokens=self.num_ref_tokens,
-            ref_fixed_timestep=0.0,
-        )
-
-        assert output.kv_cache is not None
-        assert isinstance(output.kv_cache, Flux2KVCache)
-        assert output.kv_cache.num_ref_tokens == self.num_ref_tokens
-
-        for layer_cache in output.kv_cache.double_block_caches:
-            assert layer_cache.k_ref is not None
-            assert layer_cache.v_ref is not None
-
-        for layer_cache in output.kv_cache.single_block_caches:
-            assert layer_cache.k_ref is not None
-            assert layer_cache.v_ref is not None
-
-    @torch.no_grad()
-    def test_extract_mode_output_shape(self):
-        model = self.model_class(**self.get_init_dict())
+        torch.manual_seed(seed)
+        model = self.model_class(**init_dict)
+        # state_dict = model.state_dict()
+        # for key, param in state_dict.items():
+        #     print(f"{key} | {param.shape}")
+        # torch.save(state_dict, "/raid/daniel_gu/test_flux2_params/diffusers.pt")
        model.to(torch_device)
        model.eval()

-        height, width = 4, 4
-        output = model(
-            **self.get_dummy_inputs(height=height, width=width),
-            kv_cache_mode="extract",
-            num_ref_tokens=self.num_ref_tokens,
-            ref_fixed_timestep=0.0,
-        )
+        with attention_backend("native"):
+            with torch.no_grad():
+                output = model(**inputs_dict)

-        assert output.sample.shape == (1, height * width, 4)
+                if isinstance(output, dict):
+                    output = output.to_tuple()[0]

-    @torch.no_grad()
-    def test_cached_mode_uses_cache(self):
-        model = self.model_class(**self.get_init_dict())
-        model.to(torch_device)
-        model.eval()
+        self.assertIsNotNone(output)

-        height, width = 4, 4
-        extract_output = model(
-            **self.get_dummy_inputs(height=height, width=width),
-            kv_cache_mode="extract",
-            num_ref_tokens=self.num_ref_tokens,
-            ref_fixed_timestep=0.0,
-        )
+        # input & output have to have the same shape
+        input_tensor = inputs_dict[self.main_input_name]
+        expected_shape = input_tensor.shape
+        self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")

-        base_config = Flux2TransformerTesterConfig()
-        cached_inputs = base_config.get_dummy_inputs(height=height, width=width)
-        cached_output = model(
-            **cached_inputs,
-            kv_cache=extract_output.kv_cache,
-            kv_cache_mode="cached",
-        )
+        # Check against expected slice
+        # fmt: off
+        expected_slice = torch.tensor([-0.3662, 0.4844, 0.6334, -0.3497, 0.2162, 0.0188, 0.0521, -0.2061, -0.2041, -0.0342, -0.7107, 0.4797, -0.3280, 0.7059, -0.0849, 0.4416])
+        # fmt: on

-        assert cached_output.sample.shape == (1, height * width, 4)
-        assert cached_output.kv_cache is None
+        flat_output = output.cpu().flatten()
+        generated_slice = torch.cat([flat_output[:8], flat_output[-8:]])
+        self.assertTrue(torch.allclose(generated_slice, expected_slice, atol=1e-4))

-    @torch.no_grad()
-    def test_extract_return_dict_false(self):
-        model = self.model_class(**self.get_init_dict())
-        model.to(torch_device)
-        model.eval()
+    def test_gradient_checkpointing_is_applied(self):
+        expected_set = {"Flux2Transformer2DModel"}
+        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)

-        output = model(
-            **self.get_dummy_inputs(),
-            kv_cache_mode="extract",
-            num_ref_tokens=self.num_ref_tokens,
-            ref_fixed_timestep=0.0,
-            return_dict=False,
-        )

-        assert isinstance(output, tuple)
-        assert len(output) == 2
-        assert isinstance(output[1], Flux2KVCache)
+class Flux2TransformerCompileTests(TorchCompileTesterMixin, unittest.TestCase):
+    model_class = Flux2Transformer2DModel
+    different_shapes_for_compilation = [(4, 4), (4, 8), (8, 8)]

-    @torch.no_grad()
-    def test_no_kv_cache_mode_returns_no_cache(self):
-        model = self.model_class(**self.get_init_dict())
-        model.to(torch_device)
-        model.eval()
+    def prepare_init_args_and_inputs_for_common(self):
+        return Flux2TransformerTests().prepare_init_args_and_inputs_for_common()

-        base_config = Flux2TransformerTesterConfig()
-        output = model(**base_config.get_dummy_inputs())
+    def prepare_dummy_input(self, height, width):
+        return Flux2TransformerTests().prepare_dummy_input(height=height, width=width)

-        assert output.kv_cache is None
+
+class Flux2TransformerLoRAHotSwapTests(LoraHotSwappingForModelTesterMixin, unittest.TestCase):
+    model_class = Flux2Transformer2DModel
+    different_shapes_for_compilation = [(4, 4), (4, 8), (8, 8)]
+
+    def prepare_init_args_and_inputs_for_common(self):
+        return Flux2TransformerTests().prepare_init_args_and_inputs_for_common()
+
+    def prepare_dummy_input(self, height, width):
+        return Flux2TransformerTests().prepare_dummy_input(height=height, width=width)
--- a/tests/pipelines/hunyuan_video/test_hunyuan_image2video.py
+++ b/tests/pipelines/hunyuan_video/test_hunyuan_image2video.py
@@ -139,9 +139,7 @@ class HunyuanVideoImageToVideoPipelineFastTests(
            num_hidden_layers=2,
            image_size=224,
        )
-        llava_text_encoder_config = LlavaConfig(
-            vision_config=vision_config, text_config=text_config, pad_token_id=100, image_token_index=101
-        )
+        llava_text_encoder_config = LlavaConfig(vision_config, text_config, pad_token_id=100, image_token_index=101)

        clip_text_encoder_config = CLIPTextConfig(
            bos_token_id=0,
Author	SHA1	Message	Date
Dhruv Nair	514dd552d4	update	2026-03-16 12:43:24 +01:00
Dhruv Nair	0d87803e80	update	2026-03-16 12:28:33 +01:00