Revert "another fix."

This reverts commit ab07b603ab.
another fix.
2026-03-24 17:38:15 +08:00 · 2026-03-20 15:35:53 +05:30 · 2026-03-20 15:35:18 +05:30 · 2026-03-20 12:33:51 +05:30 · 2026-03-17 16:16:31 +05:30 · 2026-03-12 11:19:09 +05:30
11 changed files with 530 additions and 940 deletions
--- a/docs/source/en/optimization/attention_backends.md
+++ b/docs/source/en/optimization/attention_backends.md
@@ -143,7 +143,6 @@ Refer to the table below for a complete list of available attention backends and
 | `flash_varlen` | [FlashAttention](https://github.com/Dao-AILab/flash-attention) | Variable length FlashAttention |
 | `flash_varlen_hub` | [FlashAttention](https://github.com/Dao-AILab/flash-attention) | Variable length FlashAttention from kernels |
 | `aiter` | [AI Tensor Engine for ROCm](https://github.com/ROCm/aiter) | FlashAttention for AMD ROCm |
-| `flash_4_hub` | [FlashAttention](https://github.com/Dao-AILab/flash-attention) | FlashAttention-4 |
 | `_flash_3` | [FlashAttention](https://github.com/Dao-AILab/flash-attention) | FlashAttention-3 |
 | `_flash_varlen_3` | [FlashAttention](https://github.com/Dao-AILab/flash-attention) | Variable length FlashAttention-3 |
 | `_flash_3_hub` | [FlashAttention](https://github.com/Dao-AILab/flash-attention) | FlashAttention-3 from kernels |
--- a/src/diffusers/models/attention_dispatch.py
+++ b/src/diffusers/models/attention_dispatch.py
@@ -229,7 +229,6 @@ class AttentionBackendName(str, Enum):
    FLASH_HUB = "flash_hub"
    FLASH_VARLEN = "flash_varlen"
    FLASH_VARLEN_HUB = "flash_varlen_hub"
-    FLASH_4_HUB = "flash_4_hub"
    _FLASH_3 = "_flash_3"
    _FLASH_VARLEN_3 = "_flash_varlen_3"
    _FLASH_3_HUB = "_flash_3_hub"
@@ -359,11 +358,6 @@ _HUB_KERNELS_REGISTRY: dict["AttentionBackendName", _HubKernelConfig] = {
        function_attr="sageattn",
        version=1,
    ),
-    AttentionBackendName.FLASH_4_HUB: _HubKernelConfig(
-        repo_id="kernels-staging/flash-attn4",
-        function_attr="flash_attn_func",
-        version=0,
-    ),
 }


@@ -527,7 +521,6 @@ def _check_attention_backend_requirements(backend: AttentionBackendName) -> None
        AttentionBackendName._FLASH_3_HUB,
        AttentionBackendName._FLASH_3_VARLEN_HUB,
        AttentionBackendName.SAGE_HUB,
-        AttentionBackendName.FLASH_4_HUB,
    ]:
        if not is_kernels_available():
            raise RuntimeError(
@@ -538,11 +531,6 @@ def _check_attention_backend_requirements(backend: AttentionBackendName) -> None
                f"Backend '{backend.value}' needs to be used with a `kernels` version of at least 0.12. Please update with `pip install -U kernels`."
            )

-        if backend == AttentionBackendName.FLASH_4_HUB and not is_kernels_available(">=", "0.12.3"):
-            raise RuntimeError(
-                f"Backend '{backend.value}' needs to be used with a `kernels` version of at least 0.12.3. Please update with `pip install -U kernels`."
-            )
-
    elif backend == AttentionBackendName.AITER:
        if not _CAN_USE_AITER_ATTN:
            raise RuntimeError(
@@ -2688,37 +2676,6 @@ def _flash_attention_3_varlen_hub(
    return (out, lse) if return_lse else out


-@_AttentionBackendRegistry.register(
-    AttentionBackendName.FLASH_4_HUB,
-    constraints=[_check_device, _check_qkv_dtype_bf16_or_fp16, _check_shape],
-    supports_context_parallel=False,
-)
-def _flash_attention_4_hub(
-    query: torch.Tensor,
-    key: torch.Tensor,
-    value: torch.Tensor,
-    attn_mask: torch.Tensor | None = None,
-    scale: float | None = None,
-    is_causal: bool = False,
-    return_lse: bool = False,
-    _parallel_config: "ParallelConfig" | None = None,
-) -> torch.Tensor:
-    if attn_mask is not None:
-        raise ValueError("`attn_mask` is not supported for flash-attn 4.")
-
-    func = _HUB_KERNELS_REGISTRY[AttentionBackendName.FLASH_4_HUB].kernel_fn
-    out = func(
-        q=query,
-        k=key,
-        v=value,
-        softmax_scale=scale,
-        causal=is_causal,
-    )
-    if isinstance(out, tuple):
-        return (out[0], out[1]) if return_lse else out[0]
-    return out
-
-
@_AttentionBackendRegistry.register(
    AttentionBackendName._FLASH_VARLEN_3,
    constraints=[_check_device, _check_qkv_dtype_bf16_or_fp16, _check_shape],
--- a/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py
+++ b/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py
@@ -324,18 +324,17 @@ class AudioLDM2Pipeline(DiffusionPipeline):
            `inputs_embeds (`torch.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
                The sequence of generated hidden-states.
        """
+        cache_position_kwargs = {}
+        if is_transformers_version("<", "4.52.1"):
+            cache_position_kwargs["input_ids"] = inputs_embeds
+        else:
+            cache_position_kwargs["seq_length"] = inputs_embeds.shape[0]
+            cache_position_kwargs["device"] = (
+                self.language_model.device if getattr(self, "language_model", None) is not None else self.device
+            )
+        cache_position_kwargs["model_kwargs"] = model_kwargs
        max_new_tokens = max_new_tokens if max_new_tokens is not None else self.language_model.config.max_new_tokens
-        if hasattr(self.language_model, "_get_initial_cache_position"):
-            cache_position_kwargs = {}
-            if is_transformers_version("<", "4.52.1"):
-                cache_position_kwargs["input_ids"] = inputs_embeds
-            else:
-                cache_position_kwargs["seq_length"] = inputs_embeds.shape[0]
-                cache_position_kwargs["device"] = (
-                    self.language_model.device if getattr(self, "language_model", None) is not None else self.device
-                )
-            cache_position_kwargs["model_kwargs"] = model_kwargs
-            model_kwargs = self.language_model._get_initial_cache_position(**cache_position_kwargs)
+        model_kwargs = self.language_model._get_initial_cache_position(**cache_position_kwargs)

        for _ in range(max_new_tokens):
            # prepare model inputs
--- a/tests/lora/test_lora_layers_wanvace.py
+++ b/tests/lora/test_lora_layers_wanvace.py
@@ -28,6 +28,7 @@ from diffusers.utils.import_utils import is_peft_available

 from ..testing_utils import (
    floats_tensor,
+    is_flaky,
    require_peft_backend,
    require_peft_version_greater,
    skip_mps,
@@ -45,6 +46,7 @@ from .utils import PeftLoraLoaderMixinTests  # noqa: E402

@require_peft_backend
@skip_mps
+@is_flaky(max_attempts=10, description="very flaky class")
 class WanVACELoRATests(unittest.TestCase, PeftLoraLoaderMixinTests):
    pipeline_class = WanVACEPipeline
    scheduler_cls = FlowMatchEulerDiscreteScheduler
@@ -71,8 +73,8 @@ class WanVACELoRATests(unittest.TestCase, PeftLoraLoaderMixinTests):
        "base_dim": 3,
        "z_dim": 4,
        "dim_mult": [1, 1, 1, 1],
-        "latents_mean": [-0.7571, -0.7089, -0.9113, -0.7245],
-        "latents_std": [2.8184, 1.4541, 2.3275, 2.6558],
+        "latents_mean": torch.randn(4).numpy().tolist(),
+        "latents_std": torch.randn(4).numpy().tolist(),
        "num_res_blocks": 1,
        "temperal_downsample": [False, True, True],
    }
--- a/tests/models/transformers/test_models_transformer_flux.py
+++ b/tests/models/transformers/test_models_transformer_flux.py
@@ -41,6 +41,7 @@ from ..testing_utils import (
    ModelOptCompileTesterMixin,
    ModelOptTesterMixin,
    ModelTesterMixin,
+    PyramidAttentionBroadcastTesterMixin,
    QuantoCompileTesterMixin,
    QuantoTesterMixin,
    SingleFileTesterMixin,
@@ -218,10 +219,6 @@ class TestFluxTransformerMemory(FluxTransformerTesterConfig, MemoryTesterMixin):
 class TestFluxTransformerTraining(FluxTransformerTesterConfig, TrainingTesterMixin):
    """Training tests for Flux Transformer."""

-    def test_gradient_checkpointing_is_applied(self):
-        expected_set = {"FluxTransformer2DModel"}
-        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
-

 class TestFluxTransformerAttention(FluxTransformerTesterConfig, AttentionTesterMixin):
    """Attention processor tests for Flux Transformer."""
@@ -415,6 +412,10 @@ class TestFluxTransformerBitsAndBytesCompile(FluxTransformerTesterConfig, BitsAn
    """BitsAndBytes + compile tests for Flux Transformer."""


+class TestFluxTransformerPABCache(FluxTransformerTesterConfig, PyramidAttentionBroadcastTesterMixin):
+    """PyramidAttentionBroadcast cache tests for Flux Transformer."""
+
+
 class TestFluxTransformerFBCCache(FluxTransformerTesterConfig, FirstBlockCacheTesterMixin):
    """FirstBlockCache tests for Flux Transformer."""

--- a/tests/models/transformers/test_models_transformer_flux2.py
+++ b/tests/models/transformers/test_models_transformer_flux2.py
@@ -13,95 +13,48 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+import unittest
+
 import torch

-from diffusers import Flux2Transformer2DModel
-from diffusers.models.transformers.transformer_flux2 import (
-    Flux2KVAttnProcessor,
-    Flux2KVCache,
-    Flux2KVLayerCache,
-    Flux2KVParallelSelfAttnProcessor,
-)
-from diffusers.utils.torch_utils import randn_tensor
+from diffusers import Flux2Transformer2DModel, attention_backend

 from ...testing_utils import enable_full_determinism, torch_device
-from ..testing_utils import (
-    AttentionTesterMixin,
-    BaseModelTesterConfig,
-    BitsAndBytesTesterMixin,
-    ContextParallelTesterMixin,
-    GGUFCompileTesterMixin,
-    GGUFTesterMixin,
-    LoraHotSwappingForModelTesterMixin,
-    LoraTesterMixin,
-    MemoryTesterMixin,
-    ModelTesterMixin,
-    TorchAoCompileTesterMixin,
-    TorchAoTesterMixin,
-    TorchCompileTesterMixin,
-    TrainingTesterMixin,
-)
+from ..test_modeling_common import LoraHotSwappingForModelTesterMixin, ModelTesterMixin, TorchCompileTesterMixin


 enable_full_determinism()


-class Flux2TransformerTesterConfig(BaseModelTesterConfig):
-    @property
-    def model_class(self):
-        return Flux2Transformer2DModel
+class Flux2TransformerTests(ModelTesterMixin, unittest.TestCase):
+    model_class = Flux2Transformer2DModel
+    main_input_name = "hidden_states"
+    # We override the items here because the transformer under consideration is small.
+    model_split_percents = [0.7, 0.6, 0.6]
+
+    # Skip setting testing with default: AttnProcessor
+    uses_custom_attn_processor = True

    @property
-    def output_shape(self) -> tuple[int, int]:
+    def dummy_input(self):
+        return self.prepare_dummy_input()
+
+    @property
+    def input_shape(self):
        return (16, 4)

    @property
-    def input_shape(self) -> tuple[int, int]:
+    def output_shape(self):
        return (16, 4)

-    @property
-    def model_split_percents(self) -> list:
-        # We override the items here because the transformer under consideration is small.
-        return [0.7, 0.6, 0.6]
-
-    @property
-    def main_input_name(self) -> str:
-        return "hidden_states"
-
-    @property
-    def uses_custom_attn_processor(self) -> bool:
-        # Skip setting testing with default: AttnProcessor
-        return True
-
-    @property
-    def generator(self):
-        return torch.Generator("cpu").manual_seed(0)
-
-    def get_init_dict(self) -> dict[str, int | list[int]]:
-        return {
-            "patch_size": 1,
-            "in_channels": 4,
-            "num_layers": 1,
-            "num_single_layers": 1,
-            "attention_head_dim": 16,
-            "num_attention_heads": 2,
-            "joint_attention_dim": 32,
-            "timestep_guidance_channels": 256,  # Hardcoded in original code
-            "axes_dims_rope": [4, 4, 4, 4],
-        }
-
-    def get_dummy_inputs(self, height: int = 4, width: int = 4) -> dict[str, torch.Tensor]:
+    def prepare_dummy_input(self, height=4, width=4):
        batch_size = 1
        num_latent_channels = 4
        sequence_length = 48
        embedding_dim = 32

-        hidden_states = randn_tensor(
-            (batch_size, height * width, num_latent_channels), generator=self.generator, device=torch_device
-        )
-        encoder_hidden_states = randn_tensor(
-            (batch_size, sequence_length, embedding_dim), generator=self.generator, device=torch_device
-        )
+        hidden_states = torch.randn((batch_size, height * width, num_latent_channels)).to(torch_device)
+        encoder_hidden_states = torch.randn((batch_size, sequence_length, embedding_dim)).to(torch_device)

        t_coords = torch.arange(1)
        h_coords = torch.arange(height)
@@ -129,286 +82,8 @@ class Flux2TransformerTesterConfig(BaseModelTesterConfig):
            "guidance": guidance,
        }

-
-class TestFlux2Transformer(Flux2TransformerTesterConfig, ModelTesterMixin):
-    pass
-
-
-class TestFlux2TransformerMemory(Flux2TransformerTesterConfig, MemoryTesterMixin):
-    """Memory optimization tests for Flux2 Transformer."""
-
-
-class TestFlux2TransformerTraining(Flux2TransformerTesterConfig, TrainingTesterMixin):
-    """Training tests for Flux2 Transformer."""
-
-    def test_gradient_checkpointing_is_applied(self):
-        expected_set = {"Flux2Transformer2DModel"}
-        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
-
-
-class TestFlux2TransformerAttention(Flux2TransformerTesterConfig, AttentionTesterMixin):
-    """Attention processor tests for Flux2 Transformer."""
-
-
-class TestFlux2TransformerContextParallel(Flux2TransformerTesterConfig, ContextParallelTesterMixin):
-    """Context Parallel inference tests for Flux2 Transformer."""
-
-
-class TestFlux2TransformerLoRA(Flux2TransformerTesterConfig, LoraTesterMixin):
-    """LoRA adapter tests for Flux2 Transformer."""
-
-
-class TestFlux2TransformerLoRAHotSwap(Flux2TransformerTesterConfig, LoraHotSwappingForModelTesterMixin):
-    """LoRA hot-swapping tests for Flux2 Transformer."""
-
-    @property
-    def different_shapes_for_compilation(self):
-        return [(4, 4), (4, 8), (8, 8)]
-
-    def get_dummy_inputs(self, height: int = 4, width: int = 4) -> dict[str, torch.Tensor]:
-        """Override to support dynamic height/width for LoRA hotswap tests."""
-        batch_size = 1
-        num_latent_channels = 4
-        sequence_length = 48
-        embedding_dim = 32
-
-        hidden_states = randn_tensor(
-            (batch_size, height * width, num_latent_channels), generator=self.generator, device=torch_device
-        )
-        encoder_hidden_states = randn_tensor(
-            (batch_size, sequence_length, embedding_dim), generator=self.generator, device=torch_device
-        )
-
-        t_coords = torch.arange(1)
-        h_coords = torch.arange(height)
-        w_coords = torch.arange(width)
-        l_coords = torch.arange(1)
-        image_ids = torch.cartesian_prod(t_coords, h_coords, w_coords, l_coords)
-        image_ids = image_ids.unsqueeze(0).expand(batch_size, -1, -1).to(torch_device)
-
-        text_t_coords = torch.arange(1)
-        text_h_coords = torch.arange(1)
-        text_w_coords = torch.arange(1)
-        text_l_coords = torch.arange(sequence_length)
-        text_ids = torch.cartesian_prod(text_t_coords, text_h_coords, text_w_coords, text_l_coords)
-        text_ids = text_ids.unsqueeze(0).expand(batch_size, -1, -1).to(torch_device)
-
-        timestep = torch.tensor([1.0]).to(torch_device).expand(batch_size)
-        guidance = torch.tensor([1.0]).to(torch_device).expand(batch_size)
-
-        return {
-            "hidden_states": hidden_states,
-            "encoder_hidden_states": encoder_hidden_states,
-            "img_ids": image_ids,
-            "txt_ids": text_ids,
-            "timestep": timestep,
-            "guidance": guidance,
-        }
-
-
-class TestFlux2TransformerCompile(Flux2TransformerTesterConfig, TorchCompileTesterMixin):
-    @property
-    def different_shapes_for_compilation(self):
-        return [(4, 4), (4, 8), (8, 8)]
-
-    def get_dummy_inputs(self, height: int = 4, width: int = 4) -> dict[str, torch.Tensor]:
-        """Override to support dynamic height/width for compilation tests."""
-        batch_size = 1
-        num_latent_channels = 4
-        sequence_length = 48
-        embedding_dim = 32
-
-        hidden_states = randn_tensor(
-            (batch_size, height * width, num_latent_channels), generator=self.generator, device=torch_device
-        )
-        encoder_hidden_states = randn_tensor(
-            (batch_size, sequence_length, embedding_dim), generator=self.generator, device=torch_device
-        )
-
-        t_coords = torch.arange(1)
-        h_coords = torch.arange(height)
-        w_coords = torch.arange(width)
-        l_coords = torch.arange(1)
-        image_ids = torch.cartesian_prod(t_coords, h_coords, w_coords, l_coords)
-        image_ids = image_ids.unsqueeze(0).expand(batch_size, -1, -1).to(torch_device)
-
-        text_t_coords = torch.arange(1)
-        text_h_coords = torch.arange(1)
-        text_w_coords = torch.arange(1)
-        text_l_coords = torch.arange(sequence_length)
-        text_ids = torch.cartesian_prod(text_t_coords, text_h_coords, text_w_coords, text_l_coords)
-        text_ids = text_ids.unsqueeze(0).expand(batch_size, -1, -1).to(torch_device)
-
-        timestep = torch.tensor([1.0]).to(torch_device).expand(batch_size)
-        guidance = torch.tensor([1.0]).to(torch_device).expand(batch_size)
-
-        return {
-            "hidden_states": hidden_states,
-            "encoder_hidden_states": encoder_hidden_states,
-            "img_ids": image_ids,
-            "txt_ids": text_ids,
-            "timestep": timestep,
-            "guidance": guidance,
-        }
-
-
-class TestFlux2TransformerBitsAndBytes(Flux2TransformerTesterConfig, BitsAndBytesTesterMixin):
-    """BitsAndBytes quantization tests for Flux2 Transformer."""
-
-
-class TestFlux2TransformerTorchAo(Flux2TransformerTesterConfig, TorchAoTesterMixin):
-    """TorchAO quantization tests for Flux2 Transformer."""
-
-
-class TestFlux2TransformerGGUF(Flux2TransformerTesterConfig, GGUFTesterMixin):
-    """GGUF quantization tests for Flux2 Transformer."""
-
-    @property
-    def gguf_filename(self):
-        return "https://huggingface.co/unsloth/FLUX.2-dev-GGUF/blob/main/flux2-dev-Q2_K.gguf"
-
-    @property
-    def torch_dtype(self):
-        return torch.bfloat16
-
-    def get_dummy_inputs(self):
-        """Override to provide inputs matching the real FLUX2 model dimensions.
-
-        Flux2 defaults: in_channels=128, joint_attention_dim=15360
-        """
-        batch_size = 1
-        height = 64
-        width = 64
-        sequence_length = 512
-
-        hidden_states = randn_tensor(
-            (batch_size, height * width, 128), generator=self.generator, device=torch_device, dtype=self.torch_dtype
-        )
-        encoder_hidden_states = randn_tensor(
-            (batch_size, sequence_length, 15360), generator=self.generator, device=torch_device, dtype=self.torch_dtype
-        )
-
-        # Flux2 uses 4D image/text IDs (t, h, w, l)
-        t_coords = torch.arange(1)
-        h_coords = torch.arange(height)
-        w_coords = torch.arange(width)
-        l_coords = torch.arange(1)
-        image_ids = torch.cartesian_prod(t_coords, h_coords, w_coords, l_coords)
-        image_ids = image_ids.unsqueeze(0).expand(batch_size, -1, -1).to(torch_device)
-
-        text_t_coords = torch.arange(1)
-        text_h_coords = torch.arange(1)
-        text_w_coords = torch.arange(1)
-        text_l_coords = torch.arange(sequence_length)
-        text_ids = torch.cartesian_prod(text_t_coords, text_h_coords, text_w_coords, text_l_coords)
-        text_ids = text_ids.unsqueeze(0).expand(batch_size, -1, -1).to(torch_device)
-
-        timestep = torch.tensor([1.0]).to(torch_device, self.torch_dtype)
-        guidance = torch.tensor([3.5]).to(torch_device, self.torch_dtype)
-
-        return {
-            "hidden_states": hidden_states,
-            "encoder_hidden_states": encoder_hidden_states,
-            "img_ids": image_ids,
-            "txt_ids": text_ids,
-            "timestep": timestep,
-            "guidance": guidance,
-        }
-
-
-class TestFlux2TransformerTorchAoCompile(Flux2TransformerTesterConfig, TorchAoCompileTesterMixin):
-    """TorchAO + compile tests for Flux2 Transformer."""
-
-
-class TestFlux2TransformerGGUFCompile(Flux2TransformerTesterConfig, GGUFCompileTesterMixin):
-    """GGUF + compile tests for Flux2 Transformer."""
-
-    @property
-    def gguf_filename(self):
-        return "https://huggingface.co/unsloth/FLUX.2-dev-GGUF/blob/main/flux2-dev-Q2_K.gguf"
-
-    @property
-    def torch_dtype(self):
-        return torch.bfloat16
-
-    def get_dummy_inputs(self):
-        """Override to provide inputs matching the real FLUX2 model dimensions.
-
-        Flux2 defaults: in_channels=128, joint_attention_dim=15360
-        """
-        batch_size = 1
-        height = 64
-        width = 64
-        sequence_length = 512
-
-        hidden_states = randn_tensor(
-            (batch_size, height * width, 128), generator=self.generator, device=torch_device, dtype=self.torch_dtype
-        )
-        encoder_hidden_states = randn_tensor(
-            (batch_size, sequence_length, 15360), generator=self.generator, device=torch_device, dtype=self.torch_dtype
-        )
-
-        # Flux2 uses 4D image/text IDs (t, h, w, l)
-        t_coords = torch.arange(1)
-        h_coords = torch.arange(height)
-        w_coords = torch.arange(width)
-        l_coords = torch.arange(1)
-        image_ids = torch.cartesian_prod(t_coords, h_coords, w_coords, l_coords)
-        image_ids = image_ids.unsqueeze(0).expand(batch_size, -1, -1).to(torch_device)
-
-        text_t_coords = torch.arange(1)
-        text_h_coords = torch.arange(1)
-        text_w_coords = torch.arange(1)
-        text_l_coords = torch.arange(sequence_length)
-        text_ids = torch.cartesian_prod(text_t_coords, text_h_coords, text_w_coords, text_l_coords)
-        text_ids = text_ids.unsqueeze(0).expand(batch_size, -1, -1).to(torch_device)
-
-        timestep = torch.tensor([1.0]).to(torch_device, self.torch_dtype)
-        guidance = torch.tensor([3.5]).to(torch_device, self.torch_dtype)
-
-        return {
-            "hidden_states": hidden_states,
-            "encoder_hidden_states": encoder_hidden_states,
-            "img_ids": image_ids,
-            "txt_ids": text_ids,
-            "timestep": timestep,
-            "guidance": guidance,
-        }
-
-
-class Flux2TransformerKVCacheTesterConfig(BaseModelTesterConfig):
-    num_ref_tokens = 4
-
-    @property
-    def model_class(self):
-        return Flux2Transformer2DModel
-
-    @property
-    def output_shape(self) -> tuple[int, int]:
-        return (16, 4)
-
-    @property
-    def input_shape(self) -> tuple[int, int]:
-        return (16, 4)
-
-    @property
-    def model_split_percents(self) -> list:
-        return [0.7, 0.6, 0.6]
-
-    @property
-    def main_input_name(self) -> str:
-        return "hidden_states"
-
-    @property
-    def uses_custom_attn_processor(self) -> bool:
-        return True
-
-    @property
-    def generator(self):
-        return torch.Generator("cpu").manual_seed(0)
-
-    def get_init_dict(self) -> dict[str, int | list[int]]:
-        return {
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = {
            "patch_size": 1,
            "in_channels": 4,
            "num_layers": 1,
@@ -416,210 +91,72 @@ class Flux2TransformerKVCacheTesterConfig(BaseModelTesterConfig):
            "attention_head_dim": 16,
            "num_attention_heads": 2,
            "joint_attention_dim": 32,
-            "timestep_guidance_channels": 256,
+            "timestep_guidance_channels": 256,  # Hardcoded in original code
            "axes_dims_rope": [4, 4, 4, 4],
        }

-    def get_dummy_inputs(self, height: int = 4, width: int = 4) -> dict[str, torch.Tensor]:
-        batch_size = 1
-        num_latent_channels = 4
-        sequence_length = 48
-        embedding_dim = 32
-        num_ref_tokens = self.num_ref_tokens
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict

-        ref_hidden_states = randn_tensor(
-            (batch_size, num_ref_tokens, num_latent_channels), generator=self.generator, device=torch_device
-        )
-        img_hidden_states = randn_tensor(
-            (batch_size, height * width, num_latent_channels), generator=self.generator, device=torch_device
-        )
-        hidden_states = torch.cat([ref_hidden_states, img_hidden_states], dim=1)
+    # TODO (Daniel, Sayak): We can remove this test.
+    def test_flux2_consistency(self, seed=0):
+        torch.manual_seed(seed)
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()

-        encoder_hidden_states = randn_tensor(
-            (batch_size, sequence_length, embedding_dim), generator=self.generator, device=torch_device
-        )
-
-        ref_t_coords = torch.arange(1)
-        ref_h_coords = torch.arange(num_ref_tokens)
-        ref_w_coords = torch.arange(1)
-        ref_l_coords = torch.arange(1)
-        ref_ids = torch.cartesian_prod(ref_t_coords, ref_h_coords, ref_w_coords, ref_l_coords)
-        ref_ids = ref_ids.unsqueeze(0).expand(batch_size, -1, -1).to(torch_device)
-
-        t_coords = torch.arange(1)
-        h_coords = torch.arange(height)
-        w_coords = torch.arange(width)
-        l_coords = torch.arange(1)
-        image_ids = torch.cartesian_prod(t_coords, h_coords, w_coords, l_coords)
-        image_ids = image_ids.unsqueeze(0).expand(batch_size, -1, -1).to(torch_device)
-        image_ids = torch.cat([ref_ids, image_ids], dim=1)
-
-        text_t_coords = torch.arange(1)
-        text_h_coords = torch.arange(1)
-        text_w_coords = torch.arange(1)
-        text_l_coords = torch.arange(sequence_length)
-        text_ids = torch.cartesian_prod(text_t_coords, text_h_coords, text_w_coords, text_l_coords)
-        text_ids = text_ids.unsqueeze(0).expand(batch_size, -1, -1).to(torch_device)
-
-        timestep = torch.tensor([1.0]).to(torch_device).expand(batch_size)
-        guidance = torch.tensor([1.0]).to(torch_device).expand(batch_size)
-
-        return {
-            "hidden_states": hidden_states,
-            "encoder_hidden_states": encoder_hidden_states,
-            "img_ids": image_ids,
-            "txt_ids": text_ids,
-            "timestep": timestep,
-            "guidance": guidance,
-        }
-
-
-class TestFlux2TransformerKVCache(Flux2TransformerKVCacheTesterConfig):
-    """KV cache tests for Flux2 Transformer."""
-
-    def test_kv_layer_cache_store_and_get(self):
-        cache = Flux2KVLayerCache()
-        k = torch.randn(1, 4, 2, 16)
-        v = torch.randn(1, 4, 2, 16)
-        cache.store(k, v)
-        k_out, v_out = cache.get()
-        assert torch.equal(k, k_out)
-        assert torch.equal(v, v_out)
-
-    def test_kv_layer_cache_get_before_store_raises(self):
-        cache = Flux2KVLayerCache()
-        try:
-            cache.get()
-            assert False, "Expected RuntimeError"
-        except RuntimeError:
-            pass
-
-    def test_kv_layer_cache_clear(self):
-        cache = Flux2KVLayerCache()
-        cache.store(torch.randn(1, 4, 2, 16), torch.randn(1, 4, 2, 16))
-        cache.clear()
-        assert cache.k_ref is None
-        assert cache.v_ref is None
-
-    def test_kv_cache_structure(self):
-        num_double = 3
-        num_single = 2
-        cache = Flux2KVCache(num_double, num_single)
-        assert len(cache.double_block_caches) == num_double
-        assert len(cache.single_block_caches) == num_single
-        assert cache.num_ref_tokens == 0
-
-        for i in range(num_double):
-            assert isinstance(cache.get_double(i), Flux2KVLayerCache)
-        for i in range(num_single):
-            assert isinstance(cache.get_single(i), Flux2KVLayerCache)
-
-    def test_kv_cache_clear(self):
-        cache = Flux2KVCache(2, 1)
-        cache.num_ref_tokens = 4
-        cache.get_double(0).store(torch.randn(1, 4, 2, 16), torch.randn(1, 4, 2, 16))
-        cache.clear()
-        assert cache.num_ref_tokens == 0
-        assert cache.get_double(0).k_ref is None
-
-    def _set_kv_attn_processors(self, model):
-        for block in model.transformer_blocks:
-            block.attn.set_processor(Flux2KVAttnProcessor())
-        for block in model.single_transformer_blocks:
-            block.attn.set_processor(Flux2KVParallelSelfAttnProcessor())
-
-    @torch.no_grad()
-    def test_extract_mode_returns_cache(self):
-        model = self.model_class(**self.get_init_dict())
-        model.to(torch_device)
-        model.eval()
-        self._set_kv_attn_processors(model)
-
-        output = model(
-            **self.get_dummy_inputs(),
-            kv_cache_mode="extract",
-            num_ref_tokens=self.num_ref_tokens,
-            ref_fixed_timestep=0.0,
-        )
-
-        assert output.kv_cache is not None
-        assert isinstance(output.kv_cache, Flux2KVCache)
-        assert output.kv_cache.num_ref_tokens == self.num_ref_tokens
-
-        for layer_cache in output.kv_cache.double_block_caches:
-            assert layer_cache.k_ref is not None
-            assert layer_cache.v_ref is not None
-
-        for layer_cache in output.kv_cache.single_block_caches:
-            assert layer_cache.k_ref is not None
-            assert layer_cache.v_ref is not None
-
-    @torch.no_grad()
-    def test_extract_mode_output_shape(self):
-        model = self.model_class(**self.get_init_dict())
+        torch.manual_seed(seed)
+        model = self.model_class(**init_dict)
+        # state_dict = model.state_dict()
+        # for key, param in state_dict.items():
+        #     print(f"{key} | {param.shape}")
+        # torch.save(state_dict, "/raid/daniel_gu/test_flux2_params/diffusers.pt")
        model.to(torch_device)
        model.eval()

-        height, width = 4, 4
-        output = model(
-            **self.get_dummy_inputs(height=height, width=width),
-            kv_cache_mode="extract",
-            num_ref_tokens=self.num_ref_tokens,
-            ref_fixed_timestep=0.0,
-        )
+        with attention_backend("native"):
+            with torch.no_grad():
+                output = model(**inputs_dict)

-        assert output.sample.shape == (1, height * width, 4)
+                if isinstance(output, dict):
+                    output = output.to_tuple()[0]

-    @torch.no_grad()
-    def test_cached_mode_uses_cache(self):
-        model = self.model_class(**self.get_init_dict())
-        model.to(torch_device)
-        model.eval()
+        self.assertIsNotNone(output)

-        height, width = 4, 4
-        extract_output = model(
-            **self.get_dummy_inputs(height=height, width=width),
-            kv_cache_mode="extract",
-            num_ref_tokens=self.num_ref_tokens,
-            ref_fixed_timestep=0.0,
-        )
+        # input & output have to have the same shape
+        input_tensor = inputs_dict[self.main_input_name]
+        expected_shape = input_tensor.shape
+        self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")

-        base_config = Flux2TransformerTesterConfig()
-        cached_inputs = base_config.get_dummy_inputs(height=height, width=width)
-        cached_output = model(
-            **cached_inputs,
-            kv_cache=extract_output.kv_cache,
-            kv_cache_mode="cached",
-        )
+        # Check against expected slice
+        # fmt: off
+        expected_slice = torch.tensor([-0.3662, 0.4844, 0.6334, -0.3497, 0.2162, 0.0188, 0.0521, -0.2061, -0.2041, -0.0342, -0.7107, 0.4797, -0.3280, 0.7059, -0.0849, 0.4416])
+        # fmt: on

-        assert cached_output.sample.shape == (1, height * width, 4)
-        assert cached_output.kv_cache is None
+        flat_output = output.cpu().flatten()
+        generated_slice = torch.cat([flat_output[:8], flat_output[-8:]])
+        self.assertTrue(torch.allclose(generated_slice, expected_slice, atol=1e-4))

-    @torch.no_grad()
-    def test_extract_return_dict_false(self):
-        model = self.model_class(**self.get_init_dict())
-        model.to(torch_device)
-        model.eval()
+    def test_gradient_checkpointing_is_applied(self):
+        expected_set = {"Flux2Transformer2DModel"}
+        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)

-        output = model(
-            **self.get_dummy_inputs(),
-            kv_cache_mode="extract",
-            num_ref_tokens=self.num_ref_tokens,
-            ref_fixed_timestep=0.0,
-            return_dict=False,
-        )

-        assert isinstance(output, tuple)
-        assert len(output) == 2
-        assert isinstance(output[1], Flux2KVCache)
+class Flux2TransformerCompileTests(TorchCompileTesterMixin, unittest.TestCase):
+    model_class = Flux2Transformer2DModel
+    different_shapes_for_compilation = [(4, 4), (4, 8), (8, 8)]

-    @torch.no_grad()
-    def test_no_kv_cache_mode_returns_no_cache(self):
-        model = self.model_class(**self.get_init_dict())
-        model.to(torch_device)
-        model.eval()
+    def prepare_init_args_and_inputs_for_common(self):
+        return Flux2TransformerTests().prepare_init_args_and_inputs_for_common()

-        base_config = Flux2TransformerTesterConfig()
-        output = model(**base_config.get_dummy_inputs())
+    def prepare_dummy_input(self, height, width):
+        return Flux2TransformerTests().prepare_dummy_input(height=height, width=width)

-        assert output.kv_cache is None
+
+class Flux2TransformerLoRAHotSwapTests(LoraHotSwappingForModelTesterMixin, unittest.TestCase):
+    model_class = Flux2Transformer2DModel
+    different_shapes_for_compilation = [(4, 4), (4, 8), (8, 8)]
+
+    def prepare_init_args_and_inputs_for_common(self):
+        return Flux2TransformerTests().prepare_init_args_and_inputs_for_common()
+
+    def prepare_dummy_input(self, height, width):
+        return Flux2TransformerTests().prepare_dummy_input(height=height, width=width)
--- a/tests/models/transformers/test_models_transformer_sana.py
+++ b/tests/models/transformers/test_models_transformer_sana.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright 2025 HuggingFace Inc.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -13,58 +12,57 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+import unittest
+
 import torch

 from diffusers import SanaTransformer2DModel
-from diffusers.utils.torch_utils import randn_tensor

-from ...testing_utils import enable_full_determinism, torch_device
-from ..testing_utils import (
-    AttentionTesterMixin,
-    BaseModelTesterConfig,
-    BitsAndBytesTesterMixin,
-    MemoryTesterMixin,
-    ModelTesterMixin,
-    TorchAoTesterMixin,
-    TorchCompileTesterMixin,
-    TrainingTesterMixin,
+from ...testing_utils import (
+    enable_full_determinism,
+    torch_device,
 )
+from ..test_modeling_common import ModelTesterMixin


 enable_full_determinism()


-class SanaTransformer2DTesterConfig(BaseModelTesterConfig):
-    @property
-    def model_class(self):
-        return SanaTransformer2DModel
+class SanaTransformerTests(ModelTesterMixin, unittest.TestCase):
+    model_class = SanaTransformer2DModel
+    main_input_name = "hidden_states"
+    uses_custom_attn_processor = True
+    model_split_percents = [0.7, 0.7, 0.9]

    @property
-    def output_shape(self) -> tuple[int, ...]:
-        return (4, 32, 32)
+    def dummy_input(self):
+        batch_size = 2
+        num_channels = 4
+        height = 32
+        width = 32
+        embedding_dim = 8
+        sequence_length = 8

-    @property
-    def input_shape(self) -> tuple[int, ...]:
-        return (4, 32, 32)
+        hidden_states = torch.randn((batch_size, num_channels, height, width)).to(torch_device)
+        encoder_hidden_states = torch.randn((batch_size, sequence_length, embedding_dim)).to(torch_device)
+        timestep = torch.randint(0, 1000, size=(batch_size,)).to(torch_device)

-    @property
-    def main_input_name(self) -> str:
-        return "hidden_states"
-
-    @property
-    def uses_custom_attn_processor(self) -> bool:
-        return True
-
-    @property
-    def model_split_percents(self) -> list:
-        return [0.7, 0.7, 0.9]
-
-    @property
-    def generator(self):
-        return torch.Generator("cpu").manual_seed(0)
-
-    def get_init_dict(self) -> dict[str, int | list[int] | tuple | str | bool]:
        return {
+            "hidden_states": hidden_states,
+            "encoder_hidden_states": encoder_hidden_states,
+            "timestep": timestep,
+        }
+
+    @property
+    def input_shape(self):
+        return (4, 32, 32)
+
+    @property
+    def output_shape(self):
+        return (4, 32, 32)
+
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = {
            "patch_size": 1,
            "in_channels": 4,
            "out_channels": 4,
@@ -77,53 +75,9 @@ class SanaTransformer2DTesterConfig(BaseModelTesterConfig):
            "caption_channels": 8,
            "sample_size": 32,
        }
-
-    def get_dummy_inputs(self) -> dict[str, torch.Tensor]:
-        batch_size = 2
-        num_channels = 4
-        height = 32
-        width = 32
-        embedding_dim = 8
-        sequence_length = 8
-
-        return {
-            "hidden_states": randn_tensor(
-                (batch_size, num_channels, height, width), generator=self.generator, device=torch_device
-            ),
-            "encoder_hidden_states": randn_tensor(
-                (batch_size, sequence_length, embedding_dim), generator=self.generator, device=torch_device
-            ),
-            "timestep": torch.randint(0, 1000, size=(batch_size,)).to(torch_device),
-        }
-
-
-class TestSanaTransformer2D(SanaTransformer2DTesterConfig, ModelTesterMixin):
-    """Core model tests for Sana Transformer 2D."""
-
-
-class TestSanaTransformer2DMemory(SanaTransformer2DTesterConfig, MemoryTesterMixin):
-    """Memory optimization tests for Sana Transformer 2D."""
-
-
-class TestSanaTransformer2DTraining(SanaTransformer2DTesterConfig, TrainingTesterMixin):
-    """Training tests for Sana Transformer 2D."""
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict

    def test_gradient_checkpointing_is_applied(self):
        expected_set = {"SanaTransformer2DModel"}
        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
-
-
-class TestSanaTransformer2DAttention(SanaTransformer2DTesterConfig, AttentionTesterMixin):
-    """Attention processor tests for Sana Transformer 2D."""
-
-
-class TestSanaTransformer2DCompile(SanaTransformer2DTesterConfig, TorchCompileTesterMixin):
-    """Torch compile tests for Sana Transformer 2D."""
-
-
-class TestSanaTransformer2DBitsAndBytes(SanaTransformer2DTesterConfig, BitsAndBytesTesterMixin):
-    """BitsAndBytes quantization tests for Sana Transformer 2D."""
-
-
-class TestSanaTransformer2DTorchAo(SanaTransformer2DTesterConfig, TorchAoTesterMixin):
-    """TorchAO quantization tests for Sana Transformer 2D."""
--- a/tests/models/transformers/test_models_transformer_sana_video.py
+++ b/tests/models/transformers/test_models_transformer_sana_video.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright 2025 HuggingFace Inc.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -13,54 +12,57 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+import unittest
+
 import torch

 from diffusers import SanaVideoTransformer3DModel
-from diffusers.utils.torch_utils import randn_tensor

-from ...testing_utils import enable_full_determinism, torch_device
-from ..testing_utils import (
-    AttentionTesterMixin,
-    BaseModelTesterConfig,
-    BitsAndBytesTesterMixin,
-    MemoryTesterMixin,
-    ModelTesterMixin,
-    TorchAoTesterMixin,
-    TorchCompileTesterMixin,
-    TrainingTesterMixin,
+from ...testing_utils import (
+    enable_full_determinism,
+    torch_device,
 )
+from ..test_modeling_common import ModelTesterMixin, TorchCompileTesterMixin


 enable_full_determinism()


-class SanaVideoTransformer3DTesterConfig(BaseModelTesterConfig):
-    @property
-    def model_class(self):
-        return SanaVideoTransformer3DModel
+class SanaVideoTransformer3DTests(ModelTesterMixin, unittest.TestCase):
+    model_class = SanaVideoTransformer3DModel
+    main_input_name = "hidden_states"
+    uses_custom_attn_processor = True

    @property
-    def output_shape(self) -> tuple[int, ...]:
-        return (16, 2, 16, 16)
+    def dummy_input(self):
+        batch_size = 1
+        num_channels = 16
+        num_frames = 2
+        height = 16
+        width = 16
+        text_encoder_embedding_dim = 16
+        sequence_length = 12

-    @property
-    def input_shape(self) -> tuple[int, ...]:
-        return (16, 2, 16, 16)
+        hidden_states = torch.randn((batch_size, num_channels, num_frames, height, width)).to(torch_device)
+        timestep = torch.randint(0, 1000, size=(batch_size,)).to(torch_device)
+        encoder_hidden_states = torch.randn((batch_size, sequence_length, text_encoder_embedding_dim)).to(torch_device)

-    @property
-    def main_input_name(self) -> str:
-        return "hidden_states"
-
-    @property
-    def uses_custom_attn_processor(self) -> bool:
-        return True
-
-    @property
-    def generator(self):
-        return torch.Generator("cpu").manual_seed(0)
-
-    def get_init_dict(self) -> dict[str, int | float | list[int] | tuple | str | bool]:
        return {
+            "hidden_states": hidden_states,
+            "encoder_hidden_states": encoder_hidden_states,
+            "timestep": timestep,
+        }
+
+    @property
+    def input_shape(self):
+        return (16, 2, 16, 16)
+
+    @property
+    def output_shape(self):
+        return (16, 2, 16, 16)
+
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = {
            "in_channels": 16,
            "out_channels": 16,
            "num_attention_heads": 2,
@@ -80,56 +82,16 @@ class SanaVideoTransformer3DTesterConfig(BaseModelTesterConfig):
            "qk_norm": "rms_norm_across_heads",
            "rope_max_seq_len": 32,
        }
-
-    def get_dummy_inputs(self) -> dict[str, torch.Tensor]:
-        batch_size = 1
-        num_channels = 16
-        num_frames = 2
-        height = 16
-        width = 16
-        text_encoder_embedding_dim = 16
-        sequence_length = 12
-
-        return {
-            "hidden_states": randn_tensor(
-                (batch_size, num_channels, num_frames, height, width), generator=self.generator, device=torch_device
-            ),
-            "encoder_hidden_states": randn_tensor(
-                (batch_size, sequence_length, text_encoder_embedding_dim),
-                generator=self.generator,
-                device=torch_device,
-            ),
-            "timestep": torch.randint(0, 1000, size=(batch_size,)).to(torch_device),
-        }
-
-
-class TestSanaVideoTransformer3D(SanaVideoTransformer3DTesterConfig, ModelTesterMixin):
-    """Core model tests for Sana Video Transformer 3D."""
-
-
-class TestSanaVideoTransformer3DMemory(SanaVideoTransformer3DTesterConfig, MemoryTesterMixin):
-    """Memory optimization tests for Sana Video Transformer 3D."""
-
-
-class TestSanaVideoTransformer3DTraining(SanaVideoTransformer3DTesterConfig, TrainingTesterMixin):
-    """Training tests for Sana Video Transformer 3D."""
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict

    def test_gradient_checkpointing_is_applied(self):
        expected_set = {"SanaVideoTransformer3DModel"}
        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)


-class TestSanaVideoTransformer3DAttention(SanaVideoTransformer3DTesterConfig, AttentionTesterMixin):
-    """Attention processor tests for Sana Video Transformer 3D."""
+class SanaVideoTransformerCompileTests(TorchCompileTesterMixin, unittest.TestCase):
+    model_class = SanaVideoTransformer3DModel

-
-class TestSanaVideoTransformer3DCompile(SanaVideoTransformer3DTesterConfig, TorchCompileTesterMixin):
-    """Torch compile tests for Sana Video Transformer 3D."""
-
-
-class TestSanaVideoTransformer3DBitsAndBytes(SanaVideoTransformer3DTesterConfig, BitsAndBytesTesterMixin):
-    """BitsAndBytes quantization tests for Sana Video Transformer 3D."""
-
-
-class TestSanaVideoTransformer3DTorchAo(SanaVideoTransformer3DTesterConfig, TorchAoTesterMixin):
-    """TorchAO quantization tests for Sana Video Transformer 3D."""
+    def prepare_init_args_and_inputs_for_common(self):
+        return SanaVideoTransformer3DTests().prepare_init_args_and_inputs_for_common()
--- a/tests/modular_pipelines/test_conditional_pipeline_blocks.py
+++ b/tests/modular_pipelines/test_conditional_pipeline_blocks.py
@@ -0,0 +1,242 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from diffusers.modular_pipelines import (
+    AutoPipelineBlocks,
+    ConditionalPipelineBlocks,
+    InputParam,
+    ModularPipelineBlocks,
+)
+
+
+class TextToImageBlock(ModularPipelineBlocks):
+    model_name = "text2img"
+
+    @property
+    def inputs(self):
+        return [InputParam(name="prompt")]
+
+    @property
+    def intermediate_outputs(self):
+        return []
+
+    @property
+    def description(self):
+        return "text-to-image workflow"
+
+    def __call__(self, components, state):
+        block_state = self.get_block_state(state)
+        block_state.workflow = "text2img"
+        self.set_block_state(state, block_state)
+        return components, state
+
+
+class ImageToImageBlock(ModularPipelineBlocks):
+    model_name = "img2img"
+
+    @property
+    def inputs(self):
+        return [InputParam(name="prompt"), InputParam(name="image")]
+
+    @property
+    def intermediate_outputs(self):
+        return []
+
+    @property
+    def description(self):
+        return "image-to-image workflow"
+
+    def __call__(self, components, state):
+        block_state = self.get_block_state(state)
+        block_state.workflow = "img2img"
+        self.set_block_state(state, block_state)
+        return components, state
+
+
+class InpaintBlock(ModularPipelineBlocks):
+    model_name = "inpaint"
+
+    @property
+    def inputs(self):
+        return [InputParam(name="prompt"), InputParam(name="image"), InputParam(name="mask")]
+
+    @property
+    def intermediate_outputs(self):
+        return []
+
+    @property
+    def description(self):
+        return "inpaint workflow"
+
+    def __call__(self, components, state):
+        block_state = self.get_block_state(state)
+        block_state.workflow = "inpaint"
+        self.set_block_state(state, block_state)
+        return components, state
+
+
+class ConditionalImageBlocks(ConditionalPipelineBlocks):
+    block_classes = [InpaintBlock, ImageToImageBlock, TextToImageBlock]
+    block_names = ["inpaint", "img2img", "text2img"]
+    block_trigger_inputs = ["mask", "image"]
+    default_block_name = "text2img"
+
+    @property
+    def description(self):
+        return "Conditional image blocks for testing"
+
+    def select_block(self, mask=None, image=None) -> str | None:
+        if mask is not None:
+            return "inpaint"
+        if image is not None:
+            return "img2img"
+        return None  # falls back to default_block_name
+
+
+class OptionalConditionalBlocks(ConditionalPipelineBlocks):
+    block_classes = [InpaintBlock, ImageToImageBlock]
+    block_names = ["inpaint", "img2img"]
+    block_trigger_inputs = ["mask", "image"]
+    default_block_name = None  # no default; block can be skipped
+
+    @property
+    def description(self):
+        return "Optional conditional blocks (skippable)"
+
+    def select_block(self, mask=None, image=None) -> str | None:
+        if mask is not None:
+            return "inpaint"
+        if image is not None:
+            return "img2img"
+        return None
+
+
+class AutoImageBlocks(AutoPipelineBlocks):
+    block_classes = [InpaintBlock, ImageToImageBlock, TextToImageBlock]
+    block_names = ["inpaint", "img2img", "text2img"]
+    block_trigger_inputs = ["mask", "image", None]
+
+    @property
+    def description(self):
+        return "Auto image blocks for testing"
+
+
+class TestConditionalPipelineBlocksSelectBlock:
+    def test_select_block_with_mask(self):
+        blocks = ConditionalImageBlocks()
+        assert blocks.select_block(mask="something") == "inpaint"
+
+    def test_select_block_with_image(self):
+        blocks = ConditionalImageBlocks()
+        assert blocks.select_block(image="something") == "img2img"
+
+    def test_select_block_with_mask_and_image(self):
+        blocks = ConditionalImageBlocks()
+        assert blocks.select_block(mask="m", image="i") == "inpaint"
+
+    def test_select_block_no_triggers_returns_none(self):
+        blocks = ConditionalImageBlocks()
+        assert blocks.select_block() is None
+
+    def test_select_block_explicit_none_values(self):
+        blocks = ConditionalImageBlocks()
+        assert blocks.select_block(mask=None, image=None) is None
+
+
+class TestConditionalPipelineBlocksWorkflowSelection:
+    def test_default_workflow_when_no_triggers(self):
+        blocks = ConditionalImageBlocks()
+        execution = blocks.get_execution_blocks()
+        assert execution is not None
+        assert isinstance(execution, TextToImageBlock)
+
+    def test_mask_trigger_selects_inpaint(self):
+        blocks = ConditionalImageBlocks()
+        execution = blocks.get_execution_blocks(mask=True)
+        assert isinstance(execution, InpaintBlock)
+
+    def test_image_trigger_selects_img2img(self):
+        blocks = ConditionalImageBlocks()
+        execution = blocks.get_execution_blocks(image=True)
+        assert isinstance(execution, ImageToImageBlock)
+
+    def test_mask_and_image_selects_inpaint(self):
+        blocks = ConditionalImageBlocks()
+        execution = blocks.get_execution_blocks(mask=True, image=True)
+        assert isinstance(execution, InpaintBlock)
+
+    def test_skippable_block_returns_none(self):
+        blocks = OptionalConditionalBlocks()
+        execution = blocks.get_execution_blocks()
+        assert execution is None
+
+    def test_skippable_block_still_selects_when_triggered(self):
+        blocks = OptionalConditionalBlocks()
+        execution = blocks.get_execution_blocks(image=True)
+        assert isinstance(execution, ImageToImageBlock)
+
+
+class TestAutoPipelineBlocksSelectBlock:
+    def test_auto_select_mask(self):
+        blocks = AutoImageBlocks()
+        assert blocks.select_block(mask="m") == "inpaint"
+
+    def test_auto_select_image(self):
+        blocks = AutoImageBlocks()
+        assert blocks.select_block(image="i") == "img2img"
+
+    def test_auto_select_default(self):
+        blocks = AutoImageBlocks()
+        # No trigger -> returns None -> falls back to default (text2img)
+        assert blocks.select_block() is None
+
+    def test_auto_select_priority_order(self):
+        blocks = AutoImageBlocks()
+        assert blocks.select_block(mask="m", image="i") == "inpaint"
+
+
+class TestAutoPipelineBlocksWorkflowSelection:
+    def test_auto_default_workflow(self):
+        blocks = AutoImageBlocks()
+        execution = blocks.get_execution_blocks()
+        assert isinstance(execution, TextToImageBlock)
+
+    def test_auto_mask_workflow(self):
+        blocks = AutoImageBlocks()
+        execution = blocks.get_execution_blocks(mask=True)
+        assert isinstance(execution, InpaintBlock)
+
+    def test_auto_image_workflow(self):
+        blocks = AutoImageBlocks()
+        execution = blocks.get_execution_blocks(image=True)
+        assert isinstance(execution, ImageToImageBlock)
+
+
+class TestConditionalPipelineBlocksStructure:
+    def test_block_names_accessible(self):
+        blocks = ConditionalImageBlocks()
+        sub = dict(blocks.sub_blocks)
+        assert set(sub.keys()) == {"inpaint", "img2img", "text2img"}
+
+    def test_sub_block_types(self):
+        blocks = ConditionalImageBlocks()
+        sub = dict(blocks.sub_blocks)
+        assert isinstance(sub["inpaint"], InpaintBlock)
+        assert isinstance(sub["img2img"], ImageToImageBlock)
+        assert isinstance(sub["text2img"], TextToImageBlock)
+
+    def test_description(self):
+        blocks = ConditionalImageBlocks()
+        assert "Conditional" in blocks.description
--- a/tests/modular_pipelines/test_modular_pipelines_common.py
+++ b/tests/modular_pipelines/test_modular_pipelines_common.py
@@ -5,16 +5,10 @@ from typing import Callable

 import pytest
 import torch
-from huggingface_hub import hf_hub_download

 import diffusers
 from diffusers import AutoModel, ComponentsManager, ModularPipeline, ModularPipelineBlocks
 from diffusers.guiders import ClassifierFreeGuidance
-from diffusers.modular_pipelines import (
-    ConditionalPipelineBlocks,
-    LoopSequentialPipelineBlocks,
-    SequentialPipelineBlocks,
-)
 from diffusers.modular_pipelines.modular_pipeline_utils import (
    ComponentSpec,
    ConfigSpec,
@@ -25,7 +19,6 @@ from diffusers.modular_pipelines.modular_pipeline_utils import (
 from diffusers.utils import logging

 from ..testing_utils import (
-    CaptureLogger,
    backend_empty_cache,
    numpy_cosine_similarity_distance,
    require_accelerator,
@@ -33,33 +26,6 @@ from ..testing_utils import (
 )


-def _get_specified_components(path_or_repo_id, cache_dir=None):
-    if os.path.isdir(path_or_repo_id):
-        config_path = os.path.join(path_or_repo_id, "modular_model_index.json")
-    else:
-        try:
-            config_path = hf_hub_download(
-                repo_id=path_or_repo_id,
-                filename="modular_model_index.json",
-                local_dir=cache_dir,
-            )
-        except Exception:
-            return None
-
-    with open(config_path) as f:
-        config = json.load(f)
-
-    components = set()
-    for k, v in config.items():
-        if isinstance(v, (str, int, float, bool)):
-            continue
-        for entry in v:
-            if isinstance(entry, dict) and (entry.get("repo") or entry.get("pretrained_model_name_or_path")):
-                components.add(k)
-                break
-    return components
-
-
 class ModularPipelineTesterMixin:
    """
    It provides a set of common tests for each modular pipeline,
@@ -388,39 +354,6 @@ class ModularPipelineTesterMixin:

        assert torch.abs(image_slices[0] - image_slices[1]).max() < 1e-3

-    def test_load_expected_components_from_pretrained(self, tmp_path):
-        pipe = self.get_pipeline()
-        expected = _get_specified_components(self.pretrained_model_name_or_path, cache_dir=tmp_path)
-        if not expected:
-            pytest.skip("Skipping test as we couldn't fetch the expected components.")
-
-        actual = {
-            name
-            for name in pipe.components
-            if getattr(pipe, name, None) is not None
-            and getattr(getattr(pipe, name), "_diffusers_load_id", None) not in (None, "null")
-        }
-        assert expected == actual, f"Component mismatch: missing={expected - actual}, unexpected={actual - expected}"
-
-    def test_load_expected_components_from_save_pretrained(self, tmp_path):
-        pipe = self.get_pipeline()
-        save_dir = str(tmp_path / "saved-pipeline")
-        pipe.save_pretrained(save_dir)
-
-        expected = _get_specified_components(save_dir)
-        loaded_pipe = ModularPipeline.from_pretrained(save_dir)
-        loaded_pipe.load_components(torch_dtype=torch.float32)
-
-        actual = {
-            name
-            for name in loaded_pipe.components
-            if getattr(loaded_pipe, name, None) is not None
-            and getattr(getattr(loaded_pipe, name), "_diffusers_load_id", None) not in (None, "null")
-        }
-        assert expected == actual, (
-            f"Component mismatch after save/load: missing={expected - actual}, unexpected={actual - expected}"
-        )
-
    def test_modular_index_consistency(self, tmp_path):
        pipe = self.get_pipeline()
        components_spec = pipe._component_specs
@@ -498,117 +431,6 @@ class ModularGuiderTesterMixin:
        assert max_diff > expected_max_diff, "Output with CFG must be different from normal inference"


-class TestCustomBlockRequirements:
-    def get_dummy_block_pipe(self):
-        class DummyBlockOne:
-            # keep two arbitrary deps so that we can test warnings.
-            _requirements = {"xyz": ">=0.8.0", "abc": ">=10.0.0"}
-
-        class DummyBlockTwo:
-            # keep two dependencies that will be available during testing.
-            _requirements = {"transformers": ">=4.44.0", "diffusers": ">=0.2.0"}
-
-        pipe = SequentialPipelineBlocks.from_blocks_dict(
-            {"dummy_block_one": DummyBlockOne, "dummy_block_two": DummyBlockTwo}
-        )
-        return pipe
-
-    def get_dummy_conditional_block_pipe(self):
-        class DummyBlockOne:
-            _requirements = {"xyz": ">=0.8.0", "abc": ">=10.0.0"}
-
-        class DummyBlockTwo:
-            _requirements = {"transformers": ">=4.44.0", "diffusers": ">=0.2.0"}
-
-        class DummyConditionalBlocks(ConditionalPipelineBlocks):
-            block_classes = [DummyBlockOne, DummyBlockTwo]
-            block_names = ["block_one", "block_two"]
-            block_trigger_inputs = []
-
-            def select_block(self, **kwargs):
-                return "block_one"
-
-        return DummyConditionalBlocks()
-
-    def get_dummy_loop_block_pipe(self):
-        class DummyBlockOne:
-            _requirements = {"xyz": ">=0.8.0", "abc": ">=10.0.0"}
-
-        class DummyBlockTwo:
-            _requirements = {"transformers": ">=4.44.0", "diffusers": ">=0.2.0"}
-
-        return LoopSequentialPipelineBlocks.from_blocks_dict({"block_one": DummyBlockOne, "block_two": DummyBlockTwo})
-
-    def test_sequential_block_requirements_save_load(self, tmp_path):
-        pipe = self.get_dummy_block_pipe()
-        pipe.save_pretrained(str(tmp_path))
-
-        config_path = tmp_path / "modular_config.json"
-
-        with open(config_path, "r") as f:
-            config = json.load(f)
-
-        assert "requirements" in config
-        requirements = config["requirements"]
-
-        expected_requirements = {
-            "xyz": ">=0.8.0",
-            "abc": ">=10.0.0",
-            "transformers": ">=4.44.0",
-            "diffusers": ">=0.2.0",
-        }
-        assert expected_requirements == requirements
-
-    def test_sequential_block_requirements_warnings(self, tmp_path):
-        pipe = self.get_dummy_block_pipe()
-
-        logger = logging.get_logger("diffusers.modular_pipelines.modular_pipeline_utils")
-        logger.setLevel(30)
-
-        with CaptureLogger(logger) as cap_logger:
-            pipe.save_pretrained(str(tmp_path))
-
-        template = "{req} was specified in the requirements but wasn't found in the current environment"
-        msg_xyz = template.format(req="xyz")
-        msg_abc = template.format(req="abc")
-        assert msg_xyz in str(cap_logger.out)
-        assert msg_abc in str(cap_logger.out)
-
-    def test_conditional_block_requirements_save_load(self, tmp_path):
-        pipe = self.get_dummy_conditional_block_pipe()
-        pipe.save_pretrained(str(tmp_path))
-
-        config_path = tmp_path / "modular_config.json"
-        with open(config_path, "r") as f:
-            config = json.load(f)
-
-        assert "requirements" in config
-        expected_requirements = {
-            "xyz": ">=0.8.0",
-            "abc": ">=10.0.0",
-            "transformers": ">=4.44.0",
-            "diffusers": ">=0.2.0",
-        }
-        assert expected_requirements == config["requirements"]
-
-    def test_loop_block_requirements_save_load(self, tmp_path):
-        pipe = self.get_dummy_loop_block_pipe()
-        pipe.save_pretrained(str(tmp_path))
-
-        config_path = tmp_path / "modular_config.json"
-        with open(config_path, "r") as f:
-            config = json.load(f)
-
-        assert "requirements" in config
-        expected_requirements = {
-            "xyz": ">=0.8.0",
-            "abc": ">=10.0.0",
-            "transformers": ">=4.44.0",
-            "diffusers": ">=0.2.0",
-        }
-        assert expected_requirements == config["requirements"]
-
-
 class TestModularModelCardContent:
    def create_mock_block(self, name="TestBlock", description="Test block description"):
        class MockBlock:
--- a/tests/modular_pipelines/test_modular_pipelines_custom_blocks.py
+++ b/tests/modular_pipelines/test_modular_pipelines_custom_blocks.py
@@ -24,14 +24,18 @@ import torch
 from diffusers import FluxTransformer2DModel
 from diffusers.modular_pipelines import (
    ComponentSpec,
+    ConditionalPipelineBlocks,
    InputParam,
+    LoopSequentialPipelineBlocks,
    ModularPipelineBlocks,
    OutputParam,
    PipelineState,
+    SequentialPipelineBlocks,
    WanModularPipeline,
 )
+from diffusers.utils import logging

-from ..testing_utils import nightly, require_torch, require_torch_accelerator, slow, torch_device
+from ..testing_utils import CaptureLogger, nightly, require_torch, require_torch_accelerator, slow, torch_device


 def _create_tiny_model_dir(model_dir):
@@ -463,6 +467,117 @@ class TestModularCustomBlocks:
        assert output_prompt.startswith("Modular diffusers + ")


+class TestCustomBlockRequirements:
+    def get_dummy_block_pipe(self):
+        class DummyBlockOne:
+            # keep two arbitrary deps so that we can test warnings.
+            _requirements = {"xyz": ">=0.8.0", "abc": ">=10.0.0"}
+
+        class DummyBlockTwo:
+            # keep two dependencies that will be available during testing.
+            _requirements = {"transformers": ">=4.44.0", "diffusers": ">=0.2.0"}
+
+        pipe = SequentialPipelineBlocks.from_blocks_dict(
+            {"dummy_block_one": DummyBlockOne, "dummy_block_two": DummyBlockTwo}
+        )
+        return pipe
+
+    def get_dummy_conditional_block_pipe(self):
+        class DummyBlockOne:
+            _requirements = {"xyz": ">=0.8.0", "abc": ">=10.0.0"}
+
+        class DummyBlockTwo:
+            _requirements = {"transformers": ">=4.44.0", "diffusers": ">=0.2.0"}
+
+        class DummyConditionalBlocks(ConditionalPipelineBlocks):
+            block_classes = [DummyBlockOne, DummyBlockTwo]
+            block_names = ["block_one", "block_two"]
+            block_trigger_inputs = []
+
+            def select_block(self, **kwargs):
+                return "block_one"
+
+        return DummyConditionalBlocks()
+
+    def get_dummy_loop_block_pipe(self):
+        class DummyBlockOne:
+            _requirements = {"xyz": ">=0.8.0", "abc": ">=10.0.0"}
+
+        class DummyBlockTwo:
+            _requirements = {"transformers": ">=4.44.0", "diffusers": ">=0.2.0"}
+
+        return LoopSequentialPipelineBlocks.from_blocks_dict({"block_one": DummyBlockOne, "block_two": DummyBlockTwo})
+
+    def test_sequential_block_requirements_save_load(self, tmp_path):
+        pipe = self.get_dummy_block_pipe()
+        pipe.save_pretrained(str(tmp_path))
+
+        config_path = tmp_path / "modular_config.json"
+
+        with open(config_path, "r") as f:
+            config = json.load(f)
+
+        assert "requirements" in config
+        requirements = config["requirements"]
+
+        expected_requirements = {
+            "xyz": ">=0.8.0",
+            "abc": ">=10.0.0",
+            "transformers": ">=4.44.0",
+            "diffusers": ">=0.2.0",
+        }
+        assert expected_requirements == requirements
+
+    def test_sequential_block_requirements_warnings(self, tmp_path):
+        pipe = self.get_dummy_block_pipe()
+
+        logger = logging.get_logger("diffusers.modular_pipelines.modular_pipeline_utils")
+        logger.setLevel(30)
+
+        with CaptureLogger(logger) as cap_logger:
+            pipe.save_pretrained(str(tmp_path))
+
+        template = "{req} was specified in the requirements but wasn't found in the current environment"
+        msg_xyz = template.format(req="xyz")
+        msg_abc = template.format(req="abc")
+        assert msg_xyz in str(cap_logger.out)
+        assert msg_abc in str(cap_logger.out)
+
+    def test_conditional_block_requirements_save_load(self, tmp_path):
+        pipe = self.get_dummy_conditional_block_pipe()
+        pipe.save_pretrained(str(tmp_path))
+
+        config_path = tmp_path / "modular_config.json"
+        with open(config_path, "r") as f:
+            config = json.load(f)
+
+        assert "requirements" in config
+        expected_requirements = {
+            "xyz": ">=0.8.0",
+            "abc": ">=10.0.0",
+            "transformers": ">=4.44.0",
+            "diffusers": ">=0.2.0",
+        }
+        assert expected_requirements == config["requirements"]
+
+    def test_loop_block_requirements_save_load(self, tmp_path):
+        pipe = self.get_dummy_loop_block_pipe()
+        pipe.save_pretrained(str(tmp_path))
+
+        config_path = tmp_path / "modular_config.json"
+        with open(config_path, "r") as f:
+            config = json.load(f)
+
+        assert "requirements" in config
+        expected_requirements = {
+            "xyz": ">=0.8.0",
+            "abc": ">=10.0.0",
+            "transformers": ">=4.44.0",
+            "diffusers": ">=0.2.0",
+        }
+        assert expected_requirements == config["requirements"]
+
+
@slow
@nightly
@require_torch
Author	SHA1	Message	Date
sayakpaul	585aa6667f	Revert "another fix." This reverts commit `ab07b603ab`.	2026-03-20 15:35:53 +05:30
sayakpaul	ab07b603ab	another fix.	2026-03-20 15:35:18 +05:30
sayakpaul	7601432849	resolve conflicts.	2026-03-20 12:33:51 +05:30
Sayak Paul	2a5f136142	Merge branch 'main' into tests-conditional-pipeline-blocks	2026-03-17 16:16:31 +05:30
Sayak Paul	4ade16db58	Merge branch 'main' into tests-conditional-pipeline-blocks	2026-03-12 11:19:09 +05:30
sayakpaul	58c304595d	remove	2026-03-10 18:25:02 +05:30
sayakpaul	55c563281a	implement test suite for conditional blocks.	2026-03-10 18:24:49 +05:30