update

2026-02-13 06:15:31 +08:00 · 2026-02-10 13:24:17 +01:00 · 2026-02-10 09:45:06 +01:00 · 2026-02-06 13:08:28 +01:00 · 2026-02-06 12:06:55 +01:00 · 2026-02-06 12:00:14 +01:00
11 changed files with 738 additions and 138 deletions
--- a/src/diffusers/models/transformers/transformer_chronoedit.py
+++ b/src/diffusers/models/transformers/transformer_chronoedit.py
@@ -43,7 +43,7 @@ def _get_qkv_projections(attn: "WanAttention", hidden_states: torch.Tensor, enco
        encoder_hidden_states = hidden_states
    if attn.fused_projections:
-        if attn.cross_attention_dim_head is None:
+        if not attn.is_cross_attention:
            # In self-attention layers, we can fuse the entire QKV projection into a single linear
            query, key, value = attn.to_qkv(hidden_states).chunk(3, dim=-1)
        else:
@@ -219,7 +219,10 @@ class WanAttention(torch.nn.Module, AttentionModuleMixin):
            self.add_v_proj = torch.nn.Linear(added_kv_proj_dim, self.inner_dim, bias=True)
            self.norm_added_k = torch.nn.RMSNorm(dim_head * heads, eps=eps)
-        self.is_cross_attention = cross_attention_dim_head is not None
+        if is_cross_attention is not None:
            self.is_cross_attention = is_cross_attention
        else:
            self.is_cross_attention = cross_attention_dim_head is not None
        self.set_processor(processor)
@@ -227,7 +230,7 @@ class WanAttention(torch.nn.Module, AttentionModuleMixin):
        if getattr(self, "fused_projections", False):
            return
-        if self.cross_attention_dim_head is None:
+        if not self.is_cross_attention:
            concatenated_weights = torch.cat([self.to_q.weight.data, self.to_k.weight.data, self.to_v.weight.data])
            concatenated_bias = torch.cat([self.to_q.bias.data, self.to_k.bias.data, self.to_v.bias.data])
            out_features, in_features = concatenated_weights.shape
--- a/src/diffusers/models/transformers/transformer_wan.py
+++ b/src/diffusers/models/transformers/transformer_wan.py
@@ -42,7 +42,7 @@ def _get_qkv_projections(attn: "WanAttention", hidden_states: torch.Tensor, enco
        encoder_hidden_states = hidden_states
    if attn.fused_projections:
-        if attn.cross_attention_dim_head is None:
+        if not attn.is_cross_attention:
            # In self-attention layers, we can fuse the entire QKV projection into a single linear
            query, key, value = attn.to_qkv(hidden_states).chunk(3, dim=-1)
        else:
@@ -214,7 +214,10 @@ class WanAttention(torch.nn.Module, AttentionModuleMixin):
            self.add_v_proj = torch.nn.Linear(added_kv_proj_dim, self.inner_dim, bias=True)
            self.norm_added_k = torch.nn.RMSNorm(dim_head * heads, eps=eps)
-        self.is_cross_attention = cross_attention_dim_head is not None
+        if is_cross_attention is not None:
            self.is_cross_attention = is_cross_attention
        else:
            self.is_cross_attention = cross_attention_dim_head is not None
        self.set_processor(processor)
@@ -222,7 +225,7 @@ class WanAttention(torch.nn.Module, AttentionModuleMixin):
        if getattr(self, "fused_projections", False):
            return
-        if self.cross_attention_dim_head is None:
+        if not self.is_cross_attention:
            concatenated_weights = torch.cat([self.to_q.weight.data, self.to_k.weight.data, self.to_v.weight.data])
            concatenated_bias = torch.cat([self.to_q.bias.data, self.to_k.bias.data, self.to_v.bias.data])
            out_features, in_features = concatenated_weights.shape
--- a/src/diffusers/models/transformers/transformer_wan_animate.py
+++ b/src/diffusers/models/transformers/transformer_wan_animate.py
@@ -54,7 +54,7 @@ def _get_qkv_projections(attn: "WanAttention", hidden_states: torch.Tensor, enco
        encoder_hidden_states = hidden_states
    if attn.fused_projections:
-        if attn.cross_attention_dim_head is None:
+        if not attn.is_cross_attention:
            # In self-attention layers, we can fuse the entire QKV projection into a single linear
            query, key, value = attn.to_qkv(hidden_states).chunk(3, dim=-1)
        else:
@@ -502,13 +502,16 @@ class WanAnimateFaceBlockCrossAttention(nn.Module, AttentionModuleMixin):
        dim_head: int = 64,
        eps: float = 1e-6,
        cross_attention_dim_head: Optional[int] = None,
        bias: bool = True,
        processor=None,
    ):
        super().__init__()
        self.inner_dim = dim_head * heads
        self.heads = heads
-        self.cross_attention_head_dim = cross_attention_dim_head
+        self.cross_attention_dim_head = cross_attention_dim_head
        self.kv_inner_dim = self.inner_dim if cross_attention_dim_head is None else cross_attention_dim_head * heads
        self.use_bias = bias
        self.is_cross_attention = cross_attention_dim_head is not None
        # 1. Pre-Attention Norms for the hidden_states (video latents) and encoder_hidden_states (motion vector).
        # NOTE: this is not used in "vanilla" WanAttention
@@ -516,10 +519,10 @@ class WanAnimateFaceBlockCrossAttention(nn.Module, AttentionModuleMixin):
        self.pre_norm_kv = nn.LayerNorm(dim, eps, elementwise_affine=False)
        # 2. QKV and Output Projections
-        self.to_q = torch.nn.Linear(dim, self.inner_dim, bias=True)
+        self.to_q = torch.nn.Linear(dim, self.inner_dim, bias=bias)
-        self.to_k = torch.nn.Linear(dim, self.kv_inner_dim, bias=True)
+        self.to_k = torch.nn.Linear(dim, self.kv_inner_dim, bias=bias)
-        self.to_v = torch.nn.Linear(dim, self.kv_inner_dim, bias=True)
+        self.to_v = torch.nn.Linear(dim, self.kv_inner_dim, bias=bias)
-        self.to_out = torch.nn.Linear(self.inner_dim, dim, bias=True)
+        self.to_out = torch.nn.Linear(self.inner_dim, dim, bias=bias)
        # 3. QK Norm
        # NOTE: this is applied after the reshape, so only over dim_head rather than dim_head * heads
@@ -682,7 +685,10 @@ class WanAttention(torch.nn.Module, AttentionModuleMixin):
            self.add_v_proj = torch.nn.Linear(added_kv_proj_dim, self.inner_dim, bias=True)
            self.norm_added_k = torch.nn.RMSNorm(dim_head * heads, eps=eps)
-        self.is_cross_attention = cross_attention_dim_head is not None
+        if is_cross_attention is not None:
            self.is_cross_attention = is_cross_attention
        else:
            self.is_cross_attention = cross_attention_dim_head is not None
        self.set_processor(processor)
@@ -690,7 +696,7 @@ class WanAttention(torch.nn.Module, AttentionModuleMixin):
        if getattr(self, "fused_projections", False):
            return
-        if self.cross_attention_dim_head is None:
+        if not self.is_cross_attention:
            concatenated_weights = torch.cat([self.to_q.weight.data, self.to_k.weight.data, self.to_v.weight.data])
            concatenated_bias = torch.cat([self.to_q.bias.data, self.to_k.bias.data, self.to_v.bias.data])
            out_features, in_features = concatenated_weights.shape
--- a/src/diffusers/models/transformers/transformer_wan_vace.py
+++ b/src/diffusers/models/transformers/transformer_wan_vace.py
@@ -76,6 +76,7 @@ class WanVACETransformerBlock(nn.Module):
            eps=eps,
            added_kv_proj_dim=added_kv_proj_dim,
            processor=WanAttnProcessor(),
            is_cross_attention=True,
        )
        self.norm2 = FP32LayerNorm(dim, eps, elementwise_affine=True) if cross_attn_norm else nn.Identity()
@@ -178,6 +179,7 @@ class WanVACETransformer3DModel(
    _no_split_modules = ["WanTransformerBlock", "WanVACETransformerBlock"]
    _keep_in_fp32_modules = ["time_embedder", "scale_shift_table", "norm1", "norm2", "norm3"]
    _keys_to_ignore_on_load_unexpected = ["norm_added_q"]
    _repeated_blocks = ["WanTransformerBlock", "WanVACETransformerBlock"]
    @register_to_config
    def __init__(
--- a/src/diffusers/quantizers/gguf/gguf_quantizer.py
+++ b/src/diffusers/quantizers/gguf/gguf_quantizer.py
@@ -41,7 +41,7 @@ class GGUFQuantizer(DiffusersQuantizer):
        self.compute_dtype = quantization_config.compute_dtype
        self.pre_quantized = quantization_config.pre_quantized
-        self.modules_to_not_convert = quantization_config.modules_to_not_convert
+        self.modules_to_not_convert = quantization_config.modules_to_not_convert or []
        if not isinstance(self.modules_to_not_convert, list):
            self.modules_to_not_convert = [self.modules_to_not_convert]
--- a/tests/models/testing_utils/common.py
+++ b/tests/models/testing_utils/common.py
@@ -446,16 +446,17 @@ class ModelTesterMixin:
        torch_device not in ["cuda", "xpu"],
        reason="float16 and bfloat16 can only be used with an accelerator",
    )
-    def test_keep_in_fp32_modules(self):
+    def test_keep_in_fp32_modules(self, tmp_path):
        model = self.model_class(**self.get_init_dict())
        fp32_modules = model._keep_in_fp32_modules
        if fp32_modules is None or len(fp32_modules) == 0:
            pytest.skip("Model does not have _keep_in_fp32_modules defined.")
-        # Test with float16
+        # Save the model and reload with float16 dtype
-        model.to(torch_device)
+        # _keep_in_fp32_modules is only enforced during from_pretrained loading
-        model.to(torch.float16)
+        model.save_pretrained(tmp_path)
        model = self.model_class.from_pretrained(tmp_path, torch_dtype=torch.float16).to(torch_device)
        for name, param in model.named_parameters():
            if any(module_to_keep_in_fp32 in name.split(".") for module_to_keep_in_fp32 in fp32_modules):
@@ -470,7 +471,7 @@ class ModelTesterMixin:
    )
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16], ids=["fp16", "bf16"])
    @torch.no_grad()
-    def test_from_save_pretrained_dtype_inference(self, tmp_path, dtype):
+    def test_from_save_pretrained_dtype_inference(self, tmp_path, dtype, atol=1e-4, rtol=0):
        model = self.model_class(**self.get_init_dict())
        model.to(torch_device)
        fp32_modules = model._keep_in_fp32_modules or []
@@ -490,10 +491,6 @@ class ModelTesterMixin:
        output = model(**inputs, return_dict=False)[0]
        output_loaded = model_loaded(**inputs, return_dict=False)[0]
        self._check_dtype_inference_output(output, output_loaded, dtype)
    def _check_dtype_inference_output(self, output, output_loaded, dtype, atol=1e-4, rtol=0):
        """Check dtype inference output with configurable tolerance."""
        assert_tensors_close(
            output, output_loaded, atol=atol, rtol=rtol, msg=f"Loaded model output differs for {dtype}"
        )
--- a/tests/models/testing_utils/quantization.py
+++ b/tests/models/testing_utils/quantization.py
@@ -176,15 +176,7 @@ class QuantizationTesterMixin:
        model_quantized = self._create_quantized_model(config_kwargs)
        model_quantized.to(torch_device)
        # Get model dtype from first parameter
        model_dtype = next(model_quantized.parameters()).dtype
        inputs = self.get_dummy_inputs()
        # Cast inputs to model dtype
        inputs = {
            k: v.to(model_dtype) if isinstance(v, torch.Tensor) and v.is_floating_point() else v
            for k, v in inputs.items()
        }
        output = model_quantized(**inputs, return_dict=False)[0]
        assert output is not None, "Model output is None"
@@ -229,6 +221,8 @@ class QuantizationTesterMixin:
            init_lora_weights=False,
        )
        model.add_adapter(lora_config)
        # Move LoRA adapter weights to device (they default to CPU)
        model.to(torch_device)
        inputs = self.get_dummy_inputs()
        output = model(**inputs, return_dict=False)[0]
@@ -1021,9 +1015,6 @@ class GGUFTesterMixin(GGUFConfigMixin, QuantizationTesterMixin):
        """Test that dequantize() works correctly."""
        self._test_dequantize({"compute_dtype": torch.bfloat16})
    def test_gguf_quantized_layers(self):
        self._test_quantized_layers({"compute_dtype": torch.bfloat16})
@is_quantization
@is_modelopt
--- a/tests/models/transformers/test_models_transformer_wan.py
+++ b/tests/models/transformers/test_models_transformer_wan.py
@@ -12,57 +12,57 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import unittest
+import pytest
 import torch
 from diffusers import WanTransformer3DModel
 from diffusers.utils.torch_utils import randn_tensor
-from ...testing_utils import (
+from ...testing_utils import enable_full_determinism, torch_device
-    enable_full_determinism,
+from ..testing_utils import (
-    torch_device,
+    AttentionTesterMixin,
    BaseModelTesterConfig,
    BitsAndBytesTesterMixin,
    GGUFCompileTesterMixin,
    GGUFTesterMixin,
    MemoryTesterMixin,
    ModelTesterMixin,
    TorchAoTesterMixin,
    TorchCompileTesterMixin,
    TrainingTesterMixin,
 )
 from ..test_modeling_common import ModelTesterMixin, TorchCompileTesterMixin
 enable_full_determinism()
-class WanTransformer3DTests(ModelTesterMixin, unittest.TestCase):
+class WanTransformer3DTesterConfig(BaseModelTesterConfig):
-    model_class = WanTransformer3DModel
+    @property
-    main_input_name = "hidden_states"
+    def model_class(self):
-    uses_custom_attn_processor = True
+        return WanTransformer3DModel
    @property
-    def dummy_input(self):
+    def pretrained_model_name_or_path(self):
-        batch_size = 1
+        return "hf-internal-testing/tiny-wan22-transformer"
        num_channels = 4
        num_frames = 2
        height = 16
        width = 16
        text_encoder_embedding_dim = 16
        sequence_length = 12
-        hidden_states = torch.randn((batch_size, num_channels, num_frames, height, width)).to(torch_device)
+    @property
-        timestep = torch.randint(0, 1000, size=(batch_size,)).to(torch_device)
+    def output_shape(self) -> tuple[int, ...]:
-        encoder_hidden_states = torch.randn((batch_size, sequence_length, text_encoder_embedding_dim)).to(torch_device)
+        return (4, 2, 16, 16)
    @property
    def input_shape(self) -> tuple[int, ...]:
        return (4, 2, 16, 16)
    @property
    def main_input_name(self) -> str:
        return "hidden_states"
    @property
    def generator(self):
        return torch.Generator("cpu").manual_seed(0)
    def get_init_dict(self) -> dict[str, int | list[int] | tuple | str | bool]:
        return {
            "hidden_states": hidden_states,
            "encoder_hidden_states": encoder_hidden_states,
            "timestep": timestep,
        }
    @property
    def input_shape(self):
        return (4, 1, 16, 16)
    @property
    def output_shape(self):
        return (4, 1, 16, 16)
    def prepare_init_args_and_inputs_for_common(self):
        init_dict = {
            "patch_size": (1, 2, 2),
            "num_attention_heads": 2,
            "attention_head_dim": 12,
@@ -76,16 +76,160 @@ class WanTransformer3DTests(ModelTesterMixin, unittest.TestCase):
            "qk_norm": "rms_norm_across_heads",
            "rope_max_seq_len": 32,
        }
-        inputs_dict = self.dummy_input
+
-        return init_dict, inputs_dict
+    def get_dummy_inputs(self) -> dict[str, torch.Tensor]:
        batch_size = 1
        num_channels = 4
        num_frames = 2
        height = 16
        width = 16
        text_encoder_embedding_dim = 16
        sequence_length = 12
        return {
            "hidden_states": randn_tensor(
                (batch_size, num_channels, num_frames, height, width),
                generator=self.generator,
                device=torch_device,
            ),
            "encoder_hidden_states": randn_tensor(
                (batch_size, sequence_length, text_encoder_embedding_dim),
                generator=self.generator,
                device=torch_device,
            ),
            "timestep": torch.randint(0, 1000, size=(batch_size,), generator=self.generator).to(torch_device),
        }
 class TestWanTransformer3D(WanTransformer3DTesterConfig, ModelTesterMixin):
    """Core model tests for Wan Transformer 3D."""
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16], ids=["fp16", "bf16"])
    def test_from_save_pretrained_dtype_inference(self, tmp_path, dtype):
        # Skip: fp16/bf16 require very high atol to pass, providing little signal.
        # Dtype preservation is already tested by test_from_save_pretrained_dtype and test_keep_in_fp32_modules.
        pytest.skip("Tolerance requirements too high for meaningful test")
 class TestWanTransformer3DMemory(WanTransformer3DTesterConfig, MemoryTesterMixin):
    """Memory optimization tests for Wan Transformer 3D."""
 class TestWanTransformer3DTraining(WanTransformer3DTesterConfig, TrainingTesterMixin):
    """Training tests for Wan Transformer 3D."""
    def test_gradient_checkpointing_is_applied(self):
        expected_set = {"WanTransformer3DModel"}
        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
-class WanTransformerCompileTests(TorchCompileTesterMixin, unittest.TestCase):
+class TestWanTransformer3DAttention(WanTransformer3DTesterConfig, AttentionTesterMixin):
-    model_class = WanTransformer3DModel
+    """Attention processor tests for Wan Transformer 3D."""
-    def prepare_init_args_and_inputs_for_common(self):
+
-        return WanTransformer3DTests().prepare_init_args_and_inputs_for_common()
+class TestWanTransformer3DCompile(WanTransformer3DTesterConfig, TorchCompileTesterMixin):
    """Torch compile tests for Wan Transformer 3D."""
 class TestWanTransformer3DBitsAndBytes(WanTransformer3DTesterConfig, BitsAndBytesTesterMixin):
    """BitsAndBytes quantization tests for Wan Transformer 3D."""
    @property
    def torch_dtype(self):
        return torch.float16
    def get_dummy_inputs(self):
        """Override to provide inputs matching the tiny Wan model dimensions."""
        return {
            "hidden_states": randn_tensor(
                (1, 36, 2, 64, 64), generator=self.generator, device=torch_device, dtype=self.torch_dtype
            ),
            "encoder_hidden_states": randn_tensor(
                (1, 512, 4096), generator=self.generator, device=torch_device, dtype=self.torch_dtype
            ),
            "timestep": torch.tensor([1.0]).to(torch_device, self.torch_dtype),
        }
 class TestWanTransformer3DTorchAo(WanTransformer3DTesterConfig, TorchAoTesterMixin):
    """TorchAO quantization tests for Wan Transformer 3D."""
    @property
    def torch_dtype(self):
        return torch.bfloat16
    def get_dummy_inputs(self):
        """Override to provide inputs matching the tiny Wan model dimensions."""
        return {
            "hidden_states": randn_tensor(
                (1, 36, 2, 64, 64), generator=self.generator, device=torch_device, dtype=self.torch_dtype
            ),
            "encoder_hidden_states": randn_tensor(
                (1, 512, 4096), generator=self.generator, device=torch_device, dtype=self.torch_dtype
            ),
            "timestep": torch.tensor([1.0]).to(torch_device, self.torch_dtype),
        }
 class TestWanTransformer3DGGUF(WanTransformer3DTesterConfig, GGUFTesterMixin):
    """GGUF quantization tests for Wan Transformer 3D."""
    @property
    def gguf_filename(self):
        return "https://huggingface.co/QuantStack/Wan2.2-I2V-A14B-GGUF/blob/main/LowNoise/Wan2.2-I2V-A14B-LowNoise-Q2_K.gguf"
    @property
    def torch_dtype(self):
        return torch.bfloat16
    def _create_quantized_model(self, config_kwargs=None, **extra_kwargs):
        return super()._create_quantized_model(
            config_kwargs, config="Wan-AI/Wan2.2-I2V-A14B-Diffusers", subfolder="transformer", **extra_kwargs
        )
    def get_dummy_inputs(self):
        """Override to provide inputs matching the real Wan I2V model dimensions.
        Wan 2.2 I2V: in_channels=36, text_dim=4096
        """
        return {
            "hidden_states": randn_tensor(
                (1, 36, 2, 64, 64), generator=self.generator, device=torch_device, dtype=self.torch_dtype
            ),
            "encoder_hidden_states": randn_tensor(
                (1, 512, 4096), generator=self.generator, device=torch_device, dtype=self.torch_dtype
            ),
            "timestep": torch.tensor([1.0]).to(torch_device, self.torch_dtype),
        }
 class TestWanTransformer3DGGUFCompile(WanTransformer3DTesterConfig, GGUFCompileTesterMixin):
    """GGUF + compile tests for Wan Transformer 3D."""
    @property
    def gguf_filename(self):
        return "https://huggingface.co/QuantStack/Wan2.2-I2V-A14B-GGUF/blob/main/LowNoise/Wan2.2-I2V-A14B-LowNoise-Q2_K.gguf"
    @property
    def torch_dtype(self):
        return torch.bfloat16
    def _create_quantized_model(self, config_kwargs=None, **extra_kwargs):
        return super()._create_quantized_model(
            config_kwargs, config="Wan-AI/Wan2.2-I2V-A14B-Diffusers", subfolder="transformer", **extra_kwargs
        )
    def get_dummy_inputs(self):
        """Override to provide inputs matching the real Wan I2V model dimensions.
        Wan 2.2 I2V: in_channels=36, text_dim=4096
        """
        return {
            "hidden_states": randn_tensor(
                (1, 36, 2, 64, 64), generator=self.generator, device=torch_device, dtype=self.torch_dtype
            ),
            "encoder_hidden_states": randn_tensor(
                (1, 512, 4096), generator=self.generator, device=torch_device, dtype=self.torch_dtype
            ),
            "timestep": torch.tensor([1.0]).to(torch_device, self.torch_dtype),
        }
--- a/tests/models/transformers/test_models_transformer_wan_animate.py
+++ b/tests/models/transformers/test_models_transformer_wan_animate.py
@@ -12,76 +12,62 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import unittest
+import pytest
 import torch
 from diffusers import WanAnimateTransformer3DModel
 from diffusers.utils.torch_utils import randn_tensor
-from ...testing_utils import (
+from ...testing_utils import enable_full_determinism, torch_device
-    enable_full_determinism,
+from ..testing_utils import (
-    torch_device,
+    AttentionTesterMixin,
    BaseModelTesterConfig,
    BitsAndBytesTesterMixin,
    GGUFCompileTesterMixin,
    GGUFTesterMixin,
    MemoryTesterMixin,
    ModelTesterMixin,
    TorchAoTesterMixin,
    TorchCompileTesterMixin,
    TrainingTesterMixin,
 )
 from ..test_modeling_common import ModelTesterMixin, TorchCompileTesterMixin
 enable_full_determinism()
-class WanAnimateTransformer3DTests(ModelTesterMixin, unittest.TestCase):
+class WanAnimateTransformer3DTesterConfig(BaseModelTesterConfig):
-    model_class = WanAnimateTransformer3DModel
+    @property
-    main_input_name = "hidden_states"
+    def model_class(self):
-    uses_custom_attn_processor = True
+        return WanAnimateTransformer3DModel
    @property
-    def dummy_input(self):
+    def pretrained_model_name_or_path(self):
-        batch_size = 1
+        return "hf-internal-testing/tiny-wan-animate-transformer"
        num_channels = 4
        num_frames = 20  # To make the shapes work out; for complicated reasons we want 21 to divide num_frames + 1
        height = 16
        width = 16
        text_encoder_embedding_dim = 16
        sequence_length = 12
        clip_seq_len = 12
        clip_dim = 16
        inference_segment_length = 77  # The inference segment length in the full Wan2.2-Animate-14B model
        face_height = 16  # Should be square and match `motion_encoder_size` below
        face_width = 16
        hidden_states = torch.randn((batch_size, 2 * num_channels + 4, num_frames + 1, height, width)).to(torch_device)
        timestep = torch.randint(0, 1000, size=(batch_size,)).to(torch_device)
        encoder_hidden_states = torch.randn((batch_size, sequence_length, text_encoder_embedding_dim)).to(torch_device)
        clip_ref_features = torch.randn((batch_size, clip_seq_len, clip_dim)).to(torch_device)
        pose_latents = torch.randn((batch_size, num_channels, num_frames, height, width)).to(torch_device)
        face_pixel_values = torch.randn((batch_size, 3, inference_segment_length, face_height, face_width)).to(
            torch_device
        )
        return {
            "hidden_states": hidden_states,
            "timestep": timestep,
            "encoder_hidden_states": encoder_hidden_states,
            "encoder_hidden_states_image": clip_ref_features,
            "pose_hidden_states": pose_latents,
            "face_pixel_values": face_pixel_values,
        }
    @property
-    def input_shape(self):
+    def output_shape(self) -> tuple[int, ...]:
-        return (12, 1, 16, 16)
+        # Output has fewer channels than input (4 vs 12)
        return (4, 21, 16, 16)
    @property
-    def output_shape(self):
+    def input_shape(self) -> tuple[int, ...]:
-        return (4, 1, 16, 16)
+        return (12, 21, 16, 16)
-    def prepare_init_args_and_inputs_for_common(self):
+    @property
    def main_input_name(self) -> str:
        return "hidden_states"
    @property
    def generator(self):
        return torch.Generator("cpu").manual_seed(0)
    def get_init_dict(self) -> dict[str, int | list[int] | tuple | str | bool | float | dict]:
        # Use custom channel sizes since the default Wan Animate channel sizes will cause the motion encoder to
        # contain the vast majority of the parameters in the test model
        channel_sizes = {"4": 16, "8": 16, "16": 16}
-        init_dict = {
+        return {
            "patch_size": (1, 2, 2),
            "num_attention_heads": 2,
            "attention_head_dim": 12,
@@ -105,22 +91,219 @@ class WanAnimateTransformer3DTests(ModelTesterMixin, unittest.TestCase):
            "face_encoder_num_heads": 2,
            "inject_face_latents_blocks": 2,
        }
-        inputs_dict = self.dummy_input
+
-        return init_dict, inputs_dict
+    def get_dummy_inputs(self) -> dict[str, torch.Tensor]:
        batch_size = 1
        num_channels = 4
        num_frames = 20  # To make the shapes work out; for complicated reasons we want 21 to divide num_frames + 1
        height = 16
        width = 16
        text_encoder_embedding_dim = 16
        sequence_length = 12
        clip_seq_len = 12
        clip_dim = 16
        inference_segment_length = 77  # The inference segment length in the full Wan2.2-Animate-14B model
        face_height = 16  # Should be square and match `motion_encoder_size`
        face_width = 16
        return {
            "hidden_states": randn_tensor(
                (batch_size, 2 * num_channels + 4, num_frames + 1, height, width),
                generator=self.generator,
                device=torch_device,
            ),
            "timestep": torch.randint(0, 1000, size=(batch_size,), generator=self.generator).to(torch_device),
            "encoder_hidden_states": randn_tensor(
                (batch_size, sequence_length, text_encoder_embedding_dim),
                generator=self.generator,
                device=torch_device,
            ),
            "encoder_hidden_states_image": randn_tensor(
                (batch_size, clip_seq_len, clip_dim),
                generator=self.generator,
                device=torch_device,
            ),
            "pose_hidden_states": randn_tensor(
                (batch_size, num_channels, num_frames, height, width),
                generator=self.generator,
                device=torch_device,
            ),
            "face_pixel_values": randn_tensor(
                (batch_size, 3, inference_segment_length, face_height, face_width),
                generator=self.generator,
                device=torch_device,
            ),
        }
 class TestWanAnimateTransformer3D(WanAnimateTransformer3DTesterConfig, ModelTesterMixin):
    """Core model tests for Wan Animate Transformer 3D."""
    def test_output(self):
        # Override test_output because the transformer output is expected to have less channels
        # than the main transformer input.
        expected_output_shape = (1, 4, 21, 16, 16)
        super().test_output(expected_output_shape=expected_output_shape)
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16], ids=["fp16", "bf16"])
    def test_from_save_pretrained_dtype_inference(self, tmp_path, dtype):
        # Skip: fp16/bf16 require very high atol (~1e-2) to pass, providing little signal.
        # Dtype preservation is already tested by test_from_save_pretrained_dtype and test_keep_in_fp32_modules.
        pytest.skip("Tolerance requirements too high for meaningful test")
 class TestWanAnimateTransformer3DMemory(WanAnimateTransformer3DTesterConfig, MemoryTesterMixin):
    """Memory optimization tests for Wan Animate Transformer 3D."""
 class TestWanAnimateTransformer3DTraining(WanAnimateTransformer3DTesterConfig, TrainingTesterMixin):
    """Training tests for Wan Animate Transformer 3D."""
    def test_gradient_checkpointing_is_applied(self):
        expected_set = {"WanAnimateTransformer3DModel"}
        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
-    # Override test_output because the transformer output is expected to have less channels than the main transformer
+
-    # input.
+class TestWanAnimateTransformer3DAttention(WanAnimateTransformer3DTesterConfig, AttentionTesterMixin):
-    def test_output(self):
+    """Attention processor tests for Wan Animate Transformer 3D."""
        expected_output_shape = (1, 4, 21, 16, 16)
        super().test_output(expected_output_shape=expected_output_shape)
-class WanAnimateTransformerCompileTests(TorchCompileTesterMixin, unittest.TestCase):
+class TestWanAnimateTransformer3DCompile(WanAnimateTransformer3DTesterConfig, TorchCompileTesterMixin):
-    model_class = WanAnimateTransformer3DModel
+    """Torch compile tests for Wan Animate Transformer 3D."""
-    def prepare_init_args_and_inputs_for_common(self):
+    def test_torch_compile_recompilation_and_graph_break(self):
-        return WanAnimateTransformer3DTests().prepare_init_args_and_inputs_for_common()
+        # Skip: F.pad with mode="replicate" in WanAnimateFaceEncoder triggers importlib.import_module
        # internally, which dynamo doesn't support tracing through.
        pytest.skip("F.pad with replicate mode triggers unsupported import in torch.compile")
 class TestWanAnimateTransformer3DBitsAndBytes(WanAnimateTransformer3DTesterConfig, BitsAndBytesTesterMixin):
    """BitsAndBytes quantization tests for Wan Animate Transformer 3D."""
    @property
    def torch_dtype(self):
        return torch.float16
    def get_dummy_inputs(self):
        """Override to provide inputs matching the tiny Wan Animate model dimensions."""
        return {
            "hidden_states": randn_tensor(
                (1, 36, 21, 64, 64), generator=self.generator, device=torch_device, dtype=self.torch_dtype
            ),
            "encoder_hidden_states": randn_tensor(
                (1, 512, 4096), generator=self.generator, device=torch_device, dtype=self.torch_dtype
            ),
            "encoder_hidden_states_image": randn_tensor(
                (1, 257, 1280), generator=self.generator, device=torch_device, dtype=self.torch_dtype
            ),
            "pose_hidden_states": randn_tensor(
                (1, 16, 20, 64, 64), generator=self.generator, device=torch_device, dtype=self.torch_dtype
            ),
            "face_pixel_values": randn_tensor(
                (1, 3, 77, 512, 512), generator=self.generator, device=torch_device, dtype=self.torch_dtype
            ),
            "timestep": torch.tensor([1.0]).to(torch_device, self.torch_dtype),
        }
 class TestWanAnimateTransformer3DTorchAo(WanAnimateTransformer3DTesterConfig, TorchAoTesterMixin):
    """TorchAO quantization tests for Wan Animate Transformer 3D."""
    @property
    def torch_dtype(self):
        return torch.bfloat16
    def get_dummy_inputs(self):
        """Override to provide inputs matching the tiny Wan Animate model dimensions."""
        return {
            "hidden_states": randn_tensor(
                (1, 36, 21, 64, 64), generator=self.generator, device=torch_device, dtype=self.torch_dtype
            ),
            "encoder_hidden_states": randn_tensor(
                (1, 512, 4096), generator=self.generator, device=torch_device, dtype=self.torch_dtype
            ),
            "encoder_hidden_states_image": randn_tensor(
                (1, 257, 1280), generator=self.generator, device=torch_device, dtype=self.torch_dtype
            ),
            "pose_hidden_states": randn_tensor(
                (1, 16, 20, 64, 64), generator=self.generator, device=torch_device, dtype=self.torch_dtype
            ),
            "face_pixel_values": randn_tensor(
                (1, 3, 77, 512, 512), generator=self.generator, device=torch_device, dtype=self.torch_dtype
            ),
            "timestep": torch.tensor([1.0]).to(torch_device, self.torch_dtype),
        }
 class TestWanAnimateTransformer3DGGUF(WanAnimateTransformer3DTesterConfig, GGUFTesterMixin):
    """GGUF quantization tests for Wan Animate Transformer 3D."""
    @property
    def gguf_filename(self):
        return "https://huggingface.co/QuantStack/Wan2.2-Animate-14B-GGUF/blob/main/Wan2.2-Animate-14B-Q2_K.gguf"
    @property
    def torch_dtype(self):
        return torch.bfloat16
    def get_dummy_inputs(self):
        """Override to provide inputs matching the real Wan Animate model dimensions.
        Wan 2.2 Animate: in_channels=36 (2*16+4), text_dim=4096, image_dim=1280
        """
        return {
            "hidden_states": randn_tensor(
                (1, 36, 21, 64, 64), generator=self.generator, device=torch_device, dtype=self.torch_dtype
            ),
            "encoder_hidden_states": randn_tensor(
                (1, 512, 4096), generator=self.generator, device=torch_device, dtype=self.torch_dtype
            ),
            "encoder_hidden_states_image": randn_tensor(
                (1, 257, 1280), generator=self.generator, device=torch_device, dtype=self.torch_dtype
            ),
            "pose_hidden_states": randn_tensor(
                (1, 16, 20, 64, 64), generator=self.generator, device=torch_device, dtype=self.torch_dtype
            ),
            "face_pixel_values": randn_tensor(
                (1, 3, 77, 512, 512), generator=self.generator, device=torch_device, dtype=self.torch_dtype
            ),
            "timestep": torch.tensor([1.0]).to(torch_device, self.torch_dtype),
        }
 class TestWanAnimateTransformer3DGGUFCompile(WanAnimateTransformer3DTesterConfig, GGUFCompileTesterMixin):
    """GGUF + compile tests for Wan Animate Transformer 3D."""
    @property
    def gguf_filename(self):
        return "https://huggingface.co/QuantStack/Wan2.2-Animate-14B-GGUF/blob/main/Wan2.2-Animate-14B-Q2_K.gguf"
    @property
    def torch_dtype(self):
        return torch.bfloat16
    def get_dummy_inputs(self):
        """Override to provide inputs matching the real Wan Animate model dimensions.
        Wan 2.2 Animate: in_channels=36 (2*16+4), text_dim=4096, image_dim=1280
        """
        return {
            "hidden_states": randn_tensor(
                (1, 36, 21, 64, 64), generator=self.generator, device=torch_device, dtype=self.torch_dtype
            ),
            "encoder_hidden_states": randn_tensor(
                (1, 512, 4096), generator=self.generator, device=torch_device, dtype=self.torch_dtype
            ),
            "encoder_hidden_states_image": randn_tensor(
                (1, 257, 1280), generator=self.generator, device=torch_device, dtype=self.torch_dtype
            ),
            "pose_hidden_states": randn_tensor(
                (1, 16, 20, 64, 64), generator=self.generator, device=torch_device, dtype=self.torch_dtype
            ),
            "face_pixel_values": randn_tensor(
                (1, 3, 77, 512, 512), generator=self.generator, device=torch_device, dtype=self.torch_dtype
            ),
            "timestep": torch.tensor([1.0]).to(torch_device, self.torch_dtype),
        }
--- a/tests/models/transformers/test_models_transformer_wan_vace.py
+++ b/tests/models/transformers/test_models_transformer_wan_vace.py
@@ -0,0 +1,271 @@
 # Copyright 2025 HuggingFace Inc.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import pytest
 import torch
 from diffusers import WanVACETransformer3DModel
 from diffusers.utils.torch_utils import randn_tensor
 from ...testing_utils import enable_full_determinism, torch_device
 from ..testing_utils import (
    AttentionTesterMixin,
    BaseModelTesterConfig,
    BitsAndBytesTesterMixin,
    GGUFCompileTesterMixin,
    GGUFTesterMixin,
    MemoryTesterMixin,
    ModelTesterMixin,
    TorchAoTesterMixin,
    TorchCompileTesterMixin,
    TrainingTesterMixin,
 )
 enable_full_determinism()
 class WanVACETransformer3DTesterConfig(BaseModelTesterConfig):
    @property
    def model_class(self):
        return WanVACETransformer3DModel
    @property
    def pretrained_model_name_or_path(self):
        return "hf-internal-testing/tiny-wan-vace-transformer"
    @property
    def output_shape(self) -> tuple[int, ...]:
        return (16, 2, 16, 16)
    @property
    def input_shape(self) -> tuple[int, ...]:
        return (16, 2, 16, 16)
    @property
    def main_input_name(self) -> str:
        return "hidden_states"
    @property
    def generator(self):
        return torch.Generator("cpu").manual_seed(0)
    def get_init_dict(self) -> dict[str, int | list[int] | tuple | str | bool | None]:
        return {
            "patch_size": (1, 2, 2),
            "num_attention_heads": 2,
            "attention_head_dim": 12,
            "in_channels": 16,
            "out_channels": 16,
            "text_dim": 32,
            "freq_dim": 256,
            "ffn_dim": 32,
            "num_layers": 4,
            "cross_attn_norm": True,
            "qk_norm": "rms_norm_across_heads",
            "rope_max_seq_len": 32,
            "vace_layers": [0, 2],
            "vace_in_channels": 48,  # 3 * in_channels = 3 * 16 = 48
        }
    def get_dummy_inputs(self) -> dict[str, torch.Tensor]:
        batch_size = 1
        num_channels = 16
        num_frames = 2
        height = 16
        width = 16
        text_encoder_embedding_dim = 32
        sequence_length = 12
        # VACE requires control_hidden_states with vace_in_channels (3 * in_channels)
        vace_in_channels = 48
        return {
            "hidden_states": randn_tensor(
                (batch_size, num_channels, num_frames, height, width),
                generator=self.generator,
                device=torch_device,
            ),
            "encoder_hidden_states": randn_tensor(
                (batch_size, sequence_length, text_encoder_embedding_dim),
                generator=self.generator,
                device=torch_device,
            ),
            "control_hidden_states": randn_tensor(
                (batch_size, vace_in_channels, num_frames, height, width),
                generator=self.generator,
                device=torch_device,
            ),
            "timestep": torch.randint(0, 1000, size=(batch_size,), generator=self.generator).to(torch_device),
        }
 class TestWanVACETransformer3D(WanVACETransformer3DTesterConfig, ModelTesterMixin):
    """Core model tests for Wan VACE Transformer 3D."""
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16], ids=["fp16", "bf16"])
    def test_from_save_pretrained_dtype_inference(self, tmp_path, dtype):
        # Skip: fp16/bf16 require very high atol to pass, providing little signal.
        # Dtype preservation is already tested by test_from_save_pretrained_dtype and test_keep_in_fp32_modules.
        pytest.skip("Tolerance requirements too high for meaningful test")
    def test_model_parallelism(self, tmp_path):
        # Skip: Device mismatch between cuda:0 and cuda:1 in VACE control flow
        pytest.skip("Model parallelism not yet supported for WanVACE")
 class TestWanVACETransformer3DMemory(WanVACETransformer3DTesterConfig, MemoryTesterMixin):
    """Memory optimization tests for Wan VACE Transformer 3D."""
 class TestWanVACETransformer3DTraining(WanVACETransformer3DTesterConfig, TrainingTesterMixin):
    """Training tests for Wan VACE Transformer 3D."""
    def test_gradient_checkpointing_is_applied(self):
        expected_set = {"WanVACETransformer3DModel"}
        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
 class TestWanVACETransformer3DAttention(WanVACETransformer3DTesterConfig, AttentionTesterMixin):
    """Attention processor tests for Wan VACE Transformer 3D."""
 class TestWanVACETransformer3DCompile(WanVACETransformer3DTesterConfig, TorchCompileTesterMixin):
    """Torch compile tests for Wan VACE Transformer 3D."""
    def test_torch_compile_repeated_blocks(self):
        # WanVACE has two block types (WanTransformerBlock and WanVACETransformerBlock),
        # so we need recompile_limit=2 instead of the default 1.
        import torch._dynamo
        import torch._inductor.utils
        init_dict = self.get_init_dict()
        inputs_dict = self.get_dummy_inputs()
        model = self.model_class(**init_dict).to(torch_device)
        model.eval()
        model.compile_repeated_blocks(fullgraph=True)
        with (
            torch._inductor.utils.fresh_inductor_cache(),
            torch._dynamo.config.patch(recompile_limit=2),
        ):
            _ = model(**inputs_dict)
            _ = model(**inputs_dict)
 class TestWanVACETransformer3DBitsAndBytes(WanVACETransformer3DTesterConfig, BitsAndBytesTesterMixin):
    """BitsAndBytes quantization tests for Wan VACE Transformer 3D."""
    @property
    def torch_dtype(self):
        return torch.float16
    def get_dummy_inputs(self):
        """Override to provide inputs matching the tiny Wan VACE model dimensions."""
        return {
            "hidden_states": randn_tensor(
                (1, 16, 2, 64, 64), generator=self.generator, device=torch_device, dtype=self.torch_dtype
            ),
            "encoder_hidden_states": randn_tensor(
                (1, 512, 4096), generator=self.generator, device=torch_device, dtype=self.torch_dtype
            ),
            "control_hidden_states": randn_tensor(
                (1, 96, 2, 64, 64), generator=self.generator, device=torch_device, dtype=self.torch_dtype
            ),
            "timestep": torch.tensor([1.0]).to(torch_device, self.torch_dtype),
        }
 class TestWanVACETransformer3DTorchAo(WanVACETransformer3DTesterConfig, TorchAoTesterMixin):
    """TorchAO quantization tests for Wan VACE Transformer 3D."""
    @property
    def torch_dtype(self):
        return torch.bfloat16
    def get_dummy_inputs(self):
        """Override to provide inputs matching the tiny Wan VACE model dimensions."""
        return {
            "hidden_states": randn_tensor(
                (1, 16, 2, 64, 64), generator=self.generator, device=torch_device, dtype=self.torch_dtype
            ),
            "encoder_hidden_states": randn_tensor(
                (1, 512, 4096), generator=self.generator, device=torch_device, dtype=self.torch_dtype
            ),
            "control_hidden_states": randn_tensor(
                (1, 96, 2, 64, 64), generator=self.generator, device=torch_device, dtype=self.torch_dtype
            ),
            "timestep": torch.tensor([1.0]).to(torch_device, self.torch_dtype),
        }
 class TestWanVACETransformer3DGGUF(WanVACETransformer3DTesterConfig, GGUFTesterMixin):
    """GGUF quantization tests for Wan VACE Transformer 3D."""
    @property
    def gguf_filename(self):
        return "https://huggingface.co/QuantStack/Wan2.1_14B_VACE-GGUF/blob/main/Wan2.1_14B_VACE-Q3_K_S.gguf"
    @property
    def torch_dtype(self):
        return torch.bfloat16
    def get_dummy_inputs(self):
        """Override to provide inputs matching the real Wan VACE model dimensions.
        Wan 2.1 VACE: in_channels=16, text_dim=4096, vace_in_channels=96
        """
        return {
            "hidden_states": randn_tensor(
                (1, 16, 2, 64, 64), generator=self.generator, device=torch_device, dtype=self.torch_dtype
            ),
            "encoder_hidden_states": randn_tensor(
                (1, 512, 4096), generator=self.generator, device=torch_device, dtype=self.torch_dtype
            ),
            "control_hidden_states": randn_tensor(
                (1, 96, 2, 64, 64), generator=self.generator, device=torch_device, dtype=self.torch_dtype
            ),
            "timestep": torch.tensor([1.0]).to(torch_device, self.torch_dtype),
        }
 class TestWanVACETransformer3DGGUFCompile(WanVACETransformer3DTesterConfig, GGUFCompileTesterMixin):
    """GGUF + compile tests for Wan VACE Transformer 3D."""
    @property
    def gguf_filename(self):
        return "https://huggingface.co/QuantStack/Wan2.1_14B_VACE-GGUF/blob/main/Wan2.1_14B_VACE-Q3_K_S.gguf"
    @property
    def torch_dtype(self):
        return torch.bfloat16
    def get_dummy_inputs(self):
        """Override to provide inputs matching the real Wan VACE model dimensions.
        Wan 2.1 VACE: in_channels=16, text_dim=4096, vace_in_channels=96
        """
        return {
            "hidden_states": randn_tensor(
                (1, 16, 2, 64, 64), generator=self.generator, device=torch_device, dtype=self.torch_dtype
            ),
            "encoder_hidden_states": randn_tensor(
                (1, 512, 4096), generator=self.generator, device=torch_device, dtype=self.torch_dtype
            ),
            "control_hidden_states": randn_tensor(
                (1, 96, 2, 64, 64), generator=self.generator, device=torch_device, dtype=self.torch_dtype
            ),
            "timestep": torch.tensor([1.0]).to(torch_device, self.torch_dtype),
        }
--- a/tests/testing_utils.py
+++ b/tests/testing_utils.py
@@ -168,7 +168,7 @@ def assert_tensors_close(
        max_diff = abs_diff.max().item()
        flat_idx = abs_diff.argmax().item()
-        max_idx = tuple(torch.unravel_index(torch.tensor(flat_idx), actual.shape).tolist())
+        max_idx = tuple(idx.item() for idx in torch.unravel_index(torch.tensor(flat_idx), actual.shape))
        threshold = atol + rtol * expected.abs()
        mismatched = (abs_diff > threshold).sum().item()
Author	SHA1	Message	Date
Dhruv Nair	e99d30d120	update	2026-02-10 13:24:17 +01:00
Dhruv Nair	933736495e	update	2026-02-10 09:45:06 +01:00
Dhruv Nair	41a26b75a7	update	2026-02-06 13:08:28 +01:00
Dhruv Nair	5776aed870	update	2026-02-06 12:06:55 +01:00
Dhruv Nair	1886198ea1	update	2026-02-06 12:00:14 +01:00
Dhruv Nair	f12a9fdb76	update	2026-02-06 11:38:54 +01:00
Dhruv Nair	42cd24c572	update	2026-02-06 10:31:25 +01:00
Dhruv Nair	e8a3ef8a52	update	2026-02-04 13:56:50 +01:00