make modules_to_not_convert actually run.

2026-02-08 11:55:18 +08:00 · 2026-02-05 09:47:15 +05:30
6 changed files with 31 additions and 59 deletions
--- a/src/diffusers/models/transformers/transformer_bria_fibo.py
+++ b/src/diffusers/models/transformers/transformer_bria_fibo.py
@@ -125,9 +125,9 @@ class BriaFiboAttnProcessor:
            encoder_hidden_states, hidden_states = hidden_states.split_with_sizes(
                [encoder_hidden_states.shape[1], hidden_states.shape[1] - encoder_hidden_states.shape[1]], dim=1
            )
-            hidden_states = attn.to_out[0](hidden_states.contiguous())
+            hidden_states = attn.to_out[0](hidden_states)
            hidden_states = attn.to_out[1](hidden_states)
-            encoder_hidden_states = attn.to_add_out(encoder_hidden_states.contiguous())
+            encoder_hidden_states = attn.to_add_out(encoder_hidden_states)

            return hidden_states, encoder_hidden_states
        else:
--- a/src/diffusers/models/transformers/transformer_flux.py
+++ b/src/diffusers/models/transformers/transformer_flux.py
@@ -130,9 +130,9 @@ class FluxAttnProcessor:
            encoder_hidden_states, hidden_states = hidden_states.split_with_sizes(
                [encoder_hidden_states.shape[1], hidden_states.shape[1] - encoder_hidden_states.shape[1]], dim=1
            )
-            hidden_states = attn.to_out[0](hidden_states.contiguous())
+            hidden_states = attn.to_out[0](hidden_states)
            hidden_states = attn.to_out[1](hidden_states)
-            encoder_hidden_states = attn.to_add_out(encoder_hidden_states.contiguous())
+            encoder_hidden_states = attn.to_add_out(encoder_hidden_states)

            return hidden_states, encoder_hidden_states
        else:
--- a/src/diffusers/models/transformers/transformer_qwenimage.py
+++ b/src/diffusers/models/transformers/transformer_qwenimage.py
@@ -561,11 +561,11 @@ class QwenDoubleStreamAttnProcessor2_0:
        img_attn_output = joint_hidden_states[:, seq_txt:, :]  # Image part

        # Apply output projections
-        img_attn_output = attn.to_out[0](img_attn_output.contiguous())
+        img_attn_output = attn.to_out[0](img_attn_output)
        if len(attn.to_out) > 1:
            img_attn_output = attn.to_out[1](img_attn_output)  # dropout

-        txt_attn_output = attn.to_add_out(txt_attn_output.contiguous())
+        txt_attn_output = attn.to_add_out(txt_attn_output)

        return img_attn_output, txt_attn_output

--- a/src/diffusers/video_processor.py
+++ b/src/diffusers/video_processor.py
@@ -16,7 +16,7 @@ import warnings
 from typing import List, Optional, Tuple, Union

 import numpy as np
-import PIL.Image
+import PIL
 import torch
 import torch.nn.functional as F

@@ -26,11 +26,9 @@ from .image_processor import VaeImageProcessor, is_valid_image, is_valid_image_i
 class VideoProcessor(VaeImageProcessor):
    r"""Simple video processor."""

-    def preprocess_video(
-        self, video, height: Optional[int] = None, width: Optional[int] = None, **kwargs
-    ) -> torch.Tensor:
+    def preprocess_video(self, video, height: Optional[int] = None, width: Optional[int] = None) -> torch.Tensor:
        r"""
-        Preprocesses input video(s). Keyword arguments will be forwarded to `VaeImageProcessor.preprocess`.
+        Preprocesses input video(s).

        Args:
            video (`List[PIL.Image]`, `List[List[PIL.Image]]`, `torch.Tensor`, `np.array`, `List[torch.Tensor]`, `List[np.array]`):
@@ -52,10 +50,6 @@ class VideoProcessor(VaeImageProcessor):
            width (`int`, *optional*`, defaults to `None`):
                The width in preprocessed frames of the video. If `None`, will use get_default_height_width()` to get
                the default width.
-
-        Returns:
-            `torch.Tensor` of shape `(batch_size, num_channels, num_frames, height, width)`:
-                A 5D tensor holding the batched channels-first video(s).
        """
        if isinstance(video, list) and isinstance(video[0], np.ndarray) and video[0].ndim == 5:
            warnings.warn(
@@ -73,47 +67,20 @@ class VideoProcessor(VaeImageProcessor):
            video = torch.cat(video, axis=0)

        # ensure the input is a list of videos:
-        # - if it is a batched array of videos (5d torch.Tensor or np.ndarray), it is converted to a list of video
-        #   arrays (a list of 4d torch.Tensor or np.ndarray). `VaeImageProcessor.preprocess` will then treat the first
-        #   (frame) dim as a batch dim.
-        # - if it is a single video, it is converted to a list of one video. (A single video is a list of images or a
-        #   single imagelist.)
-        # - if it is a list of imagelists, it will be kept as is (already a list of videos).
-        # - if it is a single image, it is expanded to a single frame video and then to a list of one video. The
-        #   expansion will depend on the image type:
-        #   - PIL.Image.Image --> one element list of PIL.Image.Image
-        #   - 3D np.ndarray   --> interpret as (H, W, C), expand to (F=1, H, W, C)
-        #   - 3D torch.Tensor --> interpret as (C, H, W), expand to (F=1, C, H, W)
+        # - if it is a batch of videos (5d torch.Tensor or np.ndarray), it is converted to a list of videos (a list of 4d torch.Tensor or np.ndarray)
+        # - if it is a single video, it is converted to a list of one video.
        if isinstance(video, (np.ndarray, torch.Tensor)) and video.ndim == 5:
            video = list(video)
        elif isinstance(video, list) and is_valid_image(video[0]) or is_valid_image_imagelist(video):
            video = [video]
        elif isinstance(video, list) and is_valid_image_imagelist(video[0]):
            video = video
-        elif is_valid_image(video):
-            if isinstance(video, PIL.Image.Image):
-                video = [video]
-            elif isinstance(video, np.ndarray):
-                if video.ndim == 2:
-                    video = np.expand_dims(video, axis=-1)  # Unsqueeze channel dim in last axis
-                if video.ndim == 3:
-                    video = np.expand_dims(video, axis=0)
-                else:
-                    raise ValueError(f"Input numpy.ndarray is expected to have 2 or 3 dims but got {video.ndim} dims")
-            elif isinstance(video, torch.Tensor):
-                if video.ndim == 2:
-                    video = torch.unsqueeze(video, dim=0)  # Unsqueeze channel dim in first dim
-                if video.ndim == 3:
-                    video = torch.unsqueeze(video, dim=0)
-                else:
-                    raise ValueError(f"Input torch.Tensor is expected to have 2 or 3 dims but got {video.ndim} dims")
-            video = [video]
        else:
            raise ValueError(
                "Input is in incorrect format. Currently, we only support numpy.ndarray, torch.Tensor, PIL.Image.Image"
            )

-        video = torch.stack([self.preprocess(img, height=height, width=width, **kwargs) for img in video], dim=0)
+        video = torch.stack([self.preprocess(img, height=height, width=width) for img in video], dim=0)

        # move the number of channels before the number of frames.
        video = video.permute(0, 2, 1, 3, 4)
@@ -121,11 +88,10 @@ class VideoProcessor(VaeImageProcessor):
        return video

    def postprocess_video(
-        self, video: torch.Tensor, output_type: str = "np", **kwargs
+        self, video: torch.Tensor, output_type: str = "np"
    ) -> Union[np.ndarray, torch.Tensor, List[PIL.Image.Image]]:
        r"""
-        Converts a video tensor to a list of frames for export. Keyword arguments will be forwarded to
-        `VaeImageProcessor.postprocess`.
+        Converts a video tensor to a list of frames for export.

        Args:
            video (`torch.Tensor`): The video as a tensor.
@@ -135,7 +101,7 @@ class VideoProcessor(VaeImageProcessor):
        outputs = []
        for batch_idx in range(batch_size):
            batch_vid = video[batch_idx].permute(1, 0, 2, 3)
-            batch_output = self.postprocess(batch_vid, output_type, **kwargs)
+            batch_output = self.postprocess(batch_vid, output_type)
            outputs.append(batch_output)

        if output_type == "np":
--- a/tests/models/testing_utils/quantization.py
+++ b/tests/models/testing_utils/quantization.py
@@ -21,11 +21,8 @@ import torch
 from diffusers import BitsAndBytesConfig, GGUFQuantizationConfig, NVIDIAModelOptConfig, QuantoConfig, TorchAoConfig
 from diffusers.utils.import_utils import (
    is_bitsandbytes_available,
-    is_gguf_available,
    is_nvidia_modelopt_available,
    is_optimum_quanto_available,
-    is_torchao_available,
-    is_torchao_version,
 )

 from ...testing_utils import (
@@ -59,13 +56,6 @@ if is_bitsandbytes_available():
 if is_optimum_quanto_available():
    from optimum.quanto import QLinear

-if is_gguf_available():
-    pass
-
-if is_torchao_available():
-    if is_torchao_version(">=", "0.9.0"):
-        pass
-

 class LoRALayer(torch.nn.Module):
    """Wraps a linear layer with LoRA-like adapter - Used for testing purposes only.
--- a/tests/models/transformers/test_models_transformer_flux.py
+++ b/tests/models/transformers/test_models_transformer_flux.py
@@ -318,6 +318,10 @@ class TestFluxSingleFile(FluxTransformerTesterConfig, SingleFileTesterMixin):
 class TestFluxTransformerBitsAndBytes(FluxTransformerTesterConfig, BitsAndBytesTesterMixin):
    """BitsAndBytes quantization tests for Flux Transformer."""

+    @property
+    def modules_to_not_convert_for_test(self):
+        return ["transformer_blocks.0"]
+

 class TestFluxTransformerQuanto(FluxTransformerTesterConfig, QuantoTesterMixin):
    """Quanto quantization tests for Flux Transformer."""
@@ -330,10 +334,18 @@ class TestFluxTransformerQuanto(FluxTransformerTesterConfig, QuantoTesterMixin):
    def pretrained_model_kwargs(self):
        return {}

+    @property
+    def modules_to_not_convert_for_test(self):
+        return ["transformer_blocks.0"]
+

 class TestFluxTransformerTorchAo(FluxTransformerTesterConfig, TorchAoTesterMixin):
    """TorchAO quantization tests for Flux Transformer."""

+    @property
+    def modules_to_not_convert_for_test(self):
+        return ["transformer_blocks.0"]
+

 class TestFluxTransformerGGUF(FluxTransformerTesterConfig, GGUFTesterMixin):
    @property
@@ -402,6 +414,10 @@ class TestFluxTransformerGGUFCompile(FluxTransformerTesterConfig, GGUFCompileTes
 class TestFluxTransformerModelOpt(FluxTransformerTesterConfig, ModelOptTesterMixin):
    """ModelOpt quantization tests for Flux Transformer."""

+    @property
+    def modules_to_not_convert_for_test(self):
+        return ["transformer_blocks.0"]
+

 class TestFluxTransformerModelOptCompile(FluxTransformerTesterConfig, ModelOptCompileTesterMixin):
    """ModelOpt + compile tests for Flux Transformer."""