Merge branch 'main' into video-processor-accept-imagelike-inputs

[core] make qwen hidden states contiguous to make torchao happy. (#13081 )
make qwen hidden states contiguous to make torchao happy.
2026-02-07 11:25:12 +08:00 · 2026-02-05 10:46:04 +05:30 · 2026-02-05 09:02:32 +05:30 · 2026-02-05 08:39:44 +05:30 · 2026-02-05 03:39:06 +01:00 · 2026-02-05 02:52:24 +01:00
13 changed files with 207 additions and 64 deletions
--- a/docs/source/en/quantization/torchao.md
+++ b/docs/source/en/quantization/torchao.md
@@ -66,7 +66,7 @@ from diffusers import DiffusionPipeline, PipelineQuantizationConfig, TorchAoConf
 from torchao.quantization import Int4WeightOnlyConfig

 pipeline_quant_config = PipelineQuantizationConfig(
-    quant_mapping={"transformer": TorchAoConfig(Int4WeightOnlyConfig(group_size=128)))}
+    quant_mapping={"transformer": TorchAoConfig(Int4WeightOnlyConfig(group_size=128))}
 )
 pipeline = DiffusionPipeline.from_pretrained(
    "black-forest-labs/FLUX.1-dev",
--- a/src/diffusers/models/transformers/transformer_bria_fibo.py
+++ b/src/diffusers/models/transformers/transformer_bria_fibo.py
@@ -125,9 +125,9 @@ class BriaFiboAttnProcessor:
            encoder_hidden_states, hidden_states = hidden_states.split_with_sizes(
                [encoder_hidden_states.shape[1], hidden_states.shape[1] - encoder_hidden_states.shape[1]], dim=1
            )
-            hidden_states = attn.to_out[0](hidden_states)
+            hidden_states = attn.to_out[0](hidden_states.contiguous())
            hidden_states = attn.to_out[1](hidden_states)
-            encoder_hidden_states = attn.to_add_out(encoder_hidden_states)
+            encoder_hidden_states = attn.to_add_out(encoder_hidden_states.contiguous())

            return hidden_states, encoder_hidden_states
        else:
--- a/src/diffusers/models/transformers/transformer_flux.py
+++ b/src/diffusers/models/transformers/transformer_flux.py
@@ -130,9 +130,9 @@ class FluxAttnProcessor:
            encoder_hidden_states, hidden_states = hidden_states.split_with_sizes(
                [encoder_hidden_states.shape[1], hidden_states.shape[1] - encoder_hidden_states.shape[1]], dim=1
            )
-            hidden_states = attn.to_out[0](hidden_states)
+            hidden_states = attn.to_out[0](hidden_states.contiguous())
            hidden_states = attn.to_out[1](hidden_states)
-            encoder_hidden_states = attn.to_add_out(encoder_hidden_states)
+            encoder_hidden_states = attn.to_add_out(encoder_hidden_states.contiguous())

            return hidden_states, encoder_hidden_states
        else:
--- a/src/diffusers/models/transformers/transformer_qwenimage.py
+++ b/src/diffusers/models/transformers/transformer_qwenimage.py
@@ -561,11 +561,11 @@ class QwenDoubleStreamAttnProcessor2_0:
        img_attn_output = joint_hidden_states[:, seq_txt:, :]  # Image part

        # Apply output projections
-        img_attn_output = attn.to_out[0](img_attn_output)
+        img_attn_output = attn.to_out[0](img_attn_output.contiguous())
        if len(attn.to_out) > 1:
            img_attn_output = attn.to_out[1](img_attn_output)  # dropout

-        txt_attn_output = attn.to_add_out(txt_attn_output)
+        txt_attn_output = attn.to_add_out(txt_attn_output.contiguous())

        return img_attn_output, txt_attn_output

--- a/src/diffusers/schedulers/scheduling_cosine_dpmsolver_multistep.py
+++ b/src/diffusers/schedulers/scheduling_cosine_dpmsolver_multistep.py
@@ -545,7 +545,9 @@ class CosineDPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):

    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.index_for_timestep
    def index_for_timestep(
-        self, timestep: Union[int, torch.Tensor], schedule_timesteps: Optional[torch.Tensor] = None
+        self,
+        timestep: Union[int, torch.Tensor],
+        schedule_timesteps: Optional[torch.Tensor] = None,
    ) -> int:
        """
        Find the index for a given timestep in the schedule.
--- a/src/diffusers/schedulers/scheduling_deis_multistep.py
+++ b/src/diffusers/schedulers/scheduling_deis_multistep.py
@@ -867,7 +867,9 @@ class DEISMultistepScheduler(SchedulerMixin, ConfigMixin):

    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.index_for_timestep
    def index_for_timestep(
-        self, timestep: Union[int, torch.Tensor], schedule_timesteps: Optional[torch.Tensor] = None
+        self,
+        timestep: Union[int, torch.Tensor],
+        schedule_timesteps: Optional[torch.Tensor] = None,
    ) -> int:
        """
        Find the index for a given timestep in the schedule.
--- a/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
+++ b/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
@@ -245,13 +245,26 @@ class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
    ):
        if self.config.use_beta_sigmas and not is_scipy_available():
            raise ImportError("Make sure to install scipy if you want to use beta sigmas.")
-        if sum([self.config.use_beta_sigmas, self.config.use_exponential_sigmas, self.config.use_karras_sigmas]) > 1:
+        if (
+            sum(
+                [
+                    self.config.use_beta_sigmas,
+                    self.config.use_exponential_sigmas,
+                    self.config.use_karras_sigmas,
+                ]
+            )
+            > 1
+        ):
            raise ValueError(
                "Only one of `config.use_beta_sigmas`, `config.use_exponential_sigmas`, `config.use_karras_sigmas` can be used."
            )
        if algorithm_type in ["dpmsolver", "sde-dpmsolver"]:
            deprecation_message = f"algorithm_type {algorithm_type} is deprecated and will be removed in a future version. Choose from `dpmsolver++` or `sde-dpmsolver++` instead"
-            deprecate("algorithm_types dpmsolver and sde-dpmsolver", "1.0.0", deprecation_message)
+            deprecate(
+                "algorithm_types dpmsolver and sde-dpmsolver",
+                "1.0.0",
+                deprecation_message,
+            )

        if trained_betas is not None:
            self.betas = torch.tensor(trained_betas, dtype=torch.float32)
@@ -259,7 +272,15 @@ class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
            self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
        elif beta_schedule == "scaled_linear":
            # this schedule is very specific to the latent diffusion model.
-            self.betas = torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
+            self.betas = (
+                torch.linspace(
+                    beta_start**0.5,
+                    beta_end**0.5,
+                    num_train_timesteps,
+                    dtype=torch.float32,
+                )
+                ** 2
+            )
        elif beta_schedule == "squaredcos_cap_v2":
            # Glide cosine schedule
            self.betas = betas_for_alpha_bar(num_train_timesteps)
@@ -287,7 +308,12 @@ class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
        self.init_noise_sigma = 1.0

        # settings for DPM-Solver
-        if algorithm_type not in ["dpmsolver", "dpmsolver++", "sde-dpmsolver", "sde-dpmsolver++"]:
+        if algorithm_type not in [
+            "dpmsolver",
+            "dpmsolver++",
+            "sde-dpmsolver",
+            "sde-dpmsolver++",
+        ]:
            if algorithm_type == "deis":
                self.register_to_config(algorithm_type="dpmsolver++")
            else:
@@ -724,7 +750,7 @@ class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
        self,
        model_output: torch.Tensor,
        *args,
-        sample: torch.Tensor = None,
+        sample: Optional[torch.Tensor] = None,
        **kwargs,
    ) -> torch.Tensor:
        """
@@ -738,7 +764,7 @@ class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
        Args:
            model_output (`torch.Tensor`):
                The direct output from the learned diffusion model.
-            sample (`torch.Tensor`):
+            sample (`torch.Tensor`, *optional*):
                A current instance of a sample created by the diffusion process.

        Returns:
@@ -822,7 +848,7 @@ class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
        self,
        model_output: torch.Tensor,
        *args,
-        sample: torch.Tensor = None,
+        sample: Optional[torch.Tensor] = None,
        noise: Optional[torch.Tensor] = None,
        **kwargs,
    ) -> torch.Tensor:
@@ -832,8 +858,10 @@ class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
        Args:
            model_output (`torch.Tensor`):
                The direct output from the learned diffusion model.
-            sample (`torch.Tensor`):
+            sample (`torch.Tensor`, *optional*):
                A current instance of a sample created by the diffusion process.
+            noise (`torch.Tensor`, *optional*):
+                The noise tensor.

        Returns:
            `torch.Tensor`:
@@ -860,7 +888,10 @@ class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
                "Passing `prev_timestep` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
            )

-        sigma_t, sigma_s = self.sigmas[self.step_index + 1], self.sigmas[self.step_index]
+        sigma_t, sigma_s = (
+            self.sigmas[self.step_index + 1],
+            self.sigmas[self.step_index],
+        )
        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
        alpha_s, sigma_s = self._sigma_to_alpha_sigma_t(sigma_s)
        lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
@@ -891,7 +922,7 @@ class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
        self,
        model_output_list: List[torch.Tensor],
        *args,
-        sample: torch.Tensor = None,
+        sample: Optional[torch.Tensor] = None,
        noise: Optional[torch.Tensor] = None,
        **kwargs,
    ) -> torch.Tensor:
@@ -901,7 +932,7 @@ class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
        Args:
            model_output_list (`List[torch.Tensor]`):
                The direct outputs from learned diffusion model at current and latter timesteps.
-            sample (`torch.Tensor`):
+            sample (`torch.Tensor`, *optional*):
                A current instance of a sample created by the diffusion process.

        Returns:
@@ -1014,7 +1045,7 @@ class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
        self,
        model_output_list: List[torch.Tensor],
        *args,
-        sample: torch.Tensor = None,
+        sample: Optional[torch.Tensor] = None,
        noise: Optional[torch.Tensor] = None,
        **kwargs,
    ) -> torch.Tensor:
@@ -1024,8 +1055,10 @@ class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
        Args:
            model_output_list (`List[torch.Tensor]`):
                The direct outputs from learned diffusion model at current and latter timesteps.
-            sample (`torch.Tensor`):
+            sample (`torch.Tensor`, *optional*):
                A current instance of a sample created by diffusion process.
+            noise (`torch.Tensor`, *optional*):
+                The noise tensor.

        Returns:
            `torch.Tensor`:
@@ -1106,7 +1139,9 @@ class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
        return x_t

    def index_for_timestep(
-        self, timestep: Union[int, torch.Tensor], schedule_timesteps: Optional[torch.Tensor] = None
+        self,
+        timestep: Union[int, torch.Tensor],
+        schedule_timesteps: Optional[torch.Tensor] = None,
    ) -> int:
        """
        Find the index for a given timestep in the schedule.
@@ -1216,7 +1251,10 @@ class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
        sample = sample.to(torch.float32)
        if self.config.algorithm_type in ["sde-dpmsolver", "sde-dpmsolver++"] and variance_noise is None:
            noise = randn_tensor(
-                model_output.shape, generator=generator, device=model_output.device, dtype=torch.float32
+                model_output.shape,
+                generator=generator,
+                device=model_output.device,
+                dtype=torch.float32,
            )
        elif self.config.algorithm_type in ["sde-dpmsolver", "sde-dpmsolver++"]:
            noise = variance_noise.to(device=model_output.device, dtype=torch.float32)
--- a/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py
+++ b/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py
@@ -141,6 +141,10 @@ class DPMSolverMultistepInverseScheduler(SchedulerMixin, ConfigMixin):
        use_beta_sigmas (`bool`, *optional*, defaults to `False`):
            Whether to use beta sigmas for step sizes in the noise schedule during the sampling process. Refer to [Beta
            Sampling is All You Need](https://huggingface.co/papers/2407.12173) for more information.
+        use_flow_sigmas (`bool`, *optional*, defaults to `False`):
+            Whether to use flow sigmas for step sizes in the noise schedule during the sampling process.
+        flow_shift (`float`, *optional*, defaults to 1.0):
+            The flow shift factor. Valid only when `use_flow_sigmas=True`.
        lambda_min_clipped (`float`, defaults to `-inf`):
            Clipping threshold for the minimum value of `lambda(t)` for numerical stability. This is critical for the
            cosine (`squaredcos_cap_v2`) noise schedule.
@@ -163,15 +167,15 @@ class DPMSolverMultistepInverseScheduler(SchedulerMixin, ConfigMixin):
        num_train_timesteps: int = 1000,
        beta_start: float = 0.0001,
        beta_end: float = 0.02,
-        beta_schedule: str = "linear",
+        beta_schedule: Literal["linear", "scaled_linear", "squaredcos_cap_v2"] = "linear",
        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
        solver_order: int = 2,
-        prediction_type: str = "epsilon",
+        prediction_type: Literal["epsilon", "sample", "v_prediction", "flow_prediction"] = "epsilon",
        thresholding: bool = False,
        dynamic_thresholding_ratio: float = 0.995,
        sample_max_value: float = 1.0,
-        algorithm_type: str = "dpmsolver++",
-        solver_type: str = "midpoint",
+        algorithm_type: Literal["dpmsolver", "dpmsolver++", "sde-dpmsolver", "sde-dpmsolver++"] = "dpmsolver++",
+        solver_type: Literal["midpoint", "heun"] = "midpoint",
        lower_order_final: bool = True,
        euler_at_final: bool = False,
        use_karras_sigmas: Optional[bool] = False,
@@ -180,19 +184,32 @@ class DPMSolverMultistepInverseScheduler(SchedulerMixin, ConfigMixin):
        use_flow_sigmas: Optional[bool] = False,
        flow_shift: Optional[float] = 1.0,
        lambda_min_clipped: float = -float("inf"),
-        variance_type: Optional[str] = None,
-        timestep_spacing: str = "linspace",
+        variance_type: Optional[Literal["learned", "learned_range"]] = None,
+        timestep_spacing: Literal["linspace", "leading", "trailing"] = "linspace",
        steps_offset: int = 0,
    ):
        if self.config.use_beta_sigmas and not is_scipy_available():
            raise ImportError("Make sure to install scipy if you want to use beta sigmas.")
-        if sum([self.config.use_beta_sigmas, self.config.use_exponential_sigmas, self.config.use_karras_sigmas]) > 1:
+        if (
+            sum(
+                [
+                    self.config.use_beta_sigmas,
+                    self.config.use_exponential_sigmas,
+                    self.config.use_karras_sigmas,
+                ]
+            )
+            > 1
+        ):
            raise ValueError(
                "Only one of `config.use_beta_sigmas`, `config.use_exponential_sigmas`, `config.use_karras_sigmas` can be used."
            )
        if algorithm_type in ["dpmsolver", "sde-dpmsolver"]:
            deprecation_message = f"algorithm_type {algorithm_type} is deprecated and will be removed in a future version. Choose from `dpmsolver++` or `sde-dpmsolver++` instead"
-            deprecate("algorithm_types dpmsolver and sde-dpmsolver", "1.0.0", deprecation_message)
+            deprecate(
+                "algorithm_types dpmsolver and sde-dpmsolver",
+                "1.0.0",
+                deprecation_message,
+            )

        if trained_betas is not None:
            self.betas = torch.tensor(trained_betas, dtype=torch.float32)
@@ -200,7 +217,15 @@ class DPMSolverMultistepInverseScheduler(SchedulerMixin, ConfigMixin):
            self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
        elif beta_schedule == "scaled_linear":
            # this schedule is very specific to the latent diffusion model.
-            self.betas = torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
+            self.betas = (
+                torch.linspace(
+                    beta_start**0.5,
+                    beta_end**0.5,
+                    num_train_timesteps,
+                    dtype=torch.float32,
+                )
+                ** 2
+            )
        elif beta_schedule == "squaredcos_cap_v2":
            # Glide cosine schedule
            self.betas = betas_for_alpha_bar(num_train_timesteps)
@@ -219,7 +244,12 @@ class DPMSolverMultistepInverseScheduler(SchedulerMixin, ConfigMixin):
        self.init_noise_sigma = 1.0

        # settings for DPM-Solver
-        if algorithm_type not in ["dpmsolver", "dpmsolver++", "sde-dpmsolver", "sde-dpmsolver++"]:
+        if algorithm_type not in [
+            "dpmsolver",
+            "dpmsolver++",
+            "sde-dpmsolver",
+            "sde-dpmsolver++",
+        ]:
            if algorithm_type == "deis":
                self.register_to_config(algorithm_type="dpmsolver++")
            else:
@@ -250,7 +280,11 @@ class DPMSolverMultistepInverseScheduler(SchedulerMixin, ConfigMixin):
        """
        return self._step_index

-    def set_timesteps(self, num_inference_steps: int = None, device: Union[str, torch.device] = None):
+    def set_timesteps(
+        self,
+        num_inference_steps: Optional[int] = None,
+        device: Optional[Union[str, torch.device]] = None,
+    ):
        """
        Sets the discrete timesteps used for the diffusion chain (to be run before inference).

@@ -382,7 +416,7 @@ class DPMSolverMultistepInverseScheduler(SchedulerMixin, ConfigMixin):
        return sample

    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
-    def _sigma_to_t(self, sigma, log_sigmas):
+    def _sigma_to_t(self, sigma: np.ndarray, log_sigmas: np.ndarray) -> np.ndarray:
        """
        Convert sigma values to corresponding timestep values through interpolation.

@@ -419,7 +453,7 @@ class DPMSolverMultistepInverseScheduler(SchedulerMixin, ConfigMixin):
        return t

    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler._sigma_to_alpha_sigma_t
-    def _sigma_to_alpha_sigma_t(self, sigma):
+    def _sigma_to_alpha_sigma_t(self, sigma: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        Convert sigma values to alpha_t and sigma_t values.

@@ -441,7 +475,7 @@ class DPMSolverMultistepInverseScheduler(SchedulerMixin, ConfigMixin):
        return alpha_t, sigma_t

    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
-    def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor:
+    def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps: int) -> torch.Tensor:
        """
        Construct the noise schedule as proposed in [Elucidating the Design Space of Diffusion-Based Generative
        Models](https://huggingface.co/papers/2206.00364).
@@ -567,7 +601,7 @@ class DPMSolverMultistepInverseScheduler(SchedulerMixin, ConfigMixin):
        self,
        model_output: torch.Tensor,
        *args,
-        sample: torch.Tensor = None,
+        sample: Optional[torch.Tensor] = None,
        **kwargs,
    ) -> torch.Tensor:
        """
@@ -581,7 +615,7 @@ class DPMSolverMultistepInverseScheduler(SchedulerMixin, ConfigMixin):
        Args:
            model_output (`torch.Tensor`):
                The direct output from the learned diffusion model.
-            sample (`torch.Tensor`):
+            sample (`torch.Tensor`, *optional*):
                A current instance of a sample created by the diffusion process.

        Returns:
@@ -666,7 +700,7 @@ class DPMSolverMultistepInverseScheduler(SchedulerMixin, ConfigMixin):
        self,
        model_output: torch.Tensor,
        *args,
-        sample: torch.Tensor = None,
+        sample: Optional[torch.Tensor] = None,
        noise: Optional[torch.Tensor] = None,
        **kwargs,
    ) -> torch.Tensor:
@@ -676,8 +710,10 @@ class DPMSolverMultistepInverseScheduler(SchedulerMixin, ConfigMixin):
        Args:
            model_output (`torch.Tensor`):
                The direct output from the learned diffusion model.
-            sample (`torch.Tensor`):
+            sample (`torch.Tensor`, *optional*):
                A current instance of a sample created by the diffusion process.
+            noise (`torch.Tensor`, *optional*):
+                The noise tensor.

        Returns:
            `torch.Tensor`:
@@ -704,7 +740,10 @@ class DPMSolverMultistepInverseScheduler(SchedulerMixin, ConfigMixin):
                "Passing `prev_timestep` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
            )

-        sigma_t, sigma_s = self.sigmas[self.step_index + 1], self.sigmas[self.step_index]
+        sigma_t, sigma_s = (
+            self.sigmas[self.step_index + 1],
+            self.sigmas[self.step_index],
+        )
        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
        alpha_s, sigma_s = self._sigma_to_alpha_sigma_t(sigma_s)
        lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
@@ -736,7 +775,7 @@ class DPMSolverMultistepInverseScheduler(SchedulerMixin, ConfigMixin):
        self,
        model_output_list: List[torch.Tensor],
        *args,
-        sample: torch.Tensor = None,
+        sample: Optional[torch.Tensor] = None,
        noise: Optional[torch.Tensor] = None,
        **kwargs,
    ) -> torch.Tensor:
@@ -746,7 +785,7 @@ class DPMSolverMultistepInverseScheduler(SchedulerMixin, ConfigMixin):
        Args:
            model_output_list (`List[torch.Tensor]`):
                The direct outputs from learned diffusion model at current and latter timesteps.
-            sample (`torch.Tensor`):
+            sample (`torch.Tensor`, *optional*):
                A current instance of a sample created by the diffusion process.

        Returns:
@@ -860,7 +899,7 @@ class DPMSolverMultistepInverseScheduler(SchedulerMixin, ConfigMixin):
        self,
        model_output_list: List[torch.Tensor],
        *args,
-        sample: torch.Tensor = None,
+        sample: Optional[torch.Tensor] = None,
        noise: Optional[torch.Tensor] = None,
        **kwargs,
    ) -> torch.Tensor:
@@ -870,8 +909,10 @@ class DPMSolverMultistepInverseScheduler(SchedulerMixin, ConfigMixin):
        Args:
            model_output_list (`List[torch.Tensor]`):
                The direct outputs from learned diffusion model at current and latter timesteps.
-            sample (`torch.Tensor`):
+            sample (`torch.Tensor`, *optional*):
                A current instance of a sample created by diffusion process.
+            noise (`torch.Tensor`, *optional*):
+                The noise tensor.

        Returns:
            `torch.Tensor`:
@@ -951,7 +992,7 @@ class DPMSolverMultistepInverseScheduler(SchedulerMixin, ConfigMixin):
            )
        return x_t

-    def _init_step_index(self, timestep):
+    def _init_step_index(self, timestep: Union[int, torch.Tensor]):
        if isinstance(timestep, torch.Tensor):
            timestep = timestep.to(self.timesteps.device)

@@ -975,7 +1016,7 @@ class DPMSolverMultistepInverseScheduler(SchedulerMixin, ConfigMixin):
        model_output: torch.Tensor,
        timestep: Union[int, torch.Tensor],
        sample: torch.Tensor,
-        generator=None,
+        generator: Optional[torch.Generator] = None,
        variance_noise: Optional[torch.Tensor] = None,
        return_dict: bool = True,
    ) -> Union[SchedulerOutput, Tuple]:
@@ -1027,7 +1068,10 @@ class DPMSolverMultistepInverseScheduler(SchedulerMixin, ConfigMixin):

        if self.config.algorithm_type in ["sde-dpmsolver", "sde-dpmsolver++"] and variance_noise is None:
            noise = randn_tensor(
-                model_output.shape, generator=generator, device=model_output.device, dtype=model_output.dtype
+                model_output.shape,
+                generator=generator,
+                device=model_output.device,
+                dtype=model_output.dtype,
            )
        elif self.config.algorithm_type in ["sde-dpmsolver", "sde-dpmsolver++"]:
            noise = variance_noise
@@ -1074,6 +1118,21 @@ class DPMSolverMultistepInverseScheduler(SchedulerMixin, ConfigMixin):
        noise: torch.Tensor,
        timesteps: torch.IntTensor,
    ) -> torch.Tensor:
+        """
+        Add noise to the clean `original_samples` using the scheduler's equivalent function.
+
+        Args:
+            original_samples (`torch.Tensor`):
+                The original samples to add noise to.
+            noise (`torch.Tensor`):
+                The noise tensor.
+            timesteps (`torch.IntTensor`):
+                The timesteps at which to add noise.
+
+        Returns:
+            `torch.Tensor`:
+                The noisy samples.
+        """
        # Make sure sigmas and timesteps have the same device and dtype as original_samples
        sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
        if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
@@ -1103,5 +1162,5 @@ class DPMSolverMultistepInverseScheduler(SchedulerMixin, ConfigMixin):
        noisy_samples = alpha_t * original_samples + sigma_t * noise
        return noisy_samples

-    def __len__(self):
+    def __len__(self) -> int:
        return self.config.num_train_timesteps
--- a/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py
+++ b/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py
@@ -1120,7 +1120,9 @@ class DPMSolverSinglestepScheduler(SchedulerMixin, ConfigMixin):

    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.index_for_timestep
    def index_for_timestep(
-        self, timestep: Union[int, torch.Tensor], schedule_timesteps: Optional[torch.Tensor] = None
+        self,
+        timestep: Union[int, torch.Tensor],
+        schedule_timesteps: Optional[torch.Tensor] = None,
    ) -> int:
        """
        Find the index for a given timestep in the schedule.
--- a/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py
+++ b/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py
@@ -662,7 +662,9 @@ class EDMDPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):

    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.index_for_timestep
    def index_for_timestep(
-        self, timestep: Union[int, torch.Tensor], schedule_timesteps: Optional[torch.Tensor] = None
+        self,
+        timestep: Union[int, torch.Tensor],
+        schedule_timesteps: Optional[torch.Tensor] = None,
    ) -> int:
        """
        Find the index for a given timestep in the schedule.
--- a/src/diffusers/schedulers/scheduling_sasolver.py
+++ b/src/diffusers/schedulers/scheduling_sasolver.py
@@ -1122,7 +1122,9 @@ class SASolverScheduler(SchedulerMixin, ConfigMixin):

    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.index_for_timestep
    def index_for_timestep(
-        self, timestep: Union[int, torch.Tensor], schedule_timesteps: Optional[torch.Tensor] = None
+        self,
+        timestep: Union[int, torch.Tensor],
+        schedule_timesteps: Optional[torch.Tensor] = None,
    ) -> int:
        """
        Find the index for a given timestep in the schedule.
--- a/src/diffusers/schedulers/scheduling_unipc_multistep.py
+++ b/src/diffusers/schedulers/scheduling_unipc_multistep.py
@@ -1083,7 +1083,9 @@ class UniPCMultistepScheduler(SchedulerMixin, ConfigMixin):

    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.index_for_timestep
    def index_for_timestep(
-        self, timestep: Union[int, torch.Tensor], schedule_timesteps: Optional[torch.Tensor] = None
+        self,
+        timestep: Union[int, torch.Tensor],
+        schedule_timesteps: Optional[torch.Tensor] = None,
    ) -> int:
        """
        Find the index for a given timestep in the schedule.
--- a/src/diffusers/video_processor.py
+++ b/src/diffusers/video_processor.py
@@ -16,7 +16,7 @@ import warnings
 from typing import List, Optional, Tuple, Union

 import numpy as np
-import PIL
+import PIL.Image
 import torch
 import torch.nn.functional as F

@@ -26,9 +26,11 @@ from .image_processor import VaeImageProcessor, is_valid_image, is_valid_image_i
 class VideoProcessor(VaeImageProcessor):
    r"""Simple video processor."""

-    def preprocess_video(self, video, height: Optional[int] = None, width: Optional[int] = None) -> torch.Tensor:
+    def preprocess_video(
+        self, video, height: Optional[int] = None, width: Optional[int] = None, **kwargs
+    ) -> torch.Tensor:
        r"""
-        Preprocesses input video(s).
+        Preprocesses input video(s). Keyword arguments will be forwarded to `VaeImageProcessor.preprocess`.

        Args:
            video (`List[PIL.Image]`, `List[List[PIL.Image]]`, `torch.Tensor`, `np.array`, `List[torch.Tensor]`, `List[np.array]`):
@@ -50,6 +52,10 @@ class VideoProcessor(VaeImageProcessor):
            width (`int`, *optional*`, defaults to `None`):
                The width in preprocessed frames of the video. If `None`, will use get_default_height_width()` to get
                the default width.
+
+        Returns:
+            `torch.Tensor` of shape `(batch_size, num_channels, num_frames, height, width)`:
+                A 5D tensor holding the batched channels-first video(s).
        """
        if isinstance(video, list) and isinstance(video[0], np.ndarray) and video[0].ndim == 5:
            warnings.warn(
@@ -67,20 +73,47 @@ class VideoProcessor(VaeImageProcessor):
            video = torch.cat(video, axis=0)

        # ensure the input is a list of videos:
-        # - if it is a batch of videos (5d torch.Tensor or np.ndarray), it is converted to a list of videos (a list of 4d torch.Tensor or np.ndarray)
-        # - if it is a single video, it is converted to a list of one video.
+        # - if it is a batched array of videos (5d torch.Tensor or np.ndarray), it is converted to a list of video
+        #   arrays (a list of 4d torch.Tensor or np.ndarray). `VaeImageProcessor.preprocess` will then treat the first
+        #   (frame) dim as a batch dim.
+        # - if it is a single video, it is converted to a list of one video. (A single video is a list of images or a
+        #   single imagelist.)
+        # - if it is a list of imagelists, it will be kept as is (already a list of videos).
+        # - if it is a single image, it is expanded to a single frame video and then to a list of one video. The
+        #   expansion will depend on the image type:
+        #   - PIL.Image.Image --> one element list of PIL.Image.Image
+        #   - 3D np.ndarray   --> interpret as (H, W, C), expand to (F=1, H, W, C)
+        #   - 3D torch.Tensor --> interpret as (C, H, W), expand to (F=1, C, H, W)
        if isinstance(video, (np.ndarray, torch.Tensor)) and video.ndim == 5:
            video = list(video)
        elif isinstance(video, list) and is_valid_image(video[0]) or is_valid_image_imagelist(video):
            video = [video]
        elif isinstance(video, list) and is_valid_image_imagelist(video[0]):
            video = video
+        elif is_valid_image(video):
+            if isinstance(video, PIL.Image.Image):
+                video = [video]
+            elif isinstance(video, np.ndarray):
+                if video.ndim == 2:
+                    video = np.expand_dims(video, axis=-1)  # Unsqueeze channel dim in last axis
+                if video.ndim == 3:
+                    video = np.expand_dims(video, axis=0)
+                else:
+                    raise ValueError(f"Input numpy.ndarray is expected to have 2 or 3 dims but got {video.ndim} dims")
+            elif isinstance(video, torch.Tensor):
+                if video.ndim == 2:
+                    video = torch.unsqueeze(video, dim=0)  # Unsqueeze channel dim in first dim
+                if video.ndim == 3:
+                    video = torch.unsqueeze(video, dim=0)
+                else:
+                    raise ValueError(f"Input torch.Tensor is expected to have 2 or 3 dims but got {video.ndim} dims")
+            video = [video]
        else:
            raise ValueError(
                "Input is in incorrect format. Currently, we only support numpy.ndarray, torch.Tensor, PIL.Image.Image"
            )

-        video = torch.stack([self.preprocess(img, height=height, width=width) for img in video], dim=0)
+        video = torch.stack([self.preprocess(img, height=height, width=width, **kwargs) for img in video], dim=0)

        # move the number of channels before the number of frames.
        video = video.permute(0, 2, 1, 3, 4)
@@ -88,10 +121,11 @@ class VideoProcessor(VaeImageProcessor):
        return video

    def postprocess_video(
-        self, video: torch.Tensor, output_type: str = "np"
+        self, video: torch.Tensor, output_type: str = "np", **kwargs
    ) -> Union[np.ndarray, torch.Tensor, List[PIL.Image.Image]]:
        r"""
-        Converts a video tensor to a list of frames for export.
+        Converts a video tensor to a list of frames for export. Keyword arguments will be forwarded to
+        `VaeImageProcessor.postprocess`.

        Args:
            video (`torch.Tensor`): The video as a tensor.
@@ -101,7 +135,7 @@ class VideoProcessor(VaeImageProcessor):
        outputs = []
        for batch_idx in range(batch_size):
            batch_vid = video[batch_idx].permute(1, 0, 2, 3)
-            batch_output = self.postprocess(batch_vid, output_type)
+            batch_output = self.postprocess(batch_vid, output_type, **kwargs)
            outputs.append(batch_output)

        if output_type == "np":
Author	SHA1	Message	Date
Sayak Paul	c30acad259	Merge branch 'main' into video-processor-accept-imagelike-inputs	2026-02-05 10:46:04 +05:30
Sayak Paul	a3dcd9882f	[core] make qwen hidden states contiguous to make torchao happy. (#13081 ) make qwen hidden states contiguous to make torchao happy.	2026-02-05 09:02:32 +05:30
Sayak Paul	9fe0a9cac4	[core] make flux hidden states contiguous (#13068 ) * make flux hidden states contiguous * make fix-copies	2026-02-05 08:39:44 +05:30
Daniel Gu	4ebcdb6ecb	Allow VideoProcessor.preprocess_video to accept single-image inputs	2026-02-05 03:39:06 +01:00
Daniel Gu	8a913577d9	Forward kwargs from preprocess/postprocess_video to preprocess/postprocess	2026-02-05 02:52:24 +01:00
David El Malih	03af690b60	docs: improve docstring scheduling_dpmsolver_multistep_inverse.py (#13083 ) Improve docstring scheduling dpmsolver multistep inverse	2026-02-04 09:21:57 -08:00
Sayak Paul	90818e82b3	[docs] Fix syntax error in quantization configuration (#13076 ) Fix syntax error in quantization configuration	2026-02-04 08:31:03 -08:00