12.9

uniform run times and wheels for pytorch cuda.
2026-01-29 06:54:54 +08:00 · 2026-01-28 20:51:29 +05:30 · 2026-01-28 18:23:47 +05:30
6 changed files with 41 additions and 94 deletions
--- a/docker/diffusers-pytorch-cuda/Dockerfile
+++ b/docker/diffusers-pytorch-cuda/Dockerfile
@@ -1,4 +1,4 @@
-FROM nvidia/cuda:12.9.0-runtime-ubuntu20.04
+FROM nvidia/cuda:12.9.1-runtime-ubuntu22.04
 LABEL maintainer="Hugging Face"
 LABEL repository="diffusers"

@@ -36,7 +36,8 @@ ENV PATH="$VIRTUAL_ENV/bin:$PATH"
 RUN uv pip install --no-cache-dir \
    torch \
    torchvision \
-    torchaudio
+    torchaudio \
+    --index-url https://download.pytorch.org/whl/cu129

 # Install compatible versions of numba/llvmlite for Python 3.10+
 RUN uv pip install --no-cache-dir \
--- a/docker/diffusers-pytorch-xformers-cuda/Dockerfile
+++ b/docker/diffusers-pytorch-xformers-cuda/Dockerfile
@@ -1,4 +1,4 @@
-FROM nvidia/cuda:12.9.0-runtime-ubuntu20.04
+FROM nvidia/cuda:12.9.1-runtime-ubuntu22.04
 LABEL maintainer="Hugging Face"
 LABEL repository="diffusers"

@@ -36,7 +36,8 @@ ENV PATH="$VIRTUAL_ENV/bin:$PATH"
 RUN uv pip install --no-cache-dir \
    torch \
    torchvision \
-    torchaudio
+    torchaudio \
+    --index-url https://download.pytorch.org/whl/cu129

 # Install compatible versions of numba/llvmlite for Python 3.10+
 RUN uv pip install --no-cache-dir \
--- a/docs/source/en/quantization/torchao.md
+++ b/docs/source/en/quantization/torchao.md
@@ -83,6 +83,25 @@ Refer to this [table](https://github.com/huggingface/diffusers/pull/10009#issue-
 > [!TIP]
 > The FP8 post-training quantization schemes in torchao are effective for GPUs with compute capability of at least 8.9 (RTX-4090, Hopper, etc.). FP8 often provides the best speed, memory, and quality trade-off when generating images and videos. We recommend combining FP8 and torch.compile if your GPU is compatible.

+## autoquant
+
+torchao provides [autoquant](https://docs.pytorch.org/ao/stable/generated/torchao.quantization.autoquant.html#torchao.quantization.autoquant) an automatic quantization API. Autoquantization chooses the best quantization strategy by comparing the performance of each strategy on chosen input types and shapes. This is only supported in Diffusers for individual models at the moment.
+
+```py
+import torch
+from diffusers import DiffusionPipeline
+from torchao.quantization import autoquant
+
+# Load the pipeline
+pipeline = DiffusionPipeline.from_pretrained(
+    "black-forest-labs/FLUX.1-schnell",
+    torch_dtype=torch.bfloat16,
+    device_map="cuda"
+)
+
+transformer = autoquant(pipeline.transformer)
+```
+
 ## Supported quantization types

 torchao supports weight-only quantization and weight and dynamic-activation quantization for int8, float3-float8, and uint1-uint7.
--- a/src/diffusers/quantizers/quantization_config.py
+++ b/src/diffusers/quantizers/quantization_config.py
@@ -623,7 +623,7 @@ class TorchAoConfig(QuantizationConfigMixin):
        """

        if is_torchao_available():
-            # TODO(aryan): Support sparsify
+            # TODO(aryan): Support autoquant and sparsify
            from torchao.quantization import (
                float8_dynamic_activation_float8_weight,
                float8_static_activation_float8_weight,
--- a/src/diffusers/quantizers/torchao/torchao_quantizer.py
+++ b/src/diffusers/quantizers/torchao/torchao_quantizer.py
@@ -344,6 +344,7 @@ class TorchAoHfQuantizer(DiffusersQuantizer):
            from torchao.core.config import AOBaseConfig

            quant_type = self.quantization_config.quant_type
+            # For autoquant case, it will be treated in the string implementation below in map_to_target_dtype
            if isinstance(quant_type, AOBaseConfig):
                # Extract size digit using fuzzy match on the class name
                config_name = quant_type.__class__.__name__
--- a/src/diffusers/schedulers/scheduling_dpm_cogvideox.py
+++ b/src/diffusers/schedulers/scheduling_dpm_cogvideox.py
@@ -105,6 +105,7 @@ def rescale_zero_terminal_snr(alphas_cumprod):
    """
    Rescales betas to have zero terminal SNR Based on https://huggingface.co/papers/2305.08891 (Algorithm 1)

+
    Args:
        betas (`torch.Tensor`):
            the betas that the scheduler is being initialized with.
@@ -174,14 +175,11 @@ class CogVideoXDPMScheduler(SchedulerMixin, ConfigMixin):
            The threshold value for dynamic thresholding. Valid only when `thresholding=True`.
        timestep_spacing (`str`, defaults to `"leading"`):
            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
-            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information. Choose from
-            `leading`, `linspace` or `trailing`.
+            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
        rescale_betas_zero_snr (`bool`, defaults to `False`):
            Whether to rescale the betas to have zero terminal SNR. This enables the model to generate very bright and
            dark samples instead of limiting it to samples with medium brightness. Loosely related to
            [`--offset_noise`](https://github.com/huggingface/diffusers/blob/74fd735eb073eb1d774b1ab4154a0876eb82f055/examples/dreambooth/train_dreambooth.py#L506).
-        snr_shift_scale (`float`, defaults to 3.0):
-             Shift scale for SNR.
    """

    _compatibles = [e.name for e in KarrasDiffusionSchedulers]
@@ -193,15 +191,15 @@ class CogVideoXDPMScheduler(SchedulerMixin, ConfigMixin):
        num_train_timesteps: int = 1000,
        beta_start: float = 0.00085,
        beta_end: float = 0.0120,
-        beta_schedule: Literal["linear", "scaled_linear", "squaredcos_cap_v2"] = "scaled_linear",
+        beta_schedule: str = "scaled_linear",
        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
        clip_sample: bool = True,
        set_alpha_to_one: bool = True,
        steps_offset: int = 0,
-        prediction_type: Literal["epsilon", "sample", "v_prediction"] = "epsilon",
+        prediction_type: str = "epsilon",
        clip_sample_range: float = 1.0,
        sample_max_value: float = 1.0,
-        timestep_spacing: Literal["leading", "linspace", "trailing"] = "leading",
+        timestep_spacing: str = "leading",
        rescale_betas_zero_snr: bool = False,
        snr_shift_scale: float = 3.0,
    ):
@@ -211,15 +209,7 @@ class CogVideoXDPMScheduler(SchedulerMixin, ConfigMixin):
            self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
        elif beta_schedule == "scaled_linear":
            # this schedule is very specific to the latent diffusion model.
-            self.betas = (
-                torch.linspace(
-                    beta_start**0.5,
-                    beta_end**0.5,
-                    num_train_timesteps,
-                    dtype=torch.float64,
-                )
-                ** 2
-            )
+            self.betas = torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float64) ** 2
        elif beta_schedule == "squaredcos_cap_v2":
            # Glide cosine schedule
            self.betas = betas_for_alpha_bar(num_train_timesteps)
@@ -276,20 +266,13 @@ class CogVideoXDPMScheduler(SchedulerMixin, ConfigMixin):
        """
        return sample

-    def set_timesteps(
-        self,
-        num_inference_steps: int,
-        device: Optional[Union[str, torch.device]] = None,
-    ):
+    def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None):
        """
        Sets the discrete timesteps used for the diffusion chain (to be run before inference).

        Args:
            num_inference_steps (`int`):
                The number of diffusion steps used when generating samples with a pre-trained model.
-            device (`str` or `torch.device`, *optional*):
-                The device to which the timesteps should be moved to. If `None` (the default), the timesteps are not
-                moved.
        """

        if num_inference_steps > self.config.num_train_timesteps:
@@ -328,27 +311,7 @@ class CogVideoXDPMScheduler(SchedulerMixin, ConfigMixin):

        self.timesteps = torch.from_numpy(timesteps).to(device)

-    def get_variables(
-        self,
-        alpha_prod_t: torch.Tensor,
-        alpha_prod_t_prev: torch.Tensor,
-        alpha_prod_t_back: Optional[torch.Tensor] = None,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], torch.Tensor, torch.Tensor]:
-        """
-        Compute the variables used for DPM-Solver++ (2M) referencing the original implementation.
-
-        Args:
-            alpha_prod_t (`torch.Tensor`):
-                The cumulative product of alphas at the current timestep.
-            alpha_prod_t_prev (`torch.Tensor`):
-                The cumulative product of alphas at the previous timestep.
-            alpha_prod_t_back (`torch.Tensor`, *optional*):
-                The cumulative product of alphas at the timestep before the previous timestep.
-
-        Returns:
-            `tuple`:
-                A tuple containing the variables `h`, `r`, `lamb`, `lamb_next`.
-        """
+    def get_variables(self, alpha_prod_t, alpha_prod_t_prev, alpha_prod_t_back=None):
        lamb = ((alpha_prod_t / (1 - alpha_prod_t)) ** 0.5).log()
        lamb_next = ((alpha_prod_t_prev / (1 - alpha_prod_t_prev)) ** 0.5).log()
        h = lamb_next - lamb
@@ -361,36 +324,7 @@ class CogVideoXDPMScheduler(SchedulerMixin, ConfigMixin):
        else:
            return h, None, lamb, lamb_next

-    def get_mult(
-        self,
-        h: torch.Tensor,
-        r: Optional[torch.Tensor],
-        alpha_prod_t: torch.Tensor,
-        alpha_prod_t_prev: torch.Tensor,
-        alpha_prod_t_back: Optional[torch.Tensor] = None,
-    ) -> Union[
-        Tuple[torch.Tensor, torch.Tensor],
-        Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor],
-    ]:
-        """
-        Compute the multipliers for the previous sample and the predicted original sample.
-
-        Args:
-            h (`torch.Tensor`):
-                The log-SNR difference.
-            r (`torch.Tensor`):
-                The ratio of log-SNR differences.
-            alpha_prod_t (`torch.Tensor`):
-                The cumulative product of alphas at the current timestep.
-            alpha_prod_t_prev (`torch.Tensor`):
-                The cumulative product of alphas at the previous timestep.
-            alpha_prod_t_back (`torch.Tensor`, *optional*):
-                The cumulative product of alphas at the timestep before the previous timestep.
-
-        Returns:
-            `tuple`:
-                A tuple containing the multipliers.
-        """
+    def get_mult(self, h, r, alpha_prod_t, alpha_prod_t_prev, alpha_prod_t_back):
        mult1 = ((1 - alpha_prod_t_prev) / (1 - alpha_prod_t)) ** 0.5 * (-h).exp()
        mult2 = (-2 * h).expm1() * alpha_prod_t_prev**0.5

@@ -404,13 +338,13 @@ class CogVideoXDPMScheduler(SchedulerMixin, ConfigMixin):
    def step(
        self,
        model_output: torch.Tensor,
-        old_pred_original_sample: Optional[torch.Tensor],
+        old_pred_original_sample: torch.Tensor,
        timestep: int,
        timestep_back: int,
        sample: torch.Tensor,
        eta: float = 0.0,
        use_clipped_model_output: bool = False,
-        generator: Optional[torch.Generator] = None,
+        generator=None,
        variance_noise: Optional[torch.Tensor] = None,
        return_dict: bool = False,
    ) -> Union[DDIMSchedulerOutput, Tuple]:
@@ -421,12 +355,8 @@ class CogVideoXDPMScheduler(SchedulerMixin, ConfigMixin):
        Args:
            model_output (`torch.Tensor`):
                The direct output from learned diffusion model.
-            old_pred_original_sample (`torch.Tensor`):
-                The predicted original sample from the previous timestep.
-            timestep (`int`):
+            timestep (`float`):
                The current discrete timestep in the diffusion chain.
-            timestep_back (`int`):
-                The timestep to look back to.
            sample (`torch.Tensor`):
                A current instance of a sample created by the diffusion process.
            eta (`float`):
@@ -506,12 +436,7 @@ class CogVideoXDPMScheduler(SchedulerMixin, ConfigMixin):
            return prev_sample, pred_original_sample
        else:
            denoised_d = mult[2] * pred_original_sample - mult[3] * old_pred_original_sample
-            noise = randn_tensor(
-                sample.shape,
-                generator=generator,
-                device=sample.device,
-                dtype=sample.dtype,
-            )
+            noise = randn_tensor(sample.shape, generator=generator, device=sample.device, dtype=sample.dtype)
            x_advanced = mult[0] * sample - mult[1] * denoised_d + mult_noise * noise

            prev_sample = x_advanced
@@ -599,5 +524,5 @@ class CogVideoXDPMScheduler(SchedulerMixin, ConfigMixin):
        velocity = sqrt_alpha_prod * noise - sqrt_one_minus_alpha_prod * sample
        return velocity

-    def __len__(self) -> int:
+    def __len__(self):
        return self.config.num_train_timesteps
Author	SHA1	Message	Date
sayakpaul	71bf49250b	12.9	2026-01-28 20:51:29 +05:30
sayakpaul	fb10f4889e	uniform run times and wheels for pytorch cuda.	2026-01-28 18:23:47 +05:30