Fix aiter availability check (#13059 )

Update import_utils.py
[bug fix] GLM-Image fit new get_image_features API (#13052 )
2026-01-31 07:55:01 +08:00 · 2026-01-30 19:24:05 +05:30 · 2026-01-29 16:16:42 -10:00 · 2026-01-29 08:45:24 -10:00 · 2026-01-29 21:33:09 +05:30 · 2026-01-29 19:22:30 +05:30
12 changed files with 45 additions and 76 deletions
--- a/docker/diffusers-pytorch-cuda/Dockerfile
+++ b/docker/diffusers-pytorch-cuda/Dockerfile
@@ -1,4 +1,4 @@
-FROM nvidia/cuda:12.9.0-runtime-ubuntu20.04
+FROM nvidia/cuda:12.9.1-runtime-ubuntu24.04
 LABEL maintainer="Hugging Face"
 LABEL repository="diffusers"

@@ -36,7 +36,8 @@ ENV PATH="$VIRTUAL_ENV/bin:$PATH"
 RUN uv pip install --no-cache-dir \
    torch \
    torchvision \
-    torchaudio
+    torchaudio \
+    --index-url https://download.pytorch.org/whl/cu129

 # Install compatible versions of numba/llvmlite for Python 3.10+
 RUN uv pip install --no-cache-dir \
--- a/docker/diffusers-pytorch-xformers-cuda/Dockerfile
+++ b/docker/diffusers-pytorch-xformers-cuda/Dockerfile
@@ -1,4 +1,4 @@
-FROM nvidia/cuda:12.9.0-runtime-ubuntu20.04
+FROM nvidia/cuda:12.9.1-runtime-ubuntu24.04
 LABEL maintainer="Hugging Face"
 LABEL repository="diffusers"

@@ -36,7 +36,8 @@ ENV PATH="$VIRTUAL_ENV/bin:$PATH"
 RUN uv pip install --no-cache-dir \
    torch \
    torchvision \
-    torchaudio
+    torchaudio \
+    --index-url https://download.pytorch.org/whl/cu129

 # Install compatible versions of numba/llvmlite for Python 3.10+
 RUN uv pip install --no-cache-dir \
--- a/docs/source/en/api/pipelines/ltx2.md
+++ b/docs/source/en/api/pipelines/ltx2.md
@@ -106,6 +106,8 @@ video, audio = pipe(
    output_type="np",
    return_dict=False,
 )
+video = (video * 255).round().astype("uint8")
+video = torch.from_numpy(video)

 encode_video(
    video[0],
@@ -183,6 +185,8 @@ video, audio = pipe(
    output_type="np",
    return_dict=False,
 )
+video = (video * 255).round().astype("uint8")
+video = torch.from_numpy(video)

 encode_video(
    video[0],
--- a/src/diffusers/commands/fp16_safetensors.py
+++ b/src/diffusers/commands/fp16_safetensors.py
@@ -35,8 +35,8 @@ from . import BaseDiffusersCLICommand
 def conversion_command_factory(args: Namespace):
    if args.use_auth_token:
        warnings.warn(
-            "The `--use_auth_token` flag is deprecated and will be removed in a future version. Authentication is now"
-            " handled automatically if user is logged in."
+            "The `--use_auth_token` flag is deprecated and will be removed in a future version."
+            "Authentication is now handled automatically if the user is logged in."
        )
    return FP16SafetensorsCommand(args.ckpt_id, args.fp16, args.use_safetensors)

@@ -92,8 +92,8 @@ class FP16SafetensorsCommand(BaseDiffusersCLICommand):
        pipeline_class = getattr(import_module("diffusers"), pipeline_class_name)
        self.logger.info(f"Pipeline class imported: {pipeline_class_name}.")

-        # Load the appropriate pipeline. We could have use `DiffusionPipeline`
-        # here, but just to avoid any rough edge cases.
+        # Load the appropriate pipeline. We could have used `DiffusionPipeline`
+        # here, but just to avoid potential edge cases.
        pipeline = pipeline_class.from_pretrained(
            self.ckpt_id, torch_dtype=torch.float16 if self.fp16 else torch.float32
        )
--- a/src/diffusers/pipelines/glm_image/pipeline_glm_image.py
+++ b/src/diffusers/pipelines/glm_image/pipeline_glm_image.py
@@ -407,8 +407,8 @@ class GlmImagePipeline(DiffusionPipeline):

            if len(source_grids) > 0:
                prior_token_image_embed = self.vision_language_encoder.get_image_features(
-                    inputs["pixel_values"], source_grids, return_dict=False
-                )
+                    inputs["pixel_values"], source_grids
+                ).pooler_output
                prior_token_image_embed = torch.cat(prior_token_image_embed, dim=0)
                prior_token_image_ids_d32 = self.vision_language_encoder.get_image_tokens(
                    prior_token_image_embed, source_grids
--- a/src/diffusers/pipelines/ltx2/export_utils.py
+++ b/src/diffusers/pipelines/ltx2/export_utils.py
@@ -13,14 +13,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-from collections.abc import Generator, Iterator
 from fractions import Fraction
-from typing import List, Optional, Tuple, Union
+from typing import Optional

-import numpy as np
-import PIL.Image
 import torch
-from tqdm import tqdm

 from ...utils import is_av_available

@@ -105,52 +101,11 @@ def _write_audio(


 def encode_video(
-    video: Union[List[PIL.Image.Image], np.ndarray, torch.Tensor, Iterator[torch.Tensor]],
-    fps: int,
-    audio: Optional[torch.Tensor],
-    audio_sample_rate: Optional[int],
-    output_path: str,
-    video_chunks_number: int = 1,
+    video: torch.Tensor, fps: int, audio: Optional[torch.Tensor], audio_sample_rate: Optional[int], output_path: str
 ) -> None:
-    """
-    Encodes a video with audio using the PyAV library. Based on code from the original LTX-2 repo:
-    https://github.com/Lightricks/LTX-2/blob/4f410820b198e05074a1e92de793e3b59e9ab5a0/packages/ltx-pipelines/src/ltx_pipelines/utils/media_io.py#L182
+    video_np = video.cpu().numpy()

-    Args:
-        video (`List[PIL.Image.Image]` or `np.ndarray` or `torch.Tensor`):
-            A video tensor of shape [frames, height, width, channels] with integer pixel values in [0, 255]. If the
-            input is a `np.ndarray`, it is expected to be a float array with values in [0, 1] (which is what pipelines
-            usually return with `output_type="np"`).
-        fps (`int`)
-            The frames per second (FPS) of the encoded video.
-        audio (`torch.Tensor`, *optional*):
-            An audio waveform of shape [audio_channels, samples].
-        audio_sample_rate: (`int`, *optional*):
-            The sampling rate of the audio waveform. For LTX 2, this is typically 24000 (24 kHz).
-        output_path (`str`):
-            The path to save the encoded video to.
-        video_chunks_number (`int`, *optional*, defaults to `1`):
-            The number of chunks to split the video into for encoding. Each chunk will be encoded separately. The
-            number of chunks to use often depends on the tiling config for the video VAE.
-    """
-    if isinstance(video, list) and isinstance(video[0], PIL.Image.Image):
-        # Pipeline output_type="pil"
-        video_frames = [np.array(frame) for frame in video]
-        video = np.stack(video_frames, axis=0)
-        video = torch.from_numpy(video)
-    elif isinstance(video, np.ndarray):
-        # Pipeline output_type="np"
-        is_denormalized = np.logical_and(np.zeros_like(video) <= video, video <= np.ones_like(video))
-        if np.all(is_denormalized):
-            video = (video * 255).round().astype("uint8")
-        video = torch.from_numpy(video)
-
-    if isinstance(video, torch.Tensor):
-        video = iter([video])
-
-    first_chunk = next(video)
-
-    _, height, width, _ = first_chunk.shape
+    _, height, width, _ = video_np.shape

    container = av.open(output_path, mode="w")
    stream = container.add_stream("libx264", rate=int(fps))
@@ -164,18 +119,10 @@ def encode_video(

        audio_stream = _prepare_audio_stream(container, audio_sample_rate)

-    def all_tiles(
-        first_chunk: torch.Tensor, tiles_generator: Generator[Tuple[torch.Tensor, int], None, None]
-    ) -> Generator[Tuple[torch.Tensor, int], None, None]:
-        yield first_chunk
-        yield from tiles_generator
-
-    for video_chunk in tqdm(all_tiles(first_chunk, video), total=video_chunks_number):
-        video_chunk_cpu = video_chunk.to("cpu").numpy()
-        for frame_array in video_chunk_cpu:
-            frame = av.VideoFrame.from_ndarray(frame_array, format="rgb24")
-            for packet in stream.encode(frame):
-                container.mux(packet)
+    for frame_array in video_np:
+        frame = av.VideoFrame.from_ndarray(frame_array, format="rgb24")
+        for packet in stream.encode(frame):
+            container.mux(packet)

    # Flush encoder
    for packet in stream.encode():
--- a/src/diffusers/pipelines/ltx2/pipeline_ltx2.py
+++ b/src/diffusers/pipelines/ltx2/pipeline_ltx2.py
@@ -69,6 +69,8 @@ EXAMPLE_DOC_STRING = """
        ...     output_type="np",
        ...     return_dict=False,
        ... )
+        >>> video = (video * 255).round().astype("uint8")
+        >>> video = torch.from_numpy(video)

        >>> encode_video(
        ...     video[0],
--- a/src/diffusers/pipelines/ltx2/pipeline_ltx2_image2video.py
+++ b/src/diffusers/pipelines/ltx2/pipeline_ltx2_image2video.py
@@ -75,6 +75,8 @@ EXAMPLE_DOC_STRING = """
        ...     output_type="np",
        ...     return_dict=False,
        ... )
+        >>> video = (video * 255).round().astype("uint8")
+        >>> video = torch.from_numpy(video)

        >>> encode_video(
        ...     video[0],
--- a/src/diffusers/pipelines/ltx2/pipeline_ltx2_latent_upsample.py
+++ b/src/diffusers/pipelines/ltx2/pipeline_ltx2_latent_upsample.py
@@ -76,6 +76,8 @@ EXAMPLE_DOC_STRING = """
        ...     output_type="np",
        ...     return_dict=False,
        ... )[0]
+        >>> video = (video * 255).round().astype("uint8")
+        >>> video = torch.from_numpy(video)

        >>> encode_video(
        ...     video[0],
--- a/src/diffusers/pipelines/wan/pipeline_wan.py
+++ b/src/diffusers/pipelines/wan/pipeline_wan.py
@@ -496,8 +496,13 @@ class WanPipeline(DiffusionPipeline, WanLoraLoaderMixin):
            num_frames = num_frames // self.vae_scale_factor_temporal * self.vae_scale_factor_temporal + 1
        num_frames = max(num_frames, 1)

-        h_multiple_of = self.vae_scale_factor_spatial * self.transformer.config.patch_size[1]
-        w_multiple_of = self.vae_scale_factor_spatial * self.transformer.config.patch_size[2]
+        patch_size = (
+            self.transformer.config.patch_size
+            if self.transformer is not None
+            else self.transformer_2.config.patch_size
+        )
+        h_multiple_of = self.vae_scale_factor_spatial * patch_size[1]
+        w_multiple_of = self.vae_scale_factor_spatial * patch_size[2]
        calc_height = height // h_multiple_of * h_multiple_of
        calc_width = width // w_multiple_of * w_multiple_of
        if height != calc_height or width != calc_width:
--- a/src/diffusers/pipelines/wan/pipeline_wan_i2v.py
+++ b/src/diffusers/pipelines/wan/pipeline_wan_i2v.py
@@ -637,8 +637,13 @@ class WanImageToVideoPipeline(DiffusionPipeline, WanLoraLoaderMixin):
            num_frames = num_frames // self.vae_scale_factor_temporal * self.vae_scale_factor_temporal + 1
        num_frames = max(num_frames, 1)

-        h_multiple_of = self.vae_scale_factor_spatial * self.transformer.config.patch_size[1]
-        w_multiple_of = self.vae_scale_factor_spatial * self.transformer.config.patch_size[2]
+        patch_size = (
+            self.transformer.config.patch_size
+            if self.transformer is not None
+            else self.transformer_2.config.patch_size
+        )
+        h_multiple_of = self.vae_scale_factor_spatial * patch_size[1]
+        w_multiple_of = self.vae_scale_factor_spatial * patch_size[2]
        calc_height = height // h_multiple_of * h_multiple_of
        calc_width = width // w_multiple_of * w_multiple_of
        if height != calc_height or width != calc_width:
--- a/src/diffusers/utils/import_utils.py
+++ b/src/diffusers/utils/import_utils.py
@@ -227,7 +227,7 @@ _cosmos_guardrail_available, _cosmos_guardrail_version = _is_package_available("
 _sageattention_available, _sageattention_version = _is_package_available("sageattention")
 _flash_attn_available, _flash_attn_version = _is_package_available("flash_attn")
 _flash_attn_3_available, _flash_attn_3_version = _is_package_available("flash_attn_3")
-_aiter_available, _aiter_version = _is_package_available("aiter")
+_aiter_available, _aiter_version = _is_package_available("aiter", get_dist_name=True)
 _kornia_available, _kornia_version = _is_package_available("kornia")
 _nvidia_modelopt_available, _nvidia_modelopt_version = _is_package_available("modelopt", get_dist_name=True)
 _av_available, _av_version = _is_package_available("av")
Author	SHA1	Message	Date
Mikko Lauri	ec6b2bcccb	Fix aiter availability check (#13059 ) Update import_utils.py	2026-01-30 19:24:05 +05:30
Jared Wen	6a1904eb06	[bug fix] GLM-Image fit new `get_image_features` API (#13052 ) change get_image_features API Signed-off-by: JaredforReal <w13431838023@gmail.com> Co-authored-by: YiYi Xu <yixu310@gmail.com>	2026-01-29 16:16:42 -10:00
Sayak Paul	f5b6b6625a	[wan] fix wan 2.2 when either of the transformers isn't present. (#13055 ) fix wan 2.2 when either of the transformers isn't present.	2026-01-29 08:45:24 -10:00
Olexandr88	1be2f7e8c5	docs: fix grammar in fp16_safetensors CLI warning (#13040 ) * docs: fix grammar in fp16_safetensors CLI warning * Apply style fixes --------- Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>	2026-01-29 21:33:09 +05:30
Sayak Paul	314cfddf3a	[ci] uniform run times and wheels for pytorch cuda. (#13047 ) * uniform run times and wheels for pytorch cuda. * 12.9 * change to 24.04. * change to 24.04.	2026-01-29 19:22:30 +05:30