make style and make quality

Address review comments
Fix comment
2026-02-01 00:15:00 +08:00 · 2026-01-30 07:52:18 +01:00 · 2026-01-30 07:49:24 +01:00 · 2026-01-30 02:26:36 +01:00 · 2026-01-30 02:10:51 +01:00 · 2026-01-29 09:37:06 +01:00
12 changed files with 76 additions and 45 deletions
--- a/docker/diffusers-pytorch-cuda/Dockerfile
+++ b/docker/diffusers-pytorch-cuda/Dockerfile
@@ -1,4 +1,4 @@
-FROM nvidia/cuda:12.9.1-runtime-ubuntu24.04
+FROM nvidia/cuda:12.9.0-runtime-ubuntu20.04
 LABEL maintainer="Hugging Face"
 LABEL repository="diffusers"

@@ -36,8 +36,7 @@ ENV PATH="$VIRTUAL_ENV/bin:$PATH"
 RUN uv pip install --no-cache-dir \
    torch \
    torchvision \
-    torchaudio \
-    --index-url https://download.pytorch.org/whl/cu129
+    torchaudio

 # Install compatible versions of numba/llvmlite for Python 3.10+
 RUN uv pip install --no-cache-dir \
--- a/docker/diffusers-pytorch-xformers-cuda/Dockerfile
+++ b/docker/diffusers-pytorch-xformers-cuda/Dockerfile
@@ -1,4 +1,4 @@
-FROM nvidia/cuda:12.9.1-runtime-ubuntu24.04
+FROM nvidia/cuda:12.9.0-runtime-ubuntu20.04
 LABEL maintainer="Hugging Face"
 LABEL repository="diffusers"

@@ -36,8 +36,7 @@ ENV PATH="$VIRTUAL_ENV/bin:$PATH"
 RUN uv pip install --no-cache-dir \
    torch \
    torchvision \
-    torchaudio \
-    --index-url https://download.pytorch.org/whl/cu129
+    torchaudio

 # Install compatible versions of numba/llvmlite for Python 3.10+
 RUN uv pip install --no-cache-dir \
--- a/docs/source/en/api/pipelines/ltx2.md
+++ b/docs/source/en/api/pipelines/ltx2.md
@@ -106,8 +106,6 @@ video, audio = pipe(
    output_type="np",
    return_dict=False,
 )
-video = (video * 255).round().astype("uint8")
-video = torch.from_numpy(video)

 encode_video(
    video[0],
@@ -185,8 +183,6 @@ video, audio = pipe(
    output_type="np",
    return_dict=False,
 )
-video = (video * 255).round().astype("uint8")
-video = torch.from_numpy(video)

 encode_video(
    video[0],
--- a/src/diffusers/commands/fp16_safetensors.py
+++ b/src/diffusers/commands/fp16_safetensors.py
@@ -35,8 +35,8 @@ from . import BaseDiffusersCLICommand
 def conversion_command_factory(args: Namespace):
    if args.use_auth_token:
        warnings.warn(
-            "The `--use_auth_token` flag is deprecated and will be removed in a future version."
-            "Authentication is now handled automatically if the user is logged in."
+            "The `--use_auth_token` flag is deprecated and will be removed in a future version. Authentication is now"
+            " handled automatically if user is logged in."
        )
    return FP16SafetensorsCommand(args.ckpt_id, args.fp16, args.use_safetensors)

@@ -92,8 +92,8 @@ class FP16SafetensorsCommand(BaseDiffusersCLICommand):
        pipeline_class = getattr(import_module("diffusers"), pipeline_class_name)
        self.logger.info(f"Pipeline class imported: {pipeline_class_name}.")

-        # Load the appropriate pipeline. We could have used `DiffusionPipeline`
-        # here, but just to avoid potential edge cases.
+        # Load the appropriate pipeline. We could have use `DiffusionPipeline`
+        # here, but just to avoid any rough edge cases.
        pipeline = pipeline_class.from_pretrained(
            self.ckpt_id, torch_dtype=torch.float16 if self.fp16 else torch.float32
        )
--- a/src/diffusers/pipelines/glm_image/pipeline_glm_image.py
+++ b/src/diffusers/pipelines/glm_image/pipeline_glm_image.py
@@ -407,8 +407,8 @@ class GlmImagePipeline(DiffusionPipeline):

            if len(source_grids) > 0:
                prior_token_image_embed = self.vision_language_encoder.get_image_features(
-                    inputs["pixel_values"], source_grids
-                ).pooler_output
+                    inputs["pixel_values"], source_grids, return_dict=False
+                )
                prior_token_image_embed = torch.cat(prior_token_image_embed, dim=0)
                prior_token_image_ids_d32 = self.vision_language_encoder.get_image_tokens(
                    prior_token_image_embed, source_grids
--- a/src/diffusers/pipelines/ltx2/export_utils.py
+++ b/src/diffusers/pipelines/ltx2/export_utils.py
@@ -13,10 +13,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+from collections.abc import Generator, Iterator
 from fractions import Fraction
-from typing import Optional
+from typing import List, Optional, Tuple, Union

+import numpy as np
+import PIL.Image
 import torch
+from tqdm import tqdm

 from ...utils import is_av_available

@@ -101,11 +105,52 @@ def _write_audio(


 def encode_video(
-    video: torch.Tensor, fps: int, audio: Optional[torch.Tensor], audio_sample_rate: Optional[int], output_path: str
+    video: Union[List[PIL.Image.Image], np.ndarray, torch.Tensor, Iterator[torch.Tensor]],
+    fps: int,
+    audio: Optional[torch.Tensor],
+    audio_sample_rate: Optional[int],
+    output_path: str,
+    video_chunks_number: int = 1,
 ) -> None:
-    video_np = video.cpu().numpy()
+    """
+    Encodes a video with audio using the PyAV library. Based on code from the original LTX-2 repo:
+    https://github.com/Lightricks/LTX-2/blob/4f410820b198e05074a1e92de793e3b59e9ab5a0/packages/ltx-pipelines/src/ltx_pipelines/utils/media_io.py#L182

-    _, height, width, _ = video_np.shape
+    Args:
+        video (`List[PIL.Image.Image]` or `np.ndarray` or `torch.Tensor`):
+            A video tensor of shape [frames, height, width, channels] with integer pixel values in [0, 255]. If the
+            input is a `np.ndarray`, it is expected to be a float array with values in [0, 1] (which is what pipelines
+            usually return with `output_type="np"`).
+        fps (`int`)
+            The frames per second (FPS) of the encoded video.
+        audio (`torch.Tensor`, *optional*):
+            An audio waveform of shape [audio_channels, samples].
+        audio_sample_rate: (`int`, *optional*):
+            The sampling rate of the audio waveform. For LTX 2, this is typically 24000 (24 kHz).
+        output_path (`str`):
+            The path to save the encoded video to.
+        video_chunks_number (`int`, *optional*, defaults to `1`):
+            The number of chunks to split the video into for encoding. Each chunk will be encoded separately. The
+            number of chunks to use often depends on the tiling config for the video VAE.
+    """
+    if isinstance(video, list) and isinstance(video[0], PIL.Image.Image):
+        # Pipeline output_type="pil"
+        video_frames = [np.array(frame) for frame in video]
+        video = np.stack(video_frames, axis=0)
+        video = torch.from_numpy(video)
+    elif isinstance(video, np.ndarray):
+        # Pipeline output_type="np"
+        is_denormalized = np.logical_and(np.zeros_like(video) <= video, video <= np.ones_like(video))
+        if np.all(is_denormalized):
+            video = (video * 255).round().astype("uint8")
+        video = torch.from_numpy(video)
+
+    if isinstance(video, torch.Tensor):
+        video = iter([video])
+
+    first_chunk = next(video)
+
+    _, height, width, _ = first_chunk.shape

    container = av.open(output_path, mode="w")
    stream = container.add_stream("libx264", rate=int(fps))
@@ -119,10 +164,18 @@ def encode_video(

        audio_stream = _prepare_audio_stream(container, audio_sample_rate)

-    for frame_array in video_np:
-        frame = av.VideoFrame.from_ndarray(frame_array, format="rgb24")
-        for packet in stream.encode(frame):
-            container.mux(packet)
+    def all_tiles(
+        first_chunk: torch.Tensor, tiles_generator: Generator[Tuple[torch.Tensor, int], None, None]
+    ) -> Generator[Tuple[torch.Tensor, int], None, None]:
+        yield first_chunk
+        yield from tiles_generator
+
+    for video_chunk in tqdm(all_tiles(first_chunk, video), total=video_chunks_number):
+        video_chunk_cpu = video_chunk.to("cpu").numpy()
+        for frame_array in video_chunk_cpu:
+            frame = av.VideoFrame.from_ndarray(frame_array, format="rgb24")
+            for packet in stream.encode(frame):
+                container.mux(packet)

    # Flush encoder
    for packet in stream.encode():
--- a/src/diffusers/pipelines/ltx2/pipeline_ltx2.py
+++ b/src/diffusers/pipelines/ltx2/pipeline_ltx2.py
@@ -69,8 +69,6 @@ EXAMPLE_DOC_STRING = """
        ...     output_type="np",
        ...     return_dict=False,
        ... )
-        >>> video = (video * 255).round().astype("uint8")
-        >>> video = torch.from_numpy(video)

        >>> encode_video(
        ...     video[0],
--- a/src/diffusers/pipelines/ltx2/pipeline_ltx2_image2video.py
+++ b/src/diffusers/pipelines/ltx2/pipeline_ltx2_image2video.py
@@ -75,8 +75,6 @@ EXAMPLE_DOC_STRING = """
        ...     output_type="np",
        ...     return_dict=False,
        ... )
-        >>> video = (video * 255).round().astype("uint8")
-        >>> video = torch.from_numpy(video)

        >>> encode_video(
        ...     video[0],
--- a/src/diffusers/pipelines/ltx2/pipeline_ltx2_latent_upsample.py
+++ b/src/diffusers/pipelines/ltx2/pipeline_ltx2_latent_upsample.py
@@ -76,8 +76,6 @@ EXAMPLE_DOC_STRING = """
        ...     output_type="np",
        ...     return_dict=False,
        ... )[0]
-        >>> video = (video * 255).round().astype("uint8")
-        >>> video = torch.from_numpy(video)

        >>> encode_video(
        ...     video[0],
--- a/src/diffusers/pipelines/wan/pipeline_wan.py
+++ b/src/diffusers/pipelines/wan/pipeline_wan.py
@@ -496,13 +496,8 @@ class WanPipeline(DiffusionPipeline, WanLoraLoaderMixin):
            num_frames = num_frames // self.vae_scale_factor_temporal * self.vae_scale_factor_temporal + 1
        num_frames = max(num_frames, 1)

-        patch_size = (
-            self.transformer.config.patch_size
-            if self.transformer is not None
-            else self.transformer_2.config.patch_size
-        )
-        h_multiple_of = self.vae_scale_factor_spatial * patch_size[1]
-        w_multiple_of = self.vae_scale_factor_spatial * patch_size[2]
+        h_multiple_of = self.vae_scale_factor_spatial * self.transformer.config.patch_size[1]
+        w_multiple_of = self.vae_scale_factor_spatial * self.transformer.config.patch_size[2]
        calc_height = height // h_multiple_of * h_multiple_of
        calc_width = width // w_multiple_of * w_multiple_of
        if height != calc_height or width != calc_width:
--- a/src/diffusers/pipelines/wan/pipeline_wan_i2v.py
+++ b/src/diffusers/pipelines/wan/pipeline_wan_i2v.py
@@ -637,13 +637,8 @@ class WanImageToVideoPipeline(DiffusionPipeline, WanLoraLoaderMixin):
            num_frames = num_frames // self.vae_scale_factor_temporal * self.vae_scale_factor_temporal + 1
        num_frames = max(num_frames, 1)

-        patch_size = (
-            self.transformer.config.patch_size
-            if self.transformer is not None
-            else self.transformer_2.config.patch_size
-        )
-        h_multiple_of = self.vae_scale_factor_spatial * patch_size[1]
-        w_multiple_of = self.vae_scale_factor_spatial * patch_size[2]
+        h_multiple_of = self.vae_scale_factor_spatial * self.transformer.config.patch_size[1]
+        w_multiple_of = self.vae_scale_factor_spatial * self.transformer.config.patch_size[2]
        calc_height = height // h_multiple_of * h_multiple_of
        calc_width = width // w_multiple_of * w_multiple_of
        if height != calc_height or width != calc_width:
--- a/src/diffusers/utils/import_utils.py
+++ b/src/diffusers/utils/import_utils.py
@@ -227,7 +227,7 @@ _cosmos_guardrail_available, _cosmos_guardrail_version = _is_package_available("
 _sageattention_available, _sageattention_version = _is_package_available("sageattention")
 _flash_attn_available, _flash_attn_version = _is_package_available("flash_attn")
 _flash_attn_3_available, _flash_attn_3_version = _is_package_available("flash_attn_3")
-_aiter_available, _aiter_version = _is_package_available("aiter", get_dist_name=True)
+_aiter_available, _aiter_version = _is_package_available("aiter")
 _kornia_available, _kornia_version = _is_package_available("kornia")
 _nvidia_modelopt_available, _nvidia_modelopt_version = _is_package_available("modelopt", get_dist_name=True)
 _av_available, _av_version = _is_package_available("av")
Author	SHA1	Message	Date
Daniel Gu	7354055077	make style and make quality	2026-01-30 07:52:18 +01:00
Daniel Gu	cd60b3d151	Address review comments	2026-01-30 07:49:24 +01:00
Daniel Gu	857735f15d	Fix comment	2026-01-30 02:26:36 +01:00
Daniel Gu	2e18d2c51a	Update examples to use improved encode_video function	2026-01-30 02:10:51 +01:00
Daniel Gu	d5d2910654	Support different pipeline outputs for LTX 2 encode_video	2026-01-29 09:37:06 +01:00