Compare commits

..

5 Commits

Author SHA1 Message Date
Mikko Lauri
ec6b2bcccb Fix aiter availability check (#13059)
Update import_utils.py
2026-01-30 19:24:05 +05:30
Jared Wen
6a1904eb06 [bug fix] GLM-Image fit new get_image_features API (#13052)
change get_image_features API

Signed-off-by: JaredforReal <w13431838023@gmail.com>
Co-authored-by: YiYi Xu <yixu310@gmail.com>
2026-01-29 16:16:42 -10:00
Sayak Paul
f5b6b6625a [wan] fix wan 2.2 when either of the transformers isn't present. (#13055)
fix wan 2.2 when either of the transformers isn't present.
2026-01-29 08:45:24 -10:00
Olexandr88
1be2f7e8c5 docs: fix grammar in fp16_safetensors CLI warning (#13040)
* docs: fix grammar in fp16_safetensors CLI warning

* Apply style fixes

---------

Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
2026-01-29 21:33:09 +05:30
Sayak Paul
314cfddf3a [ci] uniform run times and wheels for pytorch cuda. (#13047)
* uniform run times and wheels for pytorch cuda.

* 12.9

* change to 24.04.

* change to 24.04.
2026-01-29 19:22:30 +05:30
12 changed files with 45 additions and 76 deletions

View File

@@ -1,4 +1,4 @@
FROM nvidia/cuda:12.9.0-runtime-ubuntu20.04
FROM nvidia/cuda:12.9.1-runtime-ubuntu24.04
LABEL maintainer="Hugging Face"
LABEL repository="diffusers"
@@ -36,7 +36,8 @@ ENV PATH="$VIRTUAL_ENV/bin:$PATH"
RUN uv pip install --no-cache-dir \
torch \
torchvision \
torchaudio
torchaudio \
--index-url https://download.pytorch.org/whl/cu129
# Install compatible versions of numba/llvmlite for Python 3.10+
RUN uv pip install --no-cache-dir \

View File

@@ -1,4 +1,4 @@
FROM nvidia/cuda:12.9.0-runtime-ubuntu20.04
FROM nvidia/cuda:12.9.1-runtime-ubuntu24.04
LABEL maintainer="Hugging Face"
LABEL repository="diffusers"
@@ -36,7 +36,8 @@ ENV PATH="$VIRTUAL_ENV/bin:$PATH"
RUN uv pip install --no-cache-dir \
torch \
torchvision \
torchaudio
torchaudio \
--index-url https://download.pytorch.org/whl/cu129
# Install compatible versions of numba/llvmlite for Python 3.10+
RUN uv pip install --no-cache-dir \

View File

@@ -106,6 +106,8 @@ video, audio = pipe(
output_type="np",
return_dict=False,
)
video = (video * 255).round().astype("uint8")
video = torch.from_numpy(video)
encode_video(
video[0],
@@ -183,6 +185,8 @@ video, audio = pipe(
output_type="np",
return_dict=False,
)
video = (video * 255).round().astype("uint8")
video = torch.from_numpy(video)
encode_video(
video[0],

View File

@@ -35,8 +35,8 @@ from . import BaseDiffusersCLICommand
def conversion_command_factory(args: Namespace):
if args.use_auth_token:
warnings.warn(
"The `--use_auth_token` flag is deprecated and will be removed in a future version. Authentication is now"
" handled automatically if user is logged in."
"The `--use_auth_token` flag is deprecated and will be removed in a future version."
"Authentication is now handled automatically if the user is logged in."
)
return FP16SafetensorsCommand(args.ckpt_id, args.fp16, args.use_safetensors)
@@ -92,8 +92,8 @@ class FP16SafetensorsCommand(BaseDiffusersCLICommand):
pipeline_class = getattr(import_module("diffusers"), pipeline_class_name)
self.logger.info(f"Pipeline class imported: {pipeline_class_name}.")
# Load the appropriate pipeline. We could have use `DiffusionPipeline`
# here, but just to avoid any rough edge cases.
# Load the appropriate pipeline. We could have used `DiffusionPipeline`
# here, but just to avoid potential edge cases.
pipeline = pipeline_class.from_pretrained(
self.ckpt_id, torch_dtype=torch.float16 if self.fp16 else torch.float32
)

View File

@@ -407,8 +407,8 @@ class GlmImagePipeline(DiffusionPipeline):
if len(source_grids) > 0:
prior_token_image_embed = self.vision_language_encoder.get_image_features(
inputs["pixel_values"], source_grids, return_dict=False
)
inputs["pixel_values"], source_grids
).pooler_output
prior_token_image_embed = torch.cat(prior_token_image_embed, dim=0)
prior_token_image_ids_d32 = self.vision_language_encoder.get_image_tokens(
prior_token_image_embed, source_grids

View File

@@ -13,14 +13,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from collections.abc import Generator, Iterator
from fractions import Fraction
from typing import List, Optional, Tuple, Union
from typing import Optional
import numpy as np
import PIL.Image
import torch
from tqdm import tqdm
from ...utils import is_av_available
@@ -105,52 +101,11 @@ def _write_audio(
def encode_video(
video: Union[List[PIL.Image.Image], np.ndarray, torch.Tensor, Iterator[torch.Tensor]],
fps: int,
audio: Optional[torch.Tensor],
audio_sample_rate: Optional[int],
output_path: str,
video_chunks_number: int = 1,
video: torch.Tensor, fps: int, audio: Optional[torch.Tensor], audio_sample_rate: Optional[int], output_path: str
) -> None:
"""
Encodes a video with audio using the PyAV library. Based on code from the original LTX-2 repo:
https://github.com/Lightricks/LTX-2/blob/4f410820b198e05074a1e92de793e3b59e9ab5a0/packages/ltx-pipelines/src/ltx_pipelines/utils/media_io.py#L182
video_np = video.cpu().numpy()
Args:
video (`List[PIL.Image.Image]` or `np.ndarray` or `torch.Tensor`):
A video tensor of shape [frames, height, width, channels] with integer pixel values in [0, 255]. If the
input is a `np.ndarray`, it is expected to be a float array with values in [0, 1] (which is what pipelines
usually return with `output_type="np"`).
fps (`int`)
The frames per second (FPS) of the encoded video.
audio (`torch.Tensor`, *optional*):
An audio waveform of shape [audio_channels, samples].
audio_sample_rate: (`int`, *optional*):
The sampling rate of the audio waveform. For LTX 2, this is typically 24000 (24 kHz).
output_path (`str`):
The path to save the encoded video to.
video_chunks_number (`int`, *optional*, defaults to `1`):
The number of chunks to split the video into for encoding. Each chunk will be encoded separately. The
number of chunks to use often depends on the tiling config for the video VAE.
"""
if isinstance(video, list) and isinstance(video[0], PIL.Image.Image):
# Pipeline output_type="pil"
video_frames = [np.array(frame) for frame in video]
video = np.stack(video_frames, axis=0)
video = torch.from_numpy(video)
elif isinstance(video, np.ndarray):
# Pipeline output_type="np"
is_denormalized = np.logical_and(np.zeros_like(video) <= video, video <= np.ones_like(video))
if np.all(is_denormalized):
video = (video * 255).round().astype("uint8")
video = torch.from_numpy(video)
if isinstance(video, torch.Tensor):
video = iter([video])
first_chunk = next(video)
_, height, width, _ = first_chunk.shape
_, height, width, _ = video_np.shape
container = av.open(output_path, mode="w")
stream = container.add_stream("libx264", rate=int(fps))
@@ -164,18 +119,10 @@ def encode_video(
audio_stream = _prepare_audio_stream(container, audio_sample_rate)
def all_tiles(
first_chunk: torch.Tensor, tiles_generator: Generator[Tuple[torch.Tensor, int], None, None]
) -> Generator[Tuple[torch.Tensor, int], None, None]:
yield first_chunk
yield from tiles_generator
for video_chunk in tqdm(all_tiles(first_chunk, video), total=video_chunks_number):
video_chunk_cpu = video_chunk.to("cpu").numpy()
for frame_array in video_chunk_cpu:
frame = av.VideoFrame.from_ndarray(frame_array, format="rgb24")
for packet in stream.encode(frame):
container.mux(packet)
for frame_array in video_np:
frame = av.VideoFrame.from_ndarray(frame_array, format="rgb24")
for packet in stream.encode(frame):
container.mux(packet)
# Flush encoder
for packet in stream.encode():

View File

@@ -69,6 +69,8 @@ EXAMPLE_DOC_STRING = """
... output_type="np",
... return_dict=False,
... )
>>> video = (video * 255).round().astype("uint8")
>>> video = torch.from_numpy(video)
>>> encode_video(
... video[0],

View File

@@ -75,6 +75,8 @@ EXAMPLE_DOC_STRING = """
... output_type="np",
... return_dict=False,
... )
>>> video = (video * 255).round().astype("uint8")
>>> video = torch.from_numpy(video)
>>> encode_video(
... video[0],

View File

@@ -76,6 +76,8 @@ EXAMPLE_DOC_STRING = """
... output_type="np",
... return_dict=False,
... )[0]
>>> video = (video * 255).round().astype("uint8")
>>> video = torch.from_numpy(video)
>>> encode_video(
... video[0],

View File

@@ -496,8 +496,13 @@ class WanPipeline(DiffusionPipeline, WanLoraLoaderMixin):
num_frames = num_frames // self.vae_scale_factor_temporal * self.vae_scale_factor_temporal + 1
num_frames = max(num_frames, 1)
h_multiple_of = self.vae_scale_factor_spatial * self.transformer.config.patch_size[1]
w_multiple_of = self.vae_scale_factor_spatial * self.transformer.config.patch_size[2]
patch_size = (
self.transformer.config.patch_size
if self.transformer is not None
else self.transformer_2.config.patch_size
)
h_multiple_of = self.vae_scale_factor_spatial * patch_size[1]
w_multiple_of = self.vae_scale_factor_spatial * patch_size[2]
calc_height = height // h_multiple_of * h_multiple_of
calc_width = width // w_multiple_of * w_multiple_of
if height != calc_height or width != calc_width:

View File

@@ -637,8 +637,13 @@ class WanImageToVideoPipeline(DiffusionPipeline, WanLoraLoaderMixin):
num_frames = num_frames // self.vae_scale_factor_temporal * self.vae_scale_factor_temporal + 1
num_frames = max(num_frames, 1)
h_multiple_of = self.vae_scale_factor_spatial * self.transformer.config.patch_size[1]
w_multiple_of = self.vae_scale_factor_spatial * self.transformer.config.patch_size[2]
patch_size = (
self.transformer.config.patch_size
if self.transformer is not None
else self.transformer_2.config.patch_size
)
h_multiple_of = self.vae_scale_factor_spatial * patch_size[1]
w_multiple_of = self.vae_scale_factor_spatial * patch_size[2]
calc_height = height // h_multiple_of * h_multiple_of
calc_width = width // w_multiple_of * w_multiple_of
if height != calc_height or width != calc_width:

View File

@@ -227,7 +227,7 @@ _cosmos_guardrail_available, _cosmos_guardrail_version = _is_package_available("
_sageattention_available, _sageattention_version = _is_package_available("sageattention")
_flash_attn_available, _flash_attn_version = _is_package_available("flash_attn")
_flash_attn_3_available, _flash_attn_3_version = _is_package_available("flash_attn_3")
_aiter_available, _aiter_version = _is_package_available("aiter")
_aiter_available, _aiter_version = _is_package_available("aiter", get_dist_name=True)
_kornia_available, _kornia_version = _is_package_available("kornia")
_nvidia_modelopt_available, _nvidia_modelopt_version = _is_package_available("modelopt", get_dist_name=True)
_av_available, _av_version = _is_package_available("av")