Compare commits

..

3 Commits

Author SHA1 Message Date
Sayak Paul
cc0335154f Merge branch 'main' into hidream-non-diffusers-lora 2025-05-09 13:55:40 +05:30
sayakpaul
e1968390c1 make fix-copies 2025-05-09 11:30:08 +05:30
sayakpaul
3c84a3c108 support non-diffusers hidream loras 2025-05-09 11:05:06 +05:30
30 changed files with 156 additions and 739 deletions

View File

@@ -457,8 +457,6 @@
title: Flux
- local: api/pipelines/control_flux_inpaint
title: FluxControlInpaint
- local: api/pipelines/framepack
title: Framepack
- local: api/pipelines/hidream
title: HiDream-I1
- local: api/pipelines/hunyuandit

View File

@@ -21,22 +21,6 @@ from diffusers import HiDreamImageTransformer2DModel
transformer = HiDreamImageTransformer2DModel.from_pretrained("HiDream-ai/HiDream-I1-Full", subfolder="transformer", torch_dtype=torch.bfloat16)
```
## Loading GGUF quantized checkpoints for HiDream-I1
GGUF checkpoints for the `HiDreamImageTransformer2DModel` can be loaded using `~FromOriginalModelMixin.from_single_file`
```python
import torch
from diffusers import GGUFQuantizationConfig, HiDreamImageTransformer2DModel
ckpt_path = "https://huggingface.co/city96/HiDream-I1-Dev-gguf/blob/main/hidream-i1-dev-Q2_K.gguf"
transformer = HiDreamImageTransformer2DModel.from_single_file(
ckpt_path,
quantization_config=GGUFQuantizationConfig(compute_dtype=torch.bfloat16),
torch_dtype=torch.bfloat16
)
```
## HiDreamImageTransformer2DModel
[[autodoc]] HiDreamImageTransformer2DModel

View File

@@ -1,209 +0,0 @@
<!-- Copyright 2025 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License. -->
# Framepack
<div class="flex flex-wrap space-x-1">
<img alt="LoRA" src="https://img.shields.io/badge/LoRA-d8b4fe?style=flat"/>
</div>
[Packing Input Frame Context in Next-Frame Prediction Models for Video Generation](https://arxiv.org/abs/2504.12626) by Lvmin Zhang and Maneesh Agrawala.
*We present a neural network structure, FramePack, to train next-frame (or next-frame-section) prediction models for video generation. The FramePack compresses input frames to make the transformer context length a fixed number regardless of the video length. As a result, we are able to process a large number of frames using video diffusion with computation bottleneck similar to image diffusion. This also makes the training video batch sizes significantly higher (batch sizes become comparable to image diffusion training). We also propose an anti-drifting sampling method that generates frames in inverted temporal order with early-established endpoints to avoid exposure bias (error accumulation over iterations). Finally, we show that existing video diffusion models can be finetuned with FramePack, and their visual quality may be improved because the next-frame prediction supports more balanced diffusion schedulers with less extreme flow shift timesteps.*
<Tip>
Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
</Tip>
## Available models
| Model name | Description |
|:---|:---|
- [`lllyasviel/FramePackI2V_HY`](https://huggingface.co/lllyasviel/FramePackI2V_HY) | Trained with the "inverted anti-drifting" strategy as described in the paper. Inference requires setting `sampling_type="inverted_anti_drifting"` when running the pipeline. |
- [`lllyasviel/FramePack_F1_I2V_HY_20250503`](https://huggingface.co/lllyasviel/FramePack_F1_I2V_HY_20250503) | Trained with a novel anti-drifting strategy but inference is performed in "vanilla" strategy as described in the paper. Inference requires setting `sampling_type="vanilla"` when running the pipeline. |
## Usage
Refer to the pipeline documentation for basic usage examples. The following section contains examples of offloading, different sampling methods, quantization, and more.
### First and last frame to video
The following example shows how to use Framepack with start and end image controls, using the inverted anti-drifiting sampling model.
```python
import torch
from diffusers import HunyuanVideoFramepackPipeline, HunyuanVideoFramepackTransformer3DModel
from diffusers.utils import export_to_video, load_image
from transformers import SiglipImageProcessor, SiglipVisionModel
transformer = HunyuanVideoFramepackTransformer3DModel.from_pretrained(
"lllyasviel/FramePackI2V_HY", torch_dtype=torch.bfloat16
)
feature_extractor = SiglipImageProcessor.from_pretrained(
"lllyasviel/flux_redux_bfl", subfolder="feature_extractor"
)
image_encoder = SiglipVisionModel.from_pretrained(
"lllyasviel/flux_redux_bfl", subfolder="image_encoder", torch_dtype=torch.float16
)
pipe = HunyuanVideoFramepackPipeline.from_pretrained(
"hunyuanvideo-community/HunyuanVideo",
transformer=transformer,
feature_extractor=feature_extractor,
image_encoder=image_encoder,
torch_dtype=torch.float16,
)
# Enable memory optimizations
pipe.enable_model_cpu_offload()
pipe.vae.enable_tiling()
prompt = "CG animation style, a small blue bird takes off from the ground, flapping its wings. The bird's feathers are delicate, with a unique pattern on its chest. The background shows a blue sky with white clouds under bright sunshine. The camera follows the bird upward, capturing its flight and the vastness of the sky from a close-up, low-angle perspective."
first_image = load_image(
"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/flf2v_input_first_frame.png"
)
last_image = load_image(
"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/flf2v_input_last_frame.png"
)
output = pipe(
image=first_image,
last_image=last_image,
prompt=prompt,
height=512,
width=512,
num_frames=91,
num_inference_steps=30,
guidance_scale=9.0,
generator=torch.Generator().manual_seed(0),
sampling_type="inverted_anti_drifting",
).frames[0]
export_to_video(output, "output.mp4", fps=30)
```
### Vanilla sampling
The following example shows how to use Framepack with the F1 model trained with vanilla sampling but new regulation approach for anti-drifting.
```python
import torch
from diffusers import HunyuanVideoFramepackPipeline, HunyuanVideoFramepackTransformer3DModel
from diffusers.utils import export_to_video, load_image
from transformers import SiglipImageProcessor, SiglipVisionModel
transformer = HunyuanVideoFramepackTransformer3DModel.from_pretrained(
"lllyasviel/FramePack_F1_I2V_HY_20250503", torch_dtype=torch.bfloat16
)
feature_extractor = SiglipImageProcessor.from_pretrained(
"lllyasviel/flux_redux_bfl", subfolder="feature_extractor"
)
image_encoder = SiglipVisionModel.from_pretrained(
"lllyasviel/flux_redux_bfl", subfolder="image_encoder", torch_dtype=torch.float16
)
pipe = HunyuanVideoFramepackPipeline.from_pretrained(
"hunyuanvideo-community/HunyuanVideo",
transformer=transformer,
feature_extractor=feature_extractor,
image_encoder=image_encoder,
torch_dtype=torch.float16,
)
# Enable memory optimizations
pipe.enable_model_cpu_offload()
pipe.vae.enable_tiling()
image = load_image(
"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/penguin.png"
)
output = pipe(
image=image,
prompt="A penguin dancing in the snow",
height=832,
width=480,
num_frames=91,
num_inference_steps=30,
guidance_scale=9.0,
generator=torch.Generator().manual_seed(0),
sampling_type="vanilla",
).frames[0]
export_to_video(output, "output.mp4", fps=30)
```
### Group offloading
Group offloading ([`~hooks.apply_group_offloading`]) provides aggressive memory optimizations for offloading internal parts of any model to the CPU, with possibly no additional overhead to generation time. If you have very low VRAM available, this approach may be suitable for you depending on the amount of CPU RAM available.
```python
import torch
from diffusers import HunyuanVideoFramepackPipeline, HunyuanVideoFramepackTransformer3DModel
from diffusers.hooks import apply_group_offloading
from diffusers.utils import export_to_video, load_image
from transformers import SiglipImageProcessor, SiglipVisionModel
transformer = HunyuanVideoFramepackTransformer3DModel.from_pretrained(
"lllyasviel/FramePack_F1_I2V_HY_20250503", torch_dtype=torch.bfloat16
)
feature_extractor = SiglipImageProcessor.from_pretrained(
"lllyasviel/flux_redux_bfl", subfolder="feature_extractor"
)
image_encoder = SiglipVisionModel.from_pretrained(
"lllyasviel/flux_redux_bfl", subfolder="image_encoder", torch_dtype=torch.float16
)
pipe = HunyuanVideoFramepackPipeline.from_pretrained(
"hunyuanvideo-community/HunyuanVideo",
transformer=transformer,
feature_extractor=feature_extractor,
image_encoder=image_encoder,
torch_dtype=torch.float16,
)
# Enable group offloading
onload_device = torch.device("cuda")
offload_device = torch.device("cpu")
list(map(
lambda x: apply_group_offloading(x, onload_device, offload_device, offload_type="leaf_level", use_stream=True, low_cpu_mem_usage=True),
[pipe.text_encoder, pipe.text_encoder_2, pipe.transformer]
))
pipe.image_encoder.to(onload_device)
pipe.vae.to(onload_device)
pipe.vae.enable_tiling()
image = load_image(
"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/penguin.png"
)
output = pipe(
image=image,
prompt="A penguin dancing in the snow",
height=832,
width=480,
num_frames=91,
num_inference_steps=30,
guidance_scale=9.0,
generator=torch.Generator().manual_seed(0),
sampling_type="vanilla",
).frames[0]
print(f"Max memory: {torch.cuda.max_memory_allocated() / 1024**3:.3f} GB")
export_to_video(output, "output.mp4", fps=30)
```
## HunyuanVideoFramepackPipeline
[[autodoc]] HunyuanVideoFramepackPipeline
- all
- __call__
## HunyuanVideoPipelineOutput
[[autodoc]] pipelines.hunyuan_video.pipeline_output.HunyuanVideoPipelineOutput

View File

@@ -52,6 +52,7 @@ The following models are available for the image-to-video pipeline:
| [`Skywork/SkyReels-V1-Hunyuan-I2V`](https://huggingface.co/Skywork/SkyReels-V1-Hunyuan-I2V) | Skywork's custom finetune of HunyuanVideo (de-distilled). Performs best with `97x544x960` resolution. Performs best at `97x544x960` resolution, `guidance_scale=1.0`, `true_cfg_scale=6.0` and a negative prompt. |
| [`hunyuanvideo-community/HunyuanVideo-I2V-33ch`](https://huggingface.co/hunyuanvideo-community/HunyuanVideo-I2V) | Tecent's official HunyuanVideo 33-channel I2V model. Performs best at resolutions of 480, 720, 960, 1280. A higher `shift` value when initializing the scheduler is recommended (good values are between 7 and 20). |
| [`hunyuanvideo-community/HunyuanVideo-I2V`](https://huggingface.co/hunyuanvideo-community/HunyuanVideo-I2V) | Tecent's official HunyuanVideo 16-channel I2V model. Performs best at resolutions of 480, 720, 960, 1280. A higher `shift` value when initializing the scheduler is recommended (good values are between 7 and 20) |
- [`lllyasviel/FramePackI2V_HY`](https://huggingface.co/lllyasviel/FramePackI2V_HY) | lllyasviel's paper introducing a new technique for long-context video generation called [Framepack](https://arxiv.org/abs/2504.12626). |
## Quantization

View File

@@ -31,7 +31,6 @@ from .single_file_utils import (
convert_autoencoder_dc_checkpoint_to_diffusers,
convert_controlnet_checkpoint,
convert_flux_transformer_checkpoint_to_diffusers,
convert_hidream_transformer_to_diffusers,
convert_hunyuan_video_transformer_to_diffusers,
convert_ldm_unet_checkpoint,
convert_ldm_vae_checkpoint,
@@ -134,10 +133,6 @@ SINGLE_FILE_LOADABLE_CLASSES = {
"checkpoint_mapping_fn": convert_wan_vae_to_diffusers,
"default_subfolder": "vae",
},
"HiDreamImageTransformer2DModel": {
"checkpoint_mapping_fn": convert_hidream_transformer_to_diffusers,
"default_subfolder": "transformer",
},
}

View File

@@ -126,7 +126,6 @@ CHECKPOINT_KEY_NAMES = {
],
"wan": ["model.diffusion_model.head.modulation", "head.modulation"],
"wan_vae": "decoder.middle.0.residual.0.gamma",
"hidream": "double_stream_blocks.0.block.adaLN_modulation.1.bias",
}
DIFFUSERS_DEFAULT_PIPELINE_PATHS = {
@@ -191,7 +190,6 @@ DIFFUSERS_DEFAULT_PIPELINE_PATHS = {
"wan-t2v-1.3B": {"pretrained_model_name_or_path": "Wan-AI/Wan2.1-T2V-1.3B-Diffusers"},
"wan-t2v-14B": {"pretrained_model_name_or_path": "Wan-AI/Wan2.1-T2V-14B-Diffusers"},
"wan-i2v-14B": {"pretrained_model_name_or_path": "Wan-AI/Wan2.1-I2V-14B-480P-Diffusers"},
"hidream": {"pretrained_model_name_or_path": "HiDream-ai/HiDream-I1-Dev"},
}
# Use to configure model sample size when original config is provided
@@ -703,8 +701,6 @@ def infer_diffusers_model_type(checkpoint):
elif CHECKPOINT_KEY_NAMES["wan_vae"] in checkpoint:
# All Wan models use the same VAE so we can use the same default model repo to fetch the config
model_type = "wan-t2v-14B"
elif CHECKPOINT_KEY_NAMES["hidream"] in checkpoint:
model_type = "hidream"
else:
model_type = "v1"
@@ -3297,12 +3293,3 @@ def convert_wan_vae_to_diffusers(checkpoint, **kwargs):
converted_state_dict[key] = value
return converted_state_dict
def convert_hidream_transformer_to_diffusers(checkpoint, **kwargs):
keys = list(checkpoint.keys())
for k in keys:
if "model.diffusion_model." in k:
checkpoint[k.replace("model.diffusion_model.", "")] = checkpoint.pop(k)
return checkpoint

View File

@@ -5,7 +5,7 @@ import torch.nn as nn
import torch.nn.functional as F
from ...configuration_utils import ConfigMixin, register_to_config
from ...loaders import FromOriginalModelMixin, PeftAdapterMixin
from ...loaders import PeftAdapterMixin
from ...models.modeling_outputs import Transformer2DModelOutput
from ...models.modeling_utils import ModelMixin
from ...utils import USE_PEFT_BACKEND, deprecate, logging, scale_lora_layers, unscale_lora_layers
@@ -602,7 +602,7 @@ class HiDreamBlock(nn.Module):
)
class HiDreamImageTransformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin):
class HiDreamImageTransformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
_supports_gradient_checkpointing = True
_no_split_modules = ["HiDreamImageTransformerBlock", "HiDreamImageSingleTransformerBlock"]

View File

@@ -152,19 +152,9 @@ class HunyuanVideoFramepackTransformer3DModel(
# 1. Latent and condition embedders
self.x_embedder = HunyuanVideoPatchEmbed((patch_size_t, patch_size, patch_size), in_channels, inner_dim)
# Framepack history projection embedder
self.clean_x_embedder = None
if has_clean_x_embedder:
self.clean_x_embedder = HunyuanVideoHistoryPatchEmbed(in_channels, inner_dim)
self.context_embedder = HunyuanVideoTokenRefiner(
text_embed_dim, num_attention_heads, attention_head_dim, num_layers=num_refiner_layers
)
# Framepack image-conditioning embedder
self.image_projection = FramepackClipVisionProjection(image_proj_dim, inner_dim) if has_image_proj else None
self.time_text_embed = HunyuanVideoConditionEmbedding(
inner_dim, pooled_projection_dim, guidance_embeds, image_condition_type
)
@@ -196,7 +186,14 @@ class HunyuanVideoFramepackTransformer3DModel(
self.norm_out = AdaLayerNormContinuous(inner_dim, inner_dim, elementwise_affine=False, eps=1e-6)
self.proj_out = nn.Linear(inner_dim, patch_size_t * patch_size * patch_size * out_channels)
self.gradient_checkpointing = False
# Framepack specific modules
self.image_projection = FramepackClipVisionProjection(image_proj_dim, inner_dim) if has_image_proj else None
self.clean_x_embedder = None
if has_clean_x_embedder:
self.clean_x_embedder = HunyuanVideoHistoryPatchEmbed(in_channels, inner_dim)
self.use_gradient_checkpointing = False
def forward(
self,

View File

@@ -36,11 +36,11 @@ EXAMPLE_DOC_STRING = """
Examples:
```py
>>> import torch
>>> from transformers import AutoTokenizer, LlamaForCausalLM
>>> from diffusers import HiDreamImagePipeline
>>> from transformers import PreTrainedTokenizerFast, LlamaForCausalLM
>>> from diffusers import UniPCMultistepScheduler, HiDreamImagePipeline
>>> tokenizer_4 = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct")
>>> tokenizer_4 = PreTrainedTokenizerFast.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct")
>>> text_encoder_4 = LlamaForCausalLM.from_pretrained(
... "meta-llama/Meta-Llama-3.1-8B-Instruct",
... output_hidden_states=True,

View File

@@ -14,7 +14,6 @@
import inspect
import math
from enum import Enum
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
import numpy as np
@@ -92,7 +91,6 @@ EXAMPLE_DOC_STRING = """
... num_inference_steps=30,
... guidance_scale=9.0,
... generator=torch.Generator().manual_seed(0),
... sampling_type="inverted_anti_drifting",
... ).frames[0]
>>> export_to_video(output, "output.mp4", fps=30)
```
@@ -140,7 +138,6 @@ EXAMPLE_DOC_STRING = """
... num_inference_steps=30,
... guidance_scale=9.0,
... generator=torch.Generator().manual_seed(0),
... sampling_type="inverted_anti_drifting",
... ).frames[0]
>>> export_to_video(output, "output.mp4", fps=30)
```
@@ -235,11 +232,6 @@ def retrieve_timesteps(
return timesteps, num_inference_steps
class FramepackSamplingType(str, Enum):
VANILLA = "vanilla"
INVERTED_ANTI_DRIFTING = "inverted_anti_drifting"
class HunyuanVideoFramepackPipeline(DiffusionPipeline, HunyuanVideoLoraLoaderMixin):
r"""
Pipeline for text-to-video generation using HunyuanVideo.
@@ -463,11 +455,6 @@ class HunyuanVideoFramepackPipeline(DiffusionPipeline, HunyuanVideoLoraLoaderMix
prompt_embeds=None,
callback_on_step_end_tensor_inputs=None,
prompt_template=None,
image=None,
image_latents=None,
last_image=None,
last_image_latents=None,
sampling_type=None,
):
if height % 16 != 0 or width % 16 != 0:
raise ValueError(f"`height` and `width` have to be divisible by 16 but are {height} and {width}.")
@@ -506,21 +493,6 @@ class HunyuanVideoFramepackPipeline(DiffusionPipeline, HunyuanVideoLoraLoaderMix
f"`prompt_template` has to contain a key `template` but only found {prompt_template.keys()}"
)
sampling_types = [x.value for x in FramepackSamplingType.__members__.values()]
if sampling_type not in sampling_types:
raise ValueError(f"`sampling_type` has to be one of '{sampling_types}' but is '{sampling_type}'")
if image is not None and image_latents is not None:
raise ValueError("Only one of `image` or `image_latents` can be passed.")
if last_image is not None and last_image_latents is not None:
raise ValueError("Only one of `last_image` or `last_image_latents` can be passed.")
if sampling_type != FramepackSamplingType.INVERTED_ANTI_DRIFTING and (
last_image is not None or last_image_latents is not None
):
raise ValueError(
'Only `"inverted_anti_drifting"` inference type supports `last_image` or `last_image_latents`.'
)
def prepare_latents(
self,
batch_size: int = 1,
@@ -651,7 +623,6 @@ class HunyuanVideoFramepackPipeline(DiffusionPipeline, HunyuanVideoLoraLoaderMix
callback_on_step_end_tensor_inputs: List[str] = ["latents"],
prompt_template: Dict[str, Any] = DEFAULT_PROMPT_TEMPLATE,
max_sequence_length: int = 256,
sampling_type: FramepackSamplingType = FramepackSamplingType.INVERTED_ANTI_DRIFTING,
):
r"""
The call function to the pipeline for generation.
@@ -764,11 +735,6 @@ class HunyuanVideoFramepackPipeline(DiffusionPipeline, HunyuanVideoLoraLoaderMix
prompt_embeds,
callback_on_step_end_tensor_inputs,
prompt_template,
image,
image_latents,
last_image,
last_image_latents,
sampling_type,
)
has_neg_prompt = negative_prompt is not None or (
@@ -840,6 +806,18 @@ class HunyuanVideoFramepackPipeline(DiffusionPipeline, HunyuanVideoLoraLoaderMix
num_channels_latents = self.transformer.config.in_channels
window_num_frames = (latent_window_size - 1) * self.vae_scale_factor_temporal + 1
num_latent_sections = max(1, (num_frames + window_num_frames - 1) // window_num_frames)
# Specific to the released checkpoint: https://huggingface.co/lllyasviel/FramePackI2V_HY
# TODO: find a more generic way in future if there are more checkpoints
history_sizes = [1, 2, 16]
history_latents = torch.zeros(
batch_size,
num_channels_latents,
sum(history_sizes),
height // self.vae_scale_factor_spatial,
width // self.vae_scale_factor_spatial,
device=device,
dtype=torch.float32,
)
history_video = None
total_generated_latent_frames = 0
@@ -851,92 +829,38 @@ class HunyuanVideoFramepackPipeline(DiffusionPipeline, HunyuanVideoLoraLoaderMix
last_image, dtype=torch.float32, device=device, generator=generator
)
# Specific to the released checkpoints:
# - https://huggingface.co/lllyasviel/FramePackI2V_HY
# - https://huggingface.co/lllyasviel/FramePack_F1_I2V_HY_20250503
# TODO: find a more generic way in future if there are more checkpoints
if sampling_type == FramepackSamplingType.INVERTED_ANTI_DRIFTING:
history_sizes = [1, 2, 16]
history_latents = torch.zeros(
batch_size,
num_channels_latents,
sum(history_sizes),
height // self.vae_scale_factor_spatial,
width // self.vae_scale_factor_spatial,
device=device,
dtype=torch.float32,
)
elif sampling_type == FramepackSamplingType.VANILLA:
history_sizes = [16, 2, 1]
history_latents = torch.zeros(
batch_size,
num_channels_latents,
sum(history_sizes),
height // self.vae_scale_factor_spatial,
width // self.vae_scale_factor_spatial,
device=device,
dtype=torch.float32,
)
history_latents = torch.cat([history_latents, image_latents], dim=2)
total_generated_latent_frames += 1
else:
assert False
latent_paddings = list(reversed(range(num_latent_sections)))
if num_latent_sections > 4:
latent_paddings = [3] + [2] * (num_latent_sections - 3) + [1, 0]
# 6. Prepare guidance condition
guidance = torch.tensor([guidance_scale] * batch_size, dtype=transformer_dtype, device=device) * 1000.0
# 7. Denoising loop
for k in range(num_latent_sections):
if sampling_type == FramepackSamplingType.INVERTED_ANTI_DRIFTING:
latent_paddings = list(reversed(range(num_latent_sections)))
if num_latent_sections > 4:
latent_paddings = [3] + [2] * (num_latent_sections - 3) + [1, 0]
is_first_section = k == 0
is_last_section = k == num_latent_sections - 1
latent_padding_size = latent_paddings[k] * latent_window_size
is_first_section = k == 0
is_last_section = k == num_latent_sections - 1
latent_padding_size = latent_paddings[k] * latent_window_size
indices = torch.arange(0, sum([1, latent_padding_size, latent_window_size, *history_sizes]))
(
indices_prefix,
indices_padding,
indices_latents,
indices_postfix,
indices_latents_history_2x,
indices_latents_history_4x,
) = indices.split([1, latent_padding_size, latent_window_size, *history_sizes], dim=0)
# Inverted anti-drifting sampling: Figure 2(c) in the paper
indices_clean_latents = torch.cat([indices_prefix, indices_postfix], dim=0)
indices = torch.arange(0, sum([1, latent_padding_size, latent_window_size, *history_sizes]))
(
indices_prefix,
indices_padding,
indices_latents,
indices_latents_history_1x,
indices_latents_history_2x,
indices_latents_history_4x,
) = indices.split([1, latent_padding_size, latent_window_size, *history_sizes], dim=0)
# Inverted anti-drifting sampling: Figure 2(c) in the paper
indices_clean_latents = torch.cat([indices_prefix, indices_latents_history_1x], dim=0)
latents_prefix = image_latents
latents_history_1x, latents_history_2x, latents_history_4x = history_latents[
:, :, : sum(history_sizes)
].split(history_sizes, dim=2)
if last_image is not None and is_first_section:
latents_history_1x = last_image_latents
latents_clean = torch.cat([latents_prefix, latents_history_1x], dim=2)
elif sampling_type == FramepackSamplingType.VANILLA:
indices = torch.arange(0, sum([1, *history_sizes, latent_window_size]))
(
indices_prefix,
indices_latents_history_4x,
indices_latents_history_2x,
indices_latents_history_1x,
indices_latents,
) = indices.split([1, *history_sizes, latent_window_size], dim=0)
indices_clean_latents = torch.cat([indices_prefix, indices_latents_history_1x], dim=0)
latents_prefix = image_latents
latents_history_4x, latents_history_2x, latents_history_1x = history_latents[
:, :, -sum(history_sizes) :
].split(history_sizes, dim=2)
latents_clean = torch.cat([latents_prefix, latents_history_1x], dim=2)
else:
assert False
latents_prefix = image_latents
latents_postfix, latents_history_2x, latents_history_4x = history_latents[
:, :, : sum(history_sizes)
].split(history_sizes, dim=2)
if last_image is not None and is_first_section:
latents_postfix = last_image_latents
latents_clean = torch.cat([latents_prefix, latents_postfix], dim=2)
latents = self.prepare_latents(
batch_size,
@@ -1036,26 +960,13 @@ class HunyuanVideoFramepackPipeline(DiffusionPipeline, HunyuanVideoLoraLoaderMix
if XLA_AVAILABLE:
xm.mark_step()
if sampling_type == FramepackSamplingType.INVERTED_ANTI_DRIFTING:
if is_last_section:
latents = torch.cat([image_latents, latents], dim=2)
total_generated_latent_frames += latents.shape[2]
history_latents = torch.cat([latents, history_latents], dim=2)
real_history_latents = history_latents[:, :, :total_generated_latent_frames]
section_latent_frames = (
(latent_window_size * 2 + 1) if is_last_section else (latent_window_size * 2)
)
index_slice = (slice(None), slice(None), slice(0, section_latent_frames))
if is_last_section:
latents = torch.cat([image_latents, latents], dim=2)
elif sampling_type == FramepackSamplingType.VANILLA:
total_generated_latent_frames += latents.shape[2]
history_latents = torch.cat([history_latents, latents], dim=2)
real_history_latents = history_latents[:, :, -total_generated_latent_frames:]
section_latent_frames = latent_window_size * 2
index_slice = (slice(None), slice(None), slice(-section_latent_frames, None))
total_generated_latent_frames += latents.shape[2]
history_latents = torch.cat([latents, history_latents], dim=2)
else:
assert False
real_history_latents = history_latents[:, :, :total_generated_latent_frames]
if history_video is None:
if not output_type == "latent":
@@ -1065,18 +976,16 @@ class HunyuanVideoFramepackPipeline(DiffusionPipeline, HunyuanVideoLoraLoaderMix
history_video = [real_history_latents]
else:
if not output_type == "latent":
section_latent_frames = (
(latent_window_size * 2 + 1) if is_last_section else (latent_window_size * 2)
)
overlapped_frames = (latent_window_size - 1) * self.vae_scale_factor_temporal + 1
current_latents = (
real_history_latents[index_slice].to(vae_dtype) / self.vae.config.scaling_factor
real_history_latents[:, :, :section_latent_frames].to(vae_dtype)
/ self.vae.config.scaling_factor
)
current_video = self.vae.decode(current_latents, return_dict=False)[0]
if sampling_type == FramepackSamplingType.INVERTED_ANTI_DRIFTING:
history_video = self._soft_append(current_video, history_video, overlapped_frames)
elif sampling_type == FramepackSamplingType.VANILLA:
history_video = self._soft_append(history_video, current_video, overlapped_frames)
else:
assert False
history_video = self._soft_append(current_video, history_video, overlapped_frames)
else:
history_video.append(real_history_latents)

View File

@@ -789,7 +789,6 @@ class LTXPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraLoaderMixi
]
latents = (1 - decode_noise_scale) * latents + decode_noise_scale * noise
latents = latents.to(self.vae.dtype)
video = self.vae.decode(latents, timestep, return_dict=False)[0]
video = self.video_processor.postprocess_video(video, output_type=output_type)

View File

@@ -1,116 +0,0 @@
# Copyright 2024 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import torch
from diffusers import HunyuanVideoFramepackTransformer3DModel
from diffusers.utils.testing_utils import (
enable_full_determinism,
torch_device,
)
from ..test_modeling_common import ModelTesterMixin
enable_full_determinism()
class HunyuanVideoTransformer3DTests(ModelTesterMixin, unittest.TestCase):
model_class = HunyuanVideoFramepackTransformer3DModel
main_input_name = "hidden_states"
uses_custom_attn_processor = True
model_split_percents = [0.5, 0.7, 0.9]
@property
def dummy_input(self):
batch_size = 1
num_channels = 4
num_frames = 3
height = 4
width = 4
text_encoder_embedding_dim = 16
image_encoder_embedding_dim = 16
pooled_projection_dim = 8
sequence_length = 12
hidden_states = torch.randn((batch_size, num_channels, num_frames, height, width)).to(torch_device)
encoder_hidden_states = torch.randn((batch_size, sequence_length, text_encoder_embedding_dim)).to(torch_device)
pooled_projections = torch.randn((batch_size, pooled_projection_dim)).to(torch_device)
encoder_attention_mask = torch.ones((batch_size, sequence_length)).to(torch_device)
image_embeds = torch.randn((batch_size, sequence_length, image_encoder_embedding_dim)).to(torch_device)
indices_latents = torch.ones((3,)).to(torch_device)
latents_clean = torch.randn((batch_size, num_channels, num_frames - 1, height, width)).to(torch_device)
indices_latents_clean = torch.ones((num_frames - 1,)).to(torch_device)
latents_history_2x = torch.randn((batch_size, num_channels, num_frames - 1, height, width)).to(torch_device)
indices_latents_history_2x = torch.ones((num_frames - 1,)).to(torch_device)
latents_history_4x = torch.randn((batch_size, num_channels, (num_frames - 1) * 4, height, width)).to(
torch_device
)
indices_latents_history_4x = torch.ones(((num_frames - 1) * 4,)).to(torch_device)
timestep = torch.randint(0, 1000, size=(batch_size,)).to(torch_device)
guidance = torch.randint(0, 1000, size=(batch_size,)).to(torch_device)
return {
"hidden_states": hidden_states,
"timestep": timestep,
"encoder_hidden_states": encoder_hidden_states,
"pooled_projections": pooled_projections,
"encoder_attention_mask": encoder_attention_mask,
"guidance": guidance,
"image_embeds": image_embeds,
"indices_latents": indices_latents,
"latents_clean": latents_clean,
"indices_latents_clean": indices_latents_clean,
"latents_history_2x": latents_history_2x,
"indices_latents_history_2x": indices_latents_history_2x,
"latents_history_4x": latents_history_4x,
"indices_latents_history_4x": indices_latents_history_4x,
}
@property
def input_shape(self):
return (4, 3, 4, 4)
@property
def output_shape(self):
return (4, 3, 4, 4)
def prepare_init_args_and_inputs_for_common(self):
init_dict = {
"in_channels": 4,
"out_channels": 4,
"num_attention_heads": 2,
"attention_head_dim": 10,
"num_layers": 1,
"num_single_layers": 1,
"num_refiner_layers": 1,
"patch_size": 2,
"patch_size_t": 1,
"guidance_embeds": True,
"text_embed_dim": 16,
"pooled_projection_dim": 8,
"rope_axes_dim": (2, 4, 4),
"image_condition_type": None,
"has_image_proj": True,
"image_proj_dim": 16,
"has_clean_x_embedder": True,
}
inputs_dict = self.dummy_input
return init_dict, inputs_dict
def test_gradient_checkpointing_is_applied(self):
expected_set = {"HunyuanVideoFramepackTransformer3DModel"}
super().test_gradient_checkpointing_is_applied(expected_set=expected_set)

View File

@@ -20,13 +20,13 @@ import torch
from diffusers import LTXVideoTransformer3DModel
from diffusers.utils.testing_utils import enable_full_determinism, torch_device
from ..test_modeling_common import ModelTesterMixin, TorchCompileTesterMixin
from ..test_modeling_common import ModelTesterMixin
enable_full_determinism()
class LTXTransformerTests(ModelTesterMixin, TorchCompileTesterMixin, unittest.TestCase):
class LTXTransformerTests(ModelTesterMixin, unittest.TestCase):
model_class = LTXVideoTransformer3DModel
main_input_name = "hidden_states"
uses_custom_attn_processor = True

View File

@@ -24,10 +24,9 @@ from transformers import AutoTokenizer, T5EncoderModel
from diffusers import AutoencoderKLCogVideoX, ConsisIDPipeline, ConsisIDTransformer3DModel, DDIMScheduler
from diffusers.utils import load_image
from diffusers.utils.testing_utils import (
backend_empty_cache,
enable_full_determinism,
numpy_cosine_similarity_distance,
require_torch_accelerator,
require_torch_gpu,
slow,
torch_device,
)
@@ -317,19 +316,19 @@ class ConsisIDPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
@slow
@require_torch_accelerator
@require_torch_gpu
class ConsisIDPipelineIntegrationTests(unittest.TestCase):
prompt = "A painting of a squirrel eating a burger."
def setUp(self):
super().setUp()
gc.collect()
backend_empty_cache(torch_device)
torch.cuda.empty_cache()
def tearDown(self):
super().tearDown()
gc.collect()
backend_empty_cache(torch_device)
torch.cuda.empty_cache()
def test_consisid(self):
generator = torch.Generator("cpu").manual_seed(0)
@@ -339,8 +338,8 @@ class ConsisIDPipelineIntegrationTests(unittest.TestCase):
prompt = self.prompt
image = load_image("https://github.com/PKU-YuanGroup/ConsisID/blob/main/asserts/example_images/2.png?raw=true")
id_vit_hidden = [torch.ones([1, 577, 1024])] * 5
id_cond = torch.ones(1, 1280)
id_vit_hidden = [torch.ones([1, 2, 2])] * 1
id_cond = torch.ones(1, 2)
videos = pipe(
image=image,
@@ -358,5 +357,5 @@ class ConsisIDPipelineIntegrationTests(unittest.TestCase):
video = videos[0]
expected_video = torch.randn(1, 16, 480, 720, 3).numpy()
max_diff = numpy_cosine_similarity_distance(video.cpu(), expected_video)
max_diff = numpy_cosine_similarity_distance(video, expected_video)
assert max_diff < 1e-3, f"Max diff is too high. got {video}"

View File

@@ -20,14 +20,7 @@ import numpy as np
import torch
from diffusers import DanceDiffusionPipeline, IPNDMScheduler, UNet1DModel
from diffusers.utils.testing_utils import (
backend_empty_cache,
enable_full_determinism,
nightly,
require_torch_accelerator,
skip_mps,
torch_device,
)
from diffusers.utils.testing_utils import enable_full_determinism, nightly, require_torch_gpu, skip_mps, torch_device
from ..pipeline_params import UNCONDITIONAL_AUDIO_GENERATION_BATCH_PARAMS, UNCONDITIONAL_AUDIO_GENERATION_PARAMS
from ..test_pipelines_common import PipelineTesterMixin
@@ -123,19 +116,19 @@ class DanceDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
@nightly
@require_torch_accelerator
@require_torch_gpu
class PipelineIntegrationTests(unittest.TestCase):
def setUp(self):
# clean up the VRAM before each test
super().setUp()
gc.collect()
backend_empty_cache(torch_device)
torch.cuda.empty_cache()
def tearDown(self):
# clean up the VRAM after each test
super().tearDown()
gc.collect()
backend_empty_cache(torch_device)
torch.cuda.empty_cache()
def test_dance_diffusion(self):
device = torch_device

View File

@@ -21,15 +21,7 @@ import torch
from diffusers import AutoencoderKL, DDIMScheduler, DiTPipeline, DiTTransformer2DModel, DPMSolverMultistepScheduler
from diffusers.utils import is_xformers_available
from diffusers.utils.testing_utils import (
backend_empty_cache,
enable_full_determinism,
load_numpy,
nightly,
numpy_cosine_similarity_distance,
require_torch_accelerator,
torch_device,
)
from diffusers.utils.testing_utils import enable_full_determinism, load_numpy, nightly, require_torch_gpu, torch_device
from ..pipeline_params import (
CLASS_CONDITIONED_IMAGE_GENERATION_BATCH_PARAMS,
@@ -115,23 +107,23 @@ class DiTPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
@nightly
@require_torch_accelerator
@require_torch_gpu
class DiTPipelineIntegrationTests(unittest.TestCase):
def setUp(self):
super().setUp()
gc.collect()
backend_empty_cache(torch_device)
torch.cuda.empty_cache()
def tearDown(self):
super().tearDown()
gc.collect()
backend_empty_cache(torch_device)
torch.cuda.empty_cache()
def test_dit_256(self):
generator = torch.manual_seed(0)
pipe = DiTPipeline.from_pretrained("facebook/DiT-XL-2-256")
pipe.to(torch_device)
pipe.to("cuda")
words = ["vase", "umbrella", "white shark", "white wolf"]
ids = pipe.get_label_ids(words)
@@ -147,7 +139,7 @@ class DiTPipelineIntegrationTests(unittest.TestCase):
def test_dit_512(self):
pipe = DiTPipeline.from_pretrained("facebook/DiT-XL-2-512")
pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
pipe.to(torch_device)
pipe.to("cuda")
words = ["vase", "umbrella"]
ids = pipe.get_label_ids(words)
@@ -160,7 +152,4 @@ class DiTPipelineIntegrationTests(unittest.TestCase):
f"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/dit/{word}_512.npy"
)
expected_slice = expected_image.flatten()
output_slice = image.flatten()
assert numpy_cosine_similarity_distance(expected_slice, output_slice) < 1e-2
assert np.abs((expected_image - image).max()) < 1e-1

View File

@@ -27,10 +27,9 @@ from diffusers import (
FlowMatchEulerDiscreteScheduler,
)
from diffusers.utils.testing_utils import (
backend_empty_cache,
enable_full_determinism,
numpy_cosine_similarity_distance,
require_torch_accelerator,
require_torch_gpu,
slow,
torch_device,
)
@@ -257,19 +256,19 @@ class EasyAnimatePipelineFastTests(PipelineTesterMixin, unittest.TestCase):
@slow
@require_torch_accelerator
@require_torch_gpu
class EasyAnimatePipelineIntegrationTests(unittest.TestCase):
prompt = "A painting of a squirrel eating a burger."
def setUp(self):
super().setUp()
gc.collect()
backend_empty_cache(torch_device)
torch.cuda.empty_cache()
def tearDown(self):
super().tearDown()
gc.collect()
backend_empty_cache(torch_device)
torch.cuda.empty_cache()
def test_EasyAnimate(self):
generator = torch.Generator("cpu").manual_seed(0)

View File

@@ -28,15 +28,13 @@ from diffusers import (
VQModel,
)
from diffusers.utils.testing_utils import (
backend_empty_cache,
enable_full_determinism,
floats_tensor,
load_image,
load_numpy,
nightly,
numpy_cosine_similarity_distance,
require_torch_accelerator,
torch_device,
require_torch_gpu,
)
from ..test_pipelines_common import PipelineTesterMixin
@@ -228,19 +226,19 @@ class KandinskyV22ControlnetPipelineFastTests(PipelineTesterMixin, unittest.Test
@nightly
@require_torch_accelerator
@require_torch_gpu
class KandinskyV22ControlnetPipelineIntegrationTests(unittest.TestCase):
def setUp(self):
# clean up the VRAM before each test
super().setUp()
gc.collect()
backend_empty_cache(torch_device)
torch.cuda.empty_cache()
def tearDown(self):
# clean up the VRAM after each test
super().tearDown()
gc.collect()
backend_empty_cache(torch_device)
torch.cuda.empty_cache()
def test_kandinsky_controlnet(self):
expected_image = load_numpy(

View File

@@ -29,15 +29,13 @@ from diffusers import (
VQModel,
)
from diffusers.utils.testing_utils import (
backend_empty_cache,
enable_full_determinism,
floats_tensor,
load_image,
load_numpy,
nightly,
numpy_cosine_similarity_distance,
require_torch_accelerator,
torch_device,
require_torch_gpu,
)
from ..test_pipelines_common import PipelineTesterMixin
@@ -235,19 +233,19 @@ class KandinskyV22ControlnetImg2ImgPipelineFastTests(PipelineTesterMixin, unitte
@nightly
@require_torch_accelerator
@require_torch_gpu
class KandinskyV22ControlnetImg2ImgPipelineIntegrationTests(unittest.TestCase):
def setUp(self):
# clean up the VRAM before each test
super().setUp()
gc.collect()
backend_empty_cache(torch_device)
torch.cuda.empty_cache()
def tearDown(self):
# clean up the VRAM after each test
super().tearDown()
gc.collect()
backend_empty_cache(torch_device)
torch.cuda.empty_cache()
def test_kandinsky_controlnet_img2img(self):
expected_image = load_numpy(
@@ -311,4 +309,4 @@ class KandinskyV22ControlnetImg2ImgPipelineIntegrationTests(unittest.TestCase):
assert image.shape == (512, 512, 3)
max_diff = numpy_cosine_similarity_distance(expected_image.flatten(), image.flatten())
assert max_diff < 5e-4
assert max_diff < 1e-4

View File

@@ -22,11 +22,10 @@ from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
from diffusers import AutoencoderKL, DDIMScheduler, LDMTextToImagePipeline, UNet2DConditionModel
from diffusers.utils.testing_utils import (
backend_empty_cache,
enable_full_determinism,
load_numpy,
nightly,
require_torch_accelerator,
require_torch_gpu,
torch_device,
)
@@ -137,17 +136,17 @@ class LDMTextToImagePipelineFastTests(PipelineTesterMixin, unittest.TestCase):
@nightly
@require_torch_accelerator
@require_torch_gpu
class LDMTextToImagePipelineSlowTests(unittest.TestCase):
def setUp(self):
super().setUp()
gc.collect()
backend_empty_cache(torch_device)
torch.cuda.empty_cache()
def tearDown(self):
super().tearDown()
gc.collect()
backend_empty_cache(torch_device)
torch.cuda.empty_cache()
def get_inputs(self, device, dtype=torch.float32, seed=0):
generator = torch.manual_seed(seed)
@@ -178,17 +177,17 @@ class LDMTextToImagePipelineSlowTests(unittest.TestCase):
@nightly
@require_torch_accelerator
@require_torch_gpu
class LDMTextToImagePipelineNightlyTests(unittest.TestCase):
def setUp(self):
super().setUp()
gc.collect()
backend_empty_cache(torch_device)
torch.cuda.empty_cache()
def tearDown(self):
super().tearDown()
gc.collect()
backend_empty_cache(torch_device)
torch.cuda.empty_cache()
def get_inputs(self, device, dtype=torch.float32, seed=0):
generator = torch.manual_seed(seed)

View File

@@ -27,8 +27,8 @@ from diffusers.utils.testing_utils import (
enable_full_determinism,
nightly,
numpy_cosine_similarity_distance,
require_big_accelerator,
require_torch_accelerator,
require_big_gpu_with_torch_cuda,
require_torch_gpu,
torch_device,
)
@@ -266,9 +266,9 @@ class MochiPipelineFastTests(PipelineTesterMixin, FasterCacheTesterMixin, unitte
@nightly
@require_torch_accelerator
@require_big_accelerator
@pytest.mark.big_accelerator
@require_torch_gpu
@require_big_gpu_with_torch_cuda
@pytest.mark.big_gpu_with_torch_cuda
class MochiPipelineIntegrationTests(unittest.TestCase):
prompt = "A painting of a squirrel eating a burger."
@@ -302,5 +302,5 @@ class MochiPipelineIntegrationTests(unittest.TestCase):
video = videos[0]
expected_video = torch.randn(1, 19, 480, 848, 3).numpy()
max_diff = numpy_cosine_similarity_distance(video.cpu(), expected_video)
max_diff = numpy_cosine_similarity_distance(video, expected_video)
assert max_diff < 1e-3, f"Max diff is too high. got {video}"

View File

@@ -39,13 +39,7 @@ from diffusers import (
UNet2DConditionModel,
)
from diffusers.utils import is_xformers_available
from diffusers.utils.testing_utils import (
backend_empty_cache,
enable_full_determinism,
nightly,
require_torch_accelerator,
torch_device,
)
from diffusers.utils.testing_utils import enable_full_determinism, nightly, require_torch_gpu, torch_device
from ..pipeline_params import TEXT_TO_AUDIO_BATCH_PARAMS, TEXT_TO_AUDIO_PARAMS
from ..test_pipelines_common import PipelineTesterMixin
@@ -414,17 +408,17 @@ class MusicLDMPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
@nightly
@require_torch_accelerator
@require_torch_gpu
class MusicLDMPipelineNightlyTests(unittest.TestCase):
def setUp(self):
super().setUp()
gc.collect()
backend_empty_cache(torch_device)
torch.cuda.empty_cache()
def tearDown(self):
super().tearDown()
gc.collect()
backend_empty_cache(torch_device)
torch.cuda.empty_cache()
def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
generator = torch.Generator(device=generator_device).manual_seed(seed)

View File

@@ -7,10 +7,8 @@ from transformers import AutoTokenizer
from diffusers import AutoencoderKL, FlowMatchEulerDiscreteScheduler, OmniGenPipeline, OmniGenTransformer2DModel
from diffusers.utils.testing_utils import (
Expectations,
backend_empty_cache,
numpy_cosine_similarity_distance,
require_torch_accelerator,
require_torch_gpu,
slow,
torch_device,
)
@@ -89,7 +87,7 @@ class OmniGenPipelineFastTests(unittest.TestCase, PipelineTesterMixin):
@slow
@require_torch_accelerator
@require_torch_gpu
class OmniGenPipelineSlowTests(unittest.TestCase):
pipeline_class = OmniGenPipeline
repo_id = "shitao/OmniGen-v1-diffusers"
@@ -97,12 +95,12 @@ class OmniGenPipelineSlowTests(unittest.TestCase):
def setUp(self):
super().setUp()
gc.collect()
backend_empty_cache(torch_device)
torch.cuda.empty_cache()
def tearDown(self):
super().tearDown()
gc.collect()
backend_empty_cache(torch_device)
torch.cuda.empty_cache()
def get_inputs(self, device, seed=0):
if str(device).startswith("mps"):
@@ -127,56 +125,21 @@ class OmniGenPipelineSlowTests(unittest.TestCase):
image = pipe(**inputs).images[0]
image_slice = image[0, :10, :10]
expected_slices = Expectations(
{
("xpu", 3): np.array(
[
[0.05859375, 0.05859375, 0.04492188],
[0.04882812, 0.04101562, 0.03320312],
[0.04882812, 0.04296875, 0.03125],
[0.04296875, 0.0390625, 0.03320312],
[0.04296875, 0.03710938, 0.03125],
[0.04492188, 0.0390625, 0.03320312],
[0.04296875, 0.03710938, 0.03125],
[0.04101562, 0.03710938, 0.02734375],
[0.04101562, 0.03515625, 0.02734375],
[0.04101562, 0.03515625, 0.02929688],
],
dtype=np.float32,
),
("cuda", 7): np.array(
[
[0.1783447, 0.16772744, 0.14339337],
[0.17066911, 0.15521264, 0.13757327],
[0.17072496, 0.15531206, 0.13524258],
[0.16746324, 0.1564025, 0.13794944],
[0.16490817, 0.15258026, 0.13697758],
[0.16971767, 0.15826806, 0.13928896],
[0.16782972, 0.15547255, 0.13783783],
[0.16464645, 0.15281534, 0.13522372],
[0.16535294, 0.15301755, 0.13526791],
[0.16365296, 0.15092957, 0.13443318],
],
dtype=np.float32,
),
("cuda", 8): np.array(
[
[0.0546875, 0.05664062, 0.04296875],
[0.046875, 0.04101562, 0.03320312],
[0.05078125, 0.04296875, 0.03125],
[0.04296875, 0.04101562, 0.03320312],
[0.0390625, 0.03710938, 0.02929688],
[0.04296875, 0.03710938, 0.03125],
[0.0390625, 0.03710938, 0.02929688],
[0.0390625, 0.03710938, 0.02734375],
[0.0390625, 0.03320312, 0.02734375],
[0.0390625, 0.03320312, 0.02734375],
],
dtype=np.float32,
),
}
expected_slice = np.array(
[
[0.1783447, 0.16772744, 0.14339337],
[0.17066911, 0.15521264, 0.13757327],
[0.17072496, 0.15531206, 0.13524258],
[0.16746324, 0.1564025, 0.13794944],
[0.16490817, 0.15258026, 0.13697758],
[0.16971767, 0.15826806, 0.13928896],
[0.16782972, 0.15547255, 0.13783783],
[0.16464645, 0.15281534, 0.13522372],
[0.16535294, 0.15301755, 0.13526791],
[0.16365296, 0.15092957, 0.13443318],
],
dtype=np.float32,
)
expected_slice = expected_slices.get_expectation()
max_diff = numpy_cosine_similarity_distance(expected_slice.flatten(), image_slice.flatten())

View File

@@ -25,12 +25,11 @@ from transformers import CLIPImageProcessor, CLIPVisionConfig
from diffusers import AutoencoderKL, PaintByExamplePipeline, PNDMScheduler, UNet2DConditionModel
from diffusers.pipelines.paint_by_example import PaintByExampleImageEncoder
from diffusers.utils.testing_utils import (
backend_empty_cache,
enable_full_determinism,
floats_tensor,
load_image,
nightly,
require_torch_accelerator,
require_torch_gpu,
torch_device,
)
@@ -175,19 +174,19 @@ class PaintByExamplePipelineFastTests(PipelineTesterMixin, unittest.TestCase):
@nightly
@require_torch_accelerator
@require_torch_gpu
class PaintByExamplePipelineIntegrationTests(unittest.TestCase):
def setUp(self):
# clean up the VRAM before each test
super().setUp()
gc.collect()
backend_empty_cache(torch_device)
torch.cuda.empty_cache()
def tearDown(self):
# clean up the VRAM after each test
super().tearDown()
gc.collect()
backend_empty_cache(torch_device)
torch.cuda.empty_cache()
def test_paint_by_example(self):
# make sure here that pndm scheduler skips prk

View File

@@ -21,13 +21,7 @@ from transformers import CLIPTextConfig, CLIPTextModelWithProjection, CLIPTokeni
from diffusers import HeunDiscreteScheduler, PriorTransformer, ShapEPipeline
from diffusers.pipelines.shap_e import ShapERenderer
from diffusers.utils.testing_utils import (
backend_empty_cache,
load_numpy,
nightly,
require_torch_accelerator,
torch_device,
)
from diffusers.utils.testing_utils import load_numpy, nightly, require_torch_gpu, torch_device
from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
@@ -228,19 +222,19 @@ class ShapEPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
@nightly
@require_torch_accelerator
@require_torch_gpu
class ShapEPipelineIntegrationTests(unittest.TestCase):
def setUp(self):
# clean up the VRAM before each test
super().setUp()
gc.collect()
backend_empty_cache(torch_device)
torch.cuda.empty_cache()
def tearDown(self):
# clean up the VRAM after each test
super().tearDown()
gc.collect()
backend_empty_cache(torch_device)
torch.cuda.empty_cache()
def test_shap_e(self):
expected_image = load_numpy(

View File

@@ -23,12 +23,11 @@ from transformers import CLIPImageProcessor, CLIPVisionConfig, CLIPVisionModel
from diffusers import HeunDiscreteScheduler, PriorTransformer, ShapEImg2ImgPipeline
from diffusers.pipelines.shap_e import ShapERenderer
from diffusers.utils.testing_utils import (
backend_empty_cache,
floats_tensor,
load_image,
load_numpy,
nightly,
require_torch_accelerator,
require_torch_gpu,
torch_device,
)
@@ -251,19 +250,19 @@ class ShapEImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
@nightly
@require_torch_accelerator
@require_torch_gpu
class ShapEImg2ImgPipelineIntegrationTests(unittest.TestCase):
def setUp(self):
# clean up the VRAM before each test
super().setUp()
gc.collect()
backend_empty_cache(torch_device)
torch.cuda.empty_cache()
def tearDown(self):
# clean up the VRAM after each test
super().tearDown()
gc.collect()
backend_empty_cache(torch_device)
torch.cuda.empty_cache()
def test_shap_e_img2img(self):
input_image = load_image(

View File

@@ -32,14 +32,7 @@ from diffusers import (
StableAudioProjectionModel,
)
from diffusers.utils import is_xformers_available
from diffusers.utils.testing_utils import (
Expectations,
backend_empty_cache,
enable_full_determinism,
nightly,
require_torch_accelerator,
torch_device,
)
from diffusers.utils.testing_utils import enable_full_determinism, nightly, require_torch_gpu, torch_device
from ..pipeline_params import TEXT_TO_AUDIO_BATCH_PARAMS
from ..test_pipelines_common import PipelineTesterMixin
@@ -426,17 +419,17 @@ class StableAudioPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
@nightly
@require_torch_accelerator
@require_torch_gpu
class StableAudioPipelineIntegrationTests(unittest.TestCase):
def setUp(self):
super().setUp()
gc.collect()
backend_empty_cache(torch_device)
torch.cuda.empty_cache()
def tearDown(self):
super().tearDown()
gc.collect()
backend_empty_cache(torch_device)
torch.cuda.empty_cache()
def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
generator = torch.Generator(device=generator_device).manual_seed(seed)
@@ -466,15 +459,9 @@ class StableAudioPipelineIntegrationTests(unittest.TestCase):
# check the portion of the generated audio with the largest dynamic range (reduces flakiness)
audio_slice = audio[0, 447590:447600]
# fmt: off
expected_slices = Expectations(
{
("xpu", 3): np.array([-0.0285, 0.1083, 0.1863, 0.3165, 0.5312, 0.6971, 0.6958, 0.6177, 0.5598, 0.5048]),
("cuda", 7): np.array([-0.0278, 0.1096, 0.1877, 0.3178, 0.5329, 0.6990, 0.6972, 0.6186, 0.5608, 0.5060]),
("cuda", 8): np.array([-0.0285, 0.1082, 0.1862, 0.3163, 0.5306, 0.6964, 0.6953, 0.6172, 0.5593, 0.5044]),
}
expected_slice = np.array(
[-0.0278, 0.1096, 0.1877, 0.3178, 0.5329, 0.6990, 0.6972, 0.6186, 0.5608, 0.5060]
)
# fmt: on
expected_slice = expected_slices.get_expectation()
# fmt: one
max_diff = np.abs(expected_slice - audio_slice.detach().cpu().numpy()).max()
assert max_diff < 1.5e-3

View File

@@ -389,7 +389,7 @@ class BnB4BitBasicTests(Base4bitTests):
class BnB4BitTrainingTests(Base4bitTests):
def setUp(self):
gc.collect()
backend_empty_cache(torch_device)
torch.cuda.empty_cache()
nf4_config = BitsAndBytesConfig(
load_in_4bit=True,
@@ -657,7 +657,7 @@ class SlowBnb4BitTests(Base4bitTests):
class SlowBnb4BitFluxTests(Base4bitTests):
def setUp(self) -> None:
gc.collect()
backend_empty_cache(torch_device)
torch.cuda.empty_cache()
model_id = "hf-internal-testing/flux.1-dev-nf4-pkg"
t5_4bit = T5EncoderModel.from_pretrained(model_id, subfolder="text_encoder_2")
@@ -674,7 +674,7 @@ class SlowBnb4BitFluxTests(Base4bitTests):
del self.pipeline_4bit
gc.collect()
backend_empty_cache(torch_device)
torch.cuda.empty_cache()
def test_quality(self):
# keep the resolution and max tokens to a lower number for faster execution.
@@ -722,7 +722,7 @@ class SlowBnb4BitFluxTests(Base4bitTests):
class SlowBnb4BitFluxControlWithLoraTests(Base4bitTests):
def setUp(self) -> None:
gc.collect()
backend_empty_cache(torch_device)
torch.cuda.empty_cache()
self.pipeline_4bit = FluxControlPipeline.from_pretrained("eramth/flux-4bit", torch_dtype=torch.float16)
self.pipeline_4bit.enable_model_cpu_offload()
@@ -731,7 +731,7 @@ class SlowBnb4BitFluxControlWithLoraTests(Base4bitTests):
del self.pipeline_4bit
gc.collect()
backend_empty_cache(torch_device)
torch.cuda.empty_cache()
def test_lora_loading(self):
self.pipeline_4bit.load_lora_weights("black-forest-labs/FLUX.1-Canny-dev-lora")

View File

@@ -12,7 +12,6 @@ from diffusers import (
FluxPipeline,
FluxTransformer2DModel,
GGUFQuantizationConfig,
HiDreamImageTransformer2DModel,
SD3Transformer2DModel,
StableDiffusion3Pipeline,
)
@@ -550,30 +549,3 @@ class FluxControlLoRAGGUFTests(unittest.TestCase):
max_diff = numpy_cosine_similarity_distance(expected_slice, out_slice)
self.assertTrue(max_diff < 1e-3)
class HiDreamGGUFSingleFileTests(GGUFSingleFileTesterMixin, unittest.TestCase):
ckpt_path = "https://huggingface.co/city96/HiDream-I1-Dev-gguf/blob/main/hidream-i1-dev-Q2_K.gguf"
torch_dtype = torch.bfloat16
model_cls = HiDreamImageTransformer2DModel
expected_memory_use_in_gb = 8
def get_dummy_inputs(self):
return {
"hidden_states": torch.randn((1, 16, 128, 128), generator=torch.Generator("cpu").manual_seed(0)).to(
torch_device, self.torch_dtype
),
"encoder_hidden_states_t5": torch.randn(
(1, 128, 4096),
generator=torch.Generator("cpu").manual_seed(0),
).to(torch_device, self.torch_dtype),
"encoder_hidden_states_llama3": torch.randn(
(32, 1, 128, 4096),
generator=torch.Generator("cpu").manual_seed(0),
).to(torch_device, self.torch_dtype),
"pooled_embeds": torch.randn(
(1, 2048),
generator=torch.Generator("cpu").manual_seed(0),
).to(torch_device, self.torch_dtype),
"timesteps": torch.tensor([1]).to(torch_device, self.torch_dtype),
}

View File

@@ -34,24 +34,13 @@ try:
print("Torch version:", torch.__version__)
print("Cuda available:", torch.cuda.is_available())
print("Cuda version:", torch.version.cuda)
print("CuDNN version:", torch.backends.cudnn.version())
print("Number of GPUs available:", torch.cuda.device_count())
if torch.cuda.is_available():
print("Cuda version:", torch.version.cuda)
print("CuDNN version:", torch.backends.cudnn.version())
print("Number of GPUs available:", torch.cuda.device_count())
device_properties = torch.cuda.get_device_properties(0)
total_memory = device_properties.total_memory / (1024**3)
print(f"CUDA memory: {total_memory} GB")
print("XPU available:", hasattr(torch, "xpu") and torch.xpu.is_available())
if hasattr(torch, "xpu") and torch.xpu.is_available():
print("XPU model:", torch.xpu.get_device_properties(0).name)
print("XPU compiler version:", torch.version.xpu)
print("Number of XPUs available:", torch.xpu.device_count())
device_properties = torch.xpu.get_device_properties(0)
total_memory = device_properties.total_memory / (1024**3)
print(f"XPU memory: {total_memory} GB")
except ImportError:
print("Torch version:", None)