mirror of
https://github.com/huggingface/diffusers.git
synced 2026-02-07 03:15:16 +08:00
Compare commits
3 Commits
hidream-si
...
hidream-no
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
cc0335154f | ||
|
|
e1968390c1 | ||
|
|
3c84a3c108 |
@@ -457,8 +457,6 @@
|
||||
title: Flux
|
||||
- local: api/pipelines/control_flux_inpaint
|
||||
title: FluxControlInpaint
|
||||
- local: api/pipelines/framepack
|
||||
title: Framepack
|
||||
- local: api/pipelines/hidream
|
||||
title: HiDream-I1
|
||||
- local: api/pipelines/hunyuandit
|
||||
|
||||
@@ -21,22 +21,6 @@ from diffusers import HiDreamImageTransformer2DModel
|
||||
transformer = HiDreamImageTransformer2DModel.from_pretrained("HiDream-ai/HiDream-I1-Full", subfolder="transformer", torch_dtype=torch.bfloat16)
|
||||
```
|
||||
|
||||
## Loading GGUF quantized checkpoints for HiDream-I1
|
||||
|
||||
GGUF checkpoints for the `HiDreamImageTransformer2DModel` can be loaded using `~FromOriginalModelMixin.from_single_file`
|
||||
|
||||
```python
|
||||
import torch
|
||||
from diffusers import GGUFQuantizationConfig, HiDreamImageTransformer2DModel
|
||||
|
||||
ckpt_path = "https://huggingface.co/city96/HiDream-I1-Dev-gguf/blob/main/hidream-i1-dev-Q2_K.gguf"
|
||||
transformer = HiDreamImageTransformer2DModel.from_single_file(
|
||||
ckpt_path,
|
||||
quantization_config=GGUFQuantizationConfig(compute_dtype=torch.bfloat16),
|
||||
torch_dtype=torch.bfloat16
|
||||
)
|
||||
```
|
||||
|
||||
## HiDreamImageTransformer2DModel
|
||||
|
||||
[[autodoc]] HiDreamImageTransformer2DModel
|
||||
|
||||
@@ -1,209 +0,0 @@
|
||||
<!-- Copyright 2025 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License. -->
|
||||
|
||||
# Framepack
|
||||
|
||||
<div class="flex flex-wrap space-x-1">
|
||||
<img alt="LoRA" src="https://img.shields.io/badge/LoRA-d8b4fe?style=flat"/>
|
||||
</div>
|
||||
|
||||
[Packing Input Frame Context in Next-Frame Prediction Models for Video Generation](https://arxiv.org/abs/2504.12626) by Lvmin Zhang and Maneesh Agrawala.
|
||||
|
||||
*We present a neural network structure, FramePack, to train next-frame (or next-frame-section) prediction models for video generation. The FramePack compresses input frames to make the transformer context length a fixed number regardless of the video length. As a result, we are able to process a large number of frames using video diffusion with computation bottleneck similar to image diffusion. This also makes the training video batch sizes significantly higher (batch sizes become comparable to image diffusion training). We also propose an anti-drifting sampling method that generates frames in inverted temporal order with early-established endpoints to avoid exposure bias (error accumulation over iterations). Finally, we show that existing video diffusion models can be finetuned with FramePack, and their visual quality may be improved because the next-frame prediction supports more balanced diffusion schedulers with less extreme flow shift timesteps.*
|
||||
|
||||
<Tip>
|
||||
|
||||
Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
|
||||
|
||||
</Tip>
|
||||
|
||||
## Available models
|
||||
|
||||
| Model name | Description |
|
||||
|:---|:---|
|
||||
- [`lllyasviel/FramePackI2V_HY`](https://huggingface.co/lllyasviel/FramePackI2V_HY) | Trained with the "inverted anti-drifting" strategy as described in the paper. Inference requires setting `sampling_type="inverted_anti_drifting"` when running the pipeline. |
|
||||
- [`lllyasviel/FramePack_F1_I2V_HY_20250503`](https://huggingface.co/lllyasviel/FramePack_F1_I2V_HY_20250503) | Trained with a novel anti-drifting strategy but inference is performed in "vanilla" strategy as described in the paper. Inference requires setting `sampling_type="vanilla"` when running the pipeline. |
|
||||
|
||||
## Usage
|
||||
|
||||
Refer to the pipeline documentation for basic usage examples. The following section contains examples of offloading, different sampling methods, quantization, and more.
|
||||
|
||||
### First and last frame to video
|
||||
|
||||
The following example shows how to use Framepack with start and end image controls, using the inverted anti-drifiting sampling model.
|
||||
|
||||
```python
|
||||
import torch
|
||||
from diffusers import HunyuanVideoFramepackPipeline, HunyuanVideoFramepackTransformer3DModel
|
||||
from diffusers.utils import export_to_video, load_image
|
||||
from transformers import SiglipImageProcessor, SiglipVisionModel
|
||||
|
||||
transformer = HunyuanVideoFramepackTransformer3DModel.from_pretrained(
|
||||
"lllyasviel/FramePackI2V_HY", torch_dtype=torch.bfloat16
|
||||
)
|
||||
feature_extractor = SiglipImageProcessor.from_pretrained(
|
||||
"lllyasviel/flux_redux_bfl", subfolder="feature_extractor"
|
||||
)
|
||||
image_encoder = SiglipVisionModel.from_pretrained(
|
||||
"lllyasviel/flux_redux_bfl", subfolder="image_encoder", torch_dtype=torch.float16
|
||||
)
|
||||
pipe = HunyuanVideoFramepackPipeline.from_pretrained(
|
||||
"hunyuanvideo-community/HunyuanVideo",
|
||||
transformer=transformer,
|
||||
feature_extractor=feature_extractor,
|
||||
image_encoder=image_encoder,
|
||||
torch_dtype=torch.float16,
|
||||
)
|
||||
|
||||
# Enable memory optimizations
|
||||
pipe.enable_model_cpu_offload()
|
||||
pipe.vae.enable_tiling()
|
||||
|
||||
prompt = "CG animation style, a small blue bird takes off from the ground, flapping its wings. The bird's feathers are delicate, with a unique pattern on its chest. The background shows a blue sky with white clouds under bright sunshine. The camera follows the bird upward, capturing its flight and the vastness of the sky from a close-up, low-angle perspective."
|
||||
first_image = load_image(
|
||||
"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/flf2v_input_first_frame.png"
|
||||
)
|
||||
last_image = load_image(
|
||||
"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/flf2v_input_last_frame.png"
|
||||
)
|
||||
output = pipe(
|
||||
image=first_image,
|
||||
last_image=last_image,
|
||||
prompt=prompt,
|
||||
height=512,
|
||||
width=512,
|
||||
num_frames=91,
|
||||
num_inference_steps=30,
|
||||
guidance_scale=9.0,
|
||||
generator=torch.Generator().manual_seed(0),
|
||||
sampling_type="inverted_anti_drifting",
|
||||
).frames[0]
|
||||
export_to_video(output, "output.mp4", fps=30)
|
||||
```
|
||||
|
||||
### Vanilla sampling
|
||||
|
||||
The following example shows how to use Framepack with the F1 model trained with vanilla sampling but new regulation approach for anti-drifting.
|
||||
|
||||
```python
|
||||
import torch
|
||||
from diffusers import HunyuanVideoFramepackPipeline, HunyuanVideoFramepackTransformer3DModel
|
||||
from diffusers.utils import export_to_video, load_image
|
||||
from transformers import SiglipImageProcessor, SiglipVisionModel
|
||||
|
||||
transformer = HunyuanVideoFramepackTransformer3DModel.from_pretrained(
|
||||
"lllyasviel/FramePack_F1_I2V_HY_20250503", torch_dtype=torch.bfloat16
|
||||
)
|
||||
feature_extractor = SiglipImageProcessor.from_pretrained(
|
||||
"lllyasviel/flux_redux_bfl", subfolder="feature_extractor"
|
||||
)
|
||||
image_encoder = SiglipVisionModel.from_pretrained(
|
||||
"lllyasviel/flux_redux_bfl", subfolder="image_encoder", torch_dtype=torch.float16
|
||||
)
|
||||
pipe = HunyuanVideoFramepackPipeline.from_pretrained(
|
||||
"hunyuanvideo-community/HunyuanVideo",
|
||||
transformer=transformer,
|
||||
feature_extractor=feature_extractor,
|
||||
image_encoder=image_encoder,
|
||||
torch_dtype=torch.float16,
|
||||
)
|
||||
|
||||
# Enable memory optimizations
|
||||
pipe.enable_model_cpu_offload()
|
||||
pipe.vae.enable_tiling()
|
||||
|
||||
image = load_image(
|
||||
"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/penguin.png"
|
||||
)
|
||||
output = pipe(
|
||||
image=image,
|
||||
prompt="A penguin dancing in the snow",
|
||||
height=832,
|
||||
width=480,
|
||||
num_frames=91,
|
||||
num_inference_steps=30,
|
||||
guidance_scale=9.0,
|
||||
generator=torch.Generator().manual_seed(0),
|
||||
sampling_type="vanilla",
|
||||
).frames[0]
|
||||
export_to_video(output, "output.mp4", fps=30)
|
||||
```
|
||||
|
||||
### Group offloading
|
||||
|
||||
Group offloading ([`~hooks.apply_group_offloading`]) provides aggressive memory optimizations for offloading internal parts of any model to the CPU, with possibly no additional overhead to generation time. If you have very low VRAM available, this approach may be suitable for you depending on the amount of CPU RAM available.
|
||||
|
||||
```python
|
||||
import torch
|
||||
from diffusers import HunyuanVideoFramepackPipeline, HunyuanVideoFramepackTransformer3DModel
|
||||
from diffusers.hooks import apply_group_offloading
|
||||
from diffusers.utils import export_to_video, load_image
|
||||
from transformers import SiglipImageProcessor, SiglipVisionModel
|
||||
|
||||
transformer = HunyuanVideoFramepackTransformer3DModel.from_pretrained(
|
||||
"lllyasviel/FramePack_F1_I2V_HY_20250503", torch_dtype=torch.bfloat16
|
||||
)
|
||||
feature_extractor = SiglipImageProcessor.from_pretrained(
|
||||
"lllyasviel/flux_redux_bfl", subfolder="feature_extractor"
|
||||
)
|
||||
image_encoder = SiglipVisionModel.from_pretrained(
|
||||
"lllyasviel/flux_redux_bfl", subfolder="image_encoder", torch_dtype=torch.float16
|
||||
)
|
||||
pipe = HunyuanVideoFramepackPipeline.from_pretrained(
|
||||
"hunyuanvideo-community/HunyuanVideo",
|
||||
transformer=transformer,
|
||||
feature_extractor=feature_extractor,
|
||||
image_encoder=image_encoder,
|
||||
torch_dtype=torch.float16,
|
||||
)
|
||||
|
||||
# Enable group offloading
|
||||
onload_device = torch.device("cuda")
|
||||
offload_device = torch.device("cpu")
|
||||
list(map(
|
||||
lambda x: apply_group_offloading(x, onload_device, offload_device, offload_type="leaf_level", use_stream=True, low_cpu_mem_usage=True),
|
||||
[pipe.text_encoder, pipe.text_encoder_2, pipe.transformer]
|
||||
))
|
||||
pipe.image_encoder.to(onload_device)
|
||||
pipe.vae.to(onload_device)
|
||||
pipe.vae.enable_tiling()
|
||||
|
||||
image = load_image(
|
||||
"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/penguin.png"
|
||||
)
|
||||
output = pipe(
|
||||
image=image,
|
||||
prompt="A penguin dancing in the snow",
|
||||
height=832,
|
||||
width=480,
|
||||
num_frames=91,
|
||||
num_inference_steps=30,
|
||||
guidance_scale=9.0,
|
||||
generator=torch.Generator().manual_seed(0),
|
||||
sampling_type="vanilla",
|
||||
).frames[0]
|
||||
print(f"Max memory: {torch.cuda.max_memory_allocated() / 1024**3:.3f} GB")
|
||||
export_to_video(output, "output.mp4", fps=30)
|
||||
```
|
||||
|
||||
## HunyuanVideoFramepackPipeline
|
||||
|
||||
[[autodoc]] HunyuanVideoFramepackPipeline
|
||||
- all
|
||||
- __call__
|
||||
|
||||
## HunyuanVideoPipelineOutput
|
||||
|
||||
[[autodoc]] pipelines.hunyuan_video.pipeline_output.HunyuanVideoPipelineOutput
|
||||
|
||||
@@ -52,6 +52,7 @@ The following models are available for the image-to-video pipeline:
|
||||
| [`Skywork/SkyReels-V1-Hunyuan-I2V`](https://huggingface.co/Skywork/SkyReels-V1-Hunyuan-I2V) | Skywork's custom finetune of HunyuanVideo (de-distilled). Performs best with `97x544x960` resolution. Performs best at `97x544x960` resolution, `guidance_scale=1.0`, `true_cfg_scale=6.0` and a negative prompt. |
|
||||
| [`hunyuanvideo-community/HunyuanVideo-I2V-33ch`](https://huggingface.co/hunyuanvideo-community/HunyuanVideo-I2V) | Tecent's official HunyuanVideo 33-channel I2V model. Performs best at resolutions of 480, 720, 960, 1280. A higher `shift` value when initializing the scheduler is recommended (good values are between 7 and 20). |
|
||||
| [`hunyuanvideo-community/HunyuanVideo-I2V`](https://huggingface.co/hunyuanvideo-community/HunyuanVideo-I2V) | Tecent's official HunyuanVideo 16-channel I2V model. Performs best at resolutions of 480, 720, 960, 1280. A higher `shift` value when initializing the scheduler is recommended (good values are between 7 and 20) |
|
||||
- [`lllyasviel/FramePackI2V_HY`](https://huggingface.co/lllyasviel/FramePackI2V_HY) | lllyasviel's paper introducing a new technique for long-context video generation called [Framepack](https://arxiv.org/abs/2504.12626). |
|
||||
|
||||
## Quantization
|
||||
|
||||
|
||||
@@ -31,7 +31,6 @@ from .single_file_utils import (
|
||||
convert_autoencoder_dc_checkpoint_to_diffusers,
|
||||
convert_controlnet_checkpoint,
|
||||
convert_flux_transformer_checkpoint_to_diffusers,
|
||||
convert_hidream_transformer_to_diffusers,
|
||||
convert_hunyuan_video_transformer_to_diffusers,
|
||||
convert_ldm_unet_checkpoint,
|
||||
convert_ldm_vae_checkpoint,
|
||||
@@ -134,10 +133,6 @@ SINGLE_FILE_LOADABLE_CLASSES = {
|
||||
"checkpoint_mapping_fn": convert_wan_vae_to_diffusers,
|
||||
"default_subfolder": "vae",
|
||||
},
|
||||
"HiDreamImageTransformer2DModel": {
|
||||
"checkpoint_mapping_fn": convert_hidream_transformer_to_diffusers,
|
||||
"default_subfolder": "transformer",
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -126,7 +126,6 @@ CHECKPOINT_KEY_NAMES = {
|
||||
],
|
||||
"wan": ["model.diffusion_model.head.modulation", "head.modulation"],
|
||||
"wan_vae": "decoder.middle.0.residual.0.gamma",
|
||||
"hidream": "double_stream_blocks.0.block.adaLN_modulation.1.bias",
|
||||
}
|
||||
|
||||
DIFFUSERS_DEFAULT_PIPELINE_PATHS = {
|
||||
@@ -191,7 +190,6 @@ DIFFUSERS_DEFAULT_PIPELINE_PATHS = {
|
||||
"wan-t2v-1.3B": {"pretrained_model_name_or_path": "Wan-AI/Wan2.1-T2V-1.3B-Diffusers"},
|
||||
"wan-t2v-14B": {"pretrained_model_name_or_path": "Wan-AI/Wan2.1-T2V-14B-Diffusers"},
|
||||
"wan-i2v-14B": {"pretrained_model_name_or_path": "Wan-AI/Wan2.1-I2V-14B-480P-Diffusers"},
|
||||
"hidream": {"pretrained_model_name_or_path": "HiDream-ai/HiDream-I1-Dev"},
|
||||
}
|
||||
|
||||
# Use to configure model sample size when original config is provided
|
||||
@@ -703,8 +701,6 @@ def infer_diffusers_model_type(checkpoint):
|
||||
elif CHECKPOINT_KEY_NAMES["wan_vae"] in checkpoint:
|
||||
# All Wan models use the same VAE so we can use the same default model repo to fetch the config
|
||||
model_type = "wan-t2v-14B"
|
||||
elif CHECKPOINT_KEY_NAMES["hidream"] in checkpoint:
|
||||
model_type = "hidream"
|
||||
else:
|
||||
model_type = "v1"
|
||||
|
||||
@@ -3297,12 +3293,3 @@ def convert_wan_vae_to_diffusers(checkpoint, **kwargs):
|
||||
converted_state_dict[key] = value
|
||||
|
||||
return converted_state_dict
|
||||
|
||||
|
||||
def convert_hidream_transformer_to_diffusers(checkpoint, **kwargs):
|
||||
keys = list(checkpoint.keys())
|
||||
for k in keys:
|
||||
if "model.diffusion_model." in k:
|
||||
checkpoint[k.replace("model.diffusion_model.", "")] = checkpoint.pop(k)
|
||||
|
||||
return checkpoint
|
||||
|
||||
@@ -5,7 +5,7 @@ import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
from ...configuration_utils import ConfigMixin, register_to_config
|
||||
from ...loaders import FromOriginalModelMixin, PeftAdapterMixin
|
||||
from ...loaders import PeftAdapterMixin
|
||||
from ...models.modeling_outputs import Transformer2DModelOutput
|
||||
from ...models.modeling_utils import ModelMixin
|
||||
from ...utils import USE_PEFT_BACKEND, deprecate, logging, scale_lora_layers, unscale_lora_layers
|
||||
@@ -602,7 +602,7 @@ class HiDreamBlock(nn.Module):
|
||||
)
|
||||
|
||||
|
||||
class HiDreamImageTransformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin):
|
||||
class HiDreamImageTransformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
|
||||
_supports_gradient_checkpointing = True
|
||||
_no_split_modules = ["HiDreamImageTransformerBlock", "HiDreamImageSingleTransformerBlock"]
|
||||
|
||||
|
||||
@@ -152,19 +152,9 @@ class HunyuanVideoFramepackTransformer3DModel(
|
||||
|
||||
# 1. Latent and condition embedders
|
||||
self.x_embedder = HunyuanVideoPatchEmbed((patch_size_t, patch_size, patch_size), in_channels, inner_dim)
|
||||
|
||||
# Framepack history projection embedder
|
||||
self.clean_x_embedder = None
|
||||
if has_clean_x_embedder:
|
||||
self.clean_x_embedder = HunyuanVideoHistoryPatchEmbed(in_channels, inner_dim)
|
||||
|
||||
self.context_embedder = HunyuanVideoTokenRefiner(
|
||||
text_embed_dim, num_attention_heads, attention_head_dim, num_layers=num_refiner_layers
|
||||
)
|
||||
|
||||
# Framepack image-conditioning embedder
|
||||
self.image_projection = FramepackClipVisionProjection(image_proj_dim, inner_dim) if has_image_proj else None
|
||||
|
||||
self.time_text_embed = HunyuanVideoConditionEmbedding(
|
||||
inner_dim, pooled_projection_dim, guidance_embeds, image_condition_type
|
||||
)
|
||||
@@ -196,7 +186,14 @@ class HunyuanVideoFramepackTransformer3DModel(
|
||||
self.norm_out = AdaLayerNormContinuous(inner_dim, inner_dim, elementwise_affine=False, eps=1e-6)
|
||||
self.proj_out = nn.Linear(inner_dim, patch_size_t * patch_size * patch_size * out_channels)
|
||||
|
||||
self.gradient_checkpointing = False
|
||||
# Framepack specific modules
|
||||
self.image_projection = FramepackClipVisionProjection(image_proj_dim, inner_dim) if has_image_proj else None
|
||||
|
||||
self.clean_x_embedder = None
|
||||
if has_clean_x_embedder:
|
||||
self.clean_x_embedder = HunyuanVideoHistoryPatchEmbed(in_channels, inner_dim)
|
||||
|
||||
self.use_gradient_checkpointing = False
|
||||
|
||||
def forward(
|
||||
self,
|
||||
|
||||
@@ -36,11 +36,11 @@ EXAMPLE_DOC_STRING = """
|
||||
Examples:
|
||||
```py
|
||||
>>> import torch
|
||||
>>> from transformers import AutoTokenizer, LlamaForCausalLM
|
||||
>>> from diffusers import HiDreamImagePipeline
|
||||
>>> from transformers import PreTrainedTokenizerFast, LlamaForCausalLM
|
||||
>>> from diffusers import UniPCMultistepScheduler, HiDreamImagePipeline
|
||||
|
||||
|
||||
>>> tokenizer_4 = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct")
|
||||
>>> tokenizer_4 = PreTrainedTokenizerFast.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct")
|
||||
>>> text_encoder_4 = LlamaForCausalLM.from_pretrained(
|
||||
... "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
... output_hidden_states=True,
|
||||
|
||||
@@ -14,7 +14,6 @@
|
||||
|
||||
import inspect
|
||||
import math
|
||||
from enum import Enum
|
||||
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
@@ -92,7 +91,6 @@ EXAMPLE_DOC_STRING = """
|
||||
... num_inference_steps=30,
|
||||
... guidance_scale=9.0,
|
||||
... generator=torch.Generator().manual_seed(0),
|
||||
... sampling_type="inverted_anti_drifting",
|
||||
... ).frames[0]
|
||||
>>> export_to_video(output, "output.mp4", fps=30)
|
||||
```
|
||||
@@ -140,7 +138,6 @@ EXAMPLE_DOC_STRING = """
|
||||
... num_inference_steps=30,
|
||||
... guidance_scale=9.0,
|
||||
... generator=torch.Generator().manual_seed(0),
|
||||
... sampling_type="inverted_anti_drifting",
|
||||
... ).frames[0]
|
||||
>>> export_to_video(output, "output.mp4", fps=30)
|
||||
```
|
||||
@@ -235,11 +232,6 @@ def retrieve_timesteps(
|
||||
return timesteps, num_inference_steps
|
||||
|
||||
|
||||
class FramepackSamplingType(str, Enum):
|
||||
VANILLA = "vanilla"
|
||||
INVERTED_ANTI_DRIFTING = "inverted_anti_drifting"
|
||||
|
||||
|
||||
class HunyuanVideoFramepackPipeline(DiffusionPipeline, HunyuanVideoLoraLoaderMixin):
|
||||
r"""
|
||||
Pipeline for text-to-video generation using HunyuanVideo.
|
||||
@@ -463,11 +455,6 @@ class HunyuanVideoFramepackPipeline(DiffusionPipeline, HunyuanVideoLoraLoaderMix
|
||||
prompt_embeds=None,
|
||||
callback_on_step_end_tensor_inputs=None,
|
||||
prompt_template=None,
|
||||
image=None,
|
||||
image_latents=None,
|
||||
last_image=None,
|
||||
last_image_latents=None,
|
||||
sampling_type=None,
|
||||
):
|
||||
if height % 16 != 0 or width % 16 != 0:
|
||||
raise ValueError(f"`height` and `width` have to be divisible by 16 but are {height} and {width}.")
|
||||
@@ -506,21 +493,6 @@ class HunyuanVideoFramepackPipeline(DiffusionPipeline, HunyuanVideoLoraLoaderMix
|
||||
f"`prompt_template` has to contain a key `template` but only found {prompt_template.keys()}"
|
||||
)
|
||||
|
||||
sampling_types = [x.value for x in FramepackSamplingType.__members__.values()]
|
||||
if sampling_type not in sampling_types:
|
||||
raise ValueError(f"`sampling_type` has to be one of '{sampling_types}' but is '{sampling_type}'")
|
||||
|
||||
if image is not None and image_latents is not None:
|
||||
raise ValueError("Only one of `image` or `image_latents` can be passed.")
|
||||
if last_image is not None and last_image_latents is not None:
|
||||
raise ValueError("Only one of `last_image` or `last_image_latents` can be passed.")
|
||||
if sampling_type != FramepackSamplingType.INVERTED_ANTI_DRIFTING and (
|
||||
last_image is not None or last_image_latents is not None
|
||||
):
|
||||
raise ValueError(
|
||||
'Only `"inverted_anti_drifting"` inference type supports `last_image` or `last_image_latents`.'
|
||||
)
|
||||
|
||||
def prepare_latents(
|
||||
self,
|
||||
batch_size: int = 1,
|
||||
@@ -651,7 +623,6 @@ class HunyuanVideoFramepackPipeline(DiffusionPipeline, HunyuanVideoLoraLoaderMix
|
||||
callback_on_step_end_tensor_inputs: List[str] = ["latents"],
|
||||
prompt_template: Dict[str, Any] = DEFAULT_PROMPT_TEMPLATE,
|
||||
max_sequence_length: int = 256,
|
||||
sampling_type: FramepackSamplingType = FramepackSamplingType.INVERTED_ANTI_DRIFTING,
|
||||
):
|
||||
r"""
|
||||
The call function to the pipeline for generation.
|
||||
@@ -764,11 +735,6 @@ class HunyuanVideoFramepackPipeline(DiffusionPipeline, HunyuanVideoLoraLoaderMix
|
||||
prompt_embeds,
|
||||
callback_on_step_end_tensor_inputs,
|
||||
prompt_template,
|
||||
image,
|
||||
image_latents,
|
||||
last_image,
|
||||
last_image_latents,
|
||||
sampling_type,
|
||||
)
|
||||
|
||||
has_neg_prompt = negative_prompt is not None or (
|
||||
@@ -840,6 +806,18 @@ class HunyuanVideoFramepackPipeline(DiffusionPipeline, HunyuanVideoLoraLoaderMix
|
||||
num_channels_latents = self.transformer.config.in_channels
|
||||
window_num_frames = (latent_window_size - 1) * self.vae_scale_factor_temporal + 1
|
||||
num_latent_sections = max(1, (num_frames + window_num_frames - 1) // window_num_frames)
|
||||
# Specific to the released checkpoint: https://huggingface.co/lllyasviel/FramePackI2V_HY
|
||||
# TODO: find a more generic way in future if there are more checkpoints
|
||||
history_sizes = [1, 2, 16]
|
||||
history_latents = torch.zeros(
|
||||
batch_size,
|
||||
num_channels_latents,
|
||||
sum(history_sizes),
|
||||
height // self.vae_scale_factor_spatial,
|
||||
width // self.vae_scale_factor_spatial,
|
||||
device=device,
|
||||
dtype=torch.float32,
|
||||
)
|
||||
history_video = None
|
||||
total_generated_latent_frames = 0
|
||||
|
||||
@@ -851,92 +829,38 @@ class HunyuanVideoFramepackPipeline(DiffusionPipeline, HunyuanVideoLoraLoaderMix
|
||||
last_image, dtype=torch.float32, device=device, generator=generator
|
||||
)
|
||||
|
||||
# Specific to the released checkpoints:
|
||||
# - https://huggingface.co/lllyasviel/FramePackI2V_HY
|
||||
# - https://huggingface.co/lllyasviel/FramePack_F1_I2V_HY_20250503
|
||||
# TODO: find a more generic way in future if there are more checkpoints
|
||||
if sampling_type == FramepackSamplingType.INVERTED_ANTI_DRIFTING:
|
||||
history_sizes = [1, 2, 16]
|
||||
history_latents = torch.zeros(
|
||||
batch_size,
|
||||
num_channels_latents,
|
||||
sum(history_sizes),
|
||||
height // self.vae_scale_factor_spatial,
|
||||
width // self.vae_scale_factor_spatial,
|
||||
device=device,
|
||||
dtype=torch.float32,
|
||||
)
|
||||
|
||||
elif sampling_type == FramepackSamplingType.VANILLA:
|
||||
history_sizes = [16, 2, 1]
|
||||
history_latents = torch.zeros(
|
||||
batch_size,
|
||||
num_channels_latents,
|
||||
sum(history_sizes),
|
||||
height // self.vae_scale_factor_spatial,
|
||||
width // self.vae_scale_factor_spatial,
|
||||
device=device,
|
||||
dtype=torch.float32,
|
||||
)
|
||||
history_latents = torch.cat([history_latents, image_latents], dim=2)
|
||||
total_generated_latent_frames += 1
|
||||
|
||||
else:
|
||||
assert False
|
||||
latent_paddings = list(reversed(range(num_latent_sections)))
|
||||
if num_latent_sections > 4:
|
||||
latent_paddings = [3] + [2] * (num_latent_sections - 3) + [1, 0]
|
||||
|
||||
# 6. Prepare guidance condition
|
||||
guidance = torch.tensor([guidance_scale] * batch_size, dtype=transformer_dtype, device=device) * 1000.0
|
||||
|
||||
# 7. Denoising loop
|
||||
for k in range(num_latent_sections):
|
||||
if sampling_type == FramepackSamplingType.INVERTED_ANTI_DRIFTING:
|
||||
latent_paddings = list(reversed(range(num_latent_sections)))
|
||||
if num_latent_sections > 4:
|
||||
latent_paddings = [3] + [2] * (num_latent_sections - 3) + [1, 0]
|
||||
is_first_section = k == 0
|
||||
is_last_section = k == num_latent_sections - 1
|
||||
latent_padding_size = latent_paddings[k] * latent_window_size
|
||||
|
||||
is_first_section = k == 0
|
||||
is_last_section = k == num_latent_sections - 1
|
||||
latent_padding_size = latent_paddings[k] * latent_window_size
|
||||
indices = torch.arange(0, sum([1, latent_padding_size, latent_window_size, *history_sizes]))
|
||||
(
|
||||
indices_prefix,
|
||||
indices_padding,
|
||||
indices_latents,
|
||||
indices_postfix,
|
||||
indices_latents_history_2x,
|
||||
indices_latents_history_4x,
|
||||
) = indices.split([1, latent_padding_size, latent_window_size, *history_sizes], dim=0)
|
||||
# Inverted anti-drifting sampling: Figure 2(c) in the paper
|
||||
indices_clean_latents = torch.cat([indices_prefix, indices_postfix], dim=0)
|
||||
|
||||
indices = torch.arange(0, sum([1, latent_padding_size, latent_window_size, *history_sizes]))
|
||||
(
|
||||
indices_prefix,
|
||||
indices_padding,
|
||||
indices_latents,
|
||||
indices_latents_history_1x,
|
||||
indices_latents_history_2x,
|
||||
indices_latents_history_4x,
|
||||
) = indices.split([1, latent_padding_size, latent_window_size, *history_sizes], dim=0)
|
||||
# Inverted anti-drifting sampling: Figure 2(c) in the paper
|
||||
indices_clean_latents = torch.cat([indices_prefix, indices_latents_history_1x], dim=0)
|
||||
|
||||
latents_prefix = image_latents
|
||||
latents_history_1x, latents_history_2x, latents_history_4x = history_latents[
|
||||
:, :, : sum(history_sizes)
|
||||
].split(history_sizes, dim=2)
|
||||
if last_image is not None and is_first_section:
|
||||
latents_history_1x = last_image_latents
|
||||
latents_clean = torch.cat([latents_prefix, latents_history_1x], dim=2)
|
||||
|
||||
elif sampling_type == FramepackSamplingType.VANILLA:
|
||||
indices = torch.arange(0, sum([1, *history_sizes, latent_window_size]))
|
||||
(
|
||||
indices_prefix,
|
||||
indices_latents_history_4x,
|
||||
indices_latents_history_2x,
|
||||
indices_latents_history_1x,
|
||||
indices_latents,
|
||||
) = indices.split([1, *history_sizes, latent_window_size], dim=0)
|
||||
indices_clean_latents = torch.cat([indices_prefix, indices_latents_history_1x], dim=0)
|
||||
|
||||
latents_prefix = image_latents
|
||||
latents_history_4x, latents_history_2x, latents_history_1x = history_latents[
|
||||
:, :, -sum(history_sizes) :
|
||||
].split(history_sizes, dim=2)
|
||||
latents_clean = torch.cat([latents_prefix, latents_history_1x], dim=2)
|
||||
|
||||
else:
|
||||
assert False
|
||||
latents_prefix = image_latents
|
||||
latents_postfix, latents_history_2x, latents_history_4x = history_latents[
|
||||
:, :, : sum(history_sizes)
|
||||
].split(history_sizes, dim=2)
|
||||
if last_image is not None and is_first_section:
|
||||
latents_postfix = last_image_latents
|
||||
latents_clean = torch.cat([latents_prefix, latents_postfix], dim=2)
|
||||
|
||||
latents = self.prepare_latents(
|
||||
batch_size,
|
||||
@@ -1036,26 +960,13 @@ class HunyuanVideoFramepackPipeline(DiffusionPipeline, HunyuanVideoLoraLoaderMix
|
||||
if XLA_AVAILABLE:
|
||||
xm.mark_step()
|
||||
|
||||
if sampling_type == FramepackSamplingType.INVERTED_ANTI_DRIFTING:
|
||||
if is_last_section:
|
||||
latents = torch.cat([image_latents, latents], dim=2)
|
||||
total_generated_latent_frames += latents.shape[2]
|
||||
history_latents = torch.cat([latents, history_latents], dim=2)
|
||||
real_history_latents = history_latents[:, :, :total_generated_latent_frames]
|
||||
section_latent_frames = (
|
||||
(latent_window_size * 2 + 1) if is_last_section else (latent_window_size * 2)
|
||||
)
|
||||
index_slice = (slice(None), slice(None), slice(0, section_latent_frames))
|
||||
if is_last_section:
|
||||
latents = torch.cat([image_latents, latents], dim=2)
|
||||
|
||||
elif sampling_type == FramepackSamplingType.VANILLA:
|
||||
total_generated_latent_frames += latents.shape[2]
|
||||
history_latents = torch.cat([history_latents, latents], dim=2)
|
||||
real_history_latents = history_latents[:, :, -total_generated_latent_frames:]
|
||||
section_latent_frames = latent_window_size * 2
|
||||
index_slice = (slice(None), slice(None), slice(-section_latent_frames, None))
|
||||
total_generated_latent_frames += latents.shape[2]
|
||||
history_latents = torch.cat([latents, history_latents], dim=2)
|
||||
|
||||
else:
|
||||
assert False
|
||||
real_history_latents = history_latents[:, :, :total_generated_latent_frames]
|
||||
|
||||
if history_video is None:
|
||||
if not output_type == "latent":
|
||||
@@ -1065,18 +976,16 @@ class HunyuanVideoFramepackPipeline(DiffusionPipeline, HunyuanVideoLoraLoaderMix
|
||||
history_video = [real_history_latents]
|
||||
else:
|
||||
if not output_type == "latent":
|
||||
section_latent_frames = (
|
||||
(latent_window_size * 2 + 1) if is_last_section else (latent_window_size * 2)
|
||||
)
|
||||
overlapped_frames = (latent_window_size - 1) * self.vae_scale_factor_temporal + 1
|
||||
current_latents = (
|
||||
real_history_latents[index_slice].to(vae_dtype) / self.vae.config.scaling_factor
|
||||
real_history_latents[:, :, :section_latent_frames].to(vae_dtype)
|
||||
/ self.vae.config.scaling_factor
|
||||
)
|
||||
current_video = self.vae.decode(current_latents, return_dict=False)[0]
|
||||
|
||||
if sampling_type == FramepackSamplingType.INVERTED_ANTI_DRIFTING:
|
||||
history_video = self._soft_append(current_video, history_video, overlapped_frames)
|
||||
elif sampling_type == FramepackSamplingType.VANILLA:
|
||||
history_video = self._soft_append(history_video, current_video, overlapped_frames)
|
||||
else:
|
||||
assert False
|
||||
history_video = self._soft_append(current_video, history_video, overlapped_frames)
|
||||
else:
|
||||
history_video.append(real_history_latents)
|
||||
|
||||
|
||||
@@ -789,7 +789,6 @@ class LTXPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraLoaderMixi
|
||||
]
|
||||
latents = (1 - decode_noise_scale) * latents + decode_noise_scale * noise
|
||||
|
||||
latents = latents.to(self.vae.dtype)
|
||||
video = self.vae.decode(latents, timestep, return_dict=False)[0]
|
||||
video = self.video_processor.postprocess_video(video, output_type=output_type)
|
||||
|
||||
|
||||
@@ -1,116 +0,0 @@
|
||||
# Copyright 2024 HuggingFace Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import unittest
|
||||
|
||||
import torch
|
||||
|
||||
from diffusers import HunyuanVideoFramepackTransformer3DModel
|
||||
from diffusers.utils.testing_utils import (
|
||||
enable_full_determinism,
|
||||
torch_device,
|
||||
)
|
||||
|
||||
from ..test_modeling_common import ModelTesterMixin
|
||||
|
||||
|
||||
enable_full_determinism()
|
||||
|
||||
|
||||
class HunyuanVideoTransformer3DTests(ModelTesterMixin, unittest.TestCase):
|
||||
model_class = HunyuanVideoFramepackTransformer3DModel
|
||||
main_input_name = "hidden_states"
|
||||
uses_custom_attn_processor = True
|
||||
model_split_percents = [0.5, 0.7, 0.9]
|
||||
|
||||
@property
|
||||
def dummy_input(self):
|
||||
batch_size = 1
|
||||
num_channels = 4
|
||||
num_frames = 3
|
||||
height = 4
|
||||
width = 4
|
||||
text_encoder_embedding_dim = 16
|
||||
image_encoder_embedding_dim = 16
|
||||
pooled_projection_dim = 8
|
||||
sequence_length = 12
|
||||
|
||||
hidden_states = torch.randn((batch_size, num_channels, num_frames, height, width)).to(torch_device)
|
||||
encoder_hidden_states = torch.randn((batch_size, sequence_length, text_encoder_embedding_dim)).to(torch_device)
|
||||
pooled_projections = torch.randn((batch_size, pooled_projection_dim)).to(torch_device)
|
||||
encoder_attention_mask = torch.ones((batch_size, sequence_length)).to(torch_device)
|
||||
image_embeds = torch.randn((batch_size, sequence_length, image_encoder_embedding_dim)).to(torch_device)
|
||||
indices_latents = torch.ones((3,)).to(torch_device)
|
||||
latents_clean = torch.randn((batch_size, num_channels, num_frames - 1, height, width)).to(torch_device)
|
||||
indices_latents_clean = torch.ones((num_frames - 1,)).to(torch_device)
|
||||
latents_history_2x = torch.randn((batch_size, num_channels, num_frames - 1, height, width)).to(torch_device)
|
||||
indices_latents_history_2x = torch.ones((num_frames - 1,)).to(torch_device)
|
||||
latents_history_4x = torch.randn((batch_size, num_channels, (num_frames - 1) * 4, height, width)).to(
|
||||
torch_device
|
||||
)
|
||||
indices_latents_history_4x = torch.ones(((num_frames - 1) * 4,)).to(torch_device)
|
||||
timestep = torch.randint(0, 1000, size=(batch_size,)).to(torch_device)
|
||||
guidance = torch.randint(0, 1000, size=(batch_size,)).to(torch_device)
|
||||
|
||||
return {
|
||||
"hidden_states": hidden_states,
|
||||
"timestep": timestep,
|
||||
"encoder_hidden_states": encoder_hidden_states,
|
||||
"pooled_projections": pooled_projections,
|
||||
"encoder_attention_mask": encoder_attention_mask,
|
||||
"guidance": guidance,
|
||||
"image_embeds": image_embeds,
|
||||
"indices_latents": indices_latents,
|
||||
"latents_clean": latents_clean,
|
||||
"indices_latents_clean": indices_latents_clean,
|
||||
"latents_history_2x": latents_history_2x,
|
||||
"indices_latents_history_2x": indices_latents_history_2x,
|
||||
"latents_history_4x": latents_history_4x,
|
||||
"indices_latents_history_4x": indices_latents_history_4x,
|
||||
}
|
||||
|
||||
@property
|
||||
def input_shape(self):
|
||||
return (4, 3, 4, 4)
|
||||
|
||||
@property
|
||||
def output_shape(self):
|
||||
return (4, 3, 4, 4)
|
||||
|
||||
def prepare_init_args_and_inputs_for_common(self):
|
||||
init_dict = {
|
||||
"in_channels": 4,
|
||||
"out_channels": 4,
|
||||
"num_attention_heads": 2,
|
||||
"attention_head_dim": 10,
|
||||
"num_layers": 1,
|
||||
"num_single_layers": 1,
|
||||
"num_refiner_layers": 1,
|
||||
"patch_size": 2,
|
||||
"patch_size_t": 1,
|
||||
"guidance_embeds": True,
|
||||
"text_embed_dim": 16,
|
||||
"pooled_projection_dim": 8,
|
||||
"rope_axes_dim": (2, 4, 4),
|
||||
"image_condition_type": None,
|
||||
"has_image_proj": True,
|
||||
"image_proj_dim": 16,
|
||||
"has_clean_x_embedder": True,
|
||||
}
|
||||
inputs_dict = self.dummy_input
|
||||
return init_dict, inputs_dict
|
||||
|
||||
def test_gradient_checkpointing_is_applied(self):
|
||||
expected_set = {"HunyuanVideoFramepackTransformer3DModel"}
|
||||
super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
|
||||
@@ -20,13 +20,13 @@ import torch
|
||||
from diffusers import LTXVideoTransformer3DModel
|
||||
from diffusers.utils.testing_utils import enable_full_determinism, torch_device
|
||||
|
||||
from ..test_modeling_common import ModelTesterMixin, TorchCompileTesterMixin
|
||||
from ..test_modeling_common import ModelTesterMixin
|
||||
|
||||
|
||||
enable_full_determinism()
|
||||
|
||||
|
||||
class LTXTransformerTests(ModelTesterMixin, TorchCompileTesterMixin, unittest.TestCase):
|
||||
class LTXTransformerTests(ModelTesterMixin, unittest.TestCase):
|
||||
model_class = LTXVideoTransformer3DModel
|
||||
main_input_name = "hidden_states"
|
||||
uses_custom_attn_processor = True
|
||||
|
||||
@@ -24,10 +24,9 @@ from transformers import AutoTokenizer, T5EncoderModel
|
||||
from diffusers import AutoencoderKLCogVideoX, ConsisIDPipeline, ConsisIDTransformer3DModel, DDIMScheduler
|
||||
from diffusers.utils import load_image
|
||||
from diffusers.utils.testing_utils import (
|
||||
backend_empty_cache,
|
||||
enable_full_determinism,
|
||||
numpy_cosine_similarity_distance,
|
||||
require_torch_accelerator,
|
||||
require_torch_gpu,
|
||||
slow,
|
||||
torch_device,
|
||||
)
|
||||
@@ -317,19 +316,19 @@ class ConsisIDPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
|
||||
|
||||
@slow
|
||||
@require_torch_accelerator
|
||||
@require_torch_gpu
|
||||
class ConsisIDPipelineIntegrationTests(unittest.TestCase):
|
||||
prompt = "A painting of a squirrel eating a burger."
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
def tearDown(self):
|
||||
super().tearDown()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
def test_consisid(self):
|
||||
generator = torch.Generator("cpu").manual_seed(0)
|
||||
@@ -339,8 +338,8 @@ class ConsisIDPipelineIntegrationTests(unittest.TestCase):
|
||||
|
||||
prompt = self.prompt
|
||||
image = load_image("https://github.com/PKU-YuanGroup/ConsisID/blob/main/asserts/example_images/2.png?raw=true")
|
||||
id_vit_hidden = [torch.ones([1, 577, 1024])] * 5
|
||||
id_cond = torch.ones(1, 1280)
|
||||
id_vit_hidden = [torch.ones([1, 2, 2])] * 1
|
||||
id_cond = torch.ones(1, 2)
|
||||
|
||||
videos = pipe(
|
||||
image=image,
|
||||
@@ -358,5 +357,5 @@ class ConsisIDPipelineIntegrationTests(unittest.TestCase):
|
||||
video = videos[0]
|
||||
expected_video = torch.randn(1, 16, 480, 720, 3).numpy()
|
||||
|
||||
max_diff = numpy_cosine_similarity_distance(video.cpu(), expected_video)
|
||||
max_diff = numpy_cosine_similarity_distance(video, expected_video)
|
||||
assert max_diff < 1e-3, f"Max diff is too high. got {video}"
|
||||
|
||||
@@ -20,14 +20,7 @@ import numpy as np
|
||||
import torch
|
||||
|
||||
from diffusers import DanceDiffusionPipeline, IPNDMScheduler, UNet1DModel
|
||||
from diffusers.utils.testing_utils import (
|
||||
backend_empty_cache,
|
||||
enable_full_determinism,
|
||||
nightly,
|
||||
require_torch_accelerator,
|
||||
skip_mps,
|
||||
torch_device,
|
||||
)
|
||||
from diffusers.utils.testing_utils import enable_full_determinism, nightly, require_torch_gpu, skip_mps, torch_device
|
||||
|
||||
from ..pipeline_params import UNCONDITIONAL_AUDIO_GENERATION_BATCH_PARAMS, UNCONDITIONAL_AUDIO_GENERATION_PARAMS
|
||||
from ..test_pipelines_common import PipelineTesterMixin
|
||||
@@ -123,19 +116,19 @@ class DanceDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
|
||||
|
||||
@nightly
|
||||
@require_torch_accelerator
|
||||
@require_torch_gpu
|
||||
class PipelineIntegrationTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
# clean up the VRAM before each test
|
||||
super().setUp()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
def tearDown(self):
|
||||
# clean up the VRAM after each test
|
||||
super().tearDown()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
def test_dance_diffusion(self):
|
||||
device = torch_device
|
||||
|
||||
@@ -21,15 +21,7 @@ import torch
|
||||
|
||||
from diffusers import AutoencoderKL, DDIMScheduler, DiTPipeline, DiTTransformer2DModel, DPMSolverMultistepScheduler
|
||||
from diffusers.utils import is_xformers_available
|
||||
from diffusers.utils.testing_utils import (
|
||||
backend_empty_cache,
|
||||
enable_full_determinism,
|
||||
load_numpy,
|
||||
nightly,
|
||||
numpy_cosine_similarity_distance,
|
||||
require_torch_accelerator,
|
||||
torch_device,
|
||||
)
|
||||
from diffusers.utils.testing_utils import enable_full_determinism, load_numpy, nightly, require_torch_gpu, torch_device
|
||||
|
||||
from ..pipeline_params import (
|
||||
CLASS_CONDITIONED_IMAGE_GENERATION_BATCH_PARAMS,
|
||||
@@ -115,23 +107,23 @@ class DiTPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
|
||||
|
||||
@nightly
|
||||
@require_torch_accelerator
|
||||
@require_torch_gpu
|
||||
class DiTPipelineIntegrationTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
def tearDown(self):
|
||||
super().tearDown()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
def test_dit_256(self):
|
||||
generator = torch.manual_seed(0)
|
||||
|
||||
pipe = DiTPipeline.from_pretrained("facebook/DiT-XL-2-256")
|
||||
pipe.to(torch_device)
|
||||
pipe.to("cuda")
|
||||
|
||||
words = ["vase", "umbrella", "white shark", "white wolf"]
|
||||
ids = pipe.get_label_ids(words)
|
||||
@@ -147,7 +139,7 @@ class DiTPipelineIntegrationTests(unittest.TestCase):
|
||||
def test_dit_512(self):
|
||||
pipe = DiTPipeline.from_pretrained("facebook/DiT-XL-2-512")
|
||||
pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
|
||||
pipe.to(torch_device)
|
||||
pipe.to("cuda")
|
||||
|
||||
words = ["vase", "umbrella"]
|
||||
ids = pipe.get_label_ids(words)
|
||||
@@ -160,7 +152,4 @@ class DiTPipelineIntegrationTests(unittest.TestCase):
|
||||
f"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/dit/{word}_512.npy"
|
||||
)
|
||||
|
||||
expected_slice = expected_image.flatten()
|
||||
output_slice = image.flatten()
|
||||
|
||||
assert numpy_cosine_similarity_distance(expected_slice, output_slice) < 1e-2
|
||||
assert np.abs((expected_image - image).max()) < 1e-1
|
||||
|
||||
@@ -27,10 +27,9 @@ from diffusers import (
|
||||
FlowMatchEulerDiscreteScheduler,
|
||||
)
|
||||
from diffusers.utils.testing_utils import (
|
||||
backend_empty_cache,
|
||||
enable_full_determinism,
|
||||
numpy_cosine_similarity_distance,
|
||||
require_torch_accelerator,
|
||||
require_torch_gpu,
|
||||
slow,
|
||||
torch_device,
|
||||
)
|
||||
@@ -257,19 +256,19 @@ class EasyAnimatePipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
|
||||
|
||||
@slow
|
||||
@require_torch_accelerator
|
||||
@require_torch_gpu
|
||||
class EasyAnimatePipelineIntegrationTests(unittest.TestCase):
|
||||
prompt = "A painting of a squirrel eating a burger."
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
def tearDown(self):
|
||||
super().tearDown()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
def test_EasyAnimate(self):
|
||||
generator = torch.Generator("cpu").manual_seed(0)
|
||||
|
||||
@@ -28,15 +28,13 @@ from diffusers import (
|
||||
VQModel,
|
||||
)
|
||||
from diffusers.utils.testing_utils import (
|
||||
backend_empty_cache,
|
||||
enable_full_determinism,
|
||||
floats_tensor,
|
||||
load_image,
|
||||
load_numpy,
|
||||
nightly,
|
||||
numpy_cosine_similarity_distance,
|
||||
require_torch_accelerator,
|
||||
torch_device,
|
||||
require_torch_gpu,
|
||||
)
|
||||
|
||||
from ..test_pipelines_common import PipelineTesterMixin
|
||||
@@ -228,19 +226,19 @@ class KandinskyV22ControlnetPipelineFastTests(PipelineTesterMixin, unittest.Test
|
||||
|
||||
|
||||
@nightly
|
||||
@require_torch_accelerator
|
||||
@require_torch_gpu
|
||||
class KandinskyV22ControlnetPipelineIntegrationTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
# clean up the VRAM before each test
|
||||
super().setUp()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
def tearDown(self):
|
||||
# clean up the VRAM after each test
|
||||
super().tearDown()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
def test_kandinsky_controlnet(self):
|
||||
expected_image = load_numpy(
|
||||
|
||||
@@ -29,15 +29,13 @@ from diffusers import (
|
||||
VQModel,
|
||||
)
|
||||
from diffusers.utils.testing_utils import (
|
||||
backend_empty_cache,
|
||||
enable_full_determinism,
|
||||
floats_tensor,
|
||||
load_image,
|
||||
load_numpy,
|
||||
nightly,
|
||||
numpy_cosine_similarity_distance,
|
||||
require_torch_accelerator,
|
||||
torch_device,
|
||||
require_torch_gpu,
|
||||
)
|
||||
|
||||
from ..test_pipelines_common import PipelineTesterMixin
|
||||
@@ -235,19 +233,19 @@ class KandinskyV22ControlnetImg2ImgPipelineFastTests(PipelineTesterMixin, unitte
|
||||
|
||||
|
||||
@nightly
|
||||
@require_torch_accelerator
|
||||
@require_torch_gpu
|
||||
class KandinskyV22ControlnetImg2ImgPipelineIntegrationTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
# clean up the VRAM before each test
|
||||
super().setUp()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
def tearDown(self):
|
||||
# clean up the VRAM after each test
|
||||
super().tearDown()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
def test_kandinsky_controlnet_img2img(self):
|
||||
expected_image = load_numpy(
|
||||
@@ -311,4 +309,4 @@ class KandinskyV22ControlnetImg2ImgPipelineIntegrationTests(unittest.TestCase):
|
||||
assert image.shape == (512, 512, 3)
|
||||
|
||||
max_diff = numpy_cosine_similarity_distance(expected_image.flatten(), image.flatten())
|
||||
assert max_diff < 5e-4
|
||||
assert max_diff < 1e-4
|
||||
|
||||
@@ -22,11 +22,10 @@ from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
|
||||
|
||||
from diffusers import AutoencoderKL, DDIMScheduler, LDMTextToImagePipeline, UNet2DConditionModel
|
||||
from diffusers.utils.testing_utils import (
|
||||
backend_empty_cache,
|
||||
enable_full_determinism,
|
||||
load_numpy,
|
||||
nightly,
|
||||
require_torch_accelerator,
|
||||
require_torch_gpu,
|
||||
torch_device,
|
||||
)
|
||||
|
||||
@@ -137,17 +136,17 @@ class LDMTextToImagePipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
|
||||
|
||||
@nightly
|
||||
@require_torch_accelerator
|
||||
@require_torch_gpu
|
||||
class LDMTextToImagePipelineSlowTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
def tearDown(self):
|
||||
super().tearDown()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
def get_inputs(self, device, dtype=torch.float32, seed=0):
|
||||
generator = torch.manual_seed(seed)
|
||||
@@ -178,17 +177,17 @@ class LDMTextToImagePipelineSlowTests(unittest.TestCase):
|
||||
|
||||
|
||||
@nightly
|
||||
@require_torch_accelerator
|
||||
@require_torch_gpu
|
||||
class LDMTextToImagePipelineNightlyTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
def tearDown(self):
|
||||
super().tearDown()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
def get_inputs(self, device, dtype=torch.float32, seed=0):
|
||||
generator = torch.manual_seed(seed)
|
||||
|
||||
@@ -27,8 +27,8 @@ from diffusers.utils.testing_utils import (
|
||||
enable_full_determinism,
|
||||
nightly,
|
||||
numpy_cosine_similarity_distance,
|
||||
require_big_accelerator,
|
||||
require_torch_accelerator,
|
||||
require_big_gpu_with_torch_cuda,
|
||||
require_torch_gpu,
|
||||
torch_device,
|
||||
)
|
||||
|
||||
@@ -266,9 +266,9 @@ class MochiPipelineFastTests(PipelineTesterMixin, FasterCacheTesterMixin, unitte
|
||||
|
||||
|
||||
@nightly
|
||||
@require_torch_accelerator
|
||||
@require_big_accelerator
|
||||
@pytest.mark.big_accelerator
|
||||
@require_torch_gpu
|
||||
@require_big_gpu_with_torch_cuda
|
||||
@pytest.mark.big_gpu_with_torch_cuda
|
||||
class MochiPipelineIntegrationTests(unittest.TestCase):
|
||||
prompt = "A painting of a squirrel eating a burger."
|
||||
|
||||
@@ -302,5 +302,5 @@ class MochiPipelineIntegrationTests(unittest.TestCase):
|
||||
video = videos[0]
|
||||
expected_video = torch.randn(1, 19, 480, 848, 3).numpy()
|
||||
|
||||
max_diff = numpy_cosine_similarity_distance(video.cpu(), expected_video)
|
||||
max_diff = numpy_cosine_similarity_distance(video, expected_video)
|
||||
assert max_diff < 1e-3, f"Max diff is too high. got {video}"
|
||||
|
||||
@@ -39,13 +39,7 @@ from diffusers import (
|
||||
UNet2DConditionModel,
|
||||
)
|
||||
from diffusers.utils import is_xformers_available
|
||||
from diffusers.utils.testing_utils import (
|
||||
backend_empty_cache,
|
||||
enable_full_determinism,
|
||||
nightly,
|
||||
require_torch_accelerator,
|
||||
torch_device,
|
||||
)
|
||||
from diffusers.utils.testing_utils import enable_full_determinism, nightly, require_torch_gpu, torch_device
|
||||
|
||||
from ..pipeline_params import TEXT_TO_AUDIO_BATCH_PARAMS, TEXT_TO_AUDIO_PARAMS
|
||||
from ..test_pipelines_common import PipelineTesterMixin
|
||||
@@ -414,17 +408,17 @@ class MusicLDMPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
|
||||
|
||||
@nightly
|
||||
@require_torch_accelerator
|
||||
@require_torch_gpu
|
||||
class MusicLDMPipelineNightlyTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
def tearDown(self):
|
||||
super().tearDown()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
|
||||
generator = torch.Generator(device=generator_device).manual_seed(seed)
|
||||
|
||||
@@ -7,10 +7,8 @@ from transformers import AutoTokenizer
|
||||
|
||||
from diffusers import AutoencoderKL, FlowMatchEulerDiscreteScheduler, OmniGenPipeline, OmniGenTransformer2DModel
|
||||
from diffusers.utils.testing_utils import (
|
||||
Expectations,
|
||||
backend_empty_cache,
|
||||
numpy_cosine_similarity_distance,
|
||||
require_torch_accelerator,
|
||||
require_torch_gpu,
|
||||
slow,
|
||||
torch_device,
|
||||
)
|
||||
@@ -89,7 +87,7 @@ class OmniGenPipelineFastTests(unittest.TestCase, PipelineTesterMixin):
|
||||
|
||||
|
||||
@slow
|
||||
@require_torch_accelerator
|
||||
@require_torch_gpu
|
||||
class OmniGenPipelineSlowTests(unittest.TestCase):
|
||||
pipeline_class = OmniGenPipeline
|
||||
repo_id = "shitao/OmniGen-v1-diffusers"
|
||||
@@ -97,12 +95,12 @@ class OmniGenPipelineSlowTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
def tearDown(self):
|
||||
super().tearDown()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
def get_inputs(self, device, seed=0):
|
||||
if str(device).startswith("mps"):
|
||||
@@ -127,56 +125,21 @@ class OmniGenPipelineSlowTests(unittest.TestCase):
|
||||
image = pipe(**inputs).images[0]
|
||||
image_slice = image[0, :10, :10]
|
||||
|
||||
expected_slices = Expectations(
|
||||
{
|
||||
("xpu", 3): np.array(
|
||||
[
|
||||
[0.05859375, 0.05859375, 0.04492188],
|
||||
[0.04882812, 0.04101562, 0.03320312],
|
||||
[0.04882812, 0.04296875, 0.03125],
|
||||
[0.04296875, 0.0390625, 0.03320312],
|
||||
[0.04296875, 0.03710938, 0.03125],
|
||||
[0.04492188, 0.0390625, 0.03320312],
|
||||
[0.04296875, 0.03710938, 0.03125],
|
||||
[0.04101562, 0.03710938, 0.02734375],
|
||||
[0.04101562, 0.03515625, 0.02734375],
|
||||
[0.04101562, 0.03515625, 0.02929688],
|
||||
],
|
||||
dtype=np.float32,
|
||||
),
|
||||
("cuda", 7): np.array(
|
||||
[
|
||||
[0.1783447, 0.16772744, 0.14339337],
|
||||
[0.17066911, 0.15521264, 0.13757327],
|
||||
[0.17072496, 0.15531206, 0.13524258],
|
||||
[0.16746324, 0.1564025, 0.13794944],
|
||||
[0.16490817, 0.15258026, 0.13697758],
|
||||
[0.16971767, 0.15826806, 0.13928896],
|
||||
[0.16782972, 0.15547255, 0.13783783],
|
||||
[0.16464645, 0.15281534, 0.13522372],
|
||||
[0.16535294, 0.15301755, 0.13526791],
|
||||
[0.16365296, 0.15092957, 0.13443318],
|
||||
],
|
||||
dtype=np.float32,
|
||||
),
|
||||
("cuda", 8): np.array(
|
||||
[
|
||||
[0.0546875, 0.05664062, 0.04296875],
|
||||
[0.046875, 0.04101562, 0.03320312],
|
||||
[0.05078125, 0.04296875, 0.03125],
|
||||
[0.04296875, 0.04101562, 0.03320312],
|
||||
[0.0390625, 0.03710938, 0.02929688],
|
||||
[0.04296875, 0.03710938, 0.03125],
|
||||
[0.0390625, 0.03710938, 0.02929688],
|
||||
[0.0390625, 0.03710938, 0.02734375],
|
||||
[0.0390625, 0.03320312, 0.02734375],
|
||||
[0.0390625, 0.03320312, 0.02734375],
|
||||
],
|
||||
dtype=np.float32,
|
||||
),
|
||||
}
|
||||
expected_slice = np.array(
|
||||
[
|
||||
[0.1783447, 0.16772744, 0.14339337],
|
||||
[0.17066911, 0.15521264, 0.13757327],
|
||||
[0.17072496, 0.15531206, 0.13524258],
|
||||
[0.16746324, 0.1564025, 0.13794944],
|
||||
[0.16490817, 0.15258026, 0.13697758],
|
||||
[0.16971767, 0.15826806, 0.13928896],
|
||||
[0.16782972, 0.15547255, 0.13783783],
|
||||
[0.16464645, 0.15281534, 0.13522372],
|
||||
[0.16535294, 0.15301755, 0.13526791],
|
||||
[0.16365296, 0.15092957, 0.13443318],
|
||||
],
|
||||
dtype=np.float32,
|
||||
)
|
||||
expected_slice = expected_slices.get_expectation()
|
||||
|
||||
max_diff = numpy_cosine_similarity_distance(expected_slice.flatten(), image_slice.flatten())
|
||||
|
||||
|
||||
@@ -25,12 +25,11 @@ from transformers import CLIPImageProcessor, CLIPVisionConfig
|
||||
from diffusers import AutoencoderKL, PaintByExamplePipeline, PNDMScheduler, UNet2DConditionModel
|
||||
from diffusers.pipelines.paint_by_example import PaintByExampleImageEncoder
|
||||
from diffusers.utils.testing_utils import (
|
||||
backend_empty_cache,
|
||||
enable_full_determinism,
|
||||
floats_tensor,
|
||||
load_image,
|
||||
nightly,
|
||||
require_torch_accelerator,
|
||||
require_torch_gpu,
|
||||
torch_device,
|
||||
)
|
||||
|
||||
@@ -175,19 +174,19 @@ class PaintByExamplePipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
|
||||
|
||||
@nightly
|
||||
@require_torch_accelerator
|
||||
@require_torch_gpu
|
||||
class PaintByExamplePipelineIntegrationTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
# clean up the VRAM before each test
|
||||
super().setUp()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
def tearDown(self):
|
||||
# clean up the VRAM after each test
|
||||
super().tearDown()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
def test_paint_by_example(self):
|
||||
# make sure here that pndm scheduler skips prk
|
||||
|
||||
@@ -21,13 +21,7 @@ from transformers import CLIPTextConfig, CLIPTextModelWithProjection, CLIPTokeni
|
||||
|
||||
from diffusers import HeunDiscreteScheduler, PriorTransformer, ShapEPipeline
|
||||
from diffusers.pipelines.shap_e import ShapERenderer
|
||||
from diffusers.utils.testing_utils import (
|
||||
backend_empty_cache,
|
||||
load_numpy,
|
||||
nightly,
|
||||
require_torch_accelerator,
|
||||
torch_device,
|
||||
)
|
||||
from diffusers.utils.testing_utils import load_numpy, nightly, require_torch_gpu, torch_device
|
||||
|
||||
from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
|
||||
|
||||
@@ -228,19 +222,19 @@ class ShapEPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
|
||||
|
||||
@nightly
|
||||
@require_torch_accelerator
|
||||
@require_torch_gpu
|
||||
class ShapEPipelineIntegrationTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
# clean up the VRAM before each test
|
||||
super().setUp()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
def tearDown(self):
|
||||
# clean up the VRAM after each test
|
||||
super().tearDown()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
def test_shap_e(self):
|
||||
expected_image = load_numpy(
|
||||
|
||||
@@ -23,12 +23,11 @@ from transformers import CLIPImageProcessor, CLIPVisionConfig, CLIPVisionModel
|
||||
from diffusers import HeunDiscreteScheduler, PriorTransformer, ShapEImg2ImgPipeline
|
||||
from diffusers.pipelines.shap_e import ShapERenderer
|
||||
from diffusers.utils.testing_utils import (
|
||||
backend_empty_cache,
|
||||
floats_tensor,
|
||||
load_image,
|
||||
load_numpy,
|
||||
nightly,
|
||||
require_torch_accelerator,
|
||||
require_torch_gpu,
|
||||
torch_device,
|
||||
)
|
||||
|
||||
@@ -251,19 +250,19 @@ class ShapEImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
|
||||
|
||||
@nightly
|
||||
@require_torch_accelerator
|
||||
@require_torch_gpu
|
||||
class ShapEImg2ImgPipelineIntegrationTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
# clean up the VRAM before each test
|
||||
super().setUp()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
def tearDown(self):
|
||||
# clean up the VRAM after each test
|
||||
super().tearDown()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
def test_shap_e_img2img(self):
|
||||
input_image = load_image(
|
||||
|
||||
@@ -32,14 +32,7 @@ from diffusers import (
|
||||
StableAudioProjectionModel,
|
||||
)
|
||||
from diffusers.utils import is_xformers_available
|
||||
from diffusers.utils.testing_utils import (
|
||||
Expectations,
|
||||
backend_empty_cache,
|
||||
enable_full_determinism,
|
||||
nightly,
|
||||
require_torch_accelerator,
|
||||
torch_device,
|
||||
)
|
||||
from diffusers.utils.testing_utils import enable_full_determinism, nightly, require_torch_gpu, torch_device
|
||||
|
||||
from ..pipeline_params import TEXT_TO_AUDIO_BATCH_PARAMS
|
||||
from ..test_pipelines_common import PipelineTesterMixin
|
||||
@@ -426,17 +419,17 @@ class StableAudioPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
|
||||
|
||||
@nightly
|
||||
@require_torch_accelerator
|
||||
@require_torch_gpu
|
||||
class StableAudioPipelineIntegrationTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
def tearDown(self):
|
||||
super().tearDown()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
|
||||
generator = torch.Generator(device=generator_device).manual_seed(seed)
|
||||
@@ -466,15 +459,9 @@ class StableAudioPipelineIntegrationTests(unittest.TestCase):
|
||||
# check the portion of the generated audio with the largest dynamic range (reduces flakiness)
|
||||
audio_slice = audio[0, 447590:447600]
|
||||
# fmt: off
|
||||
expected_slices = Expectations(
|
||||
{
|
||||
("xpu", 3): np.array([-0.0285, 0.1083, 0.1863, 0.3165, 0.5312, 0.6971, 0.6958, 0.6177, 0.5598, 0.5048]),
|
||||
("cuda", 7): np.array([-0.0278, 0.1096, 0.1877, 0.3178, 0.5329, 0.6990, 0.6972, 0.6186, 0.5608, 0.5060]),
|
||||
("cuda", 8): np.array([-0.0285, 0.1082, 0.1862, 0.3163, 0.5306, 0.6964, 0.6953, 0.6172, 0.5593, 0.5044]),
|
||||
}
|
||||
expected_slice = np.array(
|
||||
[-0.0278, 0.1096, 0.1877, 0.3178, 0.5329, 0.6990, 0.6972, 0.6186, 0.5608, 0.5060]
|
||||
)
|
||||
# fmt: on
|
||||
|
||||
expected_slice = expected_slices.get_expectation()
|
||||
# fmt: one
|
||||
max_diff = np.abs(expected_slice - audio_slice.detach().cpu().numpy()).max()
|
||||
assert max_diff < 1.5e-3
|
||||
|
||||
@@ -389,7 +389,7 @@ class BnB4BitBasicTests(Base4bitTests):
|
||||
class BnB4BitTrainingTests(Base4bitTests):
|
||||
def setUp(self):
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
nf4_config = BitsAndBytesConfig(
|
||||
load_in_4bit=True,
|
||||
@@ -657,7 +657,7 @@ class SlowBnb4BitTests(Base4bitTests):
|
||||
class SlowBnb4BitFluxTests(Base4bitTests):
|
||||
def setUp(self) -> None:
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
model_id = "hf-internal-testing/flux.1-dev-nf4-pkg"
|
||||
t5_4bit = T5EncoderModel.from_pretrained(model_id, subfolder="text_encoder_2")
|
||||
@@ -674,7 +674,7 @@ class SlowBnb4BitFluxTests(Base4bitTests):
|
||||
del self.pipeline_4bit
|
||||
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
def test_quality(self):
|
||||
# keep the resolution and max tokens to a lower number for faster execution.
|
||||
@@ -722,7 +722,7 @@ class SlowBnb4BitFluxTests(Base4bitTests):
|
||||
class SlowBnb4BitFluxControlWithLoraTests(Base4bitTests):
|
||||
def setUp(self) -> None:
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
self.pipeline_4bit = FluxControlPipeline.from_pretrained("eramth/flux-4bit", torch_dtype=torch.float16)
|
||||
self.pipeline_4bit.enable_model_cpu_offload()
|
||||
@@ -731,7 +731,7 @@ class SlowBnb4BitFluxControlWithLoraTests(Base4bitTests):
|
||||
del self.pipeline_4bit
|
||||
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
def test_lora_loading(self):
|
||||
self.pipeline_4bit.load_lora_weights("black-forest-labs/FLUX.1-Canny-dev-lora")
|
||||
|
||||
@@ -12,7 +12,6 @@ from diffusers import (
|
||||
FluxPipeline,
|
||||
FluxTransformer2DModel,
|
||||
GGUFQuantizationConfig,
|
||||
HiDreamImageTransformer2DModel,
|
||||
SD3Transformer2DModel,
|
||||
StableDiffusion3Pipeline,
|
||||
)
|
||||
@@ -550,30 +549,3 @@ class FluxControlLoRAGGUFTests(unittest.TestCase):
|
||||
|
||||
max_diff = numpy_cosine_similarity_distance(expected_slice, out_slice)
|
||||
self.assertTrue(max_diff < 1e-3)
|
||||
|
||||
|
||||
class HiDreamGGUFSingleFileTests(GGUFSingleFileTesterMixin, unittest.TestCase):
|
||||
ckpt_path = "https://huggingface.co/city96/HiDream-I1-Dev-gguf/blob/main/hidream-i1-dev-Q2_K.gguf"
|
||||
torch_dtype = torch.bfloat16
|
||||
model_cls = HiDreamImageTransformer2DModel
|
||||
expected_memory_use_in_gb = 8
|
||||
|
||||
def get_dummy_inputs(self):
|
||||
return {
|
||||
"hidden_states": torch.randn((1, 16, 128, 128), generator=torch.Generator("cpu").manual_seed(0)).to(
|
||||
torch_device, self.torch_dtype
|
||||
),
|
||||
"encoder_hidden_states_t5": torch.randn(
|
||||
(1, 128, 4096),
|
||||
generator=torch.Generator("cpu").manual_seed(0),
|
||||
).to(torch_device, self.torch_dtype),
|
||||
"encoder_hidden_states_llama3": torch.randn(
|
||||
(32, 1, 128, 4096),
|
||||
generator=torch.Generator("cpu").manual_seed(0),
|
||||
).to(torch_device, self.torch_dtype),
|
||||
"pooled_embeds": torch.randn(
|
||||
(1, 2048),
|
||||
generator=torch.Generator("cpu").manual_seed(0),
|
||||
).to(torch_device, self.torch_dtype),
|
||||
"timesteps": torch.tensor([1]).to(torch_device, self.torch_dtype),
|
||||
}
|
||||
|
||||
@@ -34,24 +34,13 @@ try:
|
||||
|
||||
print("Torch version:", torch.__version__)
|
||||
print("Cuda available:", torch.cuda.is_available())
|
||||
print("Cuda version:", torch.version.cuda)
|
||||
print("CuDNN version:", torch.backends.cudnn.version())
|
||||
print("Number of GPUs available:", torch.cuda.device_count())
|
||||
if torch.cuda.is_available():
|
||||
print("Cuda version:", torch.version.cuda)
|
||||
print("CuDNN version:", torch.backends.cudnn.version())
|
||||
print("Number of GPUs available:", torch.cuda.device_count())
|
||||
device_properties = torch.cuda.get_device_properties(0)
|
||||
total_memory = device_properties.total_memory / (1024**3)
|
||||
print(f"CUDA memory: {total_memory} GB")
|
||||
|
||||
print("XPU available:", hasattr(torch, "xpu") and torch.xpu.is_available())
|
||||
if hasattr(torch, "xpu") and torch.xpu.is_available():
|
||||
print("XPU model:", torch.xpu.get_device_properties(0).name)
|
||||
print("XPU compiler version:", torch.version.xpu)
|
||||
print("Number of XPUs available:", torch.xpu.device_count())
|
||||
device_properties = torch.xpu.get_device_properties(0)
|
||||
total_memory = device_properties.total_memory / (1024**3)
|
||||
print(f"XPU memory: {total_memory} GB")
|
||||
|
||||
|
||||
except ImportError:
|
||||
print("Torch version:", None)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user