Compare commits

..

20 Commits

Author SHA1 Message Date
Sayak Paul
51a855c8c6 Merge branch 'main' into layerwise-upcasting 2024-08-19 14:22:12 +05:30
Dhruv Nair
940b8e0358 [CI] Multiple Slow Test fixes. (#9198)
* update

* update

* update

* update
2024-08-19 13:31:09 +05:30
Dhruv Nair
b2add10d13 Update is_safetensors_compatible check (#8991)
* update

* update

* update

* update

* update
2024-08-19 11:35:22 +05:30
Wenlong Wu
815d882217 Add loading text inversion (#9130) 2024-08-19 09:26:27 +05:30
Sayak Paul
c64fa22c08 Merge branch 'main' into layerwise-upcasting 2024-08-19 09:13:18 +05:30
M Saqlain
ba4348d9a7 [Tests] Improve transformers model test suite coverage - Lumina (#8987)
* Added test suite for lumina

* Fixed failing tests

* Improved code quality

* Added function docstrings

* Improved formatting
2024-08-19 08:29:03 +05:30
townwish4git
d25eb5d385 fix(sd3): fix deletion of text_encoders etc (#8951)
Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>
Co-authored-by: YiYi Xu <yixu310@gmail.com>
2024-08-18 15:37:40 -10:00
Tolga Cangöz
7ef8a46523 [Docs] Fix CPU offloading usage (#9207)
* chore: Fix cpu offloading usage

* Trim trailing white space

* docs: update Kolors model link in kolors.md
2024-08-18 13:12:12 -10:00
Sayak Paul
f848febacd feat: allow sharding for auraflow. (#8853) 2024-08-18 08:47:26 +05:30
Beinsezii
b38255006a Add Lumina T2I Auto Pipe Mapping (#8962) 2024-08-16 23:14:17 -10:00
Jianqi Pan
cba548d8a3 fix(pipeline): k sampler sigmas device (#9189)
If Karras is not enabled, a device inconsistency error will occur. This is due to the fact that sigmas were not moved to the specified device.
2024-08-16 22:43:42 -10:00
Álvaro Somoza
db829a4be4 [IP Adapter] Fix object has no attribute with image encoder (#9194)
* fix

* apply suggestion
2024-08-17 02:00:04 -04:00
Sayak Paul
0d1a1f875a Merge branch 'main' into layerwise-upcasting 2024-08-16 14:15:15 +05:30
Sayak Paul
e780c05cc3 [Chore] add set_default_attn_processor to pixart. (#9196)
add set_default_attn_processor to pixart.
2024-08-16 13:07:06 +05:30
Sayak Paul
f1fa1235e4 Merge branch 'main' into layerwise-upcasting 2024-08-16 09:48:53 +05:30
Sayak Paul
9b411e5ff3 Merge branch 'main' into layerwise-upcasting 2024-08-15 10:34:40 +05:30
Dhruv Nair
b366b22191 update 2024-08-14 14:50:18 +02:00
Dhruv Nair
1fdae85f49 update 2024-08-14 14:19:20 +02:00
Dhruv Nair
6b9fd0905e update 2024-08-14 08:22:43 +02:00
Dhruv Nair
be55fa631f update 2024-08-13 14:11:47 +02:00
44 changed files with 538 additions and 309 deletions

View File

@@ -290,118 +290,64 @@ jobs:
pip install slack_sdk tabulate
python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
# M1 runner currently not well supported
# TODO: (Dhruv) add these back when we setup better testing for Apple Silicon
# run_nightly_tests_apple_m1:
# name: Nightly PyTorch MPS tests on MacOS
# runs-on: [ self-hosted, apple-m1 ]
# if: github.event_name == 'schedule'
#
# steps:
# - name: Checkout diffusers
# uses: actions/checkout@v3
# with:
# fetch-depth: 2
#
# - name: Clean checkout
# shell: arch -arch arm64 bash {0}
# run: |
# git clean -fxd
# - name: Setup miniconda
# uses: ./.github/actions/setup-miniconda
# with:
# python-version: 3.9
#
# - name: Install dependencies
# shell: arch -arch arm64 bash {0}
# run: |
# ${CONDA_RUN} python -m pip install --upgrade pip uv
# ${CONDA_RUN} python -m uv pip install -e [quality,test]
# ${CONDA_RUN} python -m uv pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu
# ${CONDA_RUN} python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate
# ${CONDA_RUN} python -m uv pip install pytest-reportlog
# - name: Environment
# shell: arch -arch arm64 bash {0}
# run: |
# ${CONDA_RUN} python utils/print_env.py
# - name: Run nightly PyTorch tests on M1 (MPS)
# shell: arch -arch arm64 bash {0}
# env:
# HF_HOME: /System/Volumes/Data/mnt/cache
# HF_TOKEN: ${{ secrets.HF_TOKEN }}
# run: |
# ${CONDA_RUN} python -m pytest -n 1 -s -v --make-reports=tests_torch_mps \
# --report-log=tests_torch_mps.log \
# tests/
# - name: Failure short reports
# if: ${{ failure() }}
# run: cat reports/tests_torch_mps_failures_short.txt
#
# - name: Test suite reports artifacts
# if: ${{ always() }}
# uses: actions/upload-artifact@v2
# with:
# name: torch_mps_test_reports
# path: reports
#
# - name: Generate Report and Notify Channel
# if: always()
# run: |
# pip install slack_sdk tabulate
# python utils/log_reports.py >> $GITHUB_STEP_SUMMARY run_nightly_tests_apple_m1:
# name: Nightly PyTorch MPS tests on MacOS
# runs-on: [ self-hosted, apple-m1 ]
# if: github.event_name == 'schedule'
#
# steps:
# - name: Checkout diffusers
# uses: actions/checkout@v3
# with:
# fetch-depth: 2
#
# - name: Clean checkout
# shell: arch -arch arm64 bash {0}
# run: |
# git clean -fxd
# - name: Setup miniconda
# uses: ./.github/actions/setup-miniconda
# with:
# python-version: 3.9
#
# - name: Install dependencies
# shell: arch -arch arm64 bash {0}
# run: |
# ${CONDA_RUN} python -m pip install --upgrade pip uv
# ${CONDA_RUN} python -m uv pip install -e [quality,test]
# ${CONDA_RUN} python -m uv pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu
# ${CONDA_RUN} python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate
# ${CONDA_RUN} python -m uv pip install pytest-reportlog
# - name: Environment
# shell: arch -arch arm64 bash {0}
# run: |
# ${CONDA_RUN} python utils/print_env.py
# - name: Run nightly PyTorch tests on M1 (MPS)
# shell: arch -arch arm64 bash {0}
# env:
# HF_HOME: /System/Volumes/Data/mnt/cache
# HF_TOKEN: ${{ secrets.HF_TOKEN }}
# run: |
# ${CONDA_RUN} python -m pytest -n 1 -s -v --make-reports=tests_torch_mps \
# --report-log=tests_torch_mps.log \
# tests/
# - name: Failure short reports
# if: ${{ failure() }}
# run: cat reports/tests_torch_mps_failures_short.txt
#
# - name: Test suite reports artifacts
# if: ${{ always() }}
# uses: actions/upload-artifact@v2
# with:
# name: torch_mps_test_reports
# path: reports
#
# - name: Generate Report and Notify Channel
# if: always()
# run: |
# pip install slack_sdk tabulate
# python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
run_nightly_tests_apple_m1:
name: Nightly PyTorch MPS tests on MacOS
runs-on: [ self-hosted, apple-m1 ]
if: github.event_name == 'schedule'
steps:
- name: Checkout diffusers
uses: actions/checkout@v3
with:
fetch-depth: 2
- name: Clean checkout
shell: arch -arch arm64 bash {0}
run: |
git clean -fxd
- name: Setup miniconda
uses: ./.github/actions/setup-miniconda
with:
python-version: 3.9
- name: Install dependencies
shell: arch -arch arm64 bash {0}
run: |
${CONDA_RUN} python -m pip install --upgrade pip uv
${CONDA_RUN} python -m uv pip install -e [quality,test]
${CONDA_RUN} python -m uv pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu
${CONDA_RUN} python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate
${CONDA_RUN} python -m uv pip install pytest-reportlog
- name: Environment
shell: arch -arch arm64 bash {0}
run: |
${CONDA_RUN} python utils/print_env.py
- name: Run nightly PyTorch tests on M1 (MPS)
shell: arch -arch arm64 bash {0}
env:
HF_HOME: /System/Volumes/Data/mnt/cache
HF_TOKEN: ${{ secrets.HF_TOKEN }}
run: |
${CONDA_RUN} python -m pytest -n 1 -s -v --make-reports=tests_torch_mps \
--report-log=tests_torch_mps.log \
tests/
- name: Failure short reports
if: ${{ failure() }}
run: cat reports/tests_torch_mps_failures_short.txt
- name: Test suite reports artifacts
if: ${{ always() }}
uses: actions/upload-artifact@v2
with:
name: torch_mps_test_reports
path: reports
- name: Generate Report and Notify Channel
if: always()
run: |
pip install slack_sdk tabulate
python utils/log_reports.py >> $GITHUB_STEP_SUMMARY

View File

@@ -22,7 +22,7 @@ The abstract from the paper is:
*We present ControlNet, a neural network architecture to add spatial conditioning controls to large, pretrained text-to-image diffusion models. ControlNet locks the production-ready large diffusion models, and reuses their deep and robust encoding layers pretrained with billions of images as a strong backbone to learn a diverse set of conditional controls. The neural architecture is connected with "zero convolutions" (zero-initialized convolution layers) that progressively grow the parameters from zero and ensure that no harmful noise could affect the finetuning. We test various conditioning controls, eg, edges, depth, segmentation, human pose, etc, with Stable Diffusion, using single or multiple conditions, with or without prompts. We show that the training of ControlNets is robust with small (<50k) and large (>1m) datasets. Extensive results show that ControlNet may facilitate wider applications to control image diffusion models.*
This controlnet code is mainly implemented by [The InstantX Team](https://huggingface.co/InstantX). The inpainting-related code was developed by [The Alimama Creative Team](https://huggingface.co/alimama-creative). You can find pre-trained checkpoints for SD3-ControlNet in the table below:
This controlnet code is mainly implemented by [The InstantX Team](https://huggingface.co/InstantX). The inpainting-related code was developed by [The Alimama Creative Team](https://huggingface.co/alimama-creative). You can find pre-trained checkpoints for SD3-ControlNet in the table below:
| ControlNet type | Developer | Link |

View File

@@ -14,7 +14,7 @@ specific language governing permissions and limitations under the License.
![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/kolors/kolors_header_collage.png)
Kolors is a large-scale text-to-image generation model based on latent diffusion, developed by [the Kuaishou Kolors team](kwai-kolors@kuaishou.com). Trained on billions of text-image pairs, Kolors exhibits significant advantages over both open-source and closed-source models in visual quality, complex semantic accuracy, and text rendering for both Chinese and English characters. Furthermore, Kolors supports both Chinese and English inputs, demonstrating strong performance in understanding and generating Chinese-specific content. For more details, please refer to this [technical report](https://github.com/Kwai-Kolors/Kolors/blob/master/imgs/Kolors_paper.pdf).
Kolors is a large-scale text-to-image generation model based on latent diffusion, developed by [the Kuaishou Kolors team](https://github.com/Kwai-Kolors/Kolors). Trained on billions of text-image pairs, Kolors exhibits significant advantages over both open-source and closed-source models in visual quality, complex semantic accuracy, and text rendering for both Chinese and English characters. Furthermore, Kolors supports both Chinese and English inputs, demonstrating strong performance in understanding and generating Chinese-specific content. For more details, please refer to this [technical report](https://github.com/Kwai-Kolors/Kolors/blob/master/imgs/Kolors_paper.pdf).
The abstract from the technical report is:
@@ -74,7 +74,7 @@ image_encoder = CLIPVisionModelWithProjection.from_pretrained(
pipe = KolorsPipeline.from_pretrained(
"Kwai-Kolors/Kolors-diffusers", image_encoder=image_encoder, torch_dtype=torch.float16, variant="fp16"
).to("cuda")
)
pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config, use_karras_sigmas=True)
pipe.load_ip_adapter(

View File

@@ -20,7 +20,7 @@ The abstract from the paper is:
*Recent studies have demonstrated that diffusion models are capable of generating high-quality samples, but their quality heavily depends on sampling guidance techniques, such as classifier guidance (CG) and classifier-free guidance (CFG). These techniques are often not applicable in unconditional generation or in various downstream tasks such as image restoration. In this paper, we propose a novel sampling guidance, called Perturbed-Attention Guidance (PAG), which improves diffusion sample quality across both unconditional and conditional settings, achieving this without requiring additional training or the integration of external modules. PAG is designed to progressively enhance the structure of samples throughout the denoising process. It involves generating intermediate samples with degraded structure by substituting selected self-attention maps in diffusion U-Net with an identity matrix, by considering the self-attention mechanisms' ability to capture structural information, and guiding the denoising process away from these degraded samples. In both ADM and Stable Diffusion, PAG surprisingly improves sample quality in conditional and even unconditional scenarios. Moreover, PAG significantly improves the baseline performance in various downstream tasks where existing guidances such as CG or CFG cannot be fully utilized, including ControlNet with empty prompts and image restoration such as inpainting and deblurring.*
PAG can be used by specifying the `pag_applied_layers` as a parameter when instantiating a PAG pipeline. It can be a single string or a list of strings. Each string can be a unique layer identifier or a regular expression to identify one or more layers.
PAG can be used by specifying the `pag_applied_layers` as a parameter when instantiating a PAG pipeline. It can be a single string or a list of strings. Each string can be a unique layer identifier or a regular expression to identify one or more layers.
- Full identifier as a normal string: `down_blocks.2.attentions.0.transformer_blocks.0.attn1.processor`
- Full identifier as a RegEx: `down_blocks.2.(attentions|motion_modules).0.transformer_blocks.0.attn1.processor`
@@ -46,7 +46,7 @@ Since RegEx is supported as a way for matching layer identifiers, it is crucial
## KolorsPAGPipeline
[[autodoc]] KolorsPAGPipeline
- all
- __call__
- __call__
## StableDiffusionPAGPipeline
[[autodoc]] StableDiffusionPAGPipeline

View File

@@ -3,17 +3,17 @@
[DreamBooth](https://arxiv.org/abs/2208.12242) is a method to personalize text2image models like stable diffusion given just a few (3~5) images of a subject.
The `train_dreambooth_flux.py` script shows how to implement the training procedure and adapt it for [FLUX.1 [dev]](https://blackforestlabs.ai/announcing-black-forest-labs/). We also provide a LoRA implementation in the `train_dreambooth_lora_flux.py` script.
> [!NOTE]
> [!NOTE]
> **Memory consumption**
>
> Flux can be quite expensive to run on consumer hardware devices and as a result finetuning it comes with high memory requirements -
>
> Flux can be quite expensive to run on consumer hardware devices and as a result finetuning it comes with high memory requirements -
> a LoRA with a rank of 16 (w/ all components trained) can exceed 40GB of VRAM for training.
> For more tips & guidance on training on a resource-constrained device please visit [`@bghira`'s guide](https://github.com/bghira/SimpleTuner/blob/main/documentation/quickstart/FLUX.md)
> For more tips & guidance on training on a resource-constrained device please visit [`@bghira`'s guide](https://github.com/bghira/SimpleTuner/blob/main/documentation/quickstart/FLUX.md)
> [!NOTE]
> **Gated model**
>
>
> As the model is gated, before using it with diffusers you first need to go to the [FLUX.1 [dev] Hugging Face page](https://huggingface.co/black-forest-labs/FLUX.1-dev), fill in the form and accept the gate. Once you are in, you need to log in so that your system knows youve accepted the gate. Use the command below to log in:
```bash
@@ -163,7 +163,7 @@ To do so, just specify `--train_text_encoder` while launching training. Please k
> [!NOTE]
> FLUX.1 has 2 text encoders (CLIP L/14 and T5-v1.1-XXL).
By enabling `--train_text_encoder`, fine-tuning of the **CLIP encoder** is performed.
By enabling `--train_text_encoder`, fine-tuning of the **CLIP encoder** is performed.
> At the moment, T5 fine-tuning is not supported and weights remain frozen when text encoder training is enabled.
To perform DreamBooth LoRA with text-encoder training, run:

View File

@@ -1454,7 +1454,7 @@ def main(args):
)
# Clear the memory here
if not args.train_text_encoder and train_dataset.custom_instance_prompts:
if not args.train_text_encoder and not train_dataset.custom_instance_prompts:
del tokenizers, text_encoders
# Explicitly delete the objects as well, otherwise only the lists are deleted and the original references remain, preventing garbage collection
del text_encoder_one, text_encoder_two, text_encoder_three

View File

@@ -109,6 +109,9 @@ import torch
model_id = "path-to-your-trained-model"
pipe = StableDiffusionPipeline.from_pretrained(model_id,torch_dtype=torch.float16).to("cuda")
repo_id_embeds = "path-to-your-learned-embeds"
pipe.load_textual_inversion(repo_id_embeds)
prompt = "A <cat-toy> backpack"
image = pipe(prompt, num_inference_steps=50, guidance_scale=7.5).images[0]

View File

@@ -222,7 +222,11 @@ class IPAdapterMixin:
# create feature extractor if it has not been registered to the pipeline yet
if hasattr(self, "feature_extractor") and getattr(self, "feature_extractor", None) is None:
clip_image_size = self.image_encoder.config.image_size
# FaceID IP adapters don't need the image encoder so it's not present, in this case we default to 224
default_clip_size = 224
clip_image_size = (
self.image_encoder.config.image_size if self.image_encoder is not None else default_clip_size
)
feature_extractor = CLIPImageProcessor(size=clip_image_size, crop_size=clip_image_size)
self.register_modules(feature_extractor=feature_extractor)

View File

@@ -449,7 +449,7 @@ class BasicTransformerBlock(nn.Module):
norm_hidden_states = self.norm1(hidden_states, added_cond_kwargs["pooled_text_emb"])
elif self.norm_type == "ada_norm_single":
shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (
self.scale_shift_table[None] + timestep.reshape(batch_size, 6, -1)
self.scale_shift_table[None].to(timestep.dtype) + timestep.reshape(batch_size, 6, -1)
).chunk(6, dim=1)
norm_hidden_states = self.norm1(hidden_states)
norm_hidden_states = norm_hidden_states * (1 + scale_msa) + shift_msa

View File

@@ -60,6 +60,8 @@ class AsymmetricAutoencoderKL(ModelMixin, ConfigMixin):
Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752) paper.
"""
_always_upcast_modules = ["MaskConditionDecoder"]
@register_to_config
def __init__(
self,

View File

@@ -70,6 +70,7 @@ class AutoencoderKL(ModelMixin, ConfigMixin, FromOriginalModelMixin):
_supports_gradient_checkpointing = True
_no_split_modules = ["BasicTransformerBlock", "ResnetBlock2D"]
_always_upcast_modules = ["Decoder"]
@register_to_config
def __init__(

View File

@@ -192,6 +192,7 @@ class AutoencoderKLTemporalDecoder(ModelMixin, ConfigMixin):
"""
_supports_gradient_checkpointing = True
_always_upcast_modules = ["TemporalDecoder"]
@register_to_config
def __init__(

View File

@@ -317,6 +317,7 @@ class AutoencoderOobleck(ModelMixin, ConfigMixin):
"""
_supports_gradient_checkpointing = False
_always_upcast_modules = ["OobleckEncoder", "OobleckDecoder"]
@register_to_config
def __init__(

View File

@@ -330,7 +330,7 @@ class ConsistencyDecoderVAE(ModelMixin, ConfigMixin):
Union[DecoderOutput, Tuple[torch.Tensor]]: The decoded output.
"""
z = (z * self.config.scaling_factor - self.means) / self.stds
z = (z * self.config.scaling_factor - self.means.to(z.dtype)) / self.stds.to(z.dtype)
scale_factor = 2 ** (len(self.config.block_out_channels) - 1)
z = F.interpolate(z, mode="nearest", scale_factor=scale_factor)

View File

@@ -71,6 +71,8 @@ class VQModel(ModelMixin, ConfigMixin):
Type of normalization layer to use. Can be one of `"group"` or `"spatial"`.
"""
_always_upcast_modules = ["Decoder", "VectorQuantizer"]
@register_to_config
def __init__(
self,

View File

@@ -263,6 +263,80 @@ class ModelMixin(torch.nn.Module, PushToHubMixin):
"""
self.set_use_memory_efficient_attention_xformers(False)
def enable_layerwise_upcasting(self, upcast_dtype=None):
r"""
Enable layerwise dynamic upcasting. This allows models to be loaded into the GPU in a low memory dtype e.g.
torch.float8_e4m3fn, but perform inference using a dtype that is supported by the GPU, by upcasting the
individual modules in the model to the appropriate dtype right before the foward pass.
The module is then moved back to the low memory dtype after the foward pass.
"""
upcast_dtype = upcast_dtype or torch.float32
original_dtype = self.dtype
def upcast_dtype_hook_fn(module, *args, **kwargs):
module = module.to(upcast_dtype)
def cast_to_original_dtype_hook_fn(module, *args, **kwargs):
module = module.to(original_dtype)
def fn_recursive_upcast(module):
"""In certain cases modules will apply casting internally or reference the dtype of internal blocks.
e.g.
```
class MyModel(nn.Module):
def forward(self, x):
dtype = next(iter(self.blocks.parameters())).dtype
x = self.blocks(x) + torch.ones(x.size()).to(dtype)
```
Layerwise upcasting will not work here, since the internal blocks remain in the low memory dtype until
their `forward` method is called. We need to add the upcast hook on the entire module in order for the
operation to work.
The `_always_upcast_modules` class attribute is a list of modules within the model that we must upcast
entirely, rather than layerwise.
"""
if hasattr(self, "_always_upcast_modules") and module.__class__.__name__ in self._always_upcast_modules:
# Upcast entire module and exist recursion
module.register_forward_pre_hook(upcast_dtype_hook_fn)
module.register_forward_hook(cast_to_original_dtype_hook_fn)
return
has_children = list(module.children())
if not has_children:
module.register_forward_pre_hook(upcast_dtype_hook_fn)
module.register_forward_hook(cast_to_original_dtype_hook_fn)
for child in module.children():
fn_recursive_upcast(child)
for module in self.children():
fn_recursive_upcast(module)
def disable_layerwise_upcasting(self):
def fn_recursive_upcast(module):
if hasattr(self, "_always_upcast_modules") and module.__class__.__name__ in self._always_upcast_modules:
module._forward_pre_hooks = OrderedDict()
module._forward_hooks = OrderedDict()
return
has_children = list(module.children())
if not has_children:
module._forward_pre_hooks = OrderedDict()
module._forward_hooks = OrderedDict()
for child in module.children():
fn_recursive_upcast(child)
for module in self.children():
fn_recursive_upcast(module)
def save_pretrained(
self,
save_directory: Union[str, os.PathLike],

View File

@@ -274,7 +274,9 @@ class AuraFlowTransformer2DModel(ModelMixin, ConfigMixin):
pos_embed_max_size (`int`, defaults to 4096): Maximum positions to embed from the image latents.
"""
_no_split_modules = ["AuraFlowJointTransformerBlock", "AuraFlowSingleTransformerBlock", "AuraFlowPatchEmbed"]
_supports_gradient_checkpointing = True
_always_upcast_modules = ["AuraFlowPatchEmbed"]
@register_to_config
def __init__(
@@ -456,11 +458,15 @@ class AuraFlowTransformer2DModel(ModelMixin, ConfigMixin):
# Apply patch embedding, timestep embedding, and project the caption embeddings.
hidden_states = self.pos_embed(hidden_states) # takes care of adding positional embeddings too.
temb = self.time_step_embed(timestep).to(dtype=next(self.parameters()).dtype)
temb = self.time_step_embed(timestep).to(dtype=hidden_states.dtype)
temb = self.time_step_proj(temb)
encoder_hidden_states = self.context_embedder(encoder_hidden_states)
encoder_hidden_states = torch.cat(
[self.register_tokens.repeat(encoder_hidden_states.size(0), 1, 1), encoder_hidden_states], dim=1
[
self.register_tokens.to(encoder_hidden_states.dtype).repeat(encoder_hidden_states.size(0), 1, 1),
encoder_hidden_states,
],
dim=1,
)
# MMDiT blocks.

View File

@@ -65,6 +65,7 @@ class DiTTransformer2DModel(ModelMixin, ConfigMixin):
"""
_supports_gradient_checkpointing = True
_always_upcast_modules = ["PatchEmbed"]
@register_to_config
def __init__(

View File

@@ -244,6 +244,8 @@ class HunyuanDiT2DModel(ModelMixin, ConfigMixin):
Whether or not to use style condition and image meta size. True for version <=1.1, False for version >= 1.2
"""
_always_upcast_modules = ["HunyuanDiTAttentionPool"]
@register_to_config
def __init__(
self,
@@ -484,7 +486,9 @@ class HunyuanDiT2DModel(ModelMixin, ConfigMixin):
text_embedding_mask = torch.cat([text_embedding_mask, text_embedding_mask_t5], dim=-1)
text_embedding_mask = text_embedding_mask.unsqueeze(2).bool()
encoder_hidden_states = torch.where(text_embedding_mask, encoder_hidden_states, self.text_embedding_padding)
encoder_hidden_states = torch.where(
text_embedding_mask, encoder_hidden_states, self.text_embedding_padding.to(encoder_hidden_states.dtype)
)
skips = []
for layer, block in enumerate(self.blocks):

View File

@@ -64,6 +64,7 @@ class LatteTransformer3DModel(ModelMixin, ConfigMixin):
video_length (`int`, *optional*):
The number of frames in the video-like data.
"""
_always_upcast_modules = ["PatchEmbed"]
@register_to_config
def __init__(
@@ -301,7 +302,9 @@ class LatteTransformer3DModel(ModelMixin, ConfigMixin):
hidden_states = hidden_states.reshape(-1, hidden_states.shape[-2], hidden_states.shape[-1])
embedded_timestep = embedded_timestep.repeat_interleave(num_frame, dim=0).view(-1, embedded_timestep.shape[-1])
shift, scale = (self.scale_shift_table[None] + embedded_timestep[:, None]).chunk(2, dim=1)
shift, scale = (self.scale_shift_table[None].to(embedded_timestep.dtype) + embedded_timestep[:, None]).chunk(
2, dim=1
)
hidden_states = self.norm_out(hidden_states)
# Modulation
hidden_states = hidden_states * (1 + scale) + shift

View File

@@ -19,7 +19,7 @@ from torch import nn
from ...configuration_utils import ConfigMixin, register_to_config
from ...utils import is_torch_version, logging
from ..attention import BasicTransformerBlock
from ..attention_processor import Attention, AttentionProcessor, FusedAttnProcessor2_0
from ..attention_processor import Attention, AttentionProcessor, AttnProcessor, FusedAttnProcessor2_0
from ..embeddings import PatchEmbed, PixArtAlphaTextProjection
from ..modeling_outputs import Transformer2DModelOutput
from ..modeling_utils import ModelMixin
@@ -79,6 +79,7 @@ class PixArtTransformer2DModel(ModelMixin, ConfigMixin):
_supports_gradient_checkpointing = True
_no_split_modules = ["BasicTransformerBlock", "PatchEmbed"]
_always_upcast_modules = ["PatchEmbed"]
@register_to_config
def __init__(
@@ -247,6 +248,14 @@ class PixArtTransformer2DModel(ModelMixin, ConfigMixin):
for name, module in self.named_children():
fn_recursive_attn_processor(name, module, processor)
def set_default_attn_processor(self):
"""
Disables custom attention processors and sets the default attention implementation.
Safe to just use `AttnProcessor()` as PixArt doesn't have any exotic attention processors in default model.
"""
self.set_attn_processor(AttnProcessor())
# Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections
def fuse_qkv_projections(self):
"""
@@ -414,7 +423,8 @@ class PixArtTransformer2DModel(ModelMixin, ConfigMixin):
# 3. Output
shift, scale = (
self.scale_shift_table[None] + embedded_timestep[:, None].to(self.scale_shift_table.device)
self.scale_shift_table[None].to(embedded_timestep.dtype)
+ embedded_timestep[:, None].to(self.scale_shift_table.device)
).chunk(2, dim=1)
hidden_states = self.norm_out(hidden_states)
# Modulation

View File

@@ -289,7 +289,7 @@ class PriorTransformer(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin, Pef
# timesteps does not contain any weights and will always return f32 tensors
# but time_embedding might be fp16, so we need to cast here.
timesteps_projected = timesteps_projected.to(dtype=self.dtype)
timesteps_projected = timesteps_projected.to(dtype=hidden_states.dtype)
time_embeddings = self.time_embedding(timesteps_projected)
if self.embedding_proj_norm is not None:

View File

@@ -54,6 +54,7 @@ class SD3Transformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOrigi
"""
_supports_gradient_checkpointing = True
_always_upcast_modules = ["PatchEmbed"]
@register_to_config
def __init__(

View File

@@ -283,7 +283,7 @@ class UNet2DModel(ModelMixin, ConfigMixin):
# timesteps does not contain any weights and will always return f32 tensors
# but time_embedding might actually be running in fp16. so we need to cast here.
# there might be better ways to encapsulate this.
t_emb = t_emb.to(dtype=self.dtype)
t_emb = t_emb.to(dtype=sample.dtype)
emb = self.time_embedding(t_emb)
if self.class_embedding is not None:

View File

@@ -641,7 +641,7 @@ class UNet3DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin)
# timesteps does not contain any weights and will always return f32 tensors
# but time_embedding might actually be running in fp16. so we need to cast here.
# there might be better ways to encapsulate this.
t_emb = t_emb.to(dtype=self.dtype)
t_emb = t_emb.to(dtype=sample.dtype)
emb = self.time_embedding(t_emb, timestep_cond)
emb = emb.repeat_interleave(repeats=num_frames, dim=0)

View File

@@ -590,7 +590,7 @@ class I2VGenXLUNet(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
# timesteps does not contain any weights and will always return f32 tensors
# but time_embedding might actually be running in fp16. so we need to cast here.
# there might be better ways to encapsulate this.
t_emb = t_emb.to(dtype=self.dtype)
t_emb = t_emb.to(dtype=sample.dtype)
t_emb = self.time_embedding(t_emb, timestep_cond)
# 2. FPS

View File

@@ -2152,7 +2152,7 @@ class UNetMotionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin, Peft
# timesteps does not contain any weights and will always return f32 tensors
# but time_embedding might actually be running in fp16. so we need to cast here.
# there might be better ways to encapsulate this.
t_emb = t_emb.to(dtype=self.dtype)
t_emb = t_emb.to(dtype=sample.dtype)
emb = self.time_embedding(t_emb, timestep_cond)
aug_emb = None

View File

@@ -49,6 +49,7 @@ from .kandinsky2_2 import (
)
from .kandinsky3 import Kandinsky3Img2ImgPipeline, Kandinsky3Pipeline
from .latent_consistency_models import LatentConsistencyModelImg2ImgPipeline, LatentConsistencyModelPipeline
from .lumina import LuminaText2ImgPipeline
from .pag import (
HunyuanDiTPAGPipeline,
PixArtSigmaPAGPipeline,
@@ -106,6 +107,7 @@ AUTO_TEXT2IMAGE_PIPELINES_MAPPING = OrderedDict(
("pixart-sigma-pag", PixArtSigmaPAGPipeline),
("auraflow", AuraFlowPipeline),
("flux", FluxPipeline),
("lumina", LuminaText2ImgPipeline),
]
)

View File

@@ -56,7 +56,7 @@ EXAMPLE_DOC_STRING = """
>>> from diffusers.utils import export_to_gif
>>> # You can replace the checkpoint id with "maxin-cn/Latte-1" too.
>>> pipe = LattePipeline.from_pretrained("maxin-cn/Latte-1", torch_dtype=torch.float16).to("cuda")
>>> pipe = LattePipeline.from_pretrained("maxin-cn/Latte-1", torch_dtype=torch.float16)
>>> # Enable memory optimizations.
>>> pipe.enable_model_cpu_offload()

View File

@@ -54,7 +54,7 @@ EXAMPLE_DOC_STRING = """
>>> pipe = LuminaText2ImgPipeline.from_pretrained(
... "Alpha-VLLM/Lumina-Next-SFT-diffusers", torch_dtype=torch.bfloat16
... ).cuda()
... )
>>> # Enable memory optimizations.
>>> pipe.enable_model_cpu_offload()

View File

@@ -89,49 +89,44 @@ for library in LOADABLE_CLASSES:
ALL_IMPORTABLE_CLASSES.update(LOADABLE_CLASSES[library])
def is_safetensors_compatible(filenames, variant=None, passed_components=None) -> bool:
def is_safetensors_compatible(filenames, passed_components=None) -> bool:
"""
Checking for safetensors compatibility:
- By default, all models are saved with the default pytorch serialization, so we use the list of default pytorch
files to know which safetensors files are needed.
- The model is safetensors compatible only if there is a matching safetensors file for every default pytorch file.
- The model is safetensors compatible only if there is a safetensors file for each model component present in
filenames.
Converting default pytorch serialized filenames to safetensors serialized filenames:
- For models from the diffusers library, just replace the ".bin" extension with ".safetensors"
- For models from the transformers library, the filename changes from "pytorch_model" to "model", and the ".bin"
extension is replaced with ".safetensors"
"""
pt_filenames = []
sf_filenames = set()
passed_components = passed_components or []
# extract all components of the pipeline and their associated files
components = {}
for filename in filenames:
_, extension = os.path.splitext(filename)
if len(filename.split("/")) == 2 and filename.split("/")[0] in passed_components:
if not len(filename.split("/")) == 2:
continue
if extension == ".bin":
pt_filenames.append(os.path.normpath(filename))
elif extension == ".safetensors":
sf_filenames.add(os.path.normpath(filename))
component, component_filename = filename.split("/")
if component in passed_components:
continue
for filename in pt_filenames:
# filename = 'foo/bar/baz.bam' -> path = 'foo/bar', filename = 'baz', extension = '.bam'
path, filename = os.path.split(filename)
filename, extension = os.path.splitext(filename)
components.setdefault(component, [])
components[component].append(component_filename)
if filename.startswith("pytorch_model"):
filename = filename.replace("pytorch_model", "model")
else:
filename = filename
# iterate over all files of a component
# check if safetensor files exist for that component
# if variant is provided check if the variant of the safetensors exists
for component, component_filenames in components.items():
matches = []
for component_filename in component_filenames:
filename, extension = os.path.splitext(component_filename)
expected_sf_filename = os.path.normpath(os.path.join(path, filename))
expected_sf_filename = f"{expected_sf_filename}.safetensors"
if expected_sf_filename not in sf_filenames:
logger.warning(f"{expected_sf_filename} not found")
match_exists = extension == ".safetensors"
matches.append(match_exists)
if not any(matches):
return False
return True

View File

@@ -1416,18 +1416,14 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
if (
use_safetensors
and not allow_pickle
and not is_safetensors_compatible(
model_filenames, variant=variant, passed_components=passed_components
)
and not is_safetensors_compatible(model_filenames, passed_components=passed_components)
):
raise EnvironmentError(
f"Could not find the necessary `safetensors` weights in {model_filenames} (variant={variant})"
)
if from_flax:
ignore_patterns = ["*.bin", "*.safetensors", "*.onnx", "*.pb"]
elif use_safetensors and is_safetensors_compatible(
model_filenames, variant=variant, passed_components=passed_components
):
elif use_safetensors and is_safetensors_compatible(model_filenames, passed_components=passed_components):
ignore_patterns = ["*.bin", "*.msgpack"]
use_onnx = use_onnx if use_onnx is not None else pipeline_class._is_onnx

View File

@@ -602,9 +602,9 @@ class StableDiffusionKDiffusionPipeline(
sigma_min: float = self.k_diffusion_model.sigmas[0].item()
sigma_max: float = self.k_diffusion_model.sigmas[-1].item()
sigmas = get_sigmas_karras(n=num_inference_steps, sigma_min=sigma_min, sigma_max=sigma_max)
sigmas = sigmas.to(device)
else:
sigmas = self.scheduler.sigmas
sigmas = sigmas.to(device)
sigmas = sigmas.to(prompt_embeds.dtype)
# 6. Prepare latent variables

View File

@@ -32,7 +32,7 @@ from utils import PeftLoraLoaderMixinTests # noqa: E402
@require_peft_backend
class SD3LoRATests(unittest.TestCase, PeftLoraLoaderMixinTests):
pipeline_class = StableDiffusion3Pipeline
scheduler_cls = FlowMatchEulerDiscreteScheduler()
scheduler_cls = FlowMatchEulerDiscreteScheduler
scheduler_kwargs = {}
uses_flow_matching = True
transformer_kwargs = {
@@ -80,8 +80,7 @@ class SD3LoRATests(unittest.TestCase, PeftLoraLoaderMixinTests):
Related PR: https://github.com/huggingface/diffusers/pull/8584
"""
components = self.get_dummy_components()
pipe = self.pipeline_class(**components)
pipe = self.pipeline_class(**components[0])
pipe = pipe.to(torch_device)
pipe.set_progress_bar_config(disable=None)

View File

@@ -124,71 +124,6 @@ class LoraSDXLIntegrationTests(unittest.TestCase):
gc.collect()
torch.cuda.empty_cache()
def test_sdxl_0_9_lora_one(self):
generator = torch.Generator().manual_seed(0)
pipe = StableDiffusionXLPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-0.9")
lora_model_id = "hf-internal-testing/sdxl-0.9-daiton-lora"
lora_filename = "daiton-xl-lora-test.safetensors"
pipe.load_lora_weights(lora_model_id, weight_name=lora_filename)
pipe.enable_model_cpu_offload()
images = pipe(
"masterpiece, best quality, mountain", output_type="np", generator=generator, num_inference_steps=2
).images
images = images[0, -3:, -3:, -1].flatten()
expected = np.array([0.3838, 0.3482, 0.3588, 0.3162, 0.319, 0.3369, 0.338, 0.3366, 0.3213])
max_diff = numpy_cosine_similarity_distance(expected, images)
assert max_diff < 1e-3
pipe.unload_lora_weights()
release_memory(pipe)
def test_sdxl_0_9_lora_two(self):
generator = torch.Generator().manual_seed(0)
pipe = StableDiffusionXLPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-0.9")
lora_model_id = "hf-internal-testing/sdxl-0.9-costumes-lora"
lora_filename = "saijo.safetensors"
pipe.load_lora_weights(lora_model_id, weight_name=lora_filename)
pipe.enable_model_cpu_offload()
images = pipe(
"masterpiece, best quality, mountain", output_type="np", generator=generator, num_inference_steps=2
).images
images = images[0, -3:, -3:, -1].flatten()
expected = np.array([0.3137, 0.3269, 0.3355, 0.255, 0.2577, 0.2563, 0.2679, 0.2758, 0.2626])
max_diff = numpy_cosine_similarity_distance(expected, images)
assert max_diff < 1e-3
pipe.unload_lora_weights()
release_memory(pipe)
def test_sdxl_0_9_lora_three(self):
generator = torch.Generator().manual_seed(0)
pipe = StableDiffusionXLPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-0.9")
lora_model_id = "hf-internal-testing/sdxl-0.9-kamepan-lora"
lora_filename = "kame_sdxl_v2-000020-16rank.safetensors"
pipe.load_lora_weights(lora_model_id, weight_name=lora_filename)
pipe.enable_model_cpu_offload()
images = pipe(
"masterpiece, best quality, mountain", output_type="np", generator=generator, num_inference_steps=2
).images
images = images[0, -3:, -3:, -1].flatten()
expected = np.array([0.4015, 0.3761, 0.3616, 0.3745, 0.3462, 0.3337, 0.3564, 0.3649, 0.3468])
max_diff = numpy_cosine_similarity_distance(expected, images)
assert max_diff < 5e-3
pipe.unload_lora_weights()
release_memory(pipe)
def test_sdxl_1_0_lora(self):
generator = torch.Generator("cpu").manual_seed(0)

View File

@@ -43,6 +43,8 @@ from diffusers.utils import SAFE_WEIGHTS_INDEX_NAME, is_torch_npu_available, is_
from diffusers.utils.hub_utils import _add_variant
from diffusers.utils.testing_utils import (
CaptureLogger,
disable_full_determinism,
enable_full_determinism,
get_python_version,
is_torch_compile,
require_torch_2,
@@ -984,6 +986,49 @@ class ModelTesterMixin:
new_output = new_model(**inputs_dict)
self.assertTrue(torch.allclose(base_output[0], new_output[0], atol=1e-5))
@require_torch_gpu
def test_layerwise_upcasting(self):
disable_full_determinism()
torch.cuda.empty_cache()
torch.cuda.reset_max_memory_cached()
torch.cuda.reset_max_memory_allocated()
torch.cuda.reset_peak_memory_stats()
torch.manual_seed(0)
config, inputs_dict = self.prepare_init_args_and_inputs_for_common()
model = self.model_class(**config).eval()
model.to(torch_device)
model(**inputs_dict)
base_max_memory = torch.cuda.max_memory_allocated()
# Remove model
model.to("cpu")
del model
torch.cuda.empty_cache()
torch.cuda.reset_max_memory_cached()
torch.cuda.reset_max_memory_allocated()
torch.cuda.reset_peak_memory_stats()
low_memory_dtype = torch.float8_e4m3fn
upcast_dtype = torch.float32
config, inputs_dict = self.prepare_init_args_and_inputs_for_common()
torch.manual_seed(0)
low_mem_model = self.model_class(**config).eval()
low_mem_model.to(low_memory_dtype)
low_mem_model.to(torch_device)
layerwise_max_memory = torch.cuda.max_memory_allocated()
low_mem_model.enable_layerwise_upcasting(upcast_dtype)
low_mem_model(**inputs_dict)
assert layerwise_max_memory < base_max_memory
enable_full_determinism()
@is_staging_test
class ModelPushToHubTester(unittest.TestCase):

View File

@@ -26,9 +26,11 @@ from ..test_modeling_common import ModelTesterMixin
enable_full_determinism()
class SD3TransformerTests(ModelTesterMixin, unittest.TestCase):
class AuraFlowTransformerTests(ModelTesterMixin, unittest.TestCase):
model_class = AuraFlowTransformer2DModel
main_input_name = "hidden_states"
# We override the items here because the transformer under consideration is small.
model_split_percents = [0.7, 0.6, 0.6]
@property
def dummy_input(self):
@@ -71,3 +73,7 @@ class SD3TransformerTests(ModelTesterMixin, unittest.TestCase):
}
inputs_dict = self.dummy_input
return init_dict, inputs_dict
@unittest.skip("AuraFlowTransformer2DModel uses its own dedicated attention processor. This test does not apply")
def test_set_attn_processor_for_determinism(self):
pass

View File

@@ -0,0 +1,111 @@
# coding=utf-8
# Copyright 2024 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import torch
from diffusers import LuminaNextDiT2DModel
from diffusers.utils.testing_utils import (
enable_full_determinism,
torch_device,
)
from ..test_modeling_common import ModelTesterMixin
enable_full_determinism()
class LuminaNextDiT2DModelTransformerTests(ModelTesterMixin, unittest.TestCase):
model_class = LuminaNextDiT2DModel
main_input_name = "hidden_states"
@property
def dummy_input(self):
"""
Args:
None
Returns:
Dict: Dictionary of dummy input tensors
"""
batch_size = 2 # N
num_channels = 4 # C
height = width = 16 # H, W
embedding_dim = 32 # D
sequence_length = 16 # L
hidden_states = torch.randn((batch_size, num_channels, height, width)).to(torch_device)
encoder_hidden_states = torch.randn((batch_size, sequence_length, embedding_dim)).to(torch_device)
timestep = torch.rand(size=(batch_size,)).to(torch_device)
encoder_mask = torch.randn(size=(batch_size, sequence_length)).to(torch_device)
image_rotary_emb = torch.randn((384, 384, 4)).to(torch_device)
return {
"hidden_states": hidden_states,
"encoder_hidden_states": encoder_hidden_states,
"timestep": timestep,
"encoder_mask": encoder_mask,
"image_rotary_emb": image_rotary_emb,
"cross_attention_kwargs": {},
}
@property
def input_shape(self):
"""
Args:
None
Returns:
Tuple: (int, int, int)
"""
return (4, 16, 16)
@property
def output_shape(self):
"""
Args:
None
Returns:
Tuple: (int, int, int)
"""
return (4, 16, 16)
def prepare_init_args_and_inputs_for_common(self):
"""
Args:
None
Returns:
Tuple: (Dict, Dict)
"""
init_dict = {
"sample_size": 16,
"patch_size": 2,
"in_channels": 4,
"hidden_size": 24,
"num_layers": 2,
"num_attention_heads": 3,
"num_kv_heads": 1,
"multiple_of": 16,
"ffn_dim_multiplier": None,
"norm_eps": 1e-5,
"learn_sigma": False,
"qk_norm": True,
"cross_attention_dim": 32,
"scaling_factor": 1.0,
}
inputs_dict = self.dummy_input
return init_dict, inputs_dict

View File

@@ -76,3 +76,7 @@ class SD3TransformerTests(ModelTesterMixin, unittest.TestCase):
}
inputs_dict = self.dummy_input
return init_dict, inputs_dict
@unittest.skip("SD3Transformer2DModel uses a dedicated attention processor. This test doesn't apply")
def test_set_attn_processor_for_determinism(self):
pass

View File

@@ -163,3 +163,7 @@ class AuraFlowPipelineFastTests(unittest.TestCase, PipelineTesterMixin):
assert np.allclose(
original_image_slice, image_slice_disabled, atol=1e-2, rtol=1e-2
), "Original outputs should match when fused QKV projections are disabled."
@unittest.skip("xformers attention processor does not exist for AuraFlow")
def test_xformers_attention_forwardGenerator_pass(self):
pass

View File

@@ -119,6 +119,10 @@ class LuminaText2ImgPipelinePipelineFastTests(unittest.TestCase, PipelineTesterM
max_diff = np.abs(output_with_prompt - output_with_embeds).max()
assert max_diff < 1e-4
@unittest.skip("xformers attention processor does not exist for Lumina")
def test_xformers_attention_forwardGenerator_pass(self):
pass
@slow
@require_torch_gpu

View File

@@ -68,25 +68,21 @@ class IsSafetensorsCompatibleTests(unittest.TestCase):
"unet/diffusion_pytorch_model.fp16.bin",
"unet/diffusion_pytorch_model.fp16.safetensors",
]
variant = "fp16"
self.assertTrue(is_safetensors_compatible(filenames, variant=variant))
self.assertTrue(is_safetensors_compatible(filenames))
def test_diffusers_model_is_compatible_variant(self):
filenames = [
"unet/diffusion_pytorch_model.fp16.bin",
"unet/diffusion_pytorch_model.fp16.safetensors",
]
variant = "fp16"
self.assertTrue(is_safetensors_compatible(filenames, variant=variant))
self.assertTrue(is_safetensors_compatible(filenames))
def test_diffusers_model_is_compatible_variant_partial(self):
# pass variant but use the non-variant filenames
def test_diffusers_model_is_compatible_variant_mixed(self):
filenames = [
"unet/diffusion_pytorch_model.bin",
"unet/diffusion_pytorch_model.safetensors",
"unet/diffusion_pytorch_model.fp16.safetensors",
]
variant = "fp16"
self.assertTrue(is_safetensors_compatible(filenames, variant=variant))
self.assertTrue(is_safetensors_compatible(filenames))
def test_diffusers_model_is_not_compatible_variant(self):
filenames = [
@@ -99,25 +95,14 @@ class IsSafetensorsCompatibleTests(unittest.TestCase):
"unet/diffusion_pytorch_model.fp16.bin",
# Removed: 'unet/diffusion_pytorch_model.fp16.safetensors',
]
variant = "fp16"
self.assertFalse(is_safetensors_compatible(filenames, variant=variant))
self.assertFalse(is_safetensors_compatible(filenames))
def test_transformer_model_is_compatible_variant(self):
filenames = [
"text_encoder/pytorch_model.fp16.bin",
"text_encoder/model.fp16.safetensors",
]
variant = "fp16"
self.assertTrue(is_safetensors_compatible(filenames, variant=variant))
def test_transformer_model_is_compatible_variant_partial(self):
# pass variant but use the non-variant filenames
filenames = [
"text_encoder/pytorch_model.bin",
"text_encoder/model.safetensors",
]
variant = "fp16"
self.assertTrue(is_safetensors_compatible(filenames, variant=variant))
self.assertTrue(is_safetensors_compatible(filenames))
def test_transformer_model_is_not_compatible_variant(self):
filenames = [
@@ -126,9 +111,45 @@ class IsSafetensorsCompatibleTests(unittest.TestCase):
"vae/diffusion_pytorch_model.fp16.bin",
"vae/diffusion_pytorch_model.fp16.safetensors",
"text_encoder/pytorch_model.fp16.bin",
# 'text_encoder/model.fp16.safetensors',
"unet/diffusion_pytorch_model.fp16.bin",
"unet/diffusion_pytorch_model.fp16.safetensors",
]
variant = "fp16"
self.assertFalse(is_safetensors_compatible(filenames, variant=variant))
self.assertFalse(is_safetensors_compatible(filenames))
def test_transformers_is_compatible_sharded(self):
filenames = [
"text_encoder/pytorch_model.bin",
"text_encoder/model-00001-of-00002.safetensors",
"text_encoder/model-00002-of-00002.safetensors",
]
self.assertTrue(is_safetensors_compatible(filenames))
def test_transformers_is_compatible_variant_sharded(self):
filenames = [
"text_encoder/pytorch_model.bin",
"text_encoder/model.fp16-00001-of-00002.safetensors",
"text_encoder/model.fp16-00001-of-00002.safetensors",
]
self.assertTrue(is_safetensors_compatible(filenames))
def test_diffusers_is_compatible_sharded(self):
filenames = [
"unet/diffusion_pytorch_model.bin",
"unet/diffusion_pytorch_model-00001-of-00002.safetensors",
"unet/diffusion_pytorch_model-00002-of-00002.safetensors",
]
self.assertTrue(is_safetensors_compatible(filenames))
def test_diffusers_is_compatible_variant_sharded(self):
filenames = [
"unet/diffusion_pytorch_model.bin",
"unet/diffusion_pytorch_model.fp16-00001-of-00002.safetensors",
"unet/diffusion_pytorch_model.fp16-00001-of-00002.safetensors",
]
self.assertTrue(is_safetensors_compatible(filenames))
def test_diffusers_is_compatible_only_variants(self):
filenames = [
"unet/diffusion_pytorch_model.fp16.safetensors",
]
self.assertTrue(is_safetensors_compatible(filenames))

View File

@@ -551,37 +551,94 @@ class DownloadTests(unittest.TestCase):
assert sum(f.endswith(this_format) and not f.endswith(f"{variant}{this_format}") for f in files) == 3
assert not any(f.endswith(other_format) for f in files)
def test_download_broken_variant(self):
for use_safetensors in [False, True]:
# text encoder is missing no variant and "no_ema" variant weights, so the following can't work
for variant in [None, "no_ema"]:
with self.assertRaises(OSError) as error_context:
with tempfile.TemporaryDirectory() as tmpdirname:
tmpdirname = StableDiffusionPipeline.from_pretrained(
"hf-internal-testing/stable-diffusion-broken-variants",
cache_dir=tmpdirname,
variant=variant,
use_safetensors=use_safetensors,
)
def test_download_safetensors_only_variant_exists_for_model(self):
variant = None
use_safetensors = True
assert "Error no file name" in str(error_context.exception)
# text encoder has fp16 variants so we can load it
with tempfile.TemporaryDirectory() as tmpdirname:
tmpdirname = StableDiffusionPipeline.download(
# text encoder is missing no variant weights, so the following can't work
with tempfile.TemporaryDirectory() as tmpdirname:
with self.assertRaises(OSError) as error_context:
tmpdirname = StableDiffusionPipeline.from_pretrained(
"hf-internal-testing/stable-diffusion-broken-variants",
use_safetensors=use_safetensors,
cache_dir=tmpdirname,
variant="fp16",
variant=variant,
use_safetensors=use_safetensors,
)
assert "Error no file name" in str(error_context.exception)
# text encoder has fp16 variants so we can load it
with tempfile.TemporaryDirectory() as tmpdirname:
tmpdirname = StableDiffusionPipeline.download(
"hf-internal-testing/stable-diffusion-broken-variants",
use_safetensors=use_safetensors,
cache_dir=tmpdirname,
variant="fp16",
)
all_root_files = [t[-1] for t in os.walk(tmpdirname)]
files = [item for sublist in all_root_files for item in sublist]
# None of the downloaded files should be a non-variant file even if we have some here:
# https://huggingface.co/hf-internal-testing/stable-diffusion-broken-variants/tree/main/unet
assert len(files) == 15, f"We should only download 15 files, not {len(files)}"
def test_download_bin_only_variant_exists_for_model(self):
variant = None
use_safetensors = False
# text encoder is missing Non-variant weights, so the following can't work
with tempfile.TemporaryDirectory() as tmpdirname:
with self.assertRaises(OSError) as error_context:
tmpdirname = StableDiffusionPipeline.from_pretrained(
"hf-internal-testing/stable-diffusion-broken-variants",
cache_dir=tmpdirname,
variant=variant,
use_safetensors=use_safetensors,
)
assert "Error no file name" in str(error_context.exception)
# text encoder has fp16 variants so we can load it
with tempfile.TemporaryDirectory() as tmpdirname:
tmpdirname = StableDiffusionPipeline.download(
"hf-internal-testing/stable-diffusion-broken-variants",
use_safetensors=use_safetensors,
cache_dir=tmpdirname,
variant="fp16",
)
all_root_files = [t[-1] for t in os.walk(tmpdirname)]
files = [item for sublist in all_root_files for item in sublist]
# None of the downloaded files should be a non-variant file even if we have some here:
# https://huggingface.co/hf-internal-testing/stable-diffusion-broken-variants/tree/main/unet
assert len(files) == 15, f"We should only download 15 files, not {len(files)}"
def test_download_safetensors_variant_does_not_exist_for_model(self):
variant = "no_ema"
use_safetensors = True
# text encoder is missing no_ema variant weights, so the following can't work
with tempfile.TemporaryDirectory() as tmpdirname:
with self.assertRaises(OSError) as error_context:
tmpdirname = StableDiffusionPipeline.from_pretrained(
"hf-internal-testing/stable-diffusion-broken-variants",
cache_dir=tmpdirname,
variant=variant,
use_safetensors=use_safetensors,
)
all_root_files = [t[-1] for t in os.walk(tmpdirname)]
files = [item for sublist in all_root_files for item in sublist]
assert "Error no file name" in str(error_context.exception)
# None of the downloaded files should be a non-variant file even if we have some here:
# https://huggingface.co/hf-internal-testing/stable-diffusion-broken-variants/tree/main/unet
assert len(files) == 15, f"We should only download 15 files, not {len(files)}"
# only unet has "no_ema" variant
def test_download_bin_variant_does_not_exist_for_model(self):
variant = "no_ema"
use_safetensors = False
# text encoder is missing no_ema variant weights, so the following can't work
with tempfile.TemporaryDirectory() as tmpdirname:
with self.assertRaises(OSError) as error_context:
tmpdirname = StableDiffusionPipeline.from_pretrained(
"hf-internal-testing/stable-diffusion-broken-variants",
cache_dir=tmpdirname,
variant=variant,
use_safetensors=use_safetensors,
)
assert "Error no file name" in str(error_context.exception)
def test_local_save_load_index(self):
prompt = "hello"

View File

@@ -20,12 +20,7 @@ import numpy as np
import torch
from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
from diffusers import (
AutoencoderKL,
DDIMScheduler,
TextToVideoSDPipeline,
UNet3DConditionModel,
)
from diffusers import AutoencoderKL, DDIMScheduler, TextToVideoSDPipeline, UNet3DConditionModel
from diffusers.utils import is_xformers_available
from diffusers.utils.testing_utils import (
enable_full_determinism,
@@ -64,7 +59,7 @@ class TextToVideoSDPipelineFastTests(PipelineTesterMixin, SDFunctionTesterMixin,
def get_dummy_components(self):
torch.manual_seed(0)
unet = UNet3DConditionModel(
block_out_channels=(4, 8),
block_out_channels=(8, 8),
layers_per_block=1,
sample_size=32,
in_channels=4,
@@ -134,10 +129,7 @@ class TextToVideoSDPipelineFastTests(PipelineTesterMixin, SDFunctionTesterMixin,
return inputs
def test_dict_tuple_outputs_equivalent(self):
expected_slice = None
if torch_device == "cpu":
expected_slice = np.array([0.4903, 0.5649, 0.5504, 0.5179, 0.4821, 0.5466, 0.4131, 0.5052, 0.5077])
return super().test_dict_tuple_outputs_equivalent(expected_slice=expected_slice)
return super().test_dict_tuple_outputs_equivalent()
def test_text_to_video_default_case(self):
device = "cpu" # ensure determinism for the device-dependent torch.Generator
@@ -151,9 +143,8 @@ class TextToVideoSDPipelineFastTests(PipelineTesterMixin, SDFunctionTesterMixin,
frames = sd_pipe(**inputs).frames
image_slice = frames[0][0][-3:, -3:, -1]
assert frames[0][0].shape == (32, 32, 3)
expected_slice = np.array([0.7537, 0.1752, 0.6157, 0.5508, 0.4240, 0.4110, 0.4838, 0.5648, 0.5094])
expected_slice = np.array([0.8093, 0.2751, 0.6976, 0.5927, 0.4616, 0.4336, 0.5094, 0.5683, 0.4796])
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2