update

2026-02-10 12:55:19 +08:00 · 2024-05-07 11:31:11 +00:00 · 2024-05-07 11:30:52 +00:00 · 2024-05-07 11:27:19 +00:00 · 2024-05-07 10:25:58 +00:00 · 2024-05-06 15:07:25 -07:00
44 changed files with 1152 additions and 856 deletions
--- a/.github/workflows/nightly_tests.yml
+++ b/.github/workflows/nightly_tests.yml
@@ -19,7 +19,7 @@ env:
 jobs:
  setup_torch_cuda_pipeline_matrix:
    name: Setup Torch Pipelines Matrix
-    runs-on: ubuntu-latest
+    runs-on: diffusers/diffusers-pytorch-cpu
    outputs:
      pipeline_test_matrix: ${{ steps.fetch_pipeline_matrix.outputs.pipeline_test_matrix }}
    steps:
@@ -67,19 +67,19 @@ jobs:
          fetch-depth: 2
      - name: NVIDIA-SMI
        run: nvidia-smi
-      
+
      - name: Install dependencies
        run: |
          python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
          python -m uv pip install -e [quality,test]
          python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
          python -m uv pip install pytest-reportlog
-      
+
      - name: Environment
        run: |
          python utils/print_env.py
-      
+
-      - name: Nightly PyTorch CUDA checkpoint (pipelines) tests 
+      - name: Nightly PyTorch CUDA checkpoint (pipelines) tests
        env:
          HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
          # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
@@ -88,9 +88,9 @@ jobs:
          python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
            -s -v -k "not Flax and not Onnx" \
            --make-reports=tests_pipeline_${{ matrix.module }}_cuda \
-            --report-log=tests_pipeline_${{ matrix.module }}_cuda.log \ 
+            --report-log=tests_pipeline_${{ matrix.module }}_cuda.log \
            tests/pipelines/${{ matrix.module }}
-      
+
      - name: Failure short reports
        if: ${{ failure() }}
        run: |
@@ -103,7 +103,7 @@ jobs:
        with:
          name: pipeline_${{ matrix.module }}_test_reports
          path: reports
-      
+
      - name: Generate Report and Notify Channel
        if: always()
        run: |
@@ -112,7 +112,7 @@ jobs:
  run_nightly_tests_for_other_torch_modules:
    name: Torch Non-Pipelines CUDA Nightly Tests
-    runs-on: docker-gpu
+    runs-on: [single-gpu, nvidia-gpu, t4, ci]
    container:
      image: diffusers/diffusers-pytorch-cuda
      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0
@@ -139,7 +139,7 @@ jobs:
      run: python utils/print_env.py
    - name: Run nightly PyTorch CUDA tests for non-pipeline modules
-      if: ${{ matrix.module != 'examples'}} 
+      if: ${{ matrix.module != 'examples'}}
      env:
        HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
        # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
@@ -148,7 +148,7 @@ jobs:
        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
          -s -v -k "not Flax and not Onnx" \
          --make-reports=tests_torch_${{ matrix.module }}_cuda \
-          --report-log=tests_torch_${{ matrix.module }}_cuda.log \ 
+          --report-log=tests_torch_${{ matrix.module }}_cuda.log \
          tests/${{ matrix.module }}
    - name: Run nightly example tests with Torch
@@ -161,13 +161,13 @@ jobs:
        python -m uv pip install peft@git+https://github.com/huggingface/peft.git
        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
          -s -v --make-reports=examples_torch_cuda \
-          --report-log=examples_torch_cuda.log \ 
+          --report-log=examples_torch_cuda.log \
          examples/
    - name: Failure short reports
      if: ${{ failure() }}
      run: |
-        cat reports/tests_torch_${{ matrix.module }}_cuda_stats.txt 
+        cat reports/tests_torch_${{ matrix.module }}_cuda_stats.txt
        cat reports/tests_torch_${{ matrix.module }}_cuda_failures_short.txt
    - name: Test suite reports artifacts
@@ -185,7 +185,7 @@ jobs:
  run_lora_nightly_tests:
    name: Nightly LoRA Tests with PEFT and TORCH
-    runs-on: docker-gpu
+    runs-on: [single-gpu, nvidia-gpu, t4, ci]
    container:
      image: diffusers/diffusers-pytorch-cuda
      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0
@@ -218,13 +218,13 @@ jobs:
        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
          -s -v -k "not Flax and not Onnx" \
          --make-reports=tests_torch_lora_cuda \
-          --report-log=tests_torch_lora_cuda.log \ 
+          --report-log=tests_torch_lora_cuda.log \
          tests/lora
-    
+
    - name: Failure short reports
      if: ${{ failure() }}
      run: |
-        cat reports/tests_torch_lora_cuda_stats.txt 
+        cat reports/tests_torch_lora_cuda_stats.txt
        cat reports/tests_torch_lora_cuda_failures_short.txt
    - name: Test suite reports artifacts
@@ -239,12 +239,12 @@ jobs:
      run: |
        pip install slack_sdk tabulate
        python scripts/log_reports.py >> $GITHUB_STEP_SUMMARY
-  
+
  run_flax_tpu_tests:
    name: Nightly Flax TPU Tests
    runs-on: docker-tpu
    if: github.event_name == 'schedule'
-    
+
    container:
      image: diffusers/diffusers-flax-tpu
      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --privileged
@@ -274,7 +274,7 @@ jobs:
        python -m pytest -n 0 \
          -s -v -k "Flax" \
          --make-reports=tests_flax_tpu \
-          --report-log=tests_flax_tpu.log \ 
+          --report-log=tests_flax_tpu.log \
          tests/
    - name: Failure short reports
@@ -298,11 +298,11 @@ jobs:
  run_nightly_onnx_tests:
    name: Nightly ONNXRuntime CUDA tests on Ubuntu
-    runs-on: docker-gpu
+    runs-on: [single-gpu, nvidia-gpu, t4, ci]
    container:
      image: diffusers/diffusers-onnxruntime-cuda
      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
-    
+
    steps:
    - name: Checkout diffusers
      uses: actions/checkout@v3
@@ -321,7 +321,7 @@ jobs:
    - name: Environment
      run: python utils/print_env.py
-    
+
    - name: Run nightly ONNXRuntime CUDA tests
      env:
        HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
@@ -329,7 +329,7 @@ jobs:
        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
          -s -v -k "Onnx" \
          --make-reports=tests_onnx_cuda \
-          --report-log=tests_onnx_cuda.log \ 
+          --report-log=tests_onnx_cuda.log \
          tests/
    - name: Failure short reports
@@ -344,7 +344,7 @@ jobs:
      with:
        name: ${{ matrix.config.report }}_test_reports
        path: reports
-    
+
    - name: Generate Report and Notify Channel
      if: always()
      run: |
--- a/.github/workflows/pr_test_fetcher.yml
+++ b/.github/workflows/pr_test_fetcher.yml
@@ -15,7 +15,7 @@ concurrency:
 jobs:
  setup_pr_tests:
    name: Setup PR Tests
-    runs-on: docker-cpu
+    runs-on: [ self-hosted, intel-cpu, 8-cpu, ci ]
    container:
      image: diffusers/diffusers-pytorch-cpu
      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
@@ -73,7 +73,7 @@ jobs:
      max-parallel: 2
      matrix:
        modules: ${{ fromJson(needs.setup_pr_tests.outputs.matrix) }}
-    runs-on: docker-cpu
+    runs-on: [ self-hosted, intel-cpu, 8-cpu, ci ]
    container:
      image: diffusers/diffusers-pytorch-cpu
      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
@@ -123,7 +123,7 @@ jobs:
        config:
          - name: Hub tests for models, schedulers, and pipelines
            framework: hub_tests_pytorch
-            runner: docker-cpu
+            runner: [ self-hosted, intel-cpu, 8-cpu, ci ]
            image: diffusers/diffusers-pytorch-cpu
            report: torch_hub
--- a/.github/workflows/push_tests.yml
+++ b/.github/workflows/push_tests.yml
@@ -21,7 +21,9 @@ env:
 jobs:
  setup_torch_cuda_pipeline_matrix:
    name: Setup Torch Pipelines CUDA Slow Tests Matrix
-    runs-on: ubuntu-latest
+    runs-on: [ self-hosted, intel-cpu, 8-cpu, ci ]
    container:
      image: diffusers/diffusers-pytorch-cpu
    outputs:
      pipeline_test_matrix: ${{ steps.fetch_pipeline_matrix.outputs.pipeline_test_matrix }}
    steps:
@@ -29,14 +31,13 @@ jobs:
        uses: actions/checkout@v3
        with:
          fetch-depth: 2
      - name: Set up Python
        uses: actions/setup-python@v4
        with:
          python-version: "3.8"
      - name: Install dependencies
        run: |
-          pip install -e .
+          python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-          pip install huggingface_hub
+          python -m uv pip install -e [quality,test]
      - name: Environment
        run: |
          python utils/print_env.py
      - name: Fetch Pipeline Matrix
        id: fetch_pipeline_matrix
        run: |
@@ -55,12 +56,13 @@ jobs:
    needs: setup_torch_cuda_pipeline_matrix
    strategy:
      fail-fast: false
      max-parallel: 8
      matrix:
        module: ${{ fromJson(needs.setup_torch_cuda_pipeline_matrix.outputs.pipeline_test_matrix) }}
    runs-on: [single-gpu, nvidia-gpu, t4, ci]
    container:
      image: diffusers/diffusers-pytorch-cuda
-      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0 --privileged
+      options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface/diffusers:/mnt/cache/ --gpus 0 --privileged
    steps:
      - name: Checkout diffusers
        uses: actions/checkout@v3
@@ -114,10 +116,10 @@ jobs:
  torch_cuda_tests:
    name: Torch CUDA Tests
-    runs-on: docker-gpu
+    runs-on: [single-gpu, nvidia-gpu, t4, ci]
    container:
      image: diffusers/diffusers-pytorch-cuda
-      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0
+      options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface/diffusers:/mnt/cache/ --gpus 0
    defaults:
      run:
        shell: bash
@@ -166,10 +168,10 @@ jobs:
  peft_cuda_tests:
    name: PEFT CUDA Tests
-    runs-on: docker-gpu
+    runs-on: [single-gpu, nvidia-gpu, t4, ci]
    container:
      image: diffusers/diffusers-pytorch-cuda
-      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0
+      options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface/diffusers:/mnt/cache/ --gpus 0
    defaults:
      run:
        shell: bash
@@ -219,7 +221,7 @@ jobs:
    runs-on: docker-tpu
    container:
      image: diffusers/diffusers-flax-tpu
-      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --privileged
+      options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ --privileged
    defaults:
      run:
        shell: bash
@@ -263,10 +265,10 @@ jobs:
  onnx_cuda_tests:
    name: ONNX CUDA Tests
-    runs-on: docker-gpu
+    runs-on: [single-gpu, nvidia-gpu, t4, ci]
    container:
      image: diffusers/diffusers-onnxruntime-cuda
-      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0
+      options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ --gpus 0
    defaults:
      run:
        shell: bash
@@ -311,11 +313,11 @@ jobs:
  run_torch_compile_tests:
    name: PyTorch Compile CUDA tests
-    runs-on: docker-gpu
+    runs-on: [single-gpu, nvidia-gpu, t4, ci]
    container:
      image: diffusers/diffusers-pytorch-compile-cuda
-      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    steps:
    - name: Checkout diffusers
@@ -352,11 +354,11 @@ jobs:
  run_xformers_tests:
    name: PyTorch xformers CUDA tests
-    runs-on: docker-gpu
+    runs-on: [single-gpu, nvidia-gpu, t4, ci]
    container:
      image: diffusers/diffusers-pytorch-xformers-cuda
-      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    steps:
    - name: Checkout diffusers
@@ -393,11 +395,11 @@ jobs:
  run_examples_tests:
    name: Examples PyTorch CUDA tests on Ubuntu
-    runs-on: docker-gpu
+    runs-on: [single-gpu, nvidia-gpu, t4, ci]
    container:
      image: diffusers/diffusers-pytorch-cuda
-      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    steps:
    - name: Checkout diffusers
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -81,16 +81,14 @@
    title: ControlNet
  - local: using-diffusers/t2i_adapter
    title: T2I-Adapter
  - local: using-diffusers/inference_with_lcm
    title: Latent Consistency Model
  - local: using-diffusers/textual_inversion_inference
    title: Textual inversion
  - local: using-diffusers/shap-e
    title: Shap-E
  - local: using-diffusers/diffedit
    title: DiffEdit
  - local: using-diffusers/inference_with_lcm_lora
    title: Latent Consistency Model-LoRA
  - local: using-diffusers/inference_with_lcm
    title: Latent Consistency Model
  - local: using-diffusers/inference_with_tcd_lora
    title: Trajectory Consistency Distillation-LoRA
  - local: using-diffusers/svd
@@ -141,8 +139,6 @@
 - sections:
  - local: optimization/fp16
    title: Speed up inference
  - local: using-diffusers/distilled_sd
    title: Distilled Stable Diffusion inference
  - local: optimization/memory
    title: Reduce memory usage
  - local: optimization/torch2.0
--- a/docs/source/en/api/attnprocessor.md
+++ b/docs/source/en/api/attnprocessor.md
@@ -55,3 +55,6 @@ An attention processor is a class for applying different types of attention mech
 ## XFormersAttnProcessor
 [[autodoc]] models.attention_processor.XFormersAttnProcessor
 ## AttnProcessorNPU
 [[autodoc]] models.attention_processor.AttnProcessorNPU
--- a/docs/source/en/optimization/fp16.md
+++ b/docs/source/en/optimization/fp16.md
@@ -12,27 +12,23 @@ specific language governing permissions and limitations under the License.
 # Speed up inference
-There are several ways to optimize 🤗 Diffusers for inference speed. As a general rule of thumb, we recommend using either [xFormers](xformers) or `torch.nn.functional.scaled_dot_product_attention` in PyTorch 2.0 for their memory-efficient attention.
+There are several ways to optimize Diffusers for inference speed, such as reducing the computational burden by lowering the data precision or using a lightweight distilled model. There are also memory-efficient attention implementations, [xFormers](xformers) and [scaled dot product attetntion](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) in PyTorch 2.0, that reduce memory usage which also indirectly speeds up inference. Different speed optimizations can be stacked together to get the fastest inference times.
-<Tip>
+> [!TIP]
 > Optimizing for inference speed or reduced memory usage can lead to improved performance in the other category, so you should try to optimize for both whenever you can. This guide focuses on inference speed, but you can learn more about lowering memory usage in the [Reduce memory usage](memory) guide.
-In many cases, optimizing for speed or memory leads to improved performance in the other, so you should try to optimize for both whenever you can. This guide focuses on inference speed, but you can learn more about preserving memory in the [Reduce memory usage](memory) guide.
+The inference times below are obtained from generating a single 512x512 image from the prompt "a photo of an astronaut riding a horse on mars" with 50 DDIM steps on a NVIDIA A100.
-</Tip>
+| setup    | latency | speed-up |
 |----------|---------|----------|
 | baseline | 5.27s   | x1       |
 | tf32     | 4.14s   | x1.27    |
 | fp16     | 3.51s   | x1.50    |
 | combined | 3.41s   | x1.54    |
-The results below are obtained from generating a single 512x512 image from the prompt `a photo of an astronaut riding a horse on mars` with 50 DDIM steps on a Nvidia Titan RTX, demonstrating the speed-up you can expect.
+## TensorFloat-32
-|                  | latency | speed-up |
+On Ampere and later CUDA devices, matrix multiplications and convolutions can use the [TensorFloat-32 (tf32)](https://blogs.nvidia.com/blog/2020/05/14/tensorfloat-32-precision-format/) mode for faster, but slightly less accurate computations. By default, PyTorch enables tf32 mode for convolutions but not matrix multiplications. Unless your network requires full float32 precision, we recommend enabling tf32 for matrix multiplications. It can significantly speed up computations with typically negligible loss in numerical accuracy.
 | ---------------- | ------- | ------- |
 | original         | 9.50s   | x1      |
 | fp16             | 3.61s   | x2.63   |
 | channels last    | 3.30s   | x2.88   |
 | traced UNet      | 3.21s   | x2.96   |
 | memory efficient attention  | 2.63s  | x3.61   |
 ## Use TensorFloat-32
 On Ampere and later CUDA devices, matrix multiplications and convolutions can use the [TensorFloat-32 (TF32)](https://blogs.nvidia.com/blog/2020/05/14/tensorfloat-32-precision-format/) mode for faster, but slightly less accurate computations. By default, PyTorch enables TF32 mode for convolutions but not matrix multiplications. Unless your network requires full float32 precision, we recommend enabling TF32 for matrix multiplications. It can significantly speeds up computations with typically negligible loss in numerical accuracy.
 ```python
 import torch
@@ -40,11 +36,11 @@ import torch
 torch.backends.cuda.matmul.allow_tf32 = True
 ```
-You can learn more about TF32 in the [Mixed precision training](https://huggingface.co/docs/transformers/en/perf_train_gpu_one#tf32) guide.
+Learn more about tf32 in the [Mixed precision training](https://huggingface.co/docs/transformers/en/perf_train_gpu_one#tf32) guide.
 ## Half-precision weights
-To save GPU memory and get more speed, try loading and running the model weights directly in half-precision or float16:
+To save GPU memory and get more speed, set `torch_dtype=torch.float16` to load and run the model weights directly with half-precision weights.
 ```Python
 import torch
@@ -56,19 +52,76 @@ pipe = DiffusionPipeline.from_pretrained(
    use_safetensors=True,
 )
 pipe = pipe.to("cuda")
 prompt = "a photo of an astronaut riding a horse on mars"
 image = pipe(prompt).images[0]
 ```
-<Tip warning={true}>
+> [!WARNING]
-
+> Don't use [torch.autocast](https://pytorch.org/docs/stable/amp.html#torch.autocast) in any of the pipelines as it can lead to black images and is always slower than pure float16 precision.
 Don't use [`torch.autocast`](https://pytorch.org/docs/stable/amp.html#torch.autocast) in any of the pipelines as it can lead to black images and is always slower than pure float16 precision.
 </Tip>
 ## Distilled model
-You could also use a distilled Stable Diffusion model and autoencoder to speed up inference. During distillation, many of the UNet's residual and attention blocks are shed to reduce the model size. The distilled model is faster and uses less memory while generating images of comparable quality to the full Stable Diffusion model.
+You could also use a distilled Stable Diffusion model and autoencoder to speed up inference. During distillation, many of the UNet's residual and attention blocks are shed to reduce the model size by 51% and improve latency on CPU/GPU by 43%. The distilled model is faster and uses less memory while generating images of comparable quality to the full Stable Diffusion model.
-Learn more about in the [Distilled Stable Diffusion inference](../using-diffusers/distilled_sd) guide!
+> [!TIP]
 > Read the [Open-sourcing Knowledge Distillation Code and Weights of SD-Small and SD-Tiny](https://huggingface.co/blog/sd_distillation) blog post to learn more about how knowledge distillation training works to produce a faster, smaller, and cheaper generative model.
 The inference times below are obtained from generating 4 images from the prompt "a photo of an astronaut riding a horse on mars" with 25 PNDM steps on a NVIDIA A100. Each generation is repeated 3 times with the distilled Stable Diffusion v1.4 model by [Nota AI](https://hf.co/nota-ai).
 | setup                        | latency | speed-up |
 |------------------------------|---------|----------|
 | baseline                     | 6.37s   | x1       |
 | distilled                    | 4.18s   | x1.52    |
 | distilled + tiny autoencoder | 3.83s   | x1.66    |
 Let's load the distilled Stable Diffusion model and compare it against the original Stable Diffusion model.
 ```py
 from diffusers import StableDiffusionPipeline
 import torch
 distilled = StableDiffusionPipeline.from_pretrained(
    "nota-ai/bk-sdm-small", torch_dtype=torch.float16, use_safetensors=True,
 ).to("cuda")
 prompt = "a golden vase with different flowers"
 generator = torch.manual_seed(2023)
 image = distilled("a golden vase with different flowers", num_inference_steps=25, generator=generator).images[0]
 image
 ```
 <div class="flex gap-4">
  <div>
    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/original_sd.png"/>
    <figcaption class="mt-2 text-center text-sm text-gray-500">original Stable Diffusion</figcaption>
  </div>
  <div>
    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/distilled_sd.png"/>
    <figcaption class="mt-2 text-center text-sm text-gray-500">distilled Stable Diffusion</figcaption>
  </div>
 </div>
 ### Tiny AutoEncoder
 To speed inference up even more, replace the autoencoder with a [distilled version](https://huggingface.co/sayakpaul/taesdxl-diffusers) of it.
 ```py
 import torch
 from diffusers import AutoencoderTiny, StableDiffusionPipeline
 distilled = StableDiffusionPipeline.from_pretrained(
    "nota-ai/bk-sdm-small", torch_dtype=torch.float16, use_safetensors=True,
 ).to("cuda")
 distilled.vae = AutoencoderTiny.from_pretrained(
    "sayakpaul/taesd-diffusers", torch_dtype=torch.float16, use_safetensors=True,
 ).to("cuda")
 prompt = "a golden vase with different flowers"
 generator = torch.manual_seed(2023)
 image = distilled("a golden vase with different flowers", num_inference_steps=25, generator=generator).images[0]
 image
 ```
 <div class="flex justify-center">
  <div>
    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/distilled_sd_vae.png" />
    <figcaption class="mt-2 text-center text-sm text-gray-500">distilled Stable Diffusion + Tiny AutoEncoder</figcaption>
  </div>
 </div>
--- a/docs/source/en/using-diffusers/distilled_sd.md
+++ b/docs/source/en/using-diffusers/distilled_sd.md
@@ -1,133 +0,0 @@
 <!--Copyright 2024 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
 http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
 -->
 # Distilled Stable Diffusion inference
 [[open-in-colab]]
 Stable Diffusion inference can be a computationally intensive process because it must iteratively denoise the latents to generate an image. To reduce the computational burden, you can use a *distilled* version of the Stable Diffusion model from [Nota AI](https://huggingface.co/nota-ai). The distilled version of their Stable Diffusion model eliminates some of the residual and attention blocks from the UNet, reducing the model size by 51% and improving latency on CPU/GPU by 43%.
 <Tip>
 Read this [blog post](https://huggingface.co/blog/sd_distillation) to learn more about how knowledge distillation training works to produce a faster, smaller, and cheaper generative model.
 </Tip>
 Let's load the distilled Stable Diffusion model and compare it against the original Stable Diffusion model:
 ```py
 from diffusers import StableDiffusionPipeline
 import torch
 distilled = StableDiffusionPipeline.from_pretrained(
    "nota-ai/bk-sdm-small", torch_dtype=torch.float16, use_safetensors=True,
 ).to("cuda")
 original = StableDiffusionPipeline.from_pretrained(
    "CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16, use_safetensors=True,
 ).to("cuda")
 ```
 Given a prompt, get the inference time for the original model:
 ```py
 import time
 seed = 2023
 generator = torch.manual_seed(seed)
 NUM_ITERS_TO_RUN = 3
 NUM_INFERENCE_STEPS = 25
 NUM_IMAGES_PER_PROMPT = 4
 prompt = "a golden vase with different flowers"
 start = time.time_ns()
 for _ in range(NUM_ITERS_TO_RUN):
    images = original(
        prompt,
        num_inference_steps=NUM_INFERENCE_STEPS,
        generator=generator,
        num_images_per_prompt=NUM_IMAGES_PER_PROMPT
    ).images
 end = time.time_ns()
 original_sd = f"{(end - start) / 1e6:.1f}"
 print(f"Execution time -- {original_sd} ms\n")
 "Execution time -- 45781.5 ms"
 ```
 Time the distilled model inference:
 ```py
 start = time.time_ns()
 for _ in range(NUM_ITERS_TO_RUN):
    images = distilled(
        prompt,
        num_inference_steps=NUM_INFERENCE_STEPS,
        generator=generator,
        num_images_per_prompt=NUM_IMAGES_PER_PROMPT
    ).images
 end = time.time_ns()
 distilled_sd = f"{(end - start) / 1e6:.1f}"
 print(f"Execution time -- {distilled_sd} ms\n")
 "Execution time -- 29884.2 ms"
 ```
 <div class="flex gap-4">
  <div>
    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/original_sd.png"/>
    <figcaption class="mt-2 text-center text-sm text-gray-500">original Stable Diffusion (45781.5 ms)</figcaption>
  </div>
  <div>
    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/distilled_sd.png"/>
    <figcaption class="mt-2 text-center text-sm text-gray-500">distilled Stable Diffusion (29884.2 ms)</figcaption>
  </div>
 </div>
 ## Tiny AutoEncoder
 To speed inference up even more, use a tiny distilled version of the [Stable Diffusion VAE](https://huggingface.co/sayakpaul/taesdxl-diffusers) to denoise the latents into images. Replace the VAE in the distilled Stable Diffusion model with the tiny VAE:
 ```py
 from diffusers import AutoencoderTiny
 distilled.vae = AutoencoderTiny.from_pretrained(
    "sayakpaul/taesd-diffusers", torch_dtype=torch.float16, use_safetensors=True,
 ).to("cuda")
 ```
 Time the distilled model and distilled VAE inference:
 ```py
 start = time.time_ns()
 for _ in range(NUM_ITERS_TO_RUN):
    images = distilled(
        prompt,
        num_inference_steps=NUM_INFERENCE_STEPS,
        generator=generator,
        num_images_per_prompt=NUM_IMAGES_PER_PROMPT
    ).images
 end = time.time_ns()
 distilled_tiny_sd = f"{(end - start) / 1e6:.1f}"
 print(f"Execution time -- {distilled_tiny_sd} ms\n")
 "Execution time -- 27165.7 ms"
 ```
 <div class="flex justify-center">
  <div>
    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/distilled_sd_vae.png" />
    <figcaption class="mt-2 text-center text-sm text-gray-500">distilled Stable Diffusion + Tiny AutoEncoder (27165.7 ms)</figcaption>
  </div>
 </div>
--- a/docs/source/en/using-diffusers/inference_with_lcm.md
+++ b/docs/source/en/using-diffusers/inference_with_lcm.md
@@ -10,29 +10,30 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->
 [[open-in-colab]]
 # Latent Consistency Model
-Latent Consistency Models (LCM) enable quality image generation in typically 2-4 steps making it possible to use diffusion models in almost real-time settings. 
+[[open-in-colab]]
-From the [official website](https://latent-consistency-models.github.io/):
+[Latent Consistency Models (LCMs)](https://hf.co/papers/2310.04378) enable fast high-quality image generation by directly predicting the reverse diffusion process in the latent rather than pixel space. In other words, LCMs try to predict the noiseless image from the noisy image in contrast to typical diffusion models that iteratively remove noise from the noisy image. By avoiding the iterative sampling process, LCMs are able to generate high-quality images in 2-4 steps instead of 20-30 steps.
-> LCMs can be distilled from any pre-trained Stable Diffusion (SD) in only 4,000 training steps (~32 A100 GPU Hours) for generating high quality 768 x 768 resolution images in 2~4 steps or even one step, significantly accelerating text-to-image generation. We employ LCM to distill the Dreamshaper-V7 version of SD in just 4,000 training iterations.
+LCMs are distilled from pretrained models which requires ~32 hours of A100 compute. To speed this up, [LCM-LoRAs](https://hf.co/papers/2311.05556) train a [LoRA adapter](https://huggingface.co/docs/peft/conceptual_guides/adapter#low-rank-adaptation-lora) which have much fewer parameters to train compared to the full model. The LCM-LoRA can be plugged into a diffusion model once it has been trained.
-For a more technical overview of LCMs, refer to [the paper](https://huggingface.co/papers/2310.04378).
+This guide will show you how to use LCMs and LCM-LoRAs for fast inference on tasks and how to use them with other adapters like ControlNet or T2I-Adapter.
-LCM distilled models are available for [stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5), [stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0), and the [SSD-1B](https://huggingface.co/segmind/SSD-1B) model. All the checkpoints can be found in this [collection](https://huggingface.co/collections/latent-consistency/latent-consistency-models-weights-654ce61a95edd6dffccef6a8).
+> [!TIP]
-
+> LCMs and LCM-LoRAs are available for Stable Diffusion v1.5, Stable Diffusion XL, and the SSD-1B model. You can find their checkpoints on the [Latent Consistency](https://hf.co/collections/latent-consistency/latent-consistency-models-weights-654ce61a95edd6dffccef6a8) Collections.
 This guide shows how to perform inference with LCMs for 
 - text-to-image
 - image-to-image
 - combined with style LoRAs
 - ControlNet/T2I-Adapter
 ## Text-to-image
-You'll use the [`StableDiffusionXLPipeline`] pipeline with the [`LCMScheduler`] and then load the LCM-LoRA. Together with the LCM-LoRA and the scheduler, the pipeline enables a fast inference workflow, overcoming the slow iterative nature of diffusion models.
+<hfoptions id="lcm-text2img">
 <hfoption id="LCM">
 To use LCMs, you need to load the LCM checkpoint for your supported model into [`UNet2DConditionModel`] and replace the scheduler with the [`LCMScheduler`]. Then you can use the pipeline as usual, and pass a text prompt to generate an image in just 4 steps.
 A couple of notes to keep in mind when using LCMs are:
 * Typically, batch size is doubled inside the pipeline for classifier-free guidance. But LCM applies guidance with guidance embeddings and doesn't need to double the batch size, which leads to faster inference. The downside is that negative prompts don't work with LCM because they don't have any effect on the denoising process.
 * The ideal range for `guidance_scale` is [3., 13.] because that is what the UNet was trained with. However, disabling `guidance_scale` with a value of 1.0 is also effective in most cases.
 ```python
 from diffusers import StableDiffusionXLPipeline, UNet2DConditionModel, LCMScheduler
@@ -49,31 +50,69 @@ pipe = StableDiffusionXLPipeline.from_pretrained(
 pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
 prompt = "Self-portrait oil painting, a beautiful cyborg with golden hair, 8k"
 generator = torch.manual_seed(0)
 image = pipe(
    prompt=prompt, num_inference_steps=4, generator=generator, guidance_scale=8.0
 ).images[0]
 image
 ```
-![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_full_sdxl_t2i.png)
+<div class="flex justify-center">
    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_full_sdxl_t2i.png"/>
 </div>
-Notice that we use only 4 steps for generation which is way less than what's typically used for standard SDXL.
+</hfoption>
 <hfoption id="LCM-LoRA">
-Some details to keep in mind:
+To use LCM-LoRAs, you need to replace the scheduler with the [`LCMScheduler`] and load the LCM-LoRA weights with the [`~loaders.LoraLoaderMixin.load_lora_weights`] method. Then you can use the pipeline as usual, and pass a text prompt to generate an image in just 4 steps.
-* To perform classifier-free guidance, batch size is usually doubled inside the pipeline. LCM, however, applies guidance using guidance embeddings, so the batch size does not have to be doubled in this case. This leads to a faster inference time, with the drawback that negative prompts don't have any effect on the denoising process.
+A couple of notes to keep in mind when using LCM-LoRAs are:
 * The UNet was trained using the [3., 13.] guidance scale range. So, that is the ideal range for `guidance_scale`. However, disabling `guidance_scale` using a value of 1.0 is also effective in most cases.
 * Typically, batch size is doubled inside the pipeline for classifier-free guidance. But LCM applies guidance with guidance embeddings and doesn't need to double the batch size, which leads to faster inference. The downside is that negative prompts don't work with LCM because they don't have any effect on the denoising process.
 * You could use guidance with LCM-LoRAs, but it is very sensitive to high `guidance_scale` values and can lead to artifacts in the generated image. The best values we've found are between [1.0, 2.0].
 * Replace [stabilityai/stable-diffusion-xl-base-1.0](https://hf.co/stabilityai/stable-diffusion-xl-base-1.0) with any finetuned model. For example, try using the [animagine-xl](https://huggingface.co/Linaqruf/animagine-xl) checkpoint to generate anime images with SDXL.
 ```py
 import torch
 from diffusers import DiffusionPipeline, LCMScheduler
 pipe = DiffusionPipeline.from_pretrained(
    "stabilityai/stable-diffusion-xl-base-1.0",
    variant="fp16",
    torch_dtype=torch.float16
 ).to("cuda")
 pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
 pipe.load_lora_weights("latent-consistency/lcm-lora-sdxl")
 prompt = "Self-portrait oil painting, a beautiful cyborg with golden hair, 8k"
 generator = torch.manual_seed(42)
 image = pipe(
    prompt=prompt, num_inference_steps=4, generator=generator, guidance_scale=1.0
 ).images[0]
 image
 ```
 <div class="flex justify-center">
    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_sdxl_t2i.png"/>
 </div>
 </hfoption>
 </hfoptions>
 ## Image-to-image
-LCMs can be applied to image-to-image tasks too. For this example, we'll use the [LCM_Dreamshaper_v7](https://huggingface.co/SimianLuo/LCM_Dreamshaper_v7) model, but the same steps can be applied to other LCM models as well.
+<hfoptions id="lcm-img2img">
 <hfoption id="LCM">
 To use LCMs for image-to-image, you need to load the LCM checkpoint for your supported model into [`UNet2DConditionModel`] and replace the scheduler with the [`LCMScheduler`]. Then you can use the pipeline as usual, and pass a text prompt and initial image to generate an image in just 4 steps.
 > [!TIP]
 > Experiment with different values for `num_inference_steps`, `strength`, and `guidance_scale` to get the best results.
 ```python
 import torch
 from diffusers import AutoPipelineForImage2Image, UNet2DConditionModel, LCMScheduler
-from diffusers.utils import make_image_grid, load_image
+from diffusers.utils import load_image
 unet = UNet2DConditionModel.from_pretrained(
    "SimianLuo/LCM_Dreamshaper_v7",
@@ -89,12 +128,8 @@ pipe = AutoPipelineForImage2Image.from_pretrained(
 ).to("cuda")
 pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
-# prepare image
+init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/img2img-init.png")
 url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/img2img-init.png"
 init_image = load_image(url)
 prompt = "Astronauts in a jungle, cold color palette, muted colors, detailed, 8k"
 # pass prompt and image to pipeline
 generator = torch.manual_seed(0)
 image = pipe(
    prompt,
@@ -104,22 +139,130 @@ image = pipe(
    strength=0.5,
    generator=generator
 ).images[0]
-make_image_grid([init_image, image], rows=1, cols=2)
+image
 ```
-![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_full_sdv1-5_i2i.png)
+<div class="flex gap-4">
  <div>
    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/img2img-init.png"/>
    <figcaption class="mt-2 text-center text-sm text-gray-500">initial image</figcaption>
  </div>
  <div>
    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm-img2img.png"/>
    <figcaption class="mt-2 text-center text-sm text-gray-500">generated image</figcaption>
  </div>
 </div>
 </hfoption>
 <hfoption id="LCM-LoRA">
-<Tip>
+To use LCM-LoRAs for image-to-image, you need to replace the scheduler with the [`LCMScheduler`] and load the LCM-LoRA weights with the [`~loaders.LoraLoaderMixin.load_lora_weights`] method. Then you can use the pipeline as usual, and pass a text prompt and initial image to generate an image in just 4 steps.
-You can get different results based on your prompt and the image you provide. To get the best results, we recommend trying different values for `num_inference_steps`, `strength`, and `guidance_scale` parameters and choose the best one.
+> [!TIP]
 > Experiment with different values for `num_inference_steps`, `strength`, and `guidance_scale` to get the best results.
-</Tip>
+```py
 import torch
 from diffusers import AutoPipelineForImage2Image, LCMScheduler
 from diffusers.utils import make_image_grid, load_image
 pipe = AutoPipelineForImage2Image.from_pretrained(
    "Lykon/dreamshaper-7",
    torch_dtype=torch.float16,
    variant="fp16",
 ).to("cuda")
-## Combine with style LoRAs
+pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
-LCMs can be used with other styled LoRAs to generate styled-images in very few steps (4-8). In the following example, we'll use the [papercut LoRA](TheLastBen/Papercut_SDXL). 
+pipe.load_lora_weights("latent-consistency/lcm-lora-sdv1-5")
 init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/img2img-init.png")
 prompt = "Astronauts in a jungle, cold color palette, muted colors, detailed, 8k"
 generator = torch.manual_seed(0)
 image = pipe(
    prompt,
    image=init_image,
    num_inference_steps=4,
    guidance_scale=1,
    strength=0.6,
    generator=generator
 ).images[0]
 image
 ```
 <div class="flex gap-4">
  <div>
    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/img2img-init.png"/>
    <figcaption class="mt-2 text-center text-sm text-gray-500">initial image</figcaption>
  </div>
  <div>
    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm-lora-img2img.png"/>
    <figcaption class="mt-2 text-center text-sm text-gray-500">generated image</figcaption>
  </div>
 </div>
 </hfoption>
 </hfoptions>
 ## Inpainting
 To use LCM-LoRAs for inpainting, you need to replace the scheduler with the [`LCMScheduler`] and load the LCM-LoRA weights with the [`~loaders.LoraLoaderMixin.load_lora_weights`] method. Then you can use the pipeline as usual, and pass a text prompt, initial image, and mask image to generate an image in just 4 steps.
 ```py
 import torch
 from diffusers import AutoPipelineForInpainting, LCMScheduler
 from diffusers.utils import load_image, make_image_grid
 pipe = AutoPipelineForInpainting.from_pretrained(
    "runwayml/stable-diffusion-inpainting",
    torch_dtype=torch.float16,
    variant="fp16",
 ).to("cuda")
 pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
 pipe.load_lora_weights("latent-consistency/lcm-lora-sdv1-5")
 init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png")
 mask_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint_mask.png")
 prompt = "concept art digital painting of an elven castle, inspired by lord of the rings, highly detailed, 8k"
 generator = torch.manual_seed(0)
 image = pipe(
    prompt=prompt,
    image=init_image,
    mask_image=mask_image,
    generator=generator,
    num_inference_steps=4,
    guidance_scale=4, 
 ).images[0]
 image
 ```
 <div class="flex gap-4">
  <div>
    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png"/>
    <figcaption class="mt-2 text-center text-sm text-gray-500">initial image</figcaption>
  </div>
  <div>
    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm-lora-inpaint.png"/>
    <figcaption class="mt-2 text-center text-sm text-gray-500">generated image</figcaption>
  </div>
 </div>
 ## Adapters
 LCMs are compatible with adapters like LoRA, ControlNet, T2I-Adapter, and AnimateDiff. You can bring the speed of LCMs to these adapters to generate images in a certain style or condition the model on another input like a canny image.
 ### LoRA
 [LoRA](../using-diffusers/loading_adapters#lora) adapters can be rapidly finetuned to learn a new style from just a few images and plugged into a pretrained model to generate images in that style.
 <hfoptions id="lcm-lora">
 <hfoption id="LCM">
 Load the LCM checkpoint for your supported model into [`UNet2DConditionModel`] and replace the scheduler with the [`LCMScheduler`]. Then you can use the [`~loaders.LoraLoaderMixin.load_lora_weights`] method to load the LoRA weights into the LCM and generate a styled image in a few steps.
 ```python
 from diffusers import StableDiffusionXLPipeline, UNet2DConditionModel, LCMScheduler
@@ -134,11 +277,9 @@ pipe = StableDiffusionXLPipeline.from_pretrained(
    "stabilityai/stable-diffusion-xl-base-1.0", unet=unet, torch_dtype=torch.float16, variant="fp16",
 ).to("cuda")
 pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
 pipe.load_lora_weights("TheLastBen/Papercut_SDXL", weight_name="papercut.safetensors", adapter_name="papercut")
 prompt = "papercut, a cute fox"
 generator = torch.manual_seed(0)
 image = pipe(
    prompt=prompt, num_inference_steps=4, generator=generator, guidance_scale=8.0
@@ -146,15 +287,58 @@ image = pipe(
 image
 ```
-![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_full_sdx_lora_mix.png)
+<div class="flex justify-center">
    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_full_sdx_lora_mix.png"/>
 </div>
 </hfoption>
 <hfoption id="LCM-LoRA">
-## ControlNet/T2I-Adapter
+Replace the scheduler with the [`LCMScheduler`]. Then you can use the [`~loaders.LoraLoaderMixin.load_lora_weights`] method to load the LCM-LoRA weights and the style LoRA you want to use. Combine both LoRA adapters with the [`~loaders.UNet2DConditionLoadersMixin.set_adapters`] method and generate a styled image in a few steps.
-Let's look at how we can perform inference with ControlNet/T2I-Adapter and a LCM. 
+```py
 import torch
 from diffusers import DiffusionPipeline, LCMScheduler
 pipe = DiffusionPipeline.from_pretrained(
    "stabilityai/stable-diffusion-xl-base-1.0",
    variant="fp16",
    torch_dtype=torch.float16
 ).to("cuda")
 pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
 pipe.load_lora_weights("latent-consistency/lcm-lora-sdxl", adapter_name="lcm")
 pipe.load_lora_weights("TheLastBen/Papercut_SDXL", weight_name="papercut.safetensors", adapter_name="papercut")
 pipe.set_adapters(["lcm", "papercut"], adapter_weights=[1.0, 0.8])
 prompt = "papercut, a cute fox"
 generator = torch.manual_seed(0)
 image = pipe(prompt, num_inference_steps=4, guidance_scale=1, generator=generator).images[0]
 image
 ```
 <div class="flex justify-center">
    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_sdx_lora_mix.png"/>
 </div>
 </hfoption>
 </hfoptions>
 ### ControlNet
-For this example, we'll use the [LCM_Dreamshaper_v7](https://huggingface.co/SimianLuo/LCM_Dreamshaper_v7) model with canny ControlNet, but the same steps can be applied to other LCM models as well.
+
 [ControlNet](./controlnet) are adapters that can be trained on a variety of inputs like canny edge, pose estimation, or depth. The ControlNet can be inserted into the pipeline to provide additional conditioning and control to the model for more accurate generation.
 You can find additional ControlNet models trained on other inputs in [lllyasviel's](https://hf.co/lllyasviel) repository.
 <hfoptions id="lcm-controlnet">
 <hfoption id="LCM">
 Load a ControlNet model trained on canny images and pass it to the [`ControlNetModel`]. Then you can load a LCM model into [`StableDiffusionControlNetPipeline`] and replace the scheduler with the [`LCMScheduler`]. Now pass the canny image to the pipeline and generate an image.
 > [!TIP]
 > Experiment with different values for `num_inference_steps`, `controlnet_conditioning_scale`, `cross_attention_kwargs`, and `guidance_scale` to get the best results.
 ```python
 import torch
@@ -186,8 +370,6 @@ pipe = StableDiffusionControlNetPipeline.from_pretrained(
    torch_dtype=torch.float16,
    safety_checker=None,
 ).to("cuda")
 # set scheduler
 pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
 generator = torch.manual_seed(0)
@@ -200,16 +382,84 @@ image = pipe(
 make_image_grid([canny_image, image], rows=1, cols=2)
 ```
-![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_full_sdv1-5_controlnet.png)
+<div class="flex justify-center">
    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_full_sdv1-5_controlnet.png"/>
 </div>
 </hfoption>
 <hfoption id="LCM-LoRA">
-<Tip>
+Load a ControlNet model trained on canny images and pass it to the [`ControlNetModel`]. Then you can load a Stable Diffusion v1.5 model into [`StableDiffusionControlNetPipeline`] and replace the scheduler with the [`LCMScheduler`]. Use the [`~loaders.LoraLoaderMixin.load_lora_weights`] method to load the LCM-LoRA weights, and pass the canny image to the pipeline and generate an image.
-The inference parameters in this example might not work for all examples, so we recommend trying different values for the `num_inference_steps`, `guidance_scale`, `controlnet_conditioning_scale`, and `cross_attention_kwargs` parameters and choosing the best one. 
+
-</Tip>
+> [!TIP]
 > Experiment with different values for `num_inference_steps`, `controlnet_conditioning_scale`, `cross_attention_kwargs`, and `guidance_scale` to get the best results.
 ```py
 import torch
 import cv2
 import numpy as np
 from PIL import Image
 from diffusers import StableDiffusionControlNetPipeline, ControlNetModel, LCMScheduler
 from diffusers.utils import load_image
 image = load_image(
    "https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png"
 ).resize((512, 512))
 image = np.array(image)
 low_threshold = 100
 high_threshold = 200
 image = cv2.Canny(image, low_threshold, high_threshold)
 image = image[:, :, None]
 image = np.concatenate([image, image, image], axis=2)
 canny_image = Image.fromarray(image)
 controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16)
 pipe = StableDiffusionControlNetPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5",
    controlnet=controlnet,
    torch_dtype=torch.float16,
    safety_checker=None,
    variant="fp16"
 ).to("cuda")
 pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
 pipe.load_lora_weights("latent-consistency/lcm-lora-sdv1-5")
 generator = torch.manual_seed(0)
 image = pipe(
    "the mona lisa",
    image=canny_image,
    num_inference_steps=4,
    guidance_scale=1.5,
    controlnet_conditioning_scale=0.8,
    cross_attention_kwargs={"scale": 1},
    generator=generator,
 ).images[0]
 image
 ```
 <div class="flex justify-center">
    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_sdv1-5_controlnet.png"/>
 </div>
 </hfoption>
 </hfoptions>
 ### T2I-Adapter
-This example shows how to use the `lcm-sdxl` with the [Canny T2I-Adapter](TencentARC/t2i-adapter-canny-sdxl-1.0).
+[T2I-Adapter](./t2i_adapter) is an even more lightweight adapter than ControlNet, that provides an additional input to condition a pretrained model with. It is faster than ControlNet but the results may be slightly worse.
 You can find additional T2I-Adapter checkpoints trained on other inputs in [TencentArc's](https://hf.co/TencentARC) repository.
 <hfoptions id="lcm-t2i">
 <hfoption id="LCM">
 Load a T2IAdapter trained on canny images and pass it to the [`StableDiffusionXLAdapterPipeline`]. Then load a LCM checkpoint into [`UNet2DConditionModel`] and replace the scheduler with the [`LCMScheduler`]. Now pass the canny image to the pipeline and generate an image.
 ```python
 import torch
@@ -220,10 +470,9 @@ from PIL import Image
 from diffusers import StableDiffusionXLAdapterPipeline, UNet2DConditionModel, T2IAdapter, LCMScheduler
 from diffusers.utils import load_image, make_image_grid
-# Prepare image
+# detect the canny map in low resolution to avoid high-frequency details
 # Detect the canny map in low resolution to avoid high-frequency details
 image = load_image(
-    "https://huggingface.co/Adapter/t2iadapter/resolve/main/figs_SDXLV1.0/org_canny.jpg"
+    "https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png"
 ).resize((384, 384))
 image = np.array(image)
@@ -236,7 +485,6 @@ image = image[:, :, None]
 image = np.concatenate([image, image, image], axis=2)
 canny_image = Image.fromarray(image).resize((1024, 1216))
 # load adapter
 adapter = T2IAdapter.from_pretrained("TencentARC/t2i-adapter-canny-sdxl-1.0", torch_dtype=torch.float16, varient="fp16").to("cuda")
 unet = UNet2DConditionModel.from_pretrained(
@@ -254,7 +502,7 @@ pipe = StableDiffusionXLAdapterPipeline.from_pretrained(
 pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
-prompt = "Mystical fairy in real, magic, 4k picture, high quality"
+prompt = "the mona lisa, 4k picture, high quality"
 negative_prompt = "extra digit, fewer digits, cropped, worst quality, low quality, glitch, deformed, mutated, ugly, disfigured"
 generator = torch.manual_seed(0)
@@ -268,7 +516,116 @@ image = pipe(
    adapter_conditioning_factor=1,
    generator=generator,
 ).images[0]
 grid = make_image_grid([canny_image, image], rows=1, cols=2)
 ```
-![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_full_sdxl_t2iadapter.png)
+<div class="flex justify-center">
    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm-t2i.png"/>
 </div>
 </hfoption>
 <hfoption id="LCM-LoRA">
 Load a T2IAdapter trained on canny images and pass it to the [`StableDiffusionXLAdapterPipeline`]. Replace the scheduler with the [`LCMScheduler`], and use the [`~loaders.LoraLoaderMixin.load_lora_weights`] method to load the LCM-LoRA weights. Pass the canny image to the pipeline and generate an image.
 ```py
 import torch
 import cv2
 import numpy as np
 from PIL import Image
 from diffusers import StableDiffusionXLAdapterPipeline, UNet2DConditionModel, T2IAdapter, LCMScheduler
 from diffusers.utils import load_image, make_image_grid
 # detect the canny map in low resolution to avoid high-frequency details
 image = load_image(
    "https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png"
 ).resize((384, 384))
 image = np.array(image)
 low_threshold = 100
 high_threshold = 200
 image = cv2.Canny(image, low_threshold, high_threshold)
 image = image[:, :, None]
 image = np.concatenate([image, image, image], axis=2)
 canny_image = Image.fromarray(image).resize((1024, 1024))
 adapter = T2IAdapter.from_pretrained("TencentARC/t2i-adapter-canny-sdxl-1.0", torch_dtype=torch.float16, varient="fp16").to("cuda")
 pipe = StableDiffusionXLAdapterPipeline.from_pretrained(
    "stabilityai/stable-diffusion-xl-base-1.0", 
    adapter=adapter,
    torch_dtype=torch.float16,
    variant="fp16", 
 ).to("cuda")
 pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
 pipe.load_lora_weights("latent-consistency/lcm-lora-sdxl")
 prompt = "the mona lisa, 4k picture, high quality"
 negative_prompt = "extra digit, fewer digits, cropped, worst quality, low quality, glitch, deformed, mutated, ugly, disfigured"
 generator = torch.manual_seed(0)
 image = pipe(
    prompt=prompt,
    negative_prompt=negative_prompt,
    image=canny_image,
    num_inference_steps=4,
    guidance_scale=1.5, 
    adapter_conditioning_scale=0.8, 
    adapter_conditioning_factor=1,
    generator=generator,
 ).images[0]
 ```
 <div class="flex justify-center">
    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm-lora-t2i.png"/>
 </div>
 </hfoption>
 </hfoptions>
 ### AnimateDiff
 [AnimateDiff](../api/pipelines/animatediff) is an adapter that adds motion to an image. It can be used with most Stable Diffusion models, effectively turning them into "video generation" models. Generating good results with a video model usually requires generating multiple frames (16-24), which can be very slow with a regular Stable Diffusion model. LCM-LoRA can speed up this process by only taking 4-8 steps for each frame.
 Load a [`AnimateDiffPipeline`] and pass a [`MotionAdapter`] to it. Then replace the scheduler with the [`LCMScheduler`], and combine both LoRA adapters with the [`~loaders.UNet2DConditionLoadersMixin.set_adapters`] method. Now you can pass a prompt to the pipeline and generate an animated image.
 ```py
 import torch
 from diffusers import MotionAdapter, AnimateDiffPipeline, DDIMScheduler, LCMScheduler
 from diffusers.utils import export_to_gif
 adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5")
 pipe = AnimateDiffPipeline.from_pretrained(
    "frankjoshua/toonyou_beta6",
    motion_adapter=adapter,
 ).to("cuda")
 # set scheduler
 pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
 # load LCM-LoRA
 pipe.load_lora_weights("latent-consistency/lcm-lora-sdv1-5", adapter_name="lcm")
 pipe.load_lora_weights("guoyww/animatediff-motion-lora-zoom-in", weight_name="diffusion_pytorch_model.safetensors", adapter_name="motion-lora")
 pipe.set_adapters(["lcm", "motion-lora"], adapter_weights=[0.55, 1.2])
 prompt = "best quality, masterpiece, 1girl, looking at viewer, blurry background, upper body, contemporary, dress"
 generator = torch.manual_seed(0)
 frames = pipe(
    prompt=prompt,
    num_inference_steps=5,
    guidance_scale=1.25,
    cross_attention_kwargs={"scale": 1},
    num_frames=24,
    generator=generator
 ).frames[0]
 export_to_gif(frames, "animation.gif")
 ```
 <div class="flex justify-center">
    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm-lora-animatediff.gif"/>
 </div>
--- a/docs/source/en/using-diffusers/inference_with_lcm_lora.md
+++ b/docs/source/en/using-diffusers/inference_with_lcm_lora.md
@@ -1,422 +0,0 @@
 <!--Copyright 2024 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
 http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
 -->
 [[open-in-colab]]
 # Performing inference with LCM-LoRA
 Latent Consistency Models (LCM) enable quality image generation in typically 2-4 steps making it possible to use diffusion models in almost real-time settings. 
 From the [official website](https://latent-consistency-models.github.io/):
 > LCMs can be distilled from any pre-trained Stable Diffusion (SD) in only 4,000 training steps (~32 A100 GPU Hours) for generating high quality 768 x 768 resolution images in 2~4 steps or even one step, significantly accelerating text-to-image generation. We employ LCM to distill the Dreamshaper-V7 version of SD in just 4,000 training iterations.
 For a more technical overview of LCMs, refer to [the paper](https://huggingface.co/papers/2310.04378).
 However, each model needs to be distilled separately for latent consistency distillation. The core idea with LCM-LoRA is to train just a few adapter layers, the adapter being LoRA in this case. 
 This way, we don't have to train the full model and keep the number of trainable parameters manageable. The resulting LoRAs can then be applied to any fine-tuned version of the model without distilling them separately.
 Additionally, the LoRAs can be applied to image-to-image, ControlNet/T2I-Adapter, inpainting, AnimateDiff etc. 
 The LCM-LoRA can also be combined with other LoRAs to generate styled images in very few steps (4-8).
 LCM-LoRAs are available for [stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5), [stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0), and the [SSD-1B](https://huggingface.co/segmind/SSD-1B) model. All the checkpoints can be found in this [collection](https://huggingface.co/collections/latent-consistency/latent-consistency-models-loras-654cdd24e111e16f0865fba6).
 For more details about LCM-LoRA, refer to [the technical report](https://huggingface.co/papers/2311.05556).
 This guide shows how to perform inference with LCM-LoRAs for 
 - text-to-image
 - image-to-image
 - combined with styled LoRAs
 - ControlNet/T2I-Adapter
 - inpainting
 - AnimateDiff
 Before going through this guide, we'll take a look at the general workflow for performing inference with LCM-LoRAs.
 LCM-LoRAs are similar to other Stable Diffusion LoRAs so they can be used with any [`DiffusionPipeline`] that supports LoRAs.
 - Load the task specific pipeline and model.
 - Set the scheduler to [`LCMScheduler`].
 - Load the LCM-LoRA weights for the model.
 - Reduce the `guidance_scale` between `[1.0, 2.0]` and set the `num_inference_steps` between [4, 8].
 - Perform inference with the pipeline with the usual parameters.
 Let's look at how we can perform inference with LCM-LoRAs for different tasks.
 First, make sure you have [peft](https://github.com/huggingface/peft) installed, for better LoRA support.
 ```bash
 pip install -U peft
 ```
 ## Text-to-image
 You'll use the [`StableDiffusionXLPipeline`] with the scheduler: [`LCMScheduler`] and then load the LCM-LoRA. Together with the LCM-LoRA and the scheduler, the pipeline enables a fast inference workflow overcoming the slow iterative nature of diffusion models.
 ```python
 import torch
 from diffusers import DiffusionPipeline, LCMScheduler
 pipe = DiffusionPipeline.from_pretrained(
    "stabilityai/stable-diffusion-xl-base-1.0",
    variant="fp16",
    torch_dtype=torch.float16
 ).to("cuda")
 # set scheduler
 pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
 # load LCM-LoRA
 pipe.load_lora_weights("latent-consistency/lcm-lora-sdxl")
 prompt = "Self-portrait oil painting, a beautiful cyborg with golden hair, 8k"
 generator = torch.manual_seed(42)
 image = pipe(
    prompt=prompt, num_inference_steps=4, generator=generator, guidance_scale=1.0
 ).images[0]
 ```
 ![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_sdxl_t2i.png)
 Notice that we use only 4 steps for generation which is way less than what's typically used for standard SDXL.
 <Tip>
 You may have noticed that we set `guidance_scale=1.0`, which disables classifer-free-guidance. This is because the LCM-LoRA is trained with guidance, so the batch size does not have to be doubled in this case. This leads to a faster inference time, with the drawback that negative prompts don't have any effect on the denoising process.
 You can also use guidance with LCM-LoRA, but due to the nature of training the model is very sensitve to the `guidance_scale` values, high values can lead to artifacts in the generated images. In our experiments, we found that the best values are in the range of [1.0, 2.0].
 </Tip>
 ### Inference with a fine-tuned model
 As mentioned above, the LCM-LoRA can be applied to any fine-tuned version of the model without having to distill them separately. Let's look at how we can perform inference with a fine-tuned model. In this example, we'll use the [animagine-xl](https://huggingface.co/Linaqruf/animagine-xl) model, which is a fine-tuned version of the SDXL model for generating anime.
 ```python
 from diffusers import DiffusionPipeline, LCMScheduler
 pipe = DiffusionPipeline.from_pretrained(
    "Linaqruf/animagine-xl",
    variant="fp16",
    torch_dtype=torch.float16
 ).to("cuda")
 # set scheduler
 pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
 # load LCM-LoRA
 pipe.load_lora_weights("latent-consistency/lcm-lora-sdxl")
 prompt = "face focus, cute, masterpiece, best quality, 1girl, green hair, sweater, looking at viewer, upper body, beanie, outdoors, night, turtleneck"
 generator = torch.manual_seed(0)
 image = pipe(
    prompt=prompt, num_inference_steps=4, generator=generator, guidance_scale=1.0
 ).images[0]
 ```
 ![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_sdxl_t2i_finetuned.png)
 ## Image-to-image
 LCM-LoRA can be applied to image-to-image tasks too. Let's look at how we can perform image-to-image generation with LCMs. For this example we'll use the [dreamshaper-7](https://huggingface.co/Lykon/dreamshaper-7) model and the LCM-LoRA for `stable-diffusion-v1-5 `.
 ```python
 import torch
 from diffusers import AutoPipelineForImage2Image, LCMScheduler
 from diffusers.utils import make_image_grid, load_image
 pipe = AutoPipelineForImage2Image.from_pretrained(
    "Lykon/dreamshaper-7",
    torch_dtype=torch.float16,
    variant="fp16",
 ).to("cuda")
 # set scheduler
 pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
 # load LCM-LoRA
 pipe.load_lora_weights("latent-consistency/lcm-lora-sdv1-5")
 # prepare image
 url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/img2img-init.png"
 init_image = load_image(url)
 prompt = "Astronauts in a jungle, cold color palette, muted colors, detailed, 8k"
 # pass prompt and image to pipeline
 generator = torch.manual_seed(0)
 image = pipe(
    prompt,
    image=init_image,
    num_inference_steps=4,
    guidance_scale=1,
    strength=0.6,
    generator=generator
 ).images[0]
 make_image_grid([init_image, image], rows=1, cols=2)
 ```
 ![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_sdv1-5_i2i.png)
 <Tip>
 You can get different results based on your prompt and the image you provide. To get the best results, we recommend trying different values for `num_inference_steps`, `strength`, and `guidance_scale` parameters and choose the best one.
 </Tip>
 ## Combine with styled LoRAs
 LCM-LoRA can be combined with other LoRAs to generate styled-images in very few steps (4-8). In the following example, we'll use the LCM-LoRA with the [papercut LoRA](TheLastBen/Papercut_SDXL). 
 To learn more about how to combine LoRAs, refer to [this guide](https://huggingface.co/docs/diffusers/tutorials/using_peft_for_inference#combine-multiple-adapters).
 ```python
 import torch
 from diffusers import DiffusionPipeline, LCMScheduler
 pipe = DiffusionPipeline.from_pretrained(
    "stabilityai/stable-diffusion-xl-base-1.0",
    variant="fp16",
    torch_dtype=torch.float16
 ).to("cuda")
 # set scheduler
 pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
 # load LoRAs
 pipe.load_lora_weights("latent-consistency/lcm-lora-sdxl", adapter_name="lcm")
 pipe.load_lora_weights("TheLastBen/Papercut_SDXL", weight_name="papercut.safetensors", adapter_name="papercut")
 # Combine LoRAs
 pipe.set_adapters(["lcm", "papercut"], adapter_weights=[1.0, 0.8])
 prompt = "papercut, a cute fox"
 generator = torch.manual_seed(0)
 image = pipe(prompt, num_inference_steps=4, guidance_scale=1, generator=generator).images[0]
 image
 ```
 ![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_sdx_lora_mix.png)
 ## ControlNet/T2I-Adapter
 Let's look at how we can perform inference with ControlNet/T2I-Adapter and LCM-LoRA. 
 ### ControlNet
 For this example, we'll use the SD-v1-5 model and the LCM-LoRA for SD-v1-5 with canny ControlNet.
 ```python
 import torch
 import cv2
 import numpy as np
 from PIL import Image
 from diffusers import StableDiffusionControlNetPipeline, ControlNetModel, LCMScheduler
 from diffusers.utils import load_image
 image = load_image(
    "https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png"
 ).resize((512, 512))
 image = np.array(image)
 low_threshold = 100
 high_threshold = 200
 image = cv2.Canny(image, low_threshold, high_threshold)
 image = image[:, :, None]
 image = np.concatenate([image, image, image], axis=2)
 canny_image = Image.fromarray(image)
 controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16)
 pipe = StableDiffusionControlNetPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5",
    controlnet=controlnet,
    torch_dtype=torch.float16,
    safety_checker=None,
    variant="fp16"
 ).to("cuda")
 # set scheduler
 pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
 # load LCM-LoRA
 pipe.load_lora_weights("latent-consistency/lcm-lora-sdv1-5")
 generator = torch.manual_seed(0)
 image = pipe(
    "the mona lisa",
    image=canny_image,
    num_inference_steps=4,
    guidance_scale=1.5,
    controlnet_conditioning_scale=0.8,
    cross_attention_kwargs={"scale": 1},
    generator=generator,
 ).images[0]
 make_image_grid([canny_image, image], rows=1, cols=2)
 ```
 ![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_sdv1-5_controlnet.png)
 <Tip>
 The inference parameters in this example might not work for all examples, so we recommend you to try different values for `num_inference_steps`, `guidance_scale`, `controlnet_conditioning_scale` and `cross_attention_kwargs` parameters and choose the best one. 
 </Tip>
 ### T2I-Adapter
 This example shows how to use the LCM-LoRA with the [Canny T2I-Adapter](TencentARC/t2i-adapter-canny-sdxl-1.0) and SDXL.
 ```python
 import torch
 import cv2
 import numpy as np
 from PIL import Image
 from diffusers import StableDiffusionXLAdapterPipeline, T2IAdapter, LCMScheduler
 from diffusers.utils import load_image, make_image_grid
 # Prepare image
 # Detect the canny map in low resolution to avoid high-frequency details
 image = load_image(
    "https://huggingface.co/Adapter/t2iadapter/resolve/main/figs_SDXLV1.0/org_canny.jpg"
 ).resize((384, 384))
 image = np.array(image)
 low_threshold = 100
 high_threshold = 200
 image = cv2.Canny(image, low_threshold, high_threshold)
 image = image[:, :, None]
 image = np.concatenate([image, image, image], axis=2)
 canny_image = Image.fromarray(image).resize((1024, 1024))
 # load adapter
 adapter = T2IAdapter.from_pretrained("TencentARC/t2i-adapter-canny-sdxl-1.0", torch_dtype=torch.float16, varient="fp16").to("cuda")
 pipe = StableDiffusionXLAdapterPipeline.from_pretrained(
    "stabilityai/stable-diffusion-xl-base-1.0", 
    adapter=adapter,
    torch_dtype=torch.float16,
    variant="fp16", 
 ).to("cuda")
 # set scheduler
 pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
 # load LCM-LoRA
 pipe.load_lora_weights("latent-consistency/lcm-lora-sdxl")
 prompt = "Mystical fairy in real, magic, 4k picture, high quality"
 negative_prompt = "extra digit, fewer digits, cropped, worst quality, low quality, glitch, deformed, mutated, ugly, disfigured"
 generator = torch.manual_seed(0)
 image = pipe(
    prompt=prompt,
    negative_prompt=negative_prompt,
    image=canny_image,
    num_inference_steps=4,
    guidance_scale=1.5, 
    adapter_conditioning_scale=0.8, 
    adapter_conditioning_factor=1,
    generator=generator,
 ).images[0]
 make_image_grid([canny_image, image], rows=1, cols=2)
 ```
 ![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_sdxl_t2iadapter.png)
 ## Inpainting
 LCM-LoRA can be used for inpainting as well. 
 ```python
 import torch
 from diffusers import AutoPipelineForInpainting, LCMScheduler
 from diffusers.utils import load_image, make_image_grid
 pipe = AutoPipelineForInpainting.from_pretrained(
    "runwayml/stable-diffusion-inpainting",
    torch_dtype=torch.float16,
    variant="fp16",
 ).to("cuda")
 # set scheduler
 pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
 # load LCM-LoRA
 pipe.load_lora_weights("latent-consistency/lcm-lora-sdv1-5")
 # load base and mask image
 init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png")
 mask_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint_mask.png")
 # generator = torch.Generator("cuda").manual_seed(92)
 prompt = "concept art digital painting of an elven castle, inspired by lord of the rings, highly detailed, 8k"
 generator = torch.manual_seed(0)
 image = pipe(
    prompt=prompt,
    image=init_image,
    mask_image=mask_image,
    generator=generator,
    num_inference_steps=4,
    guidance_scale=4, 
 ).images[0]
 make_image_grid([init_image, mask_image, image], rows=1, cols=3)
 ```
 ![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_sdv1-5_inpainting.png)
 ## AnimateDiff
 [`AnimateDiff`] allows you to animate images using Stable Diffusion models. To get good results, we need to generate multiple frames (16-24), and doing this with standard SD models can be very slow. 
 LCM-LoRA can be used to speed up the process significantly, as you just need to do 4-8 steps for each frame. Let's look at how we can perform animation with LCM-LoRA and AnimateDiff.
 ```python
 import torch
 from diffusers import MotionAdapter, AnimateDiffPipeline, DDIMScheduler, LCMScheduler
 from diffusers.utils import export_to_gif
 adapter = MotionAdapter.from_pretrained("diffusers/animatediff-motion-adapter-v1-5")
 pipe = AnimateDiffPipeline.from_pretrained(
    "frankjoshua/toonyou_beta6",
    motion_adapter=adapter,
 ).to("cuda")
 # set scheduler
 pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
 # load LCM-LoRA
 pipe.load_lora_weights("latent-consistency/lcm-lora-sdv1-5", adapter_name="lcm")
 pipe.load_lora_weights("guoyww/animatediff-motion-lora-zoom-in", weight_name="diffusion_pytorch_model.safetensors", adapter_name="motion-lora")
 pipe.set_adapters(["lcm", "motion-lora"], adapter_weights=[0.55, 1.2])
 prompt = "best quality, masterpiece, 1girl, looking at viewer, blurry background, upper body, contemporary, dress"
 generator = torch.manual_seed(0)
 frames = pipe(
    prompt=prompt,
    num_inference_steps=5,
    guidance_scale=1.25,
    cross_attention_kwargs={"scale": 1},
    num_frames=24,
    generator=generator
 ).frames[0]
 export_to_gif(frames, "animation.gif")
 ```
 ![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_sdv1-5_animatediff.gif)
--- a/examples/community/pipeline_demofusion_sdxl.py
+++ b/examples/community/pipeline_demofusion_sdxl.py
@@ -1304,7 +1304,11 @@ class DemoFusionSDXLPipeline(
            if isinstance(component, torch.nn.Module):
                if hasattr(component, "_hf_hook"):
                    is_model_cpu_offload = isinstance(getattr(component, "_hf_hook"), CpuOffload)
-                    is_sequential_cpu_offload = isinstance(getattr(component, "_hf_hook"), AlignDevicesHook)
+                    is_sequential_cpu_offload = (
                        isinstance(getattr(component, "_hf_hook"), AlignDevicesHook)
                        or hasattr(component._hf_hook, "hooks")
                        and isinstance(component._hf_hook.hooks[0], AlignDevicesHook)
                    )
                    logger.info(
                        "Accelerate hooks detected. Since you have called `load_lora_weights()`, the previous hooks will be first removed. Then the LoRA parameters will be loaded and the hooks will be applied again."
                    )
--- a/examples/controlnet/train_controlnet_sdxl.py
+++ b/examples/controlnet/train_controlnet_sdxl.py
@@ -32,7 +32,7 @@ import torch.utils.checkpoint
 import transformers
 from accelerate import Accelerator
 from accelerate.logging import get_logger
-from accelerate.utils import ProjectConfiguration, set_seed
+from accelerate.utils import DistributedType, ProjectConfiguration, set_seed
 from datasets import load_dataset
 from huggingface_hub import create_repo, upload_folder
 from packaging import version
@@ -53,7 +53,7 @@ from diffusers import (
 from diffusers.optimization import get_scheduler
 from diffusers.utils import check_min_version, is_wandb_available, make_image_grid
 from diffusers.utils.hub_utils import load_or_create_model_card, populate_model_card
-from diffusers.utils.import_utils import is_xformers_available
+from diffusers.utils.import_utils import is_torch_npu_available, is_xformers_available
 from diffusers.utils.torch_utils import is_compiled_module
@@ -64,6 +64,8 @@ if is_wandb_available():
 check_min_version("0.28.0.dev0")
 logger = get_logger(__name__)
 if is_torch_npu_available():
    torch.npu.config.allow_internal_format = False
 def log_validation(vae, unet, controlnet, args, accelerator, weight_dtype, step, is_final_validation=False):
@@ -471,6 +473,9 @@ def parse_args(input_args=None):
    parser.add_argument(
        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
    )
    parser.add_argument(
        "--enable_npu_flash_attention", action="store_true", help="Whether or not to use npu flash attention."
    )
    parser.add_argument(
        "--set_grads_to_none",
        action="store_true",
@@ -936,6 +941,13 @@ def main(args):
    text_encoder_two.requires_grad_(False)
    controlnet.train()
    if args.enable_npu_flash_attention:
        if is_torch_npu_available():
            logger.info("npu flash attention enabled.")
            unet.enable_npu_flash_attention()
        else:
            raise ValueError("npu flash attention requires torch_npu extensions and is supported only on npu devices.")
    if args.enable_xformers_memory_efficient_attention:
        if is_xformers_available():
            import xformers
@@ -1235,7 +1247,8 @@ def main(args):
                progress_bar.update(1)
                global_step += 1
-                if accelerator.is_main_process:
+                # DeepSpeed requires saving weights on every device; saving weights only on the main process would cause issues.
                if accelerator.distributed_type == DistributedType.DEEPSPEED or accelerator.is_main_process:
                    if global_step % args.checkpointing_steps == 0:
                        # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
                        if args.checkpoints_total_limit is not None:
--- a/examples/text_to_image/train_text_to_image_lora_sdxl.py
+++ b/examples/text_to_image/train_text_to_image_lora_sdxl.py
@@ -32,7 +32,7 @@ import torch.utils.checkpoint
 import transformers
 from accelerate import Accelerator
 from accelerate.logging import get_logger
-from accelerate.utils import DistributedDataParallelKwargs, ProjectConfiguration, set_seed
+from accelerate.utils import DistributedDataParallelKwargs, DistributedType, ProjectConfiguration, set_seed
 from datasets import load_dataset
 from huggingface_hub import create_repo, upload_folder
 from packaging import version
@@ -60,7 +60,7 @@ from diffusers.utils import (
    is_wandb_available,
 )
 from diffusers.utils.hub_utils import load_or_create_model_card, populate_model_card
-from diffusers.utils.import_utils import is_xformers_available
+from diffusers.utils.import_utils import is_torch_npu_available, is_xformers_available
 from diffusers.utils.torch_utils import is_compiled_module
@@ -68,6 +68,8 @@ from diffusers.utils.torch_utils import is_compiled_module
 check_min_version("0.28.0.dev0")
 logger = get_logger(__name__)
 if is_torch_npu_available():
    torch.npu.config.allow_internal_format = False
 def save_model_card(
@@ -419,6 +421,9 @@ def parse_args(input_args=None):
    parser.add_argument(
        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
    )
    parser.add_argument(
        "--enable_npu_flash_attention", action="store_true", help="Whether or not to use npu flash attention."
    )
    parser.add_argument("--noise_offset", type=float, default=0, help="The scale of noise offset.")
    parser.add_argument(
        "--rank",
@@ -623,6 +628,13 @@ def main(args):
    text_encoder_one.to(accelerator.device, dtype=weight_dtype)
    text_encoder_two.to(accelerator.device, dtype=weight_dtype)
    if args.enable_npu_flash_attention:
        if is_torch_npu_available():
            logger.info("npu flash attention enabled.")
            unet.enable_npu_flash_attention()
        else:
            raise ValueError("npu flash attention requires torch_npu extensions and is supported only on npu devices.")
    if args.enable_xformers_memory_efficient_attention:
        if is_xformers_available():
            import xformers
@@ -1149,7 +1161,8 @@ def main(args):
                accelerator.log({"train_loss": train_loss}, step=global_step)
                train_loss = 0.0
-                if accelerator.is_main_process:
+                # DeepSpeed requires saving weights on every device; saving weights only on the main process would cause issues.
                if accelerator.distributed_type == DistributedType.DEEPSPEED or accelerator.is_main_process:
                    if global_step % args.checkpointing_steps == 0:
                        # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
                        if args.checkpoints_total_limit is not None:
--- a/src/diffusers/configuration_utils.py
+++ b/src/diffusers/configuration_utils.py
@@ -310,9 +310,9 @@ class ConfigMixin:
            force_download (`bool`, *optional*, defaults to `False`):
                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
                cached versions if they exist.
-            resume_download (`bool`, *optional*, defaults to `False`):
+            resume_download:
-                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
+                Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1
-                incompletely downloaded files are deleted.
+                of Diffusers.
            proxies (`Dict[str, str]`, *optional*):
                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
@@ -341,7 +341,7 @@ class ConfigMixin:
        """
        cache_dir = kwargs.pop("cache_dir", None)
        force_download = kwargs.pop("force_download", False)
-        resume_download = kwargs.pop("resume_download", False)
+        resume_download = kwargs.pop("resume_download", None)
        proxies = kwargs.pop("proxies", None)
        token = kwargs.pop("token", None)
        local_files_only = kwargs.pop("local_files_only", False)
--- a/src/diffusers/loaders/autoencoder.py
+++ b/src/diffusers/loaders/autoencoder.py
@@ -50,9 +50,9 @@ class FromOriginalVAEMixin:
            cache_dir (`Union[str, os.PathLike]`, *optional*):
                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
                is not used.
-            resume_download (`bool`, *optional*, defaults to `False`):
+            resume_download:
-                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
+                Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1
-                incompletely downloaded files are deleted.
+                of Diffusers.
            proxies (`Dict[str, str]`, *optional*):
                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
@@ -99,7 +99,7 @@ class FromOriginalVAEMixin:
        original_config_file = kwargs.pop("original_config_file", None)
        config_file = kwargs.pop("config_file", None)
-        resume_download = kwargs.pop("resume_download", False)
+        resume_download = kwargs.pop("resume_download", None)
        force_download = kwargs.pop("force_download", False)
        proxies = kwargs.pop("proxies", None)
        token = kwargs.pop("token", None)
--- a/src/diffusers/loaders/controlnet.py
+++ b/src/diffusers/loaders/controlnet.py
@@ -50,9 +50,9 @@ class FromOriginalControlNetMixin:
            cache_dir (`Union[str, os.PathLike]`, *optional*):
                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
                is not used.
-            resume_download (`bool`, *optional*, defaults to `False`):
+            resume_download:
-                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
+                Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1
-                incompletely downloaded files are deleted.
+                of Diffusers.
            proxies (`Dict[str, str]`, *optional*):
                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
@@ -89,7 +89,7 @@ class FromOriginalControlNetMixin:
        """
        original_config_file = kwargs.pop("original_config_file", None)
        config_file = kwargs.pop("config_file", None)
-        resume_download = kwargs.pop("resume_download", False)
+        resume_download = kwargs.pop("resume_download", None)
        force_download = kwargs.pop("force_download", False)
        proxies = kwargs.pop("proxies", None)
        token = kwargs.pop("token", None)
--- a/src/diffusers/loaders/ip_adapter.py
+++ b/src/diffusers/loaders/ip_adapter.py
@@ -90,9 +90,9 @@ class IPAdapterMixin:
            force_download (`bool`, *optional*, defaults to `False`):
                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
                cached versions if they exist.
-            resume_download (`bool`, *optional*, defaults to `False`):
+            resume_download:
-                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
+                Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1
-                incompletely downloaded files are deleted.
+                of Diffusers.
            proxies (`Dict[str, str]`, *optional*):
                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
@@ -135,7 +135,7 @@ class IPAdapterMixin:
        # Load the main state dict first.
        cache_dir = kwargs.pop("cache_dir", None)
        force_download = kwargs.pop("force_download", False)
-        resume_download = kwargs.pop("resume_download", False)
+        resume_download = kwargs.pop("resume_download", None)
        proxies = kwargs.pop("proxies", None)
        local_files_only = kwargs.pop("local_files_only", None)
        token = kwargs.pop("token", None)
--- a/src/diffusers/loaders/lora.py
+++ b/src/diffusers/loaders/lora.py
@@ -176,9 +176,9 @@ class LoraLoaderMixin:
            force_download (`bool`, *optional*, defaults to `False`):
                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
                cached versions if they exist.
-            resume_download (`bool`, *optional*, defaults to `False`):
+            resume_download:
-                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
+                Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1
-                incompletely downloaded files are deleted.
+                of Diffusers.
            proxies (`Dict[str, str]`, *optional*):
                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
@@ -208,7 +208,7 @@ class LoraLoaderMixin:
        # UNet and text encoder or both.
        cache_dir = kwargs.pop("cache_dir", None)
        force_download = kwargs.pop("force_download", False)
-        resume_download = kwargs.pop("resume_download", False)
+        resume_download = kwargs.pop("resume_download", None)
        proxies = kwargs.pop("proxies", None)
        local_files_only = kwargs.pop("local_files_only", None)
        token = kwargs.pop("token", None)
@@ -369,7 +369,11 @@ class LoraLoaderMixin:
                    if not is_model_cpu_offload:
                        is_model_cpu_offload = isinstance(component._hf_hook, CpuOffload)
                    if not is_sequential_cpu_offload:
-                        is_sequential_cpu_offload = isinstance(component._hf_hook, AlignDevicesHook)
+                        is_sequential_cpu_offload = (
                            isinstance(component._hf_hook, AlignDevicesHook)
                            or hasattr(component._hf_hook, "hooks")
                            and isinstance(component._hf_hook.hooks[0], AlignDevicesHook)
                        )
                    logger.info(
                        "Accelerate hooks detected. Since you have called `load_lora_weights()`, the previous hooks will be first removed. Then the LoRA parameters will be loaded and the hooks will be applied again."
--- a/src/diffusers/loaders/single_file.py
+++ b/src/diffusers/loaders/single_file.py
@@ -177,9 +177,9 @@ class FromSingleFileMixin:
            cache_dir (`Union[str, os.PathLike]`, *optional*):
                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
                is not used.
-            resume_download (`bool`, *optional*, defaults to `False`):
+            resume_download:
-                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
+                Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1
-                incompletely downloaded files are deleted.
+                of Diffusers.
            proxies (`Dict[str, str]`, *optional*):
                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
@@ -244,7 +244,7 @@ class FromSingleFileMixin:
        ```
        """
        original_config_file = kwargs.pop("original_config_file", None)
-        resume_download = kwargs.pop("resume_download", False)
+        resume_download = kwargs.pop("resume_download", None)
        force_download = kwargs.pop("force_download", False)
        proxies = kwargs.pop("proxies", None)
        token = kwargs.pop("token", None)
--- a/src/diffusers/loaders/single_file_utils.py
+++ b/src/diffusers/loaders/single_file_utils.py
@@ -305,7 +305,7 @@ def fetch_ldm_config_and_checkpoint(
    pretrained_model_link_or_path,
    class_name,
    original_config_file=None,
-    resume_download=False,
+    resume_download=None,
    force_download=False,
    proxies=None,
    token=None,
--- a/src/diffusers/loaders/textual_inversion.py
+++ b/src/diffusers/loaders/textual_inversion.py
@@ -38,7 +38,7 @@ TEXT_INVERSION_NAME_SAFE = "learned_embeds.safetensors"
 def load_textual_inversion_state_dicts(pretrained_model_name_or_paths, **kwargs):
    cache_dir = kwargs.pop("cache_dir", None)
    force_download = kwargs.pop("force_download", False)
-    resume_download = kwargs.pop("resume_download", False)
+    resume_download = kwargs.pop("resume_download", None)
    proxies = kwargs.pop("proxies", None)
    local_files_only = kwargs.pop("local_files_only", None)
    token = kwargs.pop("token", None)
@@ -308,9 +308,9 @@ class TextualInversionLoaderMixin:
            force_download (`bool`, *optional*, defaults to `False`):
                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
                cached versions if they exist.
-            resume_download (`bool`, *optional*, defaults to `False`):
+            resume_download:
-                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
+                Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1
-                incompletely downloaded files are deleted.
+                of Diffusers.
            proxies (`Dict[str, str]`, *optional*):
                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
@@ -423,7 +423,11 @@ class TextualInversionLoaderMixin:
            if isinstance(component, nn.Module):
                if hasattr(component, "_hf_hook"):
                    is_model_cpu_offload = isinstance(getattr(component, "_hf_hook"), CpuOffload)
-                    is_sequential_cpu_offload = isinstance(getattr(component, "_hf_hook"), AlignDevicesHook)
+                    is_sequential_cpu_offload = (
                        isinstance(getattr(component, "_hf_hook"), AlignDevicesHook)
                        or hasattr(component._hf_hook, "hooks")
                        and isinstance(component._hf_hook.hooks[0], AlignDevicesHook)
                    )
                    logger.info(
                        "Accelerate hooks detected. Since you have called `load_textual_inversion()`, the previous hooks will be first removed. Then the textual inversion parameters will be loaded and the hooks will be applied again."
                    )
--- a/src/diffusers/loaders/unet.py
+++ b/src/diffusers/loaders/unet.py
@@ -103,9 +103,9 @@ class UNet2DConditionLoadersMixin:
            force_download (`bool`, *optional*, defaults to `False`):
                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
                cached versions if they exist.
-            resume_download (`bool`, *optional*, defaults to `False`):
+            resume_download:
-                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
+                Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1
-                incompletely downloaded files are deleted.
+                of Diffusers.
            proxies (`Dict[str, str]`, *optional*):
                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
@@ -149,7 +149,7 @@ class UNet2DConditionLoadersMixin:
        cache_dir = kwargs.pop("cache_dir", None)
        force_download = kwargs.pop("force_download", False)
-        resume_download = kwargs.pop("resume_download", False)
+        resume_download = kwargs.pop("resume_download", None)
        proxies = kwargs.pop("proxies", None)
        local_files_only = kwargs.pop("local_files_only", None)
        token = kwargs.pop("token", None)
@@ -359,7 +359,11 @@ class UNet2DConditionLoadersMixin:
                for _, component in _pipeline.components.items():
                    if isinstance(component, nn.Module) and hasattr(component, "_hf_hook"):
                        is_model_cpu_offload = isinstance(getattr(component, "_hf_hook"), CpuOffload)
-                        is_sequential_cpu_offload = isinstance(getattr(component, "_hf_hook"), AlignDevicesHook)
+                        is_sequential_cpu_offload = (
                            isinstance(getattr(component, "_hf_hook"), AlignDevicesHook)
                            or hasattr(component._hf_hook, "hooks")
                            and isinstance(component._hf_hook.hooks[0], AlignDevicesHook)
                        )
                        logger.info(
                            "Accelerate hooks detected. Since you have called `load_lora_weights()`, the previous hooks will be first removed. Then the LoRA parameters will be loaded and the hooks will be applied again."
@@ -1086,9 +1090,9 @@ class FromOriginalUNetMixin:
            cache_dir (`Union[str, os.PathLike]`, *optional*):
                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
                is not used.
-            resume_download (`bool`, *optional*, defaults to `False`):
+            resume_download:
-                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
+                Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1
-                incompletely downloaded files are deleted.
+                of Diffusers.
            proxies (`Dict[str, str]`, *optional*):
                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
@@ -1110,7 +1114,7 @@ class FromOriginalUNetMixin:
            raise ValueError("FromOriginalUNetMixin is currently only compatible with StableCascadeUNet")
        config = kwargs.pop("config", None)
-        resume_download = kwargs.pop("resume_download", False)
+        resume_download = kwargs.pop("resume_download", None)
        force_download = kwargs.pop("force_download", False)
        proxies = kwargs.pop("proxies", None)
        token = kwargs.pop("token", None)
--- a/src/diffusers/models/activations.py
+++ b/src/diffusers/models/activations.py
@@ -18,8 +18,12 @@ import torch.nn.functional as F
 from torch import nn
 from ..utils import deprecate
 from ..utils.import_utils import is_torch_npu_available
 if is_torch_npu_available():
    import torch_npu
 ACTIVATION_FUNCTIONS = {
    "swish": nn.SiLU(),
    "silu": nn.SiLU(),
@@ -98,9 +102,13 @@ class GEGLU(nn.Module):
        if len(args) > 0 or kwargs.get("scale", None) is not None:
            deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
            deprecate("scale", "1.0.0", deprecation_message)
-
+        hidden_states = self.proj(hidden_states)
-        hidden_states, gate = self.proj(hidden_states).chunk(2, dim=-1)
+        if is_torch_npu_available():
-        return hidden_states * self.gelu(gate)
+            # using torch_npu.npu_geglu can run faster and save memory on NPU.
            return torch_npu.npu_geglu(hidden_states, dim=-1, approximate=1)[0]
        else:
            hidden_states, gate = hidden_states.chunk(2, dim=-1)
            return hidden_states * self.gelu(gate)
 class ApproximateGELU(nn.Module):
--- a/src/diffusers/models/attention_processor.py
+++ b/src/diffusers/models/attention_processor.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import inspect
 import math
 from importlib import import_module
 from typing import Callable, List, Optional, Union
@@ -21,13 +22,15 @@ from torch import nn
 from ..image_processor import IPAdapterMaskProcessor
 from ..utils import deprecate, logging
-from ..utils.import_utils import is_xformers_available
+from ..utils.import_utils import is_torch_npu_available, is_xformers_available
 from ..utils.torch_utils import maybe_allow_in_graph
 from .lora import LoRALinearLayer
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 if is_torch_npu_available():
    import torch_npu
 if is_xformers_available():
    import xformers
@@ -209,6 +212,23 @@ class Attention(nn.Module):
            )
        self.set_processor(processor)
    def set_use_npu_flash_attention(self, use_npu_flash_attention: bool) -> None:
        r"""
        Set whether to use npu flash attention from `torch_npu` or not.
        """
        if use_npu_flash_attention:
            processor = AttnProcessorNPU()
        else:
            # set attention processor
            # We use the AttnProcessor2_0 by default when torch 2.x is used which uses
            # torch.nn.functional.scaled_dot_product_attention for native Flash/memory_efficient_attention
            # but only if it has the default `scale` argument. TODO remove scale_qk check when we move to torch 2.1
            processor = (
                AttnProcessor2_0() if hasattr(F, "scaled_dot_product_attention") and self.scale_qk else AttnProcessor()
            )
        self.set_processor(processor)
    def set_use_memory_efficient_attention_xformers(
        self, use_memory_efficient_attention_xformers: bool, attention_op: Optional[Callable] = None
    ) -> None:
@@ -1207,6 +1227,116 @@ class XFormersAttnProcessor:
        return hidden_states
 class AttnProcessorNPU:
    r"""
    Processor for implementing flash attention using torch_npu. Torch_npu supports only fp16 and bf16 data types. If
    fp32 is used, F.scaled_dot_product_attention will be used for computation, but the acceleration effect on NPU is
    not significant.
    """
    def __init__(self):
        if not is_torch_npu_available():
            raise ImportError("AttnProcessorNPU requires torch_npu extensions and is supported only on npu devices.")
    def __call__(
        self,
        attn: Attention,
        hidden_states: torch.FloatTensor,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        temb: Optional[torch.FloatTensor] = None,
        *args,
        **kwargs,
    ) -> torch.FloatTensor:
        if len(args) > 0 or kwargs.get("scale", None) is not None:
            deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
            deprecate("scale", "1.0.0", deprecation_message)
        residual = hidden_states
        if attn.spatial_norm is not None:
            hidden_states = attn.spatial_norm(hidden_states, temb)
        input_ndim = hidden_states.ndim
        if input_ndim == 4:
            batch_size, channel, height, width = hidden_states.shape
            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
        batch_size, sequence_length, _ = (
            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
        )
        if attention_mask is not None:
            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
            # scaled_dot_product_attention expects attention_mask shape to be
            # (batch, heads, source_length, target_length)
            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
        if attn.group_norm is not None:
            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
        query = attn.to_q(hidden_states)
        if encoder_hidden_states is None:
            encoder_hidden_states = hidden_states
        elif attn.norm_cross:
            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
        key = attn.to_k(encoder_hidden_states)
        value = attn.to_v(encoder_hidden_states)
        inner_dim = key.shape[-1]
        head_dim = inner_dim // attn.heads
        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
        # the output of sdp = (batch, num_heads, seq_len, head_dim)
        if query.dtype in (torch.float16, torch.bfloat16):
            hidden_states = torch_npu.npu_fusion_attention(
                query,
                key,
                value,
                attn.heads,
                input_layout="BNSD",
                pse=None,
                atten_mask=attention_mask,
                scale=1.0 / math.sqrt(query.shape[-1]),
                pre_tockens=65536,
                next_tockens=65536,
                keep_prob=1.0,
                sync=False,
                inner_precise=0,
            )[0]
        else:
            # TODO: add support for attn.scale when we move to Torch 2.1
            hidden_states = F.scaled_dot_product_attention(
                query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
            )
        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
        hidden_states = hidden_states.to(query.dtype)
        # linear proj
        hidden_states = attn.to_out[0](hidden_states)
        # dropout
        hidden_states = attn.to_out[1](hidden_states)
        if input_ndim == 4:
            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
        if attn.residual_connection:
            hidden_states = hidden_states + residual
        hidden_states = hidden_states / attn.rescale_output_factor
        return hidden_states
 class AttnProcessor2_0:
    r"""
    Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
--- a/src/diffusers/models/modeling_flax_utils.py
+++ b/src/diffusers/models/modeling_flax_utils.py
@@ -245,9 +245,9 @@ class FlaxModelMixin(PushToHubMixin):
            force_download (`bool`, *optional*, defaults to `False`):
                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
                cached versions if they exist.
-            resume_download (`bool`, *optional*, defaults to `False`):
+            resume_download:
-                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
+                Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1
-                incompletely downloaded files are deleted.
+                of Diffusers.
            proxies (`Dict[str, str]`, *optional*):
                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
@@ -296,7 +296,7 @@ class FlaxModelMixin(PushToHubMixin):
        cache_dir = kwargs.pop("cache_dir", None)
        force_download = kwargs.pop("force_download", False)
        from_pt = kwargs.pop("from_pt", False)
-        resume_download = kwargs.pop("resume_download", False)
+        resume_download = kwargs.pop("resume_download", None)
        proxies = kwargs.pop("proxies", None)
        local_files_only = kwargs.pop("local_files_only", False)
        token = kwargs.pop("token", None)
--- a/src/diffusers/models/modeling_utils.py
+++ b/src/diffusers/models/modeling_utils.py
@@ -272,6 +272,36 @@ class ModelMixin(torch.nn.Module, PushToHubMixin):
        if self._supports_gradient_checkpointing:
            self.apply(partial(self._set_gradient_checkpointing, value=False))
    def set_use_npu_flash_attention(self, valid: bool) -> None:
        r"""
        Set the switch for the npu flash attention.
        """
        def fn_recursive_set_npu_flash_attention(module: torch.nn.Module):
            if hasattr(module, "set_use_npu_flash_attention"):
                module.set_use_npu_flash_attention(valid)
            for child in module.children():
                fn_recursive_set_npu_flash_attention(child)
        for module in self.children():
            if isinstance(module, torch.nn.Module):
                fn_recursive_set_npu_flash_attention(module)
    def enable_npu_flash_attention(self) -> None:
        r"""
        Enable npu flash attention from torch_npu
        """
        self.set_use_npu_flash_attention(True)
    def disable_npu_flash_attention(self) -> None:
        r"""
        disable npu flash attention from torch_npu
        """
        self.set_use_npu_flash_attention(False)
    def set_use_memory_efficient_attention_xformers(
        self, valid: bool, attention_op: Optional[Callable] = None
    ) -> None:
@@ -446,9 +476,9 @@ class ModelMixin(torch.nn.Module, PushToHubMixin):
            force_download (`bool`, *optional*, defaults to `False`):
                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
                cached versions if they exist.
-            resume_download (`bool`, *optional*, defaults to `False`):
+            resume_download:
-                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
+                Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1
-                incompletely downloaded files are deleted.
+                of Diffusers.
            proxies (`Dict[str, str]`, *optional*):
                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
@@ -530,7 +560,7 @@ class ModelMixin(torch.nn.Module, PushToHubMixin):
        ignore_mismatched_sizes = kwargs.pop("ignore_mismatched_sizes", False)
        force_download = kwargs.pop("force_download", False)
        from_flax = kwargs.pop("from_flax", False)
-        resume_download = kwargs.pop("resume_download", False)
+        resume_download = kwargs.pop("resume_download", None)
        proxies = kwargs.pop("proxies", None)
        output_loading_info = kwargs.pop("output_loading_info", False)
        local_files_only = kwargs.pop("local_files_only", None)
--- a/src/diffusers/pipelines/auto_pipeline.py
+++ b/src/diffusers/pipelines/auto_pipeline.py
@@ -234,9 +234,9 @@ class AutoPipelineForText2Image(ConfigMixin):
            cache_dir (`Union[str, os.PathLike]`, *optional*):
                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
                is not used.
-            resume_download (`bool`, *optional*, defaults to `False`):
+            resume_download:
-                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
+                Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1
-                incompletely downloaded files are deleted.
+                of Diffusers.
            proxies (`Dict[str, str]`, *optional*):
                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
@@ -311,7 +311,7 @@ class AutoPipelineForText2Image(ConfigMixin):
        """
        cache_dir = kwargs.pop("cache_dir", None)
        force_download = kwargs.pop("force_download", False)
-        resume_download = kwargs.pop("resume_download", False)
+        resume_download = kwargs.pop("resume_download", None)
        proxies = kwargs.pop("proxies", None)
        token = kwargs.pop("token", None)
        local_files_only = kwargs.pop("local_files_only", False)
@@ -507,9 +507,9 @@ class AutoPipelineForImage2Image(ConfigMixin):
            cache_dir (`Union[str, os.PathLike]`, *optional*):
                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
                is not used.
-            resume_download (`bool`, *optional*, defaults to `False`):
+            resume_download:
-                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
+                Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1
-                incompletely downloaded files are deleted.
+                of Diffusers.
            proxies (`Dict[str, str]`, *optional*):
                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
@@ -584,7 +584,7 @@ class AutoPipelineForImage2Image(ConfigMixin):
        """
        cache_dir = kwargs.pop("cache_dir", None)
        force_download = kwargs.pop("force_download", False)
-        resume_download = kwargs.pop("resume_download", False)
+        resume_download = kwargs.pop("resume_download", None)
        proxies = kwargs.pop("proxies", None)
        token = kwargs.pop("token", None)
        local_files_only = kwargs.pop("local_files_only", False)
@@ -783,9 +783,9 @@ class AutoPipelineForInpainting(ConfigMixin):
            cache_dir (`Union[str, os.PathLike]`, *optional*):
                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
                is not used.
-            resume_download (`bool`, *optional*, defaults to `False`):
+            resume_download:
-                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
+                Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1
-                incompletely downloaded files are deleted.
+                of Diffusers.
            proxies (`Dict[str, str]`, *optional*):
                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
@@ -860,7 +860,7 @@ class AutoPipelineForInpainting(ConfigMixin):
        """
        cache_dir = kwargs.pop("cache_dir", None)
        force_download = kwargs.pop("force_download", False)
-        resume_download = kwargs.pop("resume_download", False)
+        resume_download = kwargs.pop("resume_download", None)
        proxies = kwargs.pop("proxies", None)
        token = kwargs.pop("token", None)
        local_files_only = kwargs.pop("local_files_only", False)
--- a/src/diffusers/pipelines/dit/pipeline_dit.py
+++ b/src/diffusers/pipelines/dit/pipeline_dit.py
@@ -227,6 +227,9 @@ class DiTPipeline(DiffusionPipeline):
        if output_type == "pil":
            samples = self.numpy_to_pil(samples)
        # Offload all models
        self.maybe_free_model_hooks()
        if not return_dict:
            return (samples,)
--- a/src/diffusers/pipelines/pipeline_flax_utils.py
+++ b/src/diffusers/pipelines/pipeline_flax_utils.py
@@ -254,9 +254,9 @@ class FlaxDiffusionPipeline(ConfigMixin, PushToHubMixin):
            force_download (`bool`, *optional*, defaults to `False`):
                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
                cached versions if they exist.
-            resume_download (`bool`, *optional*, defaults to `False`):
+            resume_download:
-                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
+                Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1
-                incompletely downloaded files are deleted.
+                of Diffusers.
            proxies (`Dict[str, str]`, *optional*):
                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
@@ -316,7 +316,7 @@ class FlaxDiffusionPipeline(ConfigMixin, PushToHubMixin):
        ```
        """
        cache_dir = kwargs.pop("cache_dir", None)
-        resume_download = kwargs.pop("resume_download", False)
+        resume_download = kwargs.pop("resume_download", None)
        proxies = kwargs.pop("proxies", None)
        local_files_only = kwargs.pop("local_files_only", False)
        token = kwargs.pop("token", None)
--- a/src/diffusers/pipelines/pipeline_loading_utils.py
+++ b/src/diffusers/pipelines/pipeline_loading_utils.py
@@ -435,7 +435,7 @@ def _load_empty_model(
            return_unused_kwargs=True,
            return_commit_hash=True,
            force_download=kwargs.pop("force_download", False),
-            resume_download=kwargs.pop("resume_download", False),
+            resume_download=kwargs.pop("resume_download", None),
            proxies=kwargs.pop("proxies", None),
            local_files_only=kwargs.pop("local_files_only", False),
            token=kwargs.pop("token", None),
@@ -454,7 +454,7 @@ def _load_empty_model(
            cached_folder,
            subfolder=name,
            force_download=kwargs.pop("force_download", False),
-            resume_download=kwargs.pop("resume_download", False),
+            resume_download=kwargs.pop("resume_download", None),
            proxies=kwargs.pop("proxies", None),
            local_files_only=kwargs.pop("local_files_only", False),
            token=kwargs.pop("token", None),
--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -376,7 +376,11 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
            if not is_accelerate_available() or is_accelerate_version("<", "0.14.0"):
                return False
-            return hasattr(module, "_hf_hook") and isinstance(module._hf_hook, accelerate.hooks.AlignDevicesHook)
+            return hasattr(module, "_hf_hook") and (
                isinstance(module._hf_hook, accelerate.hooks.AlignDevicesHook)
                or hasattr(module._hf_hook, "hooks")
                and isinstance(module._hf_hook.hooks[0], accelerate.hooks.AlignDevicesHook)
            )
        def module_is_offloaded(module):
            if not is_accelerate_available() or is_accelerate_version("<", "0.17.0.dev0"):
@@ -529,9 +533,9 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
            cache_dir (`Union[str, os.PathLike]`, *optional*):
                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
                is not used.
-            resume_download (`bool`, *optional*, defaults to `False`):
+            resume_download:
-                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
+                Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1
-                incompletely downloaded files are deleted.
+                of Diffusers.
            proxies (`Dict[str, str]`, *optional*):
                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
@@ -621,7 +625,7 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
        ```
        """
        cache_dir = kwargs.pop("cache_dir", None)
-        resume_download = kwargs.pop("resume_download", False)
+        resume_download = kwargs.pop("resume_download", None)
        force_download = kwargs.pop("force_download", False)
        proxies = kwargs.pop("proxies", None)
        local_files_only = kwargs.pop("local_files_only", None)
@@ -1005,8 +1009,7 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
        """
        for _, model in self.components.items():
            if isinstance(model, torch.nn.Module) and hasattr(model, "_hf_hook"):
-                is_sequential_cpu_offload = isinstance(getattr(model, "_hf_hook"), accelerate.hooks.AlignDevicesHook)
+                accelerate.hooks.remove_hook_from_module(model, recurse=True)
                accelerate.hooks.remove_hook_from_module(model, recurse=is_sequential_cpu_offload)
        self._all_hooks = []
    def enable_model_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
@@ -1213,9 +1216,9 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
            force_download (`bool`, *optional*, defaults to `False`):
                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
                cached versions if they exist.
-            resume_download (`bool`, *optional*, defaults to `False`):
+            resume_download:
-                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
+                Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1
-                incompletely downloaded files are deleted.
+                of Diffusers.
            proxies (`Dict[str, str]`, *optional*):
                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
@@ -1268,7 +1271,7 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
        """
        cache_dir = kwargs.pop("cache_dir", None)
-        resume_download = kwargs.pop("resume_download", False)
+        resume_download = kwargs.pop("resume_download", None)
        force_download = kwargs.pop("force_download", False)
        proxies = kwargs.pop("proxies", None)
        local_files_only = kwargs.pop("local_files_only", None)
--- a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
+++ b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
@@ -557,7 +557,7 @@ def convert_ldm_unet_checkpoint(
                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
            )
-            output_block_list = {k: sorted(v) for k, v in output_block_list.items()}
+            output_block_list = {k: sorted(v) for k, v in sorted(output_block_list.items())}
            if ["conv.bias", "conv.weight"] in output_block_list.values():
                index = list(output_block_list.values()).index(["conv.bias", "conv.weight"])
                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
@@ -172,6 +172,7 @@ class StableDiffusionInstructPix2PixPipeline(
        prompt_embeds: Optional[torch.FloatTensor] = None,
        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
        ip_adapter_image: Optional[PipelineImageInput] = None,
        ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None,
        output_type: Optional[str] = "pil",
        return_dict: bool = True,
        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
@@ -296,6 +297,8 @@ class StableDiffusionInstructPix2PixPipeline(
            negative_prompt,
            prompt_embeds,
            negative_prompt_embeds,
            ip_adapter_image,
            ip_adapter_image_embeds,
            callback_on_step_end_tensor_inputs,
        )
        self._guidance_scale = guidance_scale
@@ -303,14 +306,6 @@ class StableDiffusionInstructPix2PixPipeline(
        device = self._execution_device
        if ip_adapter_image is not None:
            output_hidden_state = False if isinstance(self.unet.encoder_hid_proj, ImageProjection) else True
            image_embeds, negative_image_embeds = self.encode_image(
                ip_adapter_image, device, num_images_per_prompt, output_hidden_state
            )
            if self.do_classifier_free_guidance:
                image_embeds = torch.cat([image_embeds, negative_image_embeds, negative_image_embeds])
        if image is None:
            raise ValueError("`image` input cannot be undefined.")
@@ -335,6 +330,14 @@ class StableDiffusionInstructPix2PixPipeline(
            negative_prompt_embeds=negative_prompt_embeds,
        )
        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
            image_embeds = self.prepare_ip_adapter_image_embeds(
                ip_adapter_image,
                ip_adapter_image_embeds,
                device,
                batch_size * num_images_per_prompt,
                self.do_classifier_free_guidance,
            )
        # 3. Preprocess image
        image = self.image_processor.preprocess(image)
@@ -635,6 +638,65 @@ class StableDiffusionInstructPix2PixPipeline(
            return image_embeds, uncond_image_embeds
    def prepare_ip_adapter_image_embeds(
        self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
    ):
        if ip_adapter_image_embeds is None:
            if not isinstance(ip_adapter_image, list):
                ip_adapter_image = [ip_adapter_image]
            if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
                raise ValueError(
                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
                )
            image_embeds = []
            for single_ip_adapter_image, image_proj_layer in zip(
                ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
            ):
                output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
                single_image_embeds, single_negative_image_embeds = self.encode_image(
                    single_ip_adapter_image, device, 1, output_hidden_state
                )
                single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
                single_negative_image_embeds = torch.stack(
                    [single_negative_image_embeds] * num_images_per_prompt, dim=0
                )
                if do_classifier_free_guidance:
                    single_image_embeds = torch.cat(
                        [single_image_embeds, single_negative_image_embeds, single_negative_image_embeds]
                    )
                    single_image_embeds = single_image_embeds.to(device)
                image_embeds.append(single_image_embeds)
        else:
            repeat_dims = [1]
            image_embeds = []
            for single_image_embeds in ip_adapter_image_embeds:
                if do_classifier_free_guidance:
                    (
                        single_image_embeds,
                        single_negative_image_embeds,
                        single_negative_image_embeds,
                    ) = single_image_embeds.chunk(3)
                    single_image_embeds = single_image_embeds.repeat(
                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
                    )
                    single_negative_image_embeds = single_negative_image_embeds.repeat(
                        num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
                    )
                    single_image_embeds = torch.cat(
                        [single_image_embeds, single_negative_image_embeds, single_negative_image_embeds]
                    )
                else:
                    single_image_embeds = single_image_embeds.repeat(
                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
                    )
                image_embeds.append(single_image_embeds)
        return image_embeds
    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
    def run_safety_checker(self, image, device, dtype):
        if self.safety_checker is None:
@@ -687,6 +749,8 @@ class StableDiffusionInstructPix2PixPipeline(
        negative_prompt=None,
        prompt_embeds=None,
        negative_prompt_embeds=None,
        ip_adapter_image=None,
        ip_adapter_image_embeds=None,
        callback_on_step_end_tensor_inputs=None,
    ):
        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
@@ -728,6 +792,21 @@ class StableDiffusionInstructPix2PixPipeline(
                    f" {negative_prompt_embeds.shape}."
                )
        if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
            raise ValueError(
                "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
            )
        if ip_adapter_image_embeds is not None:
            if not isinstance(ip_adapter_image_embeds, list):
                raise ValueError(
                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
                )
            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
                raise ValueError(
                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
                )
    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
        shape = (
--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py
@@ -436,7 +436,6 @@ class StableDiffusionXLInstructPix2PixPipeline(
            extra_step_kwargs["generator"] = generator
        return extra_step_kwargs
    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_instruct_pix2pix.StableDiffusionInstructPix2PixPipeline.check_inputs
    def check_inputs(
        self,
        prompt,
--- a/src/diffusers/schedulers/scheduling_utils.py
+++ b/src/diffusers/schedulers/scheduling_utils.py
@@ -112,9 +112,9 @@ class SchedulerMixin(PushToHubMixin):
            force_download (`bool`, *optional*, defaults to `False`):
                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
                cached versions if they exist.
-            resume_download (`bool`, *optional*, defaults to `False`):
+            resume_download:
-                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
+                Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1
-                incompletely downloaded files are deleted.
+                of Diffusers.
            proxies (`Dict[str, str]`, *optional*):
                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
--- a/src/diffusers/schedulers/scheduling_utils_flax.py
+++ b/src/diffusers/schedulers/scheduling_utils_flax.py
@@ -102,9 +102,9 @@ class FlaxSchedulerMixin(PushToHubMixin):
            force_download (`bool`, *optional*, defaults to `False`):
                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
                cached versions if they exist.
-            resume_download (`bool`, *optional*, defaults to `False`):
+            resume_download:
-                Whether or not to delete incompletely received files. Will attempt to resume the download if such a
+                Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1
-                file exists.
+                of Diffusers.
            proxies (`Dict[str, str]`, *optional*):
                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
--- a/src/diffusers/utils/dynamic_modules_utils.py
+++ b/src/diffusers/utils/dynamic_modules_utils.py
@@ -201,7 +201,7 @@ def get_cached_module_file(
    module_file: str,
    cache_dir: Optional[Union[str, os.PathLike]] = None,
    force_download: bool = False,
-    resume_download: bool = False,
+    resume_download: Optional[bool] = None,
    proxies: Optional[Dict[str, str]] = None,
    token: Optional[Union[bool, str]] = None,
    revision: Optional[str] = None,
@@ -228,9 +228,9 @@ def get_cached_module_file(
            cache should not be used.
        force_download (`bool`, *optional*, defaults to `False`):
            Whether or not to force to (re-)download the configuration files and override the cached versions if they
-            exist.
+            exist. resume_download:
-        resume_download (`bool`, *optional*, defaults to `False`):
+                Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1
-            Whether or not to delete incompletely received file. Attempts to resume the download if such a file exists.
+                of Diffusers.
        proxies (`Dict[str, str]`, *optional*):
            A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
            'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
@@ -380,7 +380,7 @@ def get_class_from_dynamic_module(
    class_name: Optional[str] = None,
    cache_dir: Optional[Union[str, os.PathLike]] = None,
    force_download: bool = False,
-    resume_download: bool = False,
+    resume_download: Optional[bool] = None,
    proxies: Optional[Dict[str, str]] = None,
    token: Optional[Union[bool, str]] = None,
    revision: Optional[str] = None,
@@ -417,8 +417,9 @@ def get_class_from_dynamic_module(
        force_download (`bool`, *optional*, defaults to `False`):
            Whether or not to force to (re-)download the configuration files and override the cached versions if they
            exist.
-        resume_download (`bool`, *optional*, defaults to `False`):
+        resume_download:
-            Whether or not to delete incompletely received file. Attempts to resume the download if such a file exists.
+            Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1 of
            Diffusers.
        proxies (`Dict[str, str]`, *optional*):
            A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
            'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
--- a/src/diffusers/utils/hub_utils.py
+++ b/src/diffusers/utils/hub_utils.py
@@ -283,7 +283,7 @@ def _get_model_file(
    cache_dir: Optional[str] = None,
    force_download: bool = False,
    proxies: Optional[Dict] = None,
-    resume_download: bool = False,
+    resume_download: Optional[bool] = None,
    local_files_only: bool = False,
    token: Optional[str] = None,
    user_agent: Optional[Union[Dict, str]] = None,
--- a/tests/models/test_modeling_common.py
+++ b/tests/models/test_modeling_common.py
@@ -30,9 +30,14 @@ from huggingface_hub.utils import is_jinja_available
 from requests.exceptions import HTTPError
 from diffusers.models import UNet2DConditionModel
-from diffusers.models.attention_processor import AttnProcessor, AttnProcessor2_0, XFormersAttnProcessor
+from diffusers.models.attention_processor import (
    AttnProcessor,
    AttnProcessor2_0,
    AttnProcessorNPU,
    XFormersAttnProcessor,
 )
 from diffusers.training_utils import EMAModel
-from diffusers.utils import is_xformers_available, logging
+from diffusers.utils import is_torch_npu_available, is_xformers_available, logging
 from diffusers.utils.testing_utils import (
    CaptureLogger,
    get_python_version,
@@ -300,6 +305,53 @@ class ModelTesterMixin:
        assert str(error.exception) == f"'{type(model).__name__}' object has no attribute 'does_not_exist'"
    @unittest.skipIf(
        torch_device != "npu" or not is_torch_npu_available(),
        reason="torch npu flash attention is only available with NPU and `torch_npu` installed",
    )
    def test_set_torch_npu_flash_attn_processor_determinism(self):
        torch.use_deterministic_algorithms(False)
        if self.forward_requires_fresh_args:
            model = self.model_class(**self.init_dict)
        else:
            init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
            model = self.model_class(**init_dict)
        model.to(torch_device)
        if not hasattr(model, "set_attn_processor"):
            # If not has `set_attn_processor`, skip test
            return
        model.set_default_attn_processor()
        assert all(type(proc) == AttnProcessorNPU for proc in model.attn_processors.values())
        with torch.no_grad():
            if self.forward_requires_fresh_args:
                output = model(**self.inputs_dict(0))[0]
            else:
                output = model(**inputs_dict)[0]
        model.enable_npu_flash_attention()
        assert all(type(proc) == AttnProcessorNPU for proc in model.attn_processors.values())
        with torch.no_grad():
            if self.forward_requires_fresh_args:
                output_2 = model(**self.inputs_dict(0))[0]
            else:
                output_2 = model(**inputs_dict)[0]
        model.set_attn_processor(AttnProcessorNPU())
        assert all(type(proc) == AttnProcessorNPU for proc in model.attn_processors.values())
        with torch.no_grad():
            if self.forward_requires_fresh_args:
                output_3 = model(**self.inputs_dict(0))[0]
            else:
                output_3 = model(**inputs_dict)[0]
        torch.use_deterministic_algorithms(True)
        assert torch.allclose(output, output_2, atol=self.base_precision)
        assert torch.allclose(output, output_3, atol=self.base_precision)
        assert torch.allclose(output_2, output_3, atol=self.base_precision)
    @unittest.skipIf(
        torch_device != "cuda" or not is_xformers_available(),
        reason="XFormers attention is only available with CUDA and `xformers` installed",
@@ -691,6 +743,9 @@ class ModelTesterMixin:
    def test_cpu_offload(self):
        config, inputs_dict = self.prepare_init_args_and_inputs_for_common()
        model = self.model_class(**config).eval()
        if model._no_split_modules is None:
            return
        model = model.to(torch_device)
        torch.manual_seed(0)
@@ -718,6 +773,9 @@ class ModelTesterMixin:
    def test_disk_offload_without_safetensors(self):
        config, inputs_dict = self.prepare_init_args_and_inputs_for_common()
        model = self.model_class(**config).eval()
        if model._no_split_modules is None:
            return
        model = model.to(torch_device)
        torch.manual_seed(0)
@@ -728,12 +786,12 @@ class ModelTesterMixin:
            model.cpu().save_pretrained(tmp_dir, safe_serialization=False)
            with self.assertRaises(ValueError):
-                max_size = int(self.model_split_percents[1] * model_size)
+                max_size = int(self.model_split_percents[0] * model_size)
                max_memory = {0: max_size, "cpu": max_size}
                # This errors out because it's missing an offload folder
                new_model = self.model_class.from_pretrained(tmp_dir, device_map="auto", max_memory=max_memory)
-            max_size = int(self.model_split_percents[1] * model_size)
+            max_size = int(self.model_split_percents[0] * model_size)
            max_memory = {0: max_size, "cpu": max_size}
            new_model = self.model_class.from_pretrained(
                tmp_dir, device_map="auto", max_memory=max_memory, offload_folder=tmp_dir
@@ -749,6 +807,9 @@ class ModelTesterMixin:
    def test_disk_offload_with_safetensors(self):
        config, inputs_dict = self.prepare_init_args_and_inputs_for_common()
        model = self.model_class(**config).eval()
        if model._no_split_modules is None:
            return
        model = model.to(torch_device)
        torch.manual_seed(0)
@@ -758,7 +819,7 @@ class ModelTesterMixin:
        with tempfile.TemporaryDirectory() as tmp_dir:
            model.cpu().save_pretrained(tmp_dir)
-            max_size = int(self.model_split_percents[1] * model_size)
+            max_size = int(self.model_split_percents[0] * model_size)
            max_memory = {0: max_size, "cpu": max_size}
            new_model = self.model_class.from_pretrained(
                tmp_dir, device_map="auto", offload_folder=tmp_dir, max_memory=max_memory
@@ -774,6 +835,9 @@ class ModelTesterMixin:
    def test_model_parallelism(self):
        config, inputs_dict = self.prepare_init_args_and_inputs_for_common()
        model = self.model_class(**config).eval()
        if model._no_split_modules is None:
            return
        model = model.to(torch_device)
        torch.manual_seed(0)
--- a/tests/pipelines/audioldm/test_audioldm.py
+++ b/tests/pipelines/audioldm/test_audioldm.py
@@ -66,16 +66,17 @@ class AudioLDMPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
    def get_dummy_components(self):
        torch.manual_seed(0)
        unet = UNet2DConditionModel(
-            block_out_channels=(32, 64),
+            block_out_channels=(8, 16),
-            layers_per_block=2,
+            layers_per_block=1,
            norm_num_groups=8,
            sample_size=32,
            in_channels=4,
            out_channels=4,
            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=(32, 64),
+            cross_attention_dim=(8, 16),
            class_embed_type="simple_projection",
-            projection_class_embeddings_input_dim=32,
+            projection_class_embeddings_input_dim=8,
            class_embeddings_concat=True,
        )
        scheduler = DDIMScheduler(
@@ -87,9 +88,10 @@ class AudioLDMPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
        )
        torch.manual_seed(0)
        vae = AutoencoderKL(
-            block_out_channels=[32, 64],
+            block_out_channels=[8, 16],
            in_channels=1,
            out_channels=1,
            norm_num_groups=8,
            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
            latent_channels=4,
@@ -98,14 +100,14 @@ class AudioLDMPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
        text_encoder_config = ClapTextConfig(
            bos_token_id=0,
            eos_token_id=2,
-            hidden_size=32,
+            hidden_size=8,
            intermediate_size=37,
            layer_norm_eps=1e-05,
-            num_attention_heads=4,
+            num_attention_heads=1,
-            num_hidden_layers=5,
+            num_hidden_layers=1,
            pad_token_id=1,
            vocab_size=1000,
-            projection_dim=32,
+            projection_dim=8,
        )
        text_encoder = ClapTextModelWithProjection(text_encoder_config)
        tokenizer = RobertaTokenizer.from_pretrained("hf-internal-testing/tiny-random-roberta", model_max_length=77)
--- a/tests/pipelines/blipdiffusion/test_blipdiffusion.py
+++ b/tests/pipelines/blipdiffusion/test_blipdiffusion.py
@@ -64,9 +64,9 @@ class BlipDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
        torch.manual_seed(0)
        text_encoder_config = CLIPTextConfig(
            vocab_size=1000,
-            hidden_size=16,
+            hidden_size=8,
-            intermediate_size=16,
+            intermediate_size=8,
-            projection_dim=16,
+            projection_dim=8,
            num_hidden_layers=1,
            num_attention_heads=1,
            max_position_embeddings=77,
@@ -78,17 +78,17 @@ class BlipDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
            out_channels=4,
            down_block_types=("DownEncoderBlock2D",),
            up_block_types=("UpDecoderBlock2D",),
-            block_out_channels=(32,),
+            block_out_channels=(8,),
            norm_num_groups=8,
            layers_per_block=1,
            act_fn="silu",
            latent_channels=4,
-            norm_num_groups=16,
+            sample_size=8,
            sample_size=16,
        )
        blip_vision_config = {
-            "hidden_size": 16,
+            "hidden_size": 8,
-            "intermediate_size": 16,
+            "intermediate_size": 8,
            "num_hidden_layers": 1,
            "num_attention_heads": 1,
            "image_size": 224,
@@ -98,32 +98,32 @@ class BlipDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
        blip_qformer_config = {
            "vocab_size": 1000,
-            "hidden_size": 16,
+            "hidden_size": 8,
            "num_hidden_layers": 1,
            "num_attention_heads": 1,
-            "intermediate_size": 16,
+            "intermediate_size": 8,
            "max_position_embeddings": 512,
            "cross_attention_frequency": 1,
-            "encoder_hidden_size": 16,
+            "encoder_hidden_size": 8,
        }
        qformer_config = Blip2Config(
            vision_config=blip_vision_config,
            qformer_config=blip_qformer_config,
-            num_query_tokens=16,
+            num_query_tokens=8,
            tokenizer="hf-internal-testing/tiny-random-bert",
        )
        qformer = Blip2QFormerModel(qformer_config)
        unet = UNet2DConditionModel(
-            block_out_channels=(16, 32),
+            block_out_channels=(8, 16),
-            norm_num_groups=16,
+            norm_num_groups=8,
            layers_per_block=1,
            sample_size=16,
            in_channels=4,
            out_channels=4,
            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=16,
+            cross_attention_dim=8,
        )
        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
@@ -189,7 +189,9 @@ class BlipDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
        assert image.shape == (1, 16, 16, 4)
-        expected_slice = np.array([0.7096, 0.5900, 0.6703, 0.4032, 0.7766, 0.3629, 0.5447, 0.4149, 0.8172])
+        expected_slice = np.array(
            [0.5329548, 0.8372512, 0.33269387, 0.82096875, 0.43657133, 0.3783, 0.5953028, 0.51934963, 0.42142007]
        )
        assert (
            np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
--- a/tests/pipelines/pixart_alpha/test_pixart.py
+++ b/tests/pipelines/pixart_alpha/test_pixart.py
@@ -324,10 +324,6 @@ class PixArtAlphaPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
    def test_inference_batch_single_identical(self):
        self._test_inference_batch_single_identical(expected_max_diff=1e-3)
    # PixArt transformer model does not work with sequential offload so skip it for now
    def test_sequential_offload_forward_pass_twice(self):
        pass
@slow
@require_torch_gpu
--- a/tests/pipelines/pixart_sigma/test_pixart.py
+++ b/tests/pipelines/pixart_sigma/test_pixart.py
@@ -308,10 +308,6 @@ class PixArtSigmaPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
    def test_inference_batch_single_identical(self):
        self._test_inference_batch_single_identical(expected_max_diff=1e-3)
    # PixArt transformer model does not work with sequential offload so skip it for now
    def test_sequential_offload_forward_pass_twice(self):
        pass
@slow
@require_torch_gpu
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
@@ -1257,8 +1257,8 @@ class StableDiffusionPipelineCkptTests(unittest.TestCase):
    def test_download_from_hub(self):
        ckpt_paths = [
-            "https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.ckpt",
+            "https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.safetensors",
-            "https://huggingface.co/WarriorMama777/OrangeMixs/blob/main/Models/AbyssOrangeMix/AbyssOrangeMix_base.ckpt",
+            "https://huggingface.co/WarriorMama777/OrangeMixs/blob/main/Models/AbyssOrangeMix/AbyssOrangeMix.safetensors",
        ]
        for ckpt_path in ckpt_paths:
@@ -1271,7 +1271,7 @@ class StableDiffusionPipelineCkptTests(unittest.TestCase):
        assert image_out.shape == (512, 512, 3)
    def test_download_local(self):
-        ckpt_filename = hf_hub_download("runwayml/stable-diffusion-v1-5", filename="v1-5-pruned-emaonly.ckpt")
+        ckpt_filename = hf_hub_download("runwayml/stable-diffusion-v1-5", filename="v1-5-pruned-emaonly.safetensors")
        config_filename = hf_hub_download("runwayml/stable-diffusion-v1-5", filename="v1-inference.yaml")
        pipe = StableDiffusionPipeline.from_single_file(
@@ -1285,7 +1285,7 @@ class StableDiffusionPipelineCkptTests(unittest.TestCase):
        assert image_out.shape == (512, 512, 3)
    def test_download_ckpt_diff_format_is_same(self):
-        ckpt_path = "https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.ckpt"
+        ckpt_path = "https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.safetensors"
        sf_pipe = StableDiffusionPipeline.from_single_file(ckpt_path)
        sf_pipe.scheduler = DDIMScheduler.from_config(sf_pipe.scheduler.config)
@@ -1310,7 +1310,7 @@ class StableDiffusionPipelineCkptTests(unittest.TestCase):
    def test_single_file_component_configs(self):
        pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
-        ckpt_path = "https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.ckpt"
+        ckpt_path = "https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.safetensors"
        single_file_pipe = StableDiffusionPipeline.from_single_file(ckpt_path, load_safety_checker=True)
        for param_name, param_value in single_file_pipe.text_encoder.config.to_dict().items():
--- a/tests/pipelines/test_pipelines_common.py
+++ b/tests/pipelines/test_pipelines_common.py
@@ -1360,6 +1360,8 @@ class PipelineTesterMixin:
        reason="CPU offload is only available with CUDA and `accelerate v0.14.0` or higher",
    )
    def test_sequential_cpu_offload_forward_pass(self, expected_max_diff=1e-4):
        import accelerate
        components = self.get_dummy_components()
        pipe = self.pipeline_class(**components)
        for component in pipe.components.values():
@@ -1373,6 +1375,7 @@ class PipelineTesterMixin:
        output_without_offload = pipe(**inputs)[0]
        pipe.enable_sequential_cpu_offload()
        assert pipe._execution_device.type == pipe._offload_device.type
        inputs = self.get_dummy_inputs(generator_device)
        output_with_offload = pipe(**inputs)[0]
@@ -1380,11 +1383,48 @@ class PipelineTesterMixin:
        max_diff = np.abs(to_np(output_with_offload) - to_np(output_without_offload)).max()
        self.assertLess(max_diff, expected_max_diff, "CPU offloading should not affect the inference results")
        # make sure all `torch.nn.Module` components (except those in `self._exclude_from_cpu_offload`) are offloaded correctly
        offloaded_modules = {
            k: v
            for k, v in pipe.components.items()
            if isinstance(v, torch.nn.Module) and k not in pipe._exclude_from_cpu_offload
        }
        # 1. all offloaded modules should be saved to cpu and moved to meta device
        self.assertTrue(
            all(v.device.type == "meta" for v in offloaded_modules.values()),
            f"Not offloaded: {[k for k, v in offloaded_modules.items() if v.device.type != 'meta']}",
        )
        # 2. all offloaded modules should have hook installed
        self.assertTrue(
            all(hasattr(v, "_hf_hook") for k, v in offloaded_modules.items()),
            f"No hook attached: {[k for k, v in offloaded_modules.items() if not hasattr(v, '_hf_hook')]}",
        )
        # 3. all offloaded modules should have correct hooks installed, should be either one of these two
        #    - `AlignDevicesHook`
        #    - a SequentialHook` that contains `AlignDevicesHook`
        offloaded_modules_with_incorrect_hooks = {}
        for k, v in offloaded_modules.items():
            if hasattr(v, "_hf_hook"):
                if isinstance(v._hf_hook, accelerate.hooks.SequentialHook):
                    # if it is a `SequentialHook`, we loop through its `hooks` attribute to check if it only contains `AlignDevicesHook`
                    for hook in v._hf_hook.hooks:
                        if not isinstance(hook, accelerate.hooks.AlignDevicesHook):
                            offloaded_modules_with_incorrect_hooks[k] = type(v._hf_hook.hooks[0])
                elif not isinstance(v._hf_hook, accelerate.hooks.AlignDevicesHook):
                    offloaded_modules_with_incorrect_hooks[k] = type(v._hf_hook)
        self.assertTrue(
            len(offloaded_modules_with_incorrect_hooks) == 0,
            f"Not installed correct hook: {offloaded_modules_with_incorrect_hooks}",
        )
    @unittest.skipIf(
        torch_device != "cuda" or not is_accelerate_available() or is_accelerate_version("<", "0.17.0"),
        reason="CPU offload is only available with CUDA and `accelerate v0.17.0` or higher",
    )
    def test_model_cpu_offload_forward_pass(self, expected_max_diff=2e-4):
        import accelerate
        generator_device = "cpu"
        components = self.get_dummy_components()
        pipe = self.pipeline_class(**components)
@@ -1400,19 +1440,39 @@ class PipelineTesterMixin:
        output_without_offload = pipe(**inputs)[0]
        pipe.enable_model_cpu_offload()
        assert pipe._execution_device.type == pipe._offload_device.type
        inputs = self.get_dummy_inputs(generator_device)
        output_with_offload = pipe(**inputs)[0]
        max_diff = np.abs(to_np(output_with_offload) - to_np(output_without_offload)).max()
        self.assertLess(max_diff, expected_max_diff, "CPU offloading should not affect the inference results")
-        offloaded_modules = [
+
-            v
+        # make sure all `torch.nn.Module` components (except those in `self._exclude_from_cpu_offload`) are offloaded correctly
        offloaded_modules = {
            k: v
            for k, v in pipe.components.items()
            if isinstance(v, torch.nn.Module) and k not in pipe._exclude_from_cpu_offload
-        ]
+        }
-        (
+        # 1. check if all offloaded modules are saved to cpu
-            self.assertTrue(all(v.device.type == "cpu" for v in offloaded_modules)),
+        self.assertTrue(
-            f"Not offloaded: {[v for v in offloaded_modules if v.device.type != 'cpu']}",
+            all(v.device.type == "cpu" for v in offloaded_modules.values()),
            f"Not offloaded: {[k for k, v in offloaded_modules.items() if v.device.type != 'cpu']}",
        )
        # 2. check if all offloaded modules have hooks installed
        self.assertTrue(
            all(hasattr(v, "_hf_hook") for k, v in offloaded_modules.items()),
            f"No hook attached: {[k for k, v in offloaded_modules.items() if not hasattr(v, '_hf_hook')]}",
        )
        # 3. check if all offloaded modules have correct type of hooks installed, should be `CpuOffload`
        offloaded_modules_with_incorrect_hooks = {}
        for k, v in offloaded_modules.items():
            if hasattr(v, "_hf_hook") and not isinstance(v._hf_hook, accelerate.hooks.CpuOffload):
                offloaded_modules_with_incorrect_hooks[k] = type(v._hf_hook)
        self.assertTrue(
            len(offloaded_modules_with_incorrect_hooks) == 0,
            f"Not installed correct hook: {offloaded_modules_with_incorrect_hooks}",
        )
    @unittest.skipIf(
@@ -1444,16 +1504,24 @@ class PipelineTesterMixin:
        self.assertLess(
            max_diff, expected_max_diff, "running CPU offloading 2nd time should not affect the inference results"
        )
        # make sure all `torch.nn.Module` components (except those in `self._exclude_from_cpu_offload`) are offloaded correctly
        offloaded_modules = {
            k: v
            for k, v in pipe.components.items()
            if isinstance(v, torch.nn.Module) and k not in pipe._exclude_from_cpu_offload
        }
        # 1. check if all offloaded modules are saved to cpu
        self.assertTrue(
            all(v.device.type == "cpu" for v in offloaded_modules.values()),
            f"Not offloaded: {[k for k, v in offloaded_modules.items() if v.device.type != 'cpu']}",
        )
-
+        # 2. check if all offloaded modules have hooks installed
        self.assertTrue(
            all(hasattr(v, "_hf_hook") for k, v in offloaded_modules.items()),
            f"No hook attached: {[k for k, v in offloaded_modules.items() if not hasattr(v, '_hf_hook')]}",
        )
        # 3. check if all offloaded modules have correct type of hooks installed, should be `CpuOffload`
        offloaded_modules_with_incorrect_hooks = {}
        for k, v in offloaded_modules.items():
            if hasattr(v, "_hf_hook") and not isinstance(v._hf_hook, accelerate.hooks.CpuOffload):
@@ -1493,19 +1561,36 @@ class PipelineTesterMixin:
        self.assertLess(
            max_diff, expected_max_diff, "running sequential offloading second time should have the inference results"
        )
        # make sure all `torch.nn.Module` components (except those in `self._exclude_from_cpu_offload`) are offloaded correctly
        offloaded_modules = {
            k: v
            for k, v in pipe.components.items()
            if isinstance(v, torch.nn.Module) and k not in pipe._exclude_from_cpu_offload
        }
        # 1. check if all offloaded modules are moved to meta device
        self.assertTrue(
            all(v.device.type == "meta" for v in offloaded_modules.values()),
            f"Not offloaded: {[k for k, v in offloaded_modules.items() if v.device.type != 'meta']}",
        )
        # 2. check if all offloaded modules have hook installed
        self.assertTrue(
            all(hasattr(v, "_hf_hook") for k, v in offloaded_modules.items()),
            f"No hook attached: {[k for k, v in offloaded_modules.items() if not hasattr(v, '_hf_hook')]}",
        )
        # 3. check if all offloaded modules have correct hooks installed, should be either one of these two
        #    - `AlignDevicesHook`
        #    - a SequentialHook` that contains `AlignDevicesHook`
        offloaded_modules_with_incorrect_hooks = {}
        for k, v in offloaded_modules.items():
-            if hasattr(v, "_hf_hook") and not isinstance(v._hf_hook, accelerate.hooks.AlignDevicesHook):
+            if hasattr(v, "_hf_hook"):
-                offloaded_modules_with_incorrect_hooks[k] = type(v._hf_hook)
+                if isinstance(v._hf_hook, accelerate.hooks.SequentialHook):
                    # if it is a `SequentialHook`, we loop through its `hooks` attribute to check if it only contains `AlignDevicesHook`
                    for hook in v._hf_hook.hooks:
                        if not isinstance(hook, accelerate.hooks.AlignDevicesHook):
                            offloaded_modules_with_incorrect_hooks[k] = type(v._hf_hook.hooks[0])
                elif not isinstance(v._hf_hook, accelerate.hooks.AlignDevicesHook):
                    offloaded_modules_with_incorrect_hooks[k] = type(v._hf_hook)
        self.assertTrue(
            len(offloaded_modules_with_incorrect_hooks) == 0,
Author	SHA1	Message	Date
Dhruv Nair	ca6d41de0d	update	2024-05-07 11:31:11 +00:00
Dhruv Nair	61e962d7d0	update	2024-05-07 11:30:52 +00:00
Dhruv Nair	7492690505	update	2024-05-07 11:27:19 +00:00
Dhruv Nair	decd6758f3	set max parallel	2024-05-07 10:25:58 +00:00
Steven Liu	0d23645bd1	[docs] Distilled inference (#7834 ) * combine * edits	2024-05-06 15:07:25 -07:00
Guillaume LEGENDRE	7fa3e5b0f6	Ci - change cache folder (#7867 )	2024-05-06 17:55:24 +05:30
Steven Liu	49b959b540	[docs] LCM (#7829 ) * lcm * lcm lora * fix * fix hfoption * edits	2024-05-03 16:08:27 -07:00
HelloWorldBeginner	58237364b1	Add Ascend NPU support for SDXL fine-tuning and fix the model saving bug when using DeepSpeed. (#7816 ) * Add Ascend NPU support for SDXL fine-tuning and fix the model saving bug when using DeepSpeed. * fix check code quality * Decouple the NPU flash attention and make it an independent module. * add doc and unit tests for npu flash attention. --------- Co-authored-by: mhh001 <mahonghao1@huawei.com> Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>	2024-05-03 08:14:34 -10:00
Dhruv Nair	3e35628873	Remove installing python again in container (#7852 ) update	2024-05-03 15:09:15 +05:30
Lucain	6a479588db	Respect `resume_download` deprecation (#7843 ) * Deprecate resume_download * align docstring with transformers * style --------- Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>	2024-05-03 08:42:57 +02:00
Aritra Roy Gosthipaty	fa489eaed6	[Tests] reduce the model size in the blipdiffusion fast test (#7849 ) reducing model size	2024-05-03 07:46:48 +05:30
Dhruv Nair	0d7c479023	Update deps for pipe test fetcher (#7838 ) update Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>	2024-05-02 20:36:47 +05:30
Guillaume LEGENDRE	ce97d7e19b	Change GPU Runners (#7840 ) * Move to new GPU Runners for slow tests * Move to new GPU Runners for nightly tests	2024-05-02 18:48:46 +05:30
Guillaume LEGENDRE	44ba90caff	move to new runners (#7839 )	2024-05-02 14:53:38 +02:00
Dhruv Nair	3c85a57297	Update CI cache (#7832 ) update Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>	2024-05-02 14:03:35 +05:30
Dhruv Nair	03ca11318e	Update download diff format tests (#7831 ) update Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>	2024-05-02 13:15:38 +05:30
Dhruv Nair	3ffa7b46e5	Fix hanging pipeline fetching (#7837 ) update	2024-05-02 13:08:57 +05:30
yunseong Cho	c1b2a89e34	Fix key error for dictionary with randomized order in convert_ldm_unet_checkpoint (#7680 ) fix key error for different order Co-authored-by: yunseong <yunseong.cho@superlabs.us> Co-authored-by: Dhruv Nair <dhruv.nair@gmail.com>	2024-05-02 10:29:55 +05:30
Aritra Roy Gosthipaty	435d37ce5a	[Tests] reduce the model size in the audioldm fast test (#7833 ) chore: initial size reduction of models	2024-05-02 06:03:52 +05:30
YiYi Xu	5915c2985d	[ip-adapter] fix ip-adapter for StableDiffusionInstructPix2PixPipeline (#7820 ) update prepare_ip_adapter_ for pix2pix	2024-05-01 06:27:43 -10:00
YiYi Xu	21a7ff12a7	update the logic of `is_sequential_cpu_offload` (#7788 ) * up * add comment to the tests + fix dit --------- Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>	2024-05-01 06:25:57 -10:00
Sayak Paul	8909ab4b19	[Tests] fix: device map tests for models (#7825 ) * fix: device module tests * remove patch file * Empty-Commit	2024-05-01 18:45:47 +05:30
Dhruv Nair	c1edb03c37	Fix for pipeline slow test fetcher (#7824 ) * update * update	2024-05-01 17:36:54 +05:30