Release: v0.19.0

2025-12-25 22:04:41 +08:00 · 2023-07-26 21:03:45 +02:00
350 changed files with 5187 additions and 36180 deletions
--- a/.github/workflows/pr_tests.yml
+++ b/.github/workflows/pr_tests.yml
@@ -113,60 +113,3 @@ jobs:
      with:
        name: pr_${{ matrix.config.report }}_test_reports
        path: reports
-
-  run_staging_tests:
-    strategy:
-      fail-fast: false
-      matrix:
-        config:
-          - name: Hub tests for models, schedulers, and pipelines
-            framework: hub_tests_pytorch
-            runner: docker-cpu
-            image: diffusers/diffusers-pytorch-cpu
-            report: torch_hub
-
-    name: ${{ matrix.config.name }}
-
-    runs-on: ${{ matrix.config.runner }}
-
-    container:
-      image: ${{ matrix.config.image }}
-      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
-
-    defaults:
-      run:
-        shell: bash
-
-    steps:
-    - name: Checkout diffusers
-      uses: actions/checkout@v3
-      with:
-        fetch-depth: 2
-
-    - name: Install dependencies
-      run: |
-        apt-get update && apt-get install libsndfile1-dev libgl1 -y
-        python -m pip install -e .[quality,test]
-
-    - name: Environment
-      run: |
-        python utils/print_env.py
-
-    - name: Run Hub tests for models, schedulers, and pipelines on a staging env
-      if: ${{ matrix.config.framework == 'hub_tests_pytorch' }}
-      run: |
-        HUGGINGFACE_CO_STAGING=true python -m pytest \
-          -m "is_staging_test" \
-          --make-reports=tests_${{ matrix.config.report }} \
-          tests
-
-    - name: Failure short reports
-      if: ${{ failure() }}
-      run: cat reports/tests_${{ matrix.config.report }}_failures_short.txt
-
-    - name: Test suite reports artifacts
-      if: ${{ always() }}
-      uses: actions/upload-artifact@v2
-      with:
-        name: pr_${{ matrix.config.report }}_test_reports
-        path: reports
--- a/2
+++ b/2
@@ -78,7 +78,7 @@ test:
 # Run tests for examples

 test-examples:
-	python -m pytest -n auto --dist=loadfile -s -v ./examples/
+	python -m pytest -n auto --dist=loadfile -s -v ./examples/pytorch/


 # Release stuff
--- a/PHILOSOPHY.md
+++ b/PHILOSOPHY.md
@@ -90,7 +90,7 @@ The following design principles are followed:
 - To integrate new model checkpoints whose general architecture can be classified as an architecture that already exists in Diffusers, the existing model architecture shall be adapted to make it work with the new checkpoint. One should only create a new file if the model architecture is fundamentally different.
 - Models should be designed to be easily extendable to future changes. This can be achieved by limiting public function arguments, configuration arguments, and "foreseeing" future changes, *e.g.* it is usually better to add `string` "...type" arguments that can easily be extended to new future types instead of boolean `is_..._type` arguments. Only the minimum amount of changes shall be made to existing architectures to make a new model checkpoint work.
 - The model design is a difficult trade-off between keeping code readable and concise and supporting many model checkpoints. For most parts of the modeling code, classes shall be adapted for new model checkpoints, while there are some exceptions where it is preferred to add new classes to make sure the code is kept concise and
-readable longterm, such as [UNet blocks](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/unet_2d_blocks.py) and [Attention processors](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+readable longterm, such as [UNet blocks](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/unet_2d_blocks.py) and [Attention processors](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).

 ### Schedulers

--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 <p align="center">
    <br>
-    <img src="https://raw.githubusercontent.com/huggingface/diffusers/main/docs/source/en/imgs/diffusers_library.jpg" width="400"/>
+    <img src="https://github.com/huggingface/diffusers/blob/main/docs/source/en/imgs/diffusers_library.jpg" width="400"/>
    <br>
 <p>
 <p align="center">
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -13,8 +13,6 @@
    title: Overview
  - local: using-diffusers/write_own_pipeline
    title: Understanding models and schedulers
-  - local: tutorials/autopipeline
-    title: AutoPipeline
  - local: tutorials/basic_training
    title: Train a diffusion model
  title: Tutorials
@@ -32,22 +30,20 @@
      title: Load safetensors
    - local: using-diffusers/other-formats
      title: Load different Stable Diffusion formats
-    - local: using-diffusers/push_to_hub
-      title: Push files to the Hub
    title: Loading & Hub
  - sections:
+    - local: using-diffusers/pipeline_overview
+      title: Overview
    - local: using-diffusers/unconditional_image_generation
      title: Unconditional image generation
    - local: using-diffusers/conditional_image_generation
-      title: Text-to-image
+      title: Text-to-image generation
    - local: using-diffusers/img2img
-      title: Image-to-image
+      title: Text-guided image-to-image
    - local: using-diffusers/inpaint
-      title: Inpainting
+      title: Text-guided image-inpainting
    - local: using-diffusers/depth2img
-      title: Depth-to-image
-    title: Tasks
-  - sections:
+      title: Text-guided depth-to-image
    - local: using-diffusers/textual_inversion_inference
      title: Textual inversion
    - local: training/distributed_inference
@@ -56,22 +52,16 @@
      title: Improve image quality with deterministic generation
    - local: using-diffusers/control_brightness
      title: Control image brightness
-    - local: using-diffusers/weighted_prompts
-      title: Prompt weighting
-    title: Techniques
-  - sections:
-    - local: using-diffusers/pipeline_overview
-      title: Overview
-    - local: using-diffusers/sdxl
-      title: Stable Diffusion XL
-    - local: using-diffusers/distilled_sd
-      title: Distilled Stable Diffusion inference
    - local: using-diffusers/reproducibility
      title: Create reproducible pipelines
    - local: using-diffusers/custom_pipeline_examples
      title: Community pipelines
    - local: using-diffusers/contribute_pipeline
      title: How to contribute a community pipeline
+    - local: using-diffusers/stable_diffusion_jax_how_to
+      title: Stable Diffusion in JAX/Flax
+    - local: using-diffusers/weighted_prompts
+      title: Weighting Prompts
    title: Pipelines for Inference
  - sections:
    - local: training/overview
@@ -109,8 +99,6 @@
    title: Memory and Speed
  - local: optimization/torch2.0
    title: Torch2.0 support
-  - local: using-diffusers/stable_diffusion_jax_how_to
-    title: Stable Diffusion in JAX/Flax
  - local: optimization/xformers
    title: xFormers
  - local: optimization/onnx
@@ -174,8 +162,6 @@
      title: AutoencoderKL
    - local: api/models/asymmetricautoencoderkl
      title: AsymmetricAutoencoderKL
-    - local: api/models/autoencoder_tiny
-      title: Tiny AutoEncoder
    - local: api/models/transformer2d
      title: Transformer2D
    - local: api/models/transformer_temporal
@@ -196,16 +182,12 @@
      title: Audio Diffusion
    - local: api/pipelines/audioldm
      title: AudioLDM
-    - local: api/pipelines/audioldm2
-      title: AudioLDM 2
    - local: api/pipelines/auto_pipeline
      title: AutoPipeline
    - local: api/pipelines/consistency_models
      title: Consistency Models
    - local: api/pipelines/controlnet
      title: ControlNet
-    - local: api/pipelines/controlnet_sdxl
-      title: ControlNet with Stable Diffusion XL
    - local: api/pipelines/cycle_diffusion
      title: Cycle Diffusion
    - local: api/pipelines/dance_diffusion
@@ -230,8 +212,6 @@
      title: Latent Diffusion
    - local: api/pipelines/panorama
      title: MultiDiffusion
-    - local: api/pipelines/musicldm
-      title: MusicLDM
    - local: api/pipelines/paint_by_example
      title: PaintByExample
    - local: api/pipelines/paradigms
@@ -279,8 +259,6 @@
        title: LDM3D Text-to-(RGB, Depth)
      - local: api/pipelines/stable_diffusion/adapter
        title: Stable Diffusion T2I-adapter
-      - local: api/pipelines/stable_diffusion/gligen
-        title: GLIGEN (Grounded Language-to-Image Generation)
      title: Stable Diffusion
    - local: api/pipelines/stable_unclip
      title: Stable unCLIP
@@ -309,49 +287,49 @@
    - local: api/schedulers/overview
      title: Overview
    - local: api/schedulers/cm_stochastic_iterative
-      title: CMStochasticIterativeScheduler
-    - local: api/schedulers/ddim_inverse
-      title: DDIMInverseScheduler
+      title: Consistency Model Multistep Scheduler
    - local: api/schedulers/ddim
-      title: DDIMScheduler
+      title: DDIM
+    - local: api/schedulers/ddim_inverse
+      title: DDIMInverse
    - local: api/schedulers/ddpm
-      title: DDPMScheduler
+      title: DDPM
    - local: api/schedulers/deis
-      title: DEISMultistepScheduler
-    - local: api/schedulers/multistep_dpm_solver_inverse
-      title: DPMSolverMultistepInverse
-    - local: api/schedulers/multistep_dpm_solver
-      title: DPMSolverMultistepScheduler
+      title: DEIS
+    - local: api/schedulers/dpm_discrete
+      title: DPM Discrete Scheduler
+    - local: api/schedulers/dpm_discrete_ancestral
+      title: DPM Discrete Scheduler with ancestral sampling
    - local: api/schedulers/dpm_sde
      title: DPMSolverSDEScheduler
-    - local: api/schedulers/singlestep_dpm_solver
-      title: DPMSolverSinglestepScheduler
    - local: api/schedulers/euler_ancestral
-      title: EulerAncestralDiscreteScheduler
+      title: Euler Ancestral Scheduler
    - local: api/schedulers/euler
-      title: EulerDiscreteScheduler
+      title: Euler scheduler
    - local: api/schedulers/heun
-      title: HeunDiscreteScheduler
+      title: Heun Scheduler
+    - local: api/schedulers/multistep_dpm_solver_inverse
+      title: Inverse Multistep DPM-Solver
    - local: api/schedulers/ipndm
-      title: IPNDMScheduler
-    - local: api/schedulers/stochastic_karras_ve
-      title: KarrasVeScheduler
-    - local: api/schedulers/dpm_discrete_ancestral
-      title: KDPM2AncestralDiscreteScheduler
-    - local: api/schedulers/dpm_discrete
-      title: KDPM2DiscreteScheduler
+      title: IPNDM
    - local: api/schedulers/lms_discrete
-      title: LMSDiscreteScheduler
+      title: Linear Multistep
+    - local: api/schedulers/multistep_dpm_solver
+      title: Multistep DPM-Solver
    - local: api/schedulers/pndm
-      title: PNDMScheduler
+      title: PNDM
    - local: api/schedulers/repaint
-      title: RePaintScheduler
-    - local: api/schedulers/score_sde_ve
-      title: ScoreSdeVeScheduler
-    - local: api/schedulers/score_sde_vp
-      title: ScoreSdeVpScheduler
+      title: RePaint Scheduler
+    - local: api/schedulers/singlestep_dpm_solver
+      title: Singlestep DPM-Solver
+    - local: api/schedulers/stochastic_karras_ve
+      title: Stochastic Kerras VE
    - local: api/schedulers/unipc
      title: UniPCMultistepScheduler
+    - local: api/schedulers/score_sde_ve
+      title: VE-SDE
+    - local: api/schedulers/score_sde_vp
+      title: VP-SDE
    - local: api/schedulers/vq_diffusion
      title: VQDiffusionScheduler
    title: Schedulers
--- a/docs/source/en/api/models/autoencoder_tiny.md
+++ b/docs/source/en/api/models/autoencoder_tiny.md
@@ -1,45 +0,0 @@
-# Tiny AutoEncoder
-
-Tiny AutoEncoder for Stable Diffusion (TAESD) was introduced in [madebyollin/taesd](https://github.com/madebyollin/taesd) by Ollin Boer Bohan. It is a tiny distilled version of Stable Diffusion's VAE that can quickly decode the latents in a [`StableDiffusionPipeline`] or [`StableDiffusionXLPipeline`] almost instantly. 
-
-To use with Stable Diffusion v-2.1:
-
-```python
-import torch
-from diffusers import DiffusionPipeline, AutoencoderTiny
-
-pipe = DiffusionPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-2-1-base", torch_dtype=torch.float16
-)
-pipe.vae = AutoencoderTiny.from_pretrained("madebyollin/taesd", torch_dtype=torch.float16)
-pipe = pipe.to("cuda")
-
-prompt = "slice of delicious New York-style berry cheesecake"
-image = pipe(prompt, num_inference_steps=25).images[0]
-image.save("cheesecake.png")
-```
-
-To use with Stable Diffusion XL 1.0
-
-```python
-import torch
-from diffusers import DiffusionPipeline, AutoencoderTiny
-
-pipe = DiffusionPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
-)
-pipe.vae = AutoencoderTiny.from_pretrained("madebyollin/taesdxl", torch_dtype=torch.float16)
-pipe = pipe.to("cuda")
-
-prompt = "slice of delicious New York-style berry cheesecake"
-image = pipe(prompt, num_inference_steps=25).images[0]
-image.save("cheesecake_sdxl.png")
-```
-
-## AutoencoderTiny
-
-[[autodoc]] AutoencoderTiny
-
-## AutoencoderTinyOutput
-
-[[autodoc]] models.autoencoder_tiny.AutoencoderTinyOutput
--- a/docs/source/en/api/models/overview.md
+++ b/docs/source/en/api/models/overview.md
@@ -9,8 +9,4 @@ All models are built from the base [`ModelMixin`] class which is a [`torch.nn.mo

 ## FlaxModelMixin

-[[autodoc]] FlaxModelMixin
-
-## PushToHubMixin
-
-[[autodoc]] utils.PushToHubMixin
+[[autodoc]] FlaxModelMixin
--- a/docs/source/en/api/pipelines/audioldm.md
+++ b/docs/source/en/api/pipelines/audioldm.md
@@ -46,5 +46,6 @@ Make sure to check out the Schedulers [guide](/using-diffusers/schedulers) to le
 	- all
 	- __call__

-## AudioPipelineOutput
-[[autodoc]] pipelines.AudioPipelineOutput
+## StableDiffusionPipelineOutput
+
+[[autodoc]] pipelines.stable_diffusion.StableDiffusionPipelineOutput
--- a/docs/source/en/api/pipelines/audioldm2.md
+++ b/docs/source/en/api/pipelines/audioldm2.md
@@ -1,93 +0,0 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# AudioLDM 2
-
-AudioLDM 2 was proposed in [AudioLDM 2: Learning Holistic Audio Generation with Self-supervised Pretraining](https://arxiv.org/abs/2308.05734) 
-by Haohe Liu et al. AudioLDM 2 takes a text prompt as input and predicts the corresponding audio. It can generate 
-text-conditional sound effects, human speech and music.
-
-Inspired by [Stable Diffusion](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/overview), AudioLDM 2
-is a text-to-audio _latent diffusion model (LDM)_ that learns continuous audio representations from text embeddings. Two 
-text encoder models are used to compute the text embeddings from a prompt input: the text-branch of [CLAP](https://huggingface.co/docs/transformers/main/en/model_doc/clap)
-and the encoder of [Flan-T5](https://huggingface.co/docs/transformers/main/en/model_doc/flan-t5). These text embeddings 
-are then projected to a shared embedding space by an [AudioLDM2ProjectionModel](https://huggingface.co/docs/diffusers/main/api/pipelines/audioldm2#diffusers.AudioLDM2ProjectionModel). 
-A [GPT2](https://huggingface.co/docs/transformers/main/en/model_doc/gpt2) _language model (LM)_ is used to auto-regressively 
-predict eight new embedding vectors, conditional on the projected CLAP and Flan-T5 embeddings. The generated embedding 
-vectors and Flan-T5 text embeddings are used as cross-attention conditioning in the LDM. The [UNet](https://huggingface.co/docs/diffusers/main/en/api/pipelines/audioldm2#diffusers.AudioLDM2UNet2DConditionModel) 
-of AudioLDM 2 is unique in the sense that it takes **two** cross-attention embeddings, as opposed to one cross-attention 
-conditioning, as in most other LDMs.
-
-The abstract of the paper is the following:
-
-*Although audio generation shares commonalities across different types of audio, such as speech, music, and sound effects, designing models for each type requires careful consideration of specific objectives and biases that can significantly differ from those of other types. To bring us closer to a unified perspective of audio generation, this paper proposes a framework that utilizes the same learning method for speech, music, and sound effect generation. Our framework introduces a general representation of audio, called language of audio (LOA). Any audio can be translated into LOA based on AudioMAE, a self-supervised pre-trained representation learning model. In the generation process, we translate any modalities into LOA by using a GPT-2 model, and we perform self-supervised audio generation learning with a latent diffusion model conditioned on LOA. The proposed framework naturally brings advantages such as in-context learning abilities and reusable self-supervised pretrained AudioMAE and latent diffusion models. Experiments on the major benchmarks of text-to-audio, text-to-music, and text-to-speech demonstrate new state-of-the-art or competitive performance to previous approaches.*
-
-This pipeline was contributed by [sanchit-gandhi](https://huggingface.co/sanchit-gandhi). The original codebase can be 
-found at [haoheliu/audioldm2](https://github.com/haoheliu/audioldm2). 
-
-## Tips
-
-### Choosing a checkpoint
-
-AudioLDM2 comes in three variants. Two of these checkpoints are applicable to the general task of text-to-audio 
-generation. The third checkpoint is trained exclusively on text-to-music generation.
-
-All checkpoints share the same model size for the text encoders and VAE. They differ in the size and depth of the UNet. 
-See table below for details on the three checkpoints:
-
-| Checkpoint                                                      | Task          | UNet Model Size | Total Model Size | Training Data / h |
-|-----------------------------------------------------------------|---------------|-----------------|------------------|-------------------|
-| [audioldm2](https://huggingface.co/cvssp/audioldm2)             | Text-to-audio | 350M            | 1.1B             | 1150k             |
-| [audioldm2-large](https://huggingface.co/cvssp/audioldm2-large) | Text-to-audio | 750M            | 1.5B             | 1150k             |
-| [audioldm2-music](https://huggingface.co/cvssp/audioldm2-music) | Text-to-music | 350M            | 1.1B             | 665k              |
-
-### Constructing a prompt
-
-* Descriptive prompt inputs work best: use adjectives to describe the sound (e.g. "high quality" or "clear") and make the prompt context specific (e.g. "water stream in a forest" instead of "stream").
-* It's best to use general terms like "cat" or "dog" instead of specific names or abstract objects the model may not be familiar with.
-* Using a **negative prompt** can significantly improve the quality of the generated waveform, by guiding the generation away from terms that correspond to poor quality audio. Try using a negative prompt of "Low quality." 
-
-### Controlling inference
-
-* The _quality_ of the predicted audio sample can be controlled by the `num_inference_steps` argument; higher steps give higher quality audio at the expense of slower inference.
-* The _length_ of the predicted audio sample can be controlled by varying the `audio_length_in_s` argument.
-
-### Evaluating generated waveforms:
-
-* The quality of the generated waveforms can vary significantly based on the seed. Try generating with different seeds until you find a satisfactory generation
-* Multiple waveforms can be generated in one go: set `num_waveforms_per_prompt` to a value greater than 1. Automatic scoring will be performed between the generated waveforms and prompt text, and the audios ranked from best to worst accordingly.
-
-The following example demonstrates how to construct good music generation using the aforementioned tips: [example](https://huggingface.co/docs/diffusers/main/en/api/pipelines/audioldm2#diffusers.AudioLDM2Pipeline.__call__.example).
-
-<Tip>
-
-Make sure to check out the Schedulers [guide](/using-diffusers/schedulers) to learn how to explore the tradeoff between 
-scheduler speed and quality, and see the [reuse components across pipelines](/using-diffusers/loading#reuse-components-across-pipelines) 
-section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
-
-## AudioLDM2Pipeline
-[[autodoc]] AudioLDM2Pipeline
-	- all
-	- __call__
-
-## AudioLDM2ProjectionModel
-[[autodoc]] AudioLDM2ProjectionModel
-	- forward
-
-## AudioLDM2UNet2DConditionModel
-[[autodoc]] AudioLDM2UNet2DConditionModel
-	- forward
-
-## AudioPipelineOutput
-[[autodoc]] pipelines.AudioPipelineOutput
--- a/docs/source/en/api/pipelines/auto_pipeline.md
+++ b/docs/source/en/api/pipelines/auto_pipeline.md
@@ -12,41 +12,35 @@ specific language governing permissions and limitations under the License.

 # AutoPipeline

-`AutoPipeline` is designed to:
+In many cases, one checkpoint can be used for multiple tasks. For example, you may be able to use the same checkpoint for Text-to-Image, Image-to-Image, and Inpainting. However, you'll need to know the pipeline class names linked to your checkpoint. 

-1. make it easy for you to load a checkpoint for a task without knowing the specific pipeline class to use
-2. use multiple pipelines in your workflow
+AutoPipeline is designed to make it easy for you to use multiple pipelines in your workflow. We currently provide 3 AutoPipeline classes to perform three different tasks, i.e. [`AutoPipelineForText2Image`], [`AutoPipelineForImage2Image`], and [`AutoPipelineForInpainting`]. You'll need to choose the AutoPipeline class based on the task you want to perform and use it to automatically retrieve the relevant pipeline given the name/path to the pre-trained weights. 

-Based on the task, the `AutoPipeline` class automatically retrieves the relevant pipeline given the name or path to the pretrained weights with the `from_pretrained()` method.
+For example, to perform Image-to-Image with the SD1.5 checkpoint, you can do

-To seamlessly switch between tasks with the same checkpoint without reallocating additional memory, use the `from_pipe()` method to transfer the components from the original pipeline to the new one.
+```python
+from diffusers import PipelineForImageToImage

-```py
-from diffusers import AutoPipelineForText2Image
-import torch
-
-pipeline = AutoPipelineForText2Image.from_pretrained(
-    "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True
-).to("cuda")
-prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k"
-
-image = pipeline(prompt, num_inference_steps=25).images[0]
+pipe_i2i = PipelineForImageoImage.from_pretrained("runwayml/stable-diffusion-v1-5")
 ```

-<Tip>
+It will also help you switch between tasks seamlessly using the same checkpoint without reallocating additional memory. For example, to re-use the Image-to-Image pipeline we just created for inpainting, you can do 

-Check out the [AutoPipeline](/tutorials/autopipeline) tutorial to learn how to use this API!
+```python
+from diffusers import PipelineForInpainting

-</Tip>
+pipe_inpaint = AutoPipelineForInpainting.from_pipe(pipe_i2i)
+```
+All the components will be transferred to the inpainting pipeline with zero cost.

-`AutoPipeline` supports text-to-image, image-to-image, and inpainting for the following diffusion models:

- [Stable Diffusion](./stable_diffusion)
- [ControlNet](./api/pipelines/controlnet)
- [Stable Diffusion XL (SDXL)](./stable_diffusion/stable_diffusion_xl)
- [DeepFloyd IF](./if) 
- [Kandinsky](./kandinsky)
- [Kandinsky 2.2](./kandinsky#kandinsky-22)
+Currently AutoPipeline support the Text-to-Image, Image-to-Image, and Inpainting tasks for below diffusion models:
+- [stable Diffusion](./stable_diffusion)
+- [Stable Diffusion Controlnet](./api/pipelines/controlnet)
+- [Stable Diffusion XL](./stable_diffusion/stable_diffusion_xl)
+- [IF](./if) 
+- [Kandinsky](./kandinsky)(./kandinsky)(./kandinsky)(./kandinsky)(./kandinsky)
+- [Kandinsky 2.2]()(./kandinsky)


 ## AutoPipelineForText2Image
--- a/docs/source/en/api/pipelines/controlnet_sdxl.md
+++ b/docs/source/en/api/pipelines/controlnet_sdxl.md
@@ -1,162 +0,0 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# ControlNet with Stable Diffusion XL
-
-[Adding Conditional Control to Text-to-Image Diffusion Models](https://huggingface.co/papers/2302.05543) by Lvmin Zhang and Maneesh Agrawala.
-
-Using a pretrained model, we can provide control images (for example, a depth map) to control Stable Diffusion text-to-image generation so that it follows the structure of the depth image and fills in the details.
-
-The abstract from the paper is:
-
-*We present a neural network structure, ControlNet, to control pretrained large diffusion models to support additional input conditions. The ControlNet learns task-specific conditions in an end-to-end way, and the learning is robust even when the training dataset is small (< 50k). Moreover, training a ControlNet is as fast as fine-tuning a diffusion model, and the model can be trained on a personal devices. Alternatively, if powerful computation clusters are available, the model can scale to large amounts (millions to billions) of data. We report that large diffusion models like Stable Diffusion can be augmented with ControlNets to enable conditional inputs like edge maps, segmentation maps, keypoints, etc. This may enrich the methods to control large diffusion models and further facilitate related applications.*
-
-We provide support using ControlNets with [Stable Diffusion XL](./stable_diffusion/stable_diffusion_xl.md) (SDXL). 
-
-You can find numerous SDXL ControlNet checkpoints from [this link](https://huggingface.co/models?other=stable-diffusion-xl&other=controlnet). There are some smaller ControlNet checkpoints too:
-
-* [controlnet-canny-sdxl-1.0-small](https://huggingface.co/diffusers/controlnet-canny-sdxl-1.0-small)
-* [controlnet-canny-sdxl-1.0-mid](https://huggingface.co/diffusers/controlnet-canny-sdxl-1.0-mid)
-* [controlnet-depth-sdxl-1.0-small](https://huggingface.co/diffusers/controlnet-depth-sdxl-1.0-small)
-* [controlnet-depth-sdxl-1.0-mid](https://huggingface.co/diffusers/controlnet-depth-sdxl-1.0-mid)
-
-We also encourage you to train custom ControlNets; we provide a [training script](https://github.com/huggingface/diffusers/blob/main/examples/controlnet/README_sdxl.md) for this.
-
-You can find some results below:
-
-<img src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/sd_xl/sdxl_controlnet_canny_grid.png" width=600/>
-
-🚨 At the time of this writing, many of these SDXL ControlNet checkpoints are experimental and there is a lot of room for improvement. We encourage our users to provide feedback. 🚨
-
-## MultiControlNet
-
-You can compose multiple ControlNet conditionings from different image inputs to create a *MultiControlNet*. To get better results, it is often helpful to:
-
-1. mask conditionings such that they don't overlap (for example, mask the area of a canny image where the pose conditioning is located)
-2. experiment with the [`controlnet_conditioning_scale`](https://huggingface.co/docs/diffusers/main/en/api/pipelines/controlnet#diffusers.StableDiffusionControlNetPipeline.__call__.controlnet_conditioning_scale) parameter to determine how much weight to assign to each conditioning input
-
-In this example, you'll combine a canny image and a human pose estimation image to generate a new image.
-
-Prepare the canny image conditioning:
-
-```py
-from diffusers.utils import load_image
-from PIL import Image
-import numpy as np 
-import cv2
-
-canny_image = load_image(
-    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/landscape.png"
-)
-canny_image = np.array(canny_image)
-
-low_threshold = 100
-high_threshold = 200
-
-canny_image = cv2.Canny(canny_image, low_threshold, high_threshold)
-
-# zero out middle columns of image where pose will be overlayed
-zero_start = canny_image.shape[1] // 4
-zero_end = zero_start + canny_image.shape[1] // 2
-canny_image[:, zero_start:zero_end] = 0
-
-canny_image = canny_image[:, :, None]
-canny_image = np.concatenate([canny_image, canny_image, canny_image], axis=2)
-canny_image = Image.fromarray(canny_image).resize((1024, 1024))
-```
-
-<div class="flex gap-4">
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/landscape.png"/>
-    <figcaption class="mt-2 text-center text-sm text-gray-500">original image</figcaption>
-  </div>
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/controlnet/landscape_canny_masked.png"/>
-    <figcaption class="mt-2 text-center text-sm text-gray-500">canny image</figcaption>
-  </div>
-</div>
-
-Prepare the human pose estimation conditioning:
-
-```py
-from controlnet_aux import OpenposeDetector
-from diffusers.utils import load_image
-
-openpose = OpenposeDetector.from_pretrained("lllyasviel/ControlNet")
-
-openpose_image = load_image(
-    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/person.png"
-)
-openpose_image = openpose(openpose_image).resize((1024, 1024))
-```
-
-<div class="flex gap-4">
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/person.png"/>
-    <figcaption class="mt-2 text-center text-sm text-gray-500">original image</figcaption>
-  </div>
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/controlnet/person_pose.png"/>
-    <figcaption class="mt-2 text-center text-sm text-gray-500">human pose image</figcaption>
-  </div>
-</div>
-
-Load a list of ControlNet models that correspond to each conditioning, and pass them to the [`StableDiffusionXLControlNetPipeline`]. Use the faster [`UniPCMultistepScheduler`] and nable model offloading to reduce memory usage.
-
-```py
-from diffusers import StableDiffusionXLControlNetPipeline, ControlNetModel, AutoencoderKL, UniPCMultistepScheduler
-import torch
-
-controlnets = [
-    ControlNetModel.from_pretrained(
-        "thibaud/controlnet-openpose-sdxl-1.0", torch_dtype=torch.float16, use_safetensors=True
-    ),
-    ControlNetModel.from_pretrained("diffusers/controlnet-canny-sdxl-1.0", torch_dtype=torch.float16, use_safetensors=True),
-]
-
-vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16, use_safetensors=True)
-pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0", controlnet=controlnets, vae=vae, torch_dtype=torch.float16, use_safetensors=True
-)
-pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
-pipe.enable_model_cpu_offload()
-```
-
-Now you can pass your prompt (an optional negative prompt if you're using one), canny image, and pose image to the pipeline:
-
-```py
-prompt = "a giant standing in a fantasy landscape, best quality"
-negative_prompt = "monochrome, lowres, bad anatomy, worst quality, low quality"
-
-generator = torch.manual_seed(1)
-
-images = [openpose_image, canny_image]
-
-images = pipe(
-    prompt,
-    image=images,
-    num_inference_steps=25,
-    generator=generator,
-    negative_prompt=negative_prompt,
-    num_images_per_prompt=3,
-    controlnet_conditioning_scale=[1.0, 0.8],
-).images[0]
-```
-
-<div class="flex justify-center">
-	<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/multicontrolnet.png"/>
-</div>
-
-## StableDiffusionXLControlNetPipeline
-[[autodoc]] StableDiffusionXLControlNetPipeline
-	- all
-	- __call__
--- a/docs/source/en/api/pipelines/kandinsky.md
+++ b/docs/source/en/api/pipelines/kandinsky.md
@@ -105,30 +105,6 @@ One cheeseburger monster coming up! Enjoy!

 ![img](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/kandinsky-docs/cheeseburger.png)

-<Tip>
-
-We also provide an end-to-end Kandinsky pipeline [`KandinskyCombinedPipeline`], which combines both the prior pipeline and text-to-image pipeline, and lets you perform inference in a single step. You can create the combined pipeline with the [`~AutoPipelineForText2Image.from_pretrained`] method
-
-```python
-from diffusers import AutoPipelineForText2Image
-import torch
-
-pipe = AutoPipelineForText2Image.from_pretrained(
-    "kandinsky-community/kandinsky-2-1", torch_dtype=torch.float16
-)
-pipe.enable_model_cpu_offload()
-```
-
-Under the hood, it will automatically load both [`KandinskyPriorPipeline`] and [`KandinskyPipeline`]. To generate images, you no longer need to call both pipelines and pass the outputs from one to another. You only need to call the combined pipeline once. You can set different `guidance_scale` and `num_inference_steps` for the prior pipeline with the `prior_guidance_scale` and `prior_num_inference_steps` arguments.
-
-```python
-prompt = "A alien cheeseburger creature eating itself, claymation, cinematic, moody lighting"
-negative_prompt = "low quality, bad quality"
-
-image = pipe(prompt=prompt, negative_prompt=negative_prompt, prior_guidance_scale =1.0, guidance_scacle = 4.0, height=768, width=768).images[0]
-```
-</Tip>
-
 The Kandinsky model works extremely well with creative prompts. Here is some of the amazing art that can be created using the exact same process but with different prompts.

 ```python
@@ -211,34 +187,6 @@ out.images[0].save("fantasy_land.png")
 ![img](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/kandinsky-docs/img2img_fantasyland.png)


-<Tip>
-
-You can also use the [`KandinskyImg2ImgCombinedPipeline`] for end-to-end image-to-image generation with Kandinsky 2.1
-
-```python
-from diffusers import AutoPipelineForImage2Image
-import torch
-import requests
-from io import BytesIO
-from PIL import Image
-import os
-
-pipe = AutoPipelineForImage2Image.from_pretrained("kandinsky-community/kandinsky-2-1", torch_dtype=torch.float16)
-pipe.enable_model_cpu_offload()
-
-prompt = "A fantasy landscape, Cinematic lighting"
-negative_prompt = "low quality, bad quality"
-
-url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
- 
-response = requests.get(url)
-original_image = Image.open(BytesIO(response.content)).convert("RGB")
-original_image.thumbnail((768, 768))
-
-image = pipe(prompt=prompt, image=original_image, strength=0.3).images[0]
-```
-</Tip>
-
 ### Text Guided Inpainting Generation

 You can use [`KandinskyInpaintPipeline`] to edit images. In this example, we will add a hat to the portrait of a cat.
@@ -283,33 +231,6 @@ image.save("cat_with_hat.png")
 ```
 ![img](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/kandinsky-docs/inpaint_cat_hat.png)

-<Tip>
-
-To use the [`KandinskyInpaintCombinedPipeline`] to perform end-to-end image inpainting generation, you can run below code instead
-
-```python
-from diffusers import AutoPipelineForInpainting
-
-pipe = AutoPipelineForInpainting.from_pretrained("kandinsky-community/kandinsky-2-1-inpaint", torch_dtype=torch.float16)
-pipe.enable_model_cpu_offload()
-image = pipe(prompt=prompt, image=original_image, mask_image=mask).images[0]
-```
-</Tip>
-
-🚨🚨🚨 __Breaking change for Kandinsky Mask Inpainting__ 🚨🚨🚨
-
-We introduced a breaking change for Kandinsky inpainting pipeline in the following pull request: https://github.com/huggingface/diffusers/pull/4207. Previously we accepted a mask format where black pixels represent the masked-out area. This is inconsistent with all other pipelines in diffusers. We have changed the mask format in Knaindsky and now using white pixels instead.
-Please upgrade your inpainting code to follow the above. If you are using Kandinsky Inpaint in production. You now need to change the mask to:
-
-```python
-# For PIL input
-import PIL.ImageOps
-mask = PIL.ImageOps.invert(mask)
-
-# For PyTorch and Numpy input
-mask = 1 - mask
-```
-
 ### Interpolate 

 The [`KandinskyPriorPipeline`] also comes with a cool utility function that will allow you to interpolate the latent space of different images and texts super easily. Here is an example of how you can create an Impressionist-style portrait for your pet based on "The Starry Night". 
--- a/docs/source/en/api/pipelines/kandinsky_v22.md
+++ b/docs/source/en/api/pipelines/kandinsky_v22.md
@@ -11,22 +11,7 @@ specific language governing permissions and limitations under the License.

 The Kandinsky 2.2 release includes robust new text-to-image models that support text-to-image generation, image-to-image generation, image interpolation, and text-guided image inpainting. The general workflow to perform these tasks using Kandinsky 2.2 is the same as in Kandinsky 2.1. First, you will need to use a prior pipeline to generate image embeddings based on your text prompt, and then use one of the image decoding pipelines to generate the output image. The only difference is that in Kandinsky 2.2, all of the decoding pipelines no longer accept the `prompt` input, and the image generation process is conditioned with only `image_embeds` and `negative_image_embeds`.

-Same as with Kandinsky 2.1, the easiest way to perform text-to-image generation is to use the combined Kandinsky pipeline. This process is exactly the same as Kandinsky 2.1. All you need to do is to replace the Kandinsky 2.1 checkpoint with 2.2.
-
-```python
-from diffusers import AutoPipelineForText2Image
-import torch
-
-pipe = AutoPipelineForText2Image.from_pretrained("kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16)
-pipe.enable_model_cpu_offload()
-
-prompt = "A alien cheeseburger creature eating itself, claymation, cinematic, moody lighting"
-negative_prompt = "low quality, bad quality"
-
-image = pipe(prompt=prompt, negative_prompt=negative_prompt, prior_guidance_scale =1.0, height=768, width=768).images[0]
-```
-
-Now, let's look at an example where we take separate steps to run the prior pipeline and text-to-image pipeline. This way, we can understand what's happening under the hood and how Kandinsky 2.2 differs from Kandinsky 2.1.
+Let's look at an example of how to perform text-to-image generation using Kandinsky 2.2.

 First, let's create the prior pipeline and text-to-image pipeline with Kandinsky 2.2 checkpoints.

--- a/docs/source/en/api/pipelines/musicldm.md
+++ b/docs/source/en/api/pipelines/musicldm.md
@@ -1,57 +0,0 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# MusicLDM
-
-MusicLDM was proposed in [MusicLDM: Enhancing Novelty in Text-to-Music Generation Using Beat-Synchronous Mixup Strategies](https://huggingface.co/papers/2308.01546) by Ke Chen, Yusong Wu, Haohe Liu, Marianna Nezhurina, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
-MusicLDM takes a text prompt as input and predicts the corresponding music sample. 
-
-Inspired by [Stable Diffusion](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/overview) and [AudioLDM](https://huggingface.co/docs/diffusers/api/pipelines/audioldm/overview),
-MusicLDM is a text-to-music _latent diffusion model (LDM)_ that learns continuous audio representations from [CLAP](https://huggingface.co/docs/transformers/main/model_doc/clap)
-latents.
-
-MusicLDM is trained on a corpus of 466 hours of music data. Beat-synchronous data augmentation strategies are applied to 
-the music samples, both in the time domain and in the latent space. Using beat-synchronous data augmentation strategies 
-encourages the model to interpolate between the training samples, but stay within the domain of the training data. The 
-result is generated music that is more diverse while staying faithful to the corresponding style.
-
-The abstract of the paper is the following:
-
-*In this paper, we present MusicLDM, a state-of-the-art text-to-music model that adapts Stable Diffusion and AudioLDM architectures to the music domain. We achieve this by retraining the contrastive language-audio pretraining model (CLAP) and the Hifi-GAN vocoder, as components of MusicLDM, on a collection of music data samples. Then, we leverage a beat tracking model and propose two different mixup strategies for data augmentation: beat-synchronous audio mixup and beat-synchronous latent mixup, to encourage the model to generate music more diverse while still staying faithful to the corresponding style.*
-
-This pipeline was contributed by [sanchit-gandhi](https://huggingface.co/sanchit-gandhi).
-
-## Tips
-
-When constructing a prompt, keep in mind:
-
-* Descriptive prompt inputs work best; use adjectives to describe the sound (for example, "high quality" or "clear") and make the prompt context specific where possible (e.g. "melodic techno with a fast beat and synths" works better than "techno").
-* Using a *negative prompt* can significantly improve the quality of the generated audio. Try using a negative prompt of "low quality, average quality".
-
-During inference:
-
-* The _quality_ of the generated audio sample can be controlled by the `num_inference_steps` argument; higher steps give higher quality audio at the expense of slower inference.
-* Multiple waveforms can be generated in one go: set `num_waveforms_per_prompt` to a value greater than 1 to enable. Automatic scoring will be performed between the generated waveforms and prompt text, and the audios ranked from best to worst accordingly.
-* The _length_ of the generated audio sample can be controlled by varying the `audio_length_in_s` argument.
-
-<Tip>
-
-Make sure to check out the Schedulers [guide](/using-diffusers/schedulers) to learn how to explore the tradeoff between 
-scheduler speed and quality, and see the [reuse components across pipelines](/using-diffusers/loading#reuse-components-across-pipelines) 
-section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
-
-## MusicLDMPipeline
-[[autodoc]] MusicLDMPipeline
-	- all
-	- __call__
--- a/docs/source/en/api/pipelines/overview.md
+++ b/docs/source/en/api/pipelines/overview.md
@@ -34,7 +34,3 @@ Pipelines do not offer any training functionality. You'll notice PyTorch's autog
 ## FlaxDiffusionPipeline

 [[autodoc]] pipelines.pipeline_flax_utils.FlaxDiffusionPipeline
-
-## PushToHubMixin
-
-[[autodoc]] utils.PushToHubMixin
--- a/docs/source/en/api/pipelines/stable_diffusion/adapter.md
+++ b/docs/source/en/api/pipelines/stable_diffusion/adapter.md
@@ -29,11 +29,10 @@ This model was contributed by the community contributor [HimariO](https://github
 | Pipeline | Tasks | Demo
 |---|---|:---:|
 | [StableDiffusionAdapterPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_adapter.py) | *Text-to-Image Generation with T2I-Adapter Conditioning* | -
-| [StableDiffusionXLAdapterPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_xl_adapter.py) | *Text-to-Image Generation with T2I-Adapter Conditioning on StableDiffusion-XL* | -

-## Usage example with the base model of StableDiffusion-1.4/1.5
+## Usage example

-In the following we give a simple example of how to use a *T2IAdapter* checkpoint with Diffusers for inference based on StableDiffusion-1.4/1.5.
+In the following we give a simple example of how to use a *T2IAdapter* checkpoint with Diffusers for inference.
 All adapters use the same pipeline.

 1. Images are first converted into the appropriate *control image* format.
@@ -70,7 +69,7 @@ Next, create the adapter pipeline
 import torch
 from diffusers import StableDiffusionAdapterPipeline, T2IAdapter

-adapter = T2IAdapter.from_pretrained("TencentARC/t2iadapter_color_sd14v1", torch_dtype=torch.float16)
+adapter = T2IAdapter.from_pretrained("TencentARC/t2iadapter_color_sd14v1")
 pipe = StableDiffusionAdapterPipeline.from_pretrained(
    "CompVis/stable-diffusion-v1-4",
    adapter=adapter,
@@ -94,62 +93,6 @@ out_image = pipe(

 ![img](https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/color_output.png)

-## Usage example with the base model of StableDiffusion-XL
-
-In the following we give a simple example of how to use a *T2IAdapter* checkpoint with Diffusers for inference based on StableDiffusion-XL.
-All adapters use the same pipeline.
-
- 1. Images are first downloaded into the appropriate *control image* format.
- 2. The *control image* and *prompt* are passed to the [`StableDiffusionXLAdapterPipeline`].
-
-Let's have a look at a simple example using the [Sketch Adapter](https://huggingface.co/Adapter/t2iadapter/tree/main/sketch_sdxl_1.0).
-
-```python
-from diffusers.utils import load_image
-
-sketch_image = load_image("https://huggingface.co/Adapter/t2iadapter/resolve/main/sketch.png").convert("L")
-```
-
-![img](https://huggingface.co/Adapter/t2iadapter/resolve/main/sketch.png)
-
-Then, create the adapter pipeline
-
-```py
-import torch
-from diffusers import (
-    T2IAdapter,
-    StableDiffusionXLAdapterPipeline,
-    DDPMScheduler
-)
-from diffusers.models.unet_2d_condition import UNet2DConditionModel
-
-model_id = "stabilityai/stable-diffusion-xl-base-1.0"
-adapter = T2IAdapter.from_pretrained("Adapter/t2iadapter", subfolder="sketch_sdxl_1.0",torch_dtype=torch.float16, adapter_type="full_adapter_xl")
-scheduler = DDPMScheduler.from_pretrained(model_id, subfolder="scheduler")
-
-pipe = StableDiffusionXLAdapterPipeline.from_pretrained(
-    model_id, adapter=adapter, safety_checker=None, torch_dtype=torch.float16, variant="fp16", scheduler=scheduler
-)
-
-pipe.to("cuda")
-```
-
-Finally, pass the prompt and control image to the pipeline
-
-```py
-# fix the random seed, so you will get the same result as the example
-generator = torch.Generator().manual_seed(42)
-
-sketch_image_out = pipe(
-    prompt="a photo of a dog in real world, high quality", 
-    negative_prompt="extra digit, fewer digits, cropped, worst quality, low quality", 
-    image=sketch_image, 
-    generator=generator, 
-    guidance_scale=7.5
-).images[0]
-```
-
-![img](https://huggingface.co/Adapter/t2iadapter/resolve/main/sketch_output.png)

 ## Available checkpoints

@@ -170,9 +113,6 @@ Non-diffusers checkpoints can be found under [TencentARC/T2I-Adapter](https://hu
 |[TencentARC/t2iadapter_depth_sd15v2](https://huggingface.co/TencentARC/t2iadapter_depth_sd15v2)||
 |[TencentARC/t2iadapter_sketch_sd15v2](https://huggingface.co/TencentARC/t2iadapter_sketch_sd15v2)||
 |[TencentARC/t2iadapter_zoedepth_sd15v1](https://huggingface.co/TencentARC/t2iadapter_zoedepth_sd15v1)||
-|[Adapter/t2iadapter, subfolder='sketch_sdxl_1.0'](https://huggingface.co/Adapter/t2iadapter/tree/main/sketch_sdxl_1.0)||
-|[Adapter/t2iadapter, subfolder='canny_sdxl_1.0'](https://huggingface.co/Adapter/t2iadapter/tree/main/canny_sdxl_1.0)||
-|[Adapter/t2iadapter, subfolder='openpose_sdxl_1.0'](https://huggingface.co/Adapter/t2iadapter/tree/main/openpose_sdxl_1.0)||

 ## Combining multiple adapters

@@ -245,14 +185,3 @@ However, T2I-Adapter performs slightly worse than ControlNet.
 	- disable_vae_slicing
 	- enable_xformers_memory_efficient_attention
 	- disable_xformers_memory_efficient_attention
-
-## StableDiffusionXLAdapterPipeline
-[[autodoc]] StableDiffusionXLAdapterPipeline
-	- all
-	- __call__
-	- enable_attention_slicing
-	- disable_attention_slicing
-	- enable_vae_slicing
-	- disable_vae_slicing
-	- enable_xformers_memory_efficient_attention
-	- disable_xformers_memory_efficient_attention
--- a/docs/source/en/api/pipelines/stable_diffusion/gligen.md
+++ b/docs/source/en/api/pipelines/stable_diffusion/gligen.md
@@ -1,46 +0,0 @@
-<!--Copyright 2023 The GLIGEN Authors and The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# GLIGEN (Grounded Language-to-Image Generation)
-
-The GLIGEN model was created by researchers and engineers from [University of Wisconsin-Madison, Columbia University, and Microsoft](https://github.com/gligen/GLIGEN). The [`StableDiffusionGLIGENPipeline`] can generate photorealistic images conditioned on grounding inputs. Along with text and bounding boxes, if input images are given, this pipeline can insert objects described by text at the region defined by bounding boxes. Otherwise, it'll generate an image described by the caption/prompt and insert objects described by text at the region defined by bounding boxes. It's trained on COCO2014D and COCO2014CD datasets, and the model uses a frozen CLIP ViT-L/14 text encoder to condition itself on grounding inputs.
-
-The abstract from the [paper](https://huggingface.co/papers/2301.07093) is:
-
-*Large-scale text-to-image diffusion models have made amazing advances. However, the status quo is to use text input alone, which can impede controllability. In this work, we propose GLIGEN, Grounded-Language-to-Image Generation, a novel approach that builds upon and extends the functionality of existing pre-trained text-to-image diffusion models by enabling them to also be conditioned on grounding inputs. To preserve the vast concept knowledge of the pre-trained model, we freeze all of its weights and inject the grounding information into new trainable layers via a gated mechanism. Our model achieves open-world grounded text2img generation with caption and bounding box condition inputs, and the grounding ability generalizes well to novel spatial configurations and concepts. GLIGEN’s zeroshot performance on COCO and LVIS outperforms existing supervised layout-to-image baselines by a large margin.*
-
-<Tip>
-
-Make sure to check out the Stable Diffusion [Tips](https://huggingface.co/docs/diffusers/en/api/pipelines/stable_diffusion/overview#tips) section to learn how to explore the tradeoff between scheduler speed and quality and how to reuse pipeline components efficiently!
-
-If you want to use one of the official checkpoints for a task, explore the [gligen](https://huggingface.co/gligen) Hub organizations!
-
-</Tip>
-
-This pipeline was contributed by [Nikhil Gajendrakumar](https://github.com/nikhil-masterful).
-
-## StableDiffusionGLIGENPipeline
-
-[[autodoc]] StableDiffusionGLIGENPipeline
-	- all
-	- __call__
-	- enable_vae_slicing
-	- disable_vae_slicing
-	- enable_vae_tiling
-	- disable_vae_tiling
-	- enable_model_cpu_offload
-	- prepare_latents
-	- enable_fuser
-
-## StableDiffusionPipelineOutput
-
-[[autodoc]] pipelines.stable_diffusion.StableDiffusionPipelineOutput
--- a/docs/source/en/api/pipelines/stable_diffusion/ldm3d_diffusion.md
+++ b/docs/source/en/api/pipelines/stable_diffusion/ldm3d_diffusion.md
@@ -30,8 +30,8 @@ Make sure to check out the Stable Diffusion [Tips](overview#tips) section to lea
 	- all
 	- __call__

-## LDM3DPipelineOutput
+## StableDiffusionPipelineOutput

-[[autodoc]] pipelines.stable_diffusion.pipeline_stable_diffusion_ldm3d.LDM3DPipelineOutput
+[[autodoc]] pipelines.stable_diffusion.StableDiffusionPipelineOutput
 	- all
 	- __call__
--- a/docs/source/en/api/pipelines/stable_diffusion/stable_diffusion_xl.md
+++ b/docs/source/en/api/pipelines/stable_diffusion/stable_diffusion_xl.md
@@ -10,29 +10,366 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->

-# Stable Diffusion XL
+# Stable diffusion XL

-Stable Diffusion XL (SDXL) was proposed in [SDXL: Improving Latent Diffusion Models for High-Resolution Image Synthesis](https://huggingface.co/papers/2307.01952) by Dustin Podell, Zion English, Kyle Lacey, Andreas Blattmann, Tim Dockhorn, Jonas Müller, Joe Penna, and Robin Rombach.
+Stable Diffusion XL was proposed in [SDXL: Improving Latent Diffusion Models for High-Resolution Image Synthesis](https://arxiv.org/abs/2307.01952) by Dustin Podell, Zion English, Kyle Lacey, Andreas Blattmann, Tim Dockhorn, Jonas Müller, Joe Penna, Robin Rombach

-The abstract from the paper is:
+The abstract of the paper is the following:

 *We present SDXL, a latent diffusion model for text-to-image synthesis. Compared to previous versions of Stable Diffusion, SDXL leverages a three times larger UNet backbone: The increase of model parameters is mainly due to more attention blocks and a larger cross-attention context as SDXL uses a second text encoder. We design multiple novel conditioning schemes and train SDXL on multiple aspect ratios. We also introduce a refinement model which is used to improve the visual fidelity of samples generated by SDXL using a post-hoc image-to-image technique. We demonstrate that SDXL shows drastically improved performance compared the previous versions of Stable Diffusion and achieves results competitive with those of black-box state-of-the-art image generators.*

 ## Tips

- SDXL works especially well with images between 768 and 1024.
- SDXL can pass a different prompt for each of the text encoders it was trained on. We can even pass different parts of the same prompt to the text encoders.
- SDXL output images can be improved by making use of a refiner model in an image-to-image setting.
- SDXL offers `negative_original_size`, `negative_crops_coords_top_left`, and `negative_target_size` to negatively condition the model on image resolution and cropping parameters.
+- Stable Diffusion XL works especially well with images between 768 and 1024.
+- Stable Diffusion XL can pass a different prompt for each of the text encoders it was trained on as shown below. We can even pass different parts of the same prompt to the text encoders.
+- Stable Diffusion XL output image can be improved by making use of a refiner as shown below.
+
+### Available checkpoints:
+
+- *Text-to-Image (1024x1024 resolution)*: [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0) with [`StableDiffusionXLPipeline`]
+- *Image-to-Image / Refiner (1024x1024 resolution)*: [stabilityai/stable-diffusion-xl-refiner-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-refiner-1.0) with [`StableDiffusionXLImg2ImgPipeline`]
+
+## Usage Example
+
+Before using SDXL make sure to have `transformers`, `accelerate`, `safetensors` and `invisible_watermark` installed. 
+You can install the libraries as follows:
+
+```
+pip install transformers
+pip install accelerate
+pip install safetensors
+pip install invisible-watermark>=0.2.0
+```
+
+### Text-to-Image
+
+You can use SDXL as follows for *text-to-image*:
+
+```py
+from diffusers import StableDiffusionXLPipeline
+import torch
+
+pipe = StableDiffusionXLPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16, variant="fp16", use_safetensors=True
+)
+pipe.to("cuda")
+
+prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k"
+image = pipe(prompt=prompt).images[0]
+```
+
+### Image-to-image 
+
+You can use SDXL as follows for *image-to-image*:
+
+```py 
+import torch
+from diffusers import StableDiffusionXLImg2ImgPipeline
+from diffusers.utils import load_image
+
+pipe = StableDiffusionXLImg2ImgPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-xl-refiner-1.0", torch_dtype=torch.float16, variant="fp16", use_safetensors=True
+)
+pipe = pipe.to("cuda")
+url = "https://huggingface.co/datasets/patrickvonplaten/images/resolve/main/aa_xl/000000009.png"
+
+init_image = load_image(url).convert("RGB")
+prompt = "a photo of an astronaut riding a horse on mars"
+image = pipe(prompt, image=init_image).images[0]
+```
+
+### Inpainting
+
+You can use SDXL as follows for *inpainting*
+
+```py 
+import torch
+from diffusers import StableDiffusionXLInpaintPipeline
+from diffusers.utils import load_image
+
+pipe = StableDiffusionXLInpaintPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16, variant="fp16", use_safetensors=True
+)
+pipe.to("cuda")
+
+img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png"
+mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png"
+
+init_image = load_image(img_url).convert("RGB")
+mask_image = load_image(mask_url).convert("RGB")
+
+prompt = "A majestic tiger sitting on a bench"
+image = pipe(prompt=prompt, image=init_image, mask_image=mask_image, num_inference_steps=50, strength=0.80).images[0]
+```
+
+### Refining the image output
+
+In addition to the [base model checkpoint](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0), 
+StableDiffusion-XL also includes a [refiner checkpoint](huggingface.co/stabilityai/stable-diffusion-xl-refiner-1.0)
+that is specialized in denoising low-noise stage images to generate images of improved high-frequency quality.
+This refiner checkpoint can be used as a "second-step" pipeline after having run the base checkpoint to improve
+image quality.
+
+When using the refiner, one can easily 
+- 1.) employ the base model and refiner as an *Ensemble of Expert Denoisers* as first proposed in [eDiff-I](https://research.nvidia.com/labs/dir/eDiff-I/) or
+- 2.) simply run the refiner in [SDEdit](https://arxiv.org/abs/2108.01073) fashion after the base model.
+
+**Note**: The idea of using SD-XL base & refiner as an ensemble of experts was first brought forward by 
+a couple community contributors which also helped shape the following `diffusers` implementation, namely:
+- [SytanSD](https://github.com/SytanSD)
+- [bghira](https://github.com/bghira)
+- [Birch-san](https://github.com/Birch-san)
+- [AmericanPresidentJimmyCarter](https://github.com/AmericanPresidentJimmyCarter)
+
+#### 1.) Ensemble of Expert Denoisers
+
+When using the base and refiner model as an ensemble of expert of denoisers, the base model should serve as the 
+expert for the high-noise diffusion stage and the refiner serves as the expert for the low-noise diffusion stage.
+
+The advantage of 1.) over 2.) is that it requires less overall denoising steps and therefore should be significantly
+faster. The drawback is that one cannot really inspect the output of the base model; it will still be heavily denoised.
+
+To use the base model and refiner as an ensemble of expert denoisers, make sure to define the span
+of timesteps which should be run through the high-noise denoising stage (*i.e.* the base model) and the low-noise
+denoising stage (*i.e.* the refiner model) respectively. We can set the intervals using the [`denoising_end`](https://huggingface.co/docs/diffusers/main/en/api/pipelines/stable_diffusion/stable_diffusion_xl#diffusers.StableDiffusionXLPipeline.__call__.denoising_end) of the base model 
+and [`denoising_start`](https://huggingface.co/docs/diffusers/main/en/api/pipelines/stable_diffusion/stable_diffusion_xl#diffusers.StableDiffusionXLImg2ImgPipeline.__call__.denoising_start) of the refiner model.
+
+For both `denoising_end` and `denoising_start` a float value between 0 and 1 should be passed.
+When passed, the end and start of denoising will be defined by proportions of discrete timesteps as
+defined by the model schedule.
+Note that this will override `strength` if it is also declared, since the number of denoising steps
+is determined by the discrete timesteps the model was trained on and the declared fractional cutoff.
+
+Let's look at an example.
+First, we import the two pipelines. Since the text encoders and variational autoencoder are the same
+you don't have to load those again for the refiner.
+
+```py
+from diffusers import DiffusionPipeline
+import torch
+
+base = DiffusionPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16, variant="fp16", use_safetensors=True
+)
+base.to("cuda")
+
+refiner = DiffusionPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-xl-refiner-1.0",
+    text_encoder_2=base.text_encoder_2,
+    vae=base.vae,
+    torch_dtype=torch.float16,
+    use_safetensors=True,
+    variant="fp16",
+)
+refiner.to("cuda")
+```
+
+Now we define the number of inference steps and the point at which the model shall be run through the 
+high-noise denoising stage (*i.e.* the base model).
+
+```py
+n_steps = 40
+high_noise_frac = 0.8
+```
+
+Stable Diffusion XL base is trained on timesteps 0-999 and Stable Diffusion XL refiner is finetuned
+from the base model on low noise timesteps 0-199 inclusive, so we use the base model for the first
+800 timesteps (high noise) and the refiner for the last 200 timesteps (low noise). Hence, `high_noise_frac`
+is set to 0.8, so that all steps 200-999 (the first 80% of denoising timesteps) are performed by the
+base model and steps 0-199 (the last 20% of denoising timesteps) are performed by the refiner model.
+
+Remember, the denoising process starts at **high value** (high noise) timesteps and ends at
+**low value** (low noise) timesteps.
+
+Let's run the two pipelines now. Make sure to set `denoising_end` and
+`denoising_start` to the same values and keep `num_inference_steps` constant. Also remember that
+the output of the base model should be in latent space:
+
+```py
+prompt = "A majestic lion jumping from a big stone at night"
+
+image = base(
+    prompt=prompt,
+    num_inference_steps=n_steps,
+    denoising_end=high_noise_frac,
+    output_type="latent",
+).images
+image = refiner(
+    prompt=prompt,
+    num_inference_steps=n_steps,
+    denoising_start=high_noise_frac,
+    image=image,
+).images[0]
+```
+
+Let's have a look at the images
+
+| Original Image | Ensemble of Denoisers Experts |
+|---|---|
+| ![lion_base_timesteps](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lion_base.png) | ![lion_refined_timesteps](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lion_refined.png)
+
+If we would have just run the base model on the same 40 steps, the image would have been arguably less detailed (e.g. the lion eyes and nose):

 <Tip>

-To learn how to use SDXL for various tasks, how to optimize performance, and other usage examples, take a look at the [Stable Diffusion XL](/using-diffusers/sdxl) guide.
-
-Check out the [Stability AI](https://huggingface.co/stabilityai) Hub organization for the official base and refiner model checkpoints! 
+The ensemble-of-experts method works well on all available schedulers!

 </Tip>

+#### 2.) Refining the image output from fully denoised base image
+
+In standard [`StableDiffusionImg2ImgPipeline`]-fashion, the fully-denoised image generated of the base model 
+can be further improved using the [refiner checkpoint](huggingface.co/stabilityai/stable-diffusion-xl-refiner-1.0).
+
+For this, you simply run the refiner as a normal image-to-image pipeline after the "base" text-to-image 
+pipeline. You can leave the outputs of the base model in latent space.
+
+```py
+from diffusers import DiffusionPipeline
+import torch
+
+pipe = DiffusionPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16, variant="fp16", use_safetensors=True
+)
+pipe.to("cuda")
+
+refiner = DiffusionPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-xl-refiner-1.0",
+    text_encoder_2=pipe.text_encoder_2,
+    vae=pipe.vae,
+    torch_dtype=torch.float16,
+    use_safetensors=True,
+    variant="fp16",
+)
+refiner.to("cuda")
+
+prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k"
+
+image = pipe(prompt=prompt, output_type="latent" if use_refiner else "pil").images[0]
+image = refiner(prompt=prompt, image=image[None, :]).images[0]
+```
+
+| Original Image | Refined Image |
+|---|---|
+| ![](https://huggingface.co/datasets/diffusers/docs-images/resolve/main/sd_xl/init_image.png) | ![](https://huggingface.co/datasets/diffusers/docs-images/resolve/main/sd_xl/refined_image.png) |
+
+<Tip>
+
+The refiner can also very well be used in an in-painting setting. To do so just make
+  sure you use the [`StableDiffusionXLInpaintPipeline`] classes as shown below
+
+</Tip>
+
+To use the refiner for inpainting in the Ensemble of Expert Denoisers setting you can do the following:
+
+```py
+from diffusers import StableDiffusionXLInpaintPipeline
+from diffusers.utils import load_image
+
+pipe = StableDiffusionXLInpaintPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16, variant="fp16", use_safetensors=True
+)
+pipe.to("cuda")
+
+refiner = StableDiffusionXLInpaintPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-xl-refiner-1.0",
+    text_encoder_2=pipe.text_encoder_2,
+    vae=pipe.vae,
+    torch_dtype=torch.float16,
+    use_safetensors=True,
+    variant="fp16",
+)
+refiner.to("cuda")
+
+img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png"
+mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png"
+
+init_image = load_image(img_url).convert("RGB")
+mask_image = load_image(mask_url).convert("RGB")
+
+prompt = "A majestic tiger sitting on a bench"
+num_inference_steps = 75
+high_noise_frac = 0.7
+
+image = pipe(
+    prompt=prompt,
+    image=init_image,
+    mask_image=mask_image,
+    num_inference_steps=num_inference_steps,
+    denoising_start=high_noise_frac,
+    output_type="latent",
+).images
+image = refiner(
+    prompt=prompt,
+    image=image,
+    mask_image=mask_image,
+    num_inference_steps=num_inference_steps,
+    denoising_start=high_noise_frac,
+).images[0]
+```
+
+To use the refiner for inpainting in the standard SDE-style setting, simply remove `denoising_end` and `denoising_start` and choose a smaller
+number of inference steps for the refiner.
+
+### Loading single file checkpoints / original file format
+
+By making use of [`~diffusers.loaders.FromSingleFileMixin.from_single_file`] you can also load the 
+original file format into `diffusers`:
+
+```py
+from diffusers import StableDiffusionXLPipeline, StableDiffusionXLImg2ImgPipeline
+import torch
+
+pipe = StableDiffusionXLPipeline.from_single_file(
+    "./sd_xl_base_1.0.safetensors", torch_dtype=torch.float16, variant="fp16", use_safetensors=True
+)
+pipe.to("cuda")
+
+refiner = StableDiffusionXLImg2ImgPipeline.from_single_file(
+    "./sd_xl_refiner_1.0.safetensors", torch_dtype=torch.float16, use_safetensors=True, variant="fp16"
+)
+refiner.to("cuda")
+```
+
+### Memory optimization via model offloading 
+
+If you are seeing out-of-memory errors, we recommend making use of [`StableDiffusionXLPipeline.enable_model_cpu_offload`].
+
+```diff
+- pipe.to("cuda")
+ pipe.enable_model_cpu_offload()
+```
+
+and 
+
+```diff
+- refiner.to("cuda")
+ refiner.enable_model_cpu_offload()
+```
+
+### Speed-up inference with `torch.compile`
+
+You can speed up inference by making use of `torch.compile`. This should give you **ca.** 20% speed-up.
+
+```diff
+ pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
+ refiner.unet = torch.compile(refiner.unet, mode="reduce-overhead", fullgraph=True)
+```
+
+### Running with `torch < 2.0`
+
+**Note** that if you want to run Stable Diffusion XL with `torch` < 2.0, please make sure to enable xformers 
+attention:
+
+```
+pip install xformers
+```
+
+```diff
+pipe.enable_xformers_memory_efficient_attention()
+refiner.enable_xformers_memory_efficient_attention()
+```
+
 ## StableDiffusionXLPipeline

 [[autodoc]] StableDiffusionXLPipeline
@@ -50,3 +387,25 @@ Check out the [Stability AI](https://huggingface.co/stabilityai) Hub organizatio
 [[autodoc]] StableDiffusionXLInpaintPipeline
 	- all
 	- __call__
+
+### Passing different prompts to each text-encoder
+
+Stable Diffusion XL was trained on two text encoders. The default behavior is to pass the same prompt to each. But it is possible to pass a different prompt for each text-encoder, as [some users](https://github.com/huggingface/diffusers/issues/4004#issuecomment-1627764201) noted that it can boost quality.
+To do so, you can pass `prompt_2` and `negative_prompt_2` in addition to `prompt` and `negative_prompt`. By doing that, you will pass the original prompts and negative prompts (as in `prompt` and `negative_prompt`) to `text_encoder` (in official SDXL 0.9/1.0 that is [OpenAI CLIP-ViT/L-14](https://huggingface.co/openai/clip-vit-large-patch14)),
+and `prompt_2` and `negative_prompt_2` to `text_encoder_2` (in official SDXL 0.9/1.0 that is [OpenCLIP-ViT/bigG-14](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)).
+
+```py
+from diffusers import StableDiffusionXLPipeline
+import torch
+
+pipe = StableDiffusionXLPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16, variant="fp16", use_safetensors=True
+)
+pipe.to("cuda")
+
+# prompt will be passed to OAI CLIP-ViT/L-14
+prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k"
+# prompt_2 will be passed to OpenCLIP-ViT/bigG-14
+prompt_2 = "monet painting"
+image = pipe(prompt=prompt, prompt_2=prompt_2).images[0]
+```
--- a/docs/source/en/api/pipelines/unidiffuser.md
+++ b/docs/source/en/api/pipelines/unidiffuser.md
@@ -20,12 +20,6 @@ The abstract from the [paper](https://arxiv.org/abs/2303.06555) is:

 You can find the original codebase at [thu-ml/unidiffuser](https://github.com/thu-ml/unidiffuser) and additional checkpoints at [thu-ml](https://huggingface.co/thu-ml).

-<Tip warning={true}>
-
-There is currently an issue on PyTorch 1.X where the output images are all black or the pixel values become `NaNs`. This issue can be mitigated by switching to PyTorch 2.X.
-
-</Tip>
-
 This pipeline was contributed by [dg845](https://github.com/dg845). ❤️

 ## Usage Examples
--- a/docs/source/en/api/schedulers/cm_stochastic_iterative.md
+++ b/docs/source/en/api/schedulers/cm_stochastic_iterative.md
@@ -1,15 +1,11 @@
-# CMStochasticIterativeScheduler
+# Consistency Model Multistep Scheduler

-[Consistency Models](https://huggingface.co/papers/2303.01469) by Yang Song, Prafulla Dhariwal, Mark Chen, and Ilya Sutskever introduced a multistep and onestep scheduler (Algorithm 1) that is capable of generating good samples in one or a small number of steps.
+## Overview

-The abstract from the paper is:
-
-*Diffusion models have made significant breakthroughs in image, audio, and video generation, but they depend on an iterative generation process that causes slow sampling speed and caps their potential for real-time applications. To overcome this limitation, we propose consistency models, a new family of generative models that achieve high sample quality without adversarial training. They support fast one-step generation by design, while still allowing for few-step sampling to trade compute for sample quality. They also support zero-shot data editing, like image inpainting, colorization, and super-resolution, without requiring explicit training on these tasks. Consistency models can be trained either as a way to distill pre-trained diffusion models, or as standalone generative models. Through extensive experiments, we demonstrate that they outperform existing distillation techniques for diffusion models in one- and few-step generation. For example, we achieve the new state-of-the-art FID of 3.55 on CIFAR-10 and 6.20 on ImageNet 64x64 for one-step generation. When trained as standalone generative models, consistency models also outperform single-step, non-adversarial generative models on standard benchmarks like CIFAR-10, ImageNet 64x64 and LSUN 256x256.*
-
-The original codebase can be found at [openai/consistency_models](https://github.com/openai/consistency_models).
+Multistep and onestep scheduler (Algorithm 1) introduced alongside consistency models in the paper [Consistency Models](https://arxiv.org/abs/2303.01469) by Yang Song, Prafulla Dhariwal, Mark Chen, and Ilya Sutskever.
+Based on the [original consistency models implementation](https://github.com/openai/consistency_models).
+Should generate good samples from [`ConsistencyModelPipeline`] in one or a small number of steps.

 ## CMStochasticIterativeScheduler
 [[autodoc]] CMStochasticIterativeScheduler

-## CMStochasticIterativeSchedulerOutput
-[[autodoc]] schedulers.scheduling_consistency_models.CMStochasticIterativeSchedulerOutput
--- a/docs/source/en/api/schedulers/ddim.md
+++ b/docs/source/en/api/schedulers/ddim.md
@@ -10,11 +10,13 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->

-# DDIMScheduler
+# Denoising Diffusion Implicit Models (DDIM)

-[Denoising Diffusion Implicit Models](https://huggingface.co/papers/2010.02502) (DDIM) by Jiaming Song, Chenlin Meng and Stefano Ermon.
+## Overview

-The abstract from the paper is:
+[Denoising Diffusion Implicit Models](https://arxiv.org/abs/2010.02502) (DDIM) by Jiaming Song, Chenlin Meng and Stefano Ermon.
+
+The abstract of the paper is the following:

 *Denoising diffusion probabilistic models (DDPMs) have achieved high quality image generation without adversarial training, 
 yet they require simulating a Markov chain for many steps to produce a sample. 
@@ -24,43 +26,50 @@ We construct a class of non-Markovian diffusion processes that lead to the same
 We empirically demonstrate that DDIMs can produce high quality samples 10× to 50× faster in terms of wall-clock time compared to DDPMs, allow us to trade off 
 computation for sample quality, and can perform semantically meaningful image interpolation directly in the latent space.*

-The original codebase of this paper can be found at [ermongroup/ddim](https://github.com/ermongroup/ddim), and you can contact the author on [tsong.me](https://tsong.me/).
+The original codebase of this paper can be found here: [ermongroup/ddim](https://github.com/ermongroup/ddim).
+For questions, feel free to contact the author on [tsong.me](https://tsong.me/).

-## Tips
+### Experimental: "Common Diffusion Noise Schedules and Sample Steps are Flawed": 

-The paper [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) claims that a mismatch between the training and inference settings leads to suboptimal inference generation results for Stable Diffusion. To fix this, the authors propose:
+The paper **[Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/abs/2305.08891)** 
+claims that a mismatch between the training and inference settings leads to suboptimal inference generation results for Stable Diffusion.

-<Tip warning={true}>
+The abstract reads as follows:

-🧪 This is an experimental feature!
-
-</Tip>
-
-1. rescale the noise schedule to enforce zero terminal signal-to-noise ratio (SNR)
+*We discover that common diffusion noise schedules do not enforce the last timestep to have zero signal-to-noise ratio (SNR),
+and some implementations of diffusion samplers do not start from the last timestep.
+Such designs are flawed and do not reflect the fact that the model is given pure Gaussian noise at inference, creating a discrepancy between training and inference.
+We show that the flawed design causes real problems in existing implementations. 
+In Stable Diffusion, it severely limits the model to only generate images with medium brightness and 
+prevents it from generating very bright and dark samples. We propose a few simple fixes: 
+- (1) rescale the noise schedule to enforce zero terminal SNR; 
+- (2) train the model with v prediction; 
+- (3) change the sampler to always start from the last timestep; 
+- (4) rescale classifier-free guidance to prevent over-exposure. 
+These simple changes ensure the diffusion process is congruent between training and inference and 
+allow the model to generate samples more faithful to the original data distribution.*

+You can apply all of these changes in `diffusers` when using [`DDIMScheduler`]:
+- (1) rescale the noise schedule to enforce zero terminal SNR; 
 ```py
 pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config, rescale_betas_zero_snr=True)
 ```
-
-2. train a model with `v_prediction` (add the following argument to the [train_text_to_image.py](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image.py) or [train_text_to_image_lora.py](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image_lora.py) scripts)
-
-```bash
--prediction_type="v_prediction"
-```
-
-3. change the sampler to always start from the last timestep
-
+- (2) train the model with v prediction; 
+Continue fine-tuning a checkpoint with [`train_text_to_image.py`](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image.py) or [`train_text_to_image_lora.py`](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image_lora.py)
+and `--prediction_type="v_prediction"`.
+- (3) change the sampler to always start from the last timestep; 
 ```py
 pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config, timestep_spacing="trailing")
 ```
-
-4. rescale classifier-free guidance to prevent over-exposure
-
+- (4) rescale classifier-free guidance to prevent over-exposure. 
 ```py
-image = pipeline(prompt, guidance_rescale=0.7).images[0]
+pipe(..., guidance_rescale=0.7)
 ```

-For example:
+An example is to use [this checkpoint](https://huggingface.co/ptx0/pseudo-journey-v2) 
+which has been fine-tuned using the `"v_prediction"`.
+
+The checkpoint can then be run in inference as follows:

 ```py
 from diffusers import DiffusionPipeline, DDIMScheduler
@@ -77,6 +86,3 @@ image = pipeline(prompt, guidance_rescale=0.7).images[0]

 ## DDIMScheduler
 [[autodoc]] DDIMScheduler
-
-## DDIMSchedulerOutput
-[[autodoc]] schedulers.scheduling_ddim.DDIMSchedulerOutput
--- a/docs/source/en/api/schedulers/ddim_inverse.md
+++ b/docs/source/en/api/schedulers/ddim_inverse.md
@@ -10,10 +10,12 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->

-# DDIMInverseScheduler
+# Inverse Denoising Diffusion Implicit Models (DDIMInverse)

-`DDIMInverseScheduler` is the inverted scheduler from [Denoising Diffusion Implicit Models](https://huggingface.co/papers/2010.02502) (DDIM) by Jiaming Song, Chenlin Meng and Stefano Ermon.
-The implementation is mostly based on the DDIM inversion definition from [Null-text Inversion for Editing Real Images using Guided Diffusion Models](https://huggingface.co/papers/2211.09794.pdf).
+## Overview
+
+This scheduler is the inverted scheduler of [Denoising Diffusion Implicit Models](https://arxiv.org/abs/2010.02502) (DDIM) by Jiaming Song, Chenlin Meng and Stefano Ermon.
+The implementation is mostly based on the DDIM inversion definition of [Null-text Inversion for Editing Real Images using Guided Diffusion Models](https://arxiv.org/pdf/2211.09794.pdf)

 ## DDIMInverseScheduler
 [[autodoc]] DDIMInverseScheduler
--- a/docs/source/en/api/schedulers/ddpm.md
+++ b/docs/source/en/api/schedulers/ddpm.md
@@ -10,16 +10,18 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->

-# DDPMScheduler
+# Denoising Diffusion Probabilistic Models (DDPM)

-[Denoising Diffusion Probabilistic Models](https://huggingface.co/papers/2006.11239) (DDPM) by Jonathan Ho, Ajay Jain and Pieter Abbeel proposes a diffusion based model of the same name. In the context of the 🤗 Diffusers library, DDPM refers to the discrete denoising scheduler from the paper as well as the pipeline.
+## Overview

-The abstract from the paper is:
+[Denoising Diffusion Probabilistic Models](https://arxiv.org/abs/2006.11239) 
+ (DDPM) by Jonathan Ho, Ajay Jain and Pieter Abbeel proposes the diffusion based model of the same name, but in the context of the 🤗 Diffusers library, DDPM refers to the discrete denoising scheduler from the paper as well as the pipeline.

-*We present high quality image synthesis results using diffusion probabilistic models, a class of latent variable models inspired by considerations from nonequilibrium thermodynamics. Our best results are obtained by training on a weighted variational bound designed according to a novel connection between diffusion probabilistic models and denoising score matching with Langevin dynamics, and our models naturally admit a progressive lossy decompression scheme that can be interpreted as a generalization of autoregressive decoding. On the unconditional CIFAR10 dataset, we obtain an Inception score of 9.46 and a state-of-the-art FID score of 3.17. On 256x256 LSUN, we obtain sample quality similar to ProgressiveGAN.*
+The abstract of the paper is the following:
+
+We present high quality image synthesis results using diffusion probabilistic models, a class of latent variable models inspired by considerations from nonequilibrium thermodynamics. Our best results are obtained by training on a weighted variational bound designed according to a novel connection between diffusion probabilistic models and denoising score matching with Langevin dynamics, and our models naturally admit a progressive lossy decompression scheme that can be interpreted as a generalization of autoregressive decoding. On the unconditional CIFAR10 dataset, we obtain an Inception score of 9.46 and a state-of-the-art FID score of 3.17. On 256x256 LSUN, we obtain sample quality similar to ProgressiveGAN.
+
+The original paper can be found [here](https://arxiv.org/abs/2010.02502).

 ## DDPMScheduler
 [[autodoc]] DDPMScheduler
-
-## DDPMSchedulerOutput
-[[autodoc]] schedulers.scheduling_ddpm.DDPMSchedulerOutput
--- a/docs/source/en/api/schedulers/deis.md
+++ b/docs/source/en/api/schedulers/deis.md
@@ -10,27 +10,13 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->

-# DEISMultistepScheduler
+# DEIS

-Diffusion Exponential Integrator Sampler (DEIS) is proposed in [Fast Sampling of Diffusion Models with Exponential Integrator](https://huggingface.co/papers/2204.13902) by Qinsheng Zhang and Yongxin Chen. `DEISMultistepScheduler` is a fast high order solver for diffusion ordinary differential equations (ODEs). 
+Fast Sampling of Diffusion Models with Exponential Integrator.

-This implementation modifies the polynomial fitting formula in log-rho space instead of the original linear `t` space in the DEIS paper. The modification enjoys closed-form coefficients for exponential multistep update instead of replying on the numerical solver.
+## Overview

-The abstract from the paper is:
-
-*The past few years have witnessed the great success of Diffusion models~(DMs) in generating high-fidelity samples in generative modeling tasks. A major limitation of the DM is its notoriously slow sampling procedure which normally requires hundreds to thousands of time discretization steps of the learned diffusion process to reach the desired accuracy. Our goal is to develop a fast sampling method for DMs with a much less number of steps while retaining high sample quality. To this end, we systematically analyze the sampling procedure in DMs and identify key factors that affect the sample quality, among which the method of discretization is most crucial. By carefully examining the learned diffusion process, we propose Diffusion Exponential Integrator Sampler~(DEIS). It is based on the Exponential Integrator designed for discretizing ordinary differential equations (ODEs) and leverages a semilinear structure of the learned diffusion process to reduce the discretization error. The proposed method can be applied to any DMs and can generate high-fidelity samples in as few as 10 steps. In our experiments, it takes about 3 minutes on one A6000 GPU to generate 50k images from CIFAR10. Moreover, by directly using pre-trained DMs, we achieve the state-of-art sampling performance when the number of score function evaluation~(NFE) is limited, e.g., 4.17 FID with 10 NFEs, 3.37 FID, and 9.74 IS with only 15 NFEs on CIFAR10. Code is available at [this https URL](https://github.com/qsh-zh/deis).*
-
-The original codebase can be found at [qsh-zh/deis](https://github.com/qsh-zh/deis).
-
-## Tips
-
-It is recommended to set `solver_order` to 2 or 3, while `solver_order=1` is equivalent to [`DDIMScheduler`].
-
-Dynamic thresholding from [Imagen](https://huggingface.co/papers/2205.11487) is supported, and for pixel-space
-diffusion models, you can set `thresholding=True` to use the dynamic thresholding.
+Original paper can be found [here](https://arxiv.org/abs/2204.13902). The original implementation can be found [here](https://github.com/qsh-zh/deis).

 ## DEISMultistepScheduler
 [[autodoc]] DEISMultistepScheduler
-
-## SchedulerOutput
-[[autodoc]] schedulers.scheduling_utils.SchedulerOutput
--- a/docs/source/en/api/schedulers/dpm_discrete.md
+++ b/docs/source/en/api/schedulers/dpm_discrete.md
@@ -10,14 +10,13 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->

-# KDPM2DiscreteScheduler
+# DPM Discrete Scheduler inspired by Karras et. al paper

-The `KDPM2DiscreteScheduler` is inspired by the [Elucidating the Design Space of Diffusion-Based Generative Models](https://huggingface.co/papers/2206.00364) paper, and the scheduler is ported from and created by [Katherine Crowson](https://github.com/crowsonkb/).
+## Overview

-The original codebase can be found at [crowsonkb/k-diffusion](https://github.com/crowsonkb/k-diffusion).
+Inspired by [Karras et. al](https://arxiv.org/abs/2206.00364). Scheduler ported from @crowsonkb's https://github.com/crowsonkb/k-diffusion library:
+
+All credit for making this scheduler work goes to [Katherine Crowson](https://github.com/crowsonkb/)

 ## KDPM2DiscreteScheduler
-[[autodoc]] KDPM2DiscreteScheduler
-
-## SchedulerOutput
-[[autodoc]] schedulers.scheduling_utils.SchedulerOutput
+[[autodoc]] KDPM2DiscreteScheduler
--- a/docs/source/en/api/schedulers/dpm_discrete_ancestral.md
+++ b/docs/source/en/api/schedulers/dpm_discrete_ancestral.md
@@ -10,14 +10,13 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->

-# KDPM2AncestralDiscreteScheduler
+# DPM Discrete Scheduler with ancestral sampling inspired by Karras et. al paper

-The `KDPM2DiscreteScheduler` with ancestral sampling is inspired by the [Elucidating the Design Space of Diffusion-Based Generative Models](https://huggingface.co/papers/2206.00364) paper, and the scheduler is ported from and created by [Katherine Crowson](https://github.com/crowsonkb/).
+## Overview

-The original codebase can be found at [crowsonkb/k-diffusion](https://github.com/crowsonkb/k-diffusion).
+Inspired by [Karras et. al](https://arxiv.org/abs/2206.00364). Scheduler ported from @crowsonkb's https://github.com/crowsonkb/k-diffusion library:
+
+All credit for making this scheduler work goes to [Katherine Crowson](https://github.com/crowsonkb/)

 ## KDPM2AncestralDiscreteScheduler
-[[autodoc]] KDPM2AncestralDiscreteScheduler
-
-## SchedulerOutput
-[[autodoc]] schedulers.scheduling_utils.SchedulerOutput
+[[autodoc]] KDPM2AncestralDiscreteScheduler
--- a/docs/source/en/api/schedulers/dpm_sde.md
+++ b/docs/source/en/api/schedulers/dpm_sde.md
@@ -10,12 +10,14 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->

-# DPMSolverSDEScheduler
+# DPM Stochastic Scheduler inspired by Karras et. al paper

-The `DPMSolverSDEScheduler` is inspired by the stochastic sampler from the [Elucidating the Design Space of Diffusion-Based Generative Models](https://huggingface.co/papers/2206.00364) paper, and the scheduler is ported from and created by [Katherine Crowson](https://github.com/crowsonkb/).
+## Overview
+
+Inspired by Stochastic Sampler from [Karras et. al](https://arxiv.org/abs/2206.00364).
+Scheduler ported from @crowsonkb's https://github.com/crowsonkb/k-diffusion library:
+
+All credit for making this scheduler work goes to [Katherine Crowson](https://github.com/crowsonkb/)

 ## DPMSolverSDEScheduler
-[[autodoc]] DPMSolverSDEScheduler
-
-## SchedulerOutput
-[[autodoc]] schedulers.scheduling_utils.SchedulerOutput
+[[autodoc]] DPMSolverSDEScheduler
--- a/docs/source/en/api/schedulers/euler.md
+++ b/docs/source/en/api/schedulers/euler.md
@@ -10,13 +10,12 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->

-# EulerDiscreteScheduler
+# Euler scheduler

-The Euler scheduler (Algorithm 2) is from the [Elucidating the Design Space of Diffusion-Based Generative Models](https://huggingface.co/papers/2206.00364) paper by Karras et al. This is a fast scheduler which can often generate good outputs in 20-30 steps. The scheduler is based on the original [k-diffusion](https://github.com/crowsonkb/k-diffusion/blob/481677d114f6ea445aa009cf5bd7a9cdee909e47/k_diffusion/sampling.py#L51) implementation by [Katherine Crowson](https://github.com/crowsonkb/).
+## Overview

+Euler scheduler (Algorithm 2) from the paper [Elucidating the Design Space of Diffusion-Based Generative Models](https://arxiv.org/abs/2206.00364) by Karras et al. (2022). Based on the original [k-diffusion](https://github.com/crowsonkb/k-diffusion/blob/481677d114f6ea445aa009cf5bd7a9cdee909e47/k_diffusion/sampling.py#L51) implementation by Katherine Crowson.
+Fast scheduler which often times generates good outputs with 20-30 steps.

 ## EulerDiscreteScheduler
-[[autodoc]] EulerDiscreteScheduler
-
-## EulerDiscreteSchedulerOutput
-[[autodoc]] schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput
+[[autodoc]] EulerDiscreteScheduler
--- a/docs/source/en/api/schedulers/euler_ancestral.md
+++ b/docs/source/en/api/schedulers/euler_ancestral.md
@@ -10,12 +10,12 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->

-# EulerAncestralDiscreteScheduler
+# Euler Ancestral scheduler

-A scheduler that uses ancestral sampling with Euler method steps. This is a fast scheduler which can often generate good outputs in 20-30 steps. The scheduler is based on the original [k-diffusion](https://github.com/crowsonkb/k-diffusion/blob/481677d114f6ea445aa009cf5bd7a9cdee909e47/k_diffusion/sampling.py#L72) implementation by [Katherine Crowson](https://github.com/crowsonkb/).
+## Overview
+
+Ancestral sampling with Euler method steps. Based on the original [k-diffusion](https://github.com/crowsonkb/k-diffusion/blob/481677d114f6ea445aa009cf5bd7a9cdee909e47/k_diffusion/sampling.py#L72) implementation by Katherine Crowson.
+Fast scheduler which often times generates good outputs with 20-30 steps.

 ## EulerAncestralDiscreteScheduler
 [[autodoc]] EulerAncestralDiscreteScheduler
-
-## EulerAncestralDiscreteSchedulerOutput
-[[autodoc]] schedulers.scheduling_euler_ancestral_discrete.EulerAncestralDiscreteSchedulerOutput
--- a/docs/source/en/api/schedulers/heun.md
+++ b/docs/source/en/api/schedulers/heun.md
@@ -10,12 +10,14 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->

-# HeunDiscreteScheduler
+# Heun scheduler inspired by Karras et. al paper

-The Heun scheduler (Algorithm 1) is from the [Elucidating the Design Space of Diffusion-Based Generative Models](https://huggingface.co/papers/2206.00364) paper by Karras et al. The scheduler is ported from the [k-diffusion](https://github.com/crowsonkb/k-diffusion) library and created by [Katherine Crowson](https://github.com/crowsonkb/).
+## Overview
+
+Algorithm 1 of [Karras et. al](https://arxiv.org/abs/2206.00364).
+Scheduler ported from @crowsonkb's https://github.com/crowsonkb/k-diffusion library:
+
+All credit for making this scheduler work goes to [Katherine Crowson](https://github.com/crowsonkb/)

 ## HeunDiscreteScheduler
-[[autodoc]] HeunDiscreteScheduler
-
-## SchedulerOutput
-[[autodoc]] schedulers.scheduling_utils.SchedulerOutput
+[[autodoc]] HeunDiscreteScheduler
--- a/docs/source/en/api/schedulers/ipndm.md
+++ b/docs/source/en/api/schedulers/ipndm.md
@@ -10,12 +10,11 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->

-# IPNDMScheduler
+# improved pseudo numerical methods for diffusion models (iPNDM)

-`IPNDMScheduler` is a fourth-order Improved Pseudo Linear Multistep scheduler. The original implementation can be found at [crowsonkb/v-diffusion-pytorch](https://github.com/crowsonkb/v-diffusion-pytorch/blob/987f8985e38208345c1959b0ea767a625831cc9b/diffusion/sampling.py#L296).
+## Overview
+
+Original implementation can be found [here](https://github.com/crowsonkb/v-diffusion-pytorch/blob/987f8985e38208345c1959b0ea767a625831cc9b/diffusion/sampling.py#L296).

 ## IPNDMScheduler
-[[autodoc]] IPNDMScheduler
-
-## SchedulerOutput
-[[autodoc]] schedulers.scheduling_utils.SchedulerOutput
+[[autodoc]] IPNDMScheduler
--- a/docs/source/en/api/schedulers/lms_discrete.md
+++ b/docs/source/en/api/schedulers/lms_discrete.md
@@ -10,12 +10,11 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->

-# LMSDiscreteScheduler
+# Linear multistep scheduler for discrete beta schedules

-`LMSDiscreteScheduler` is a linear multistep scheduler for discrete beta schedules. The scheduler is ported from and created by [Katherine Crowson](https://github.com/crowsonkb/), and the original implementation can be found at [crowsonkb/k-diffusion](https://github.com/crowsonkb/k-diffusion/blob/481677d114f6ea445aa009cf5bd7a9cdee909e47/k_diffusion/sampling.py#L181).
+## Overview
+
+Original implementation can be found [here](https://arxiv.org/abs/2206.00364).

 ## LMSDiscreteScheduler
-[[autodoc]] LMSDiscreteScheduler
-
-## LMSDiscreteSchedulerOutput
-[[autodoc]] schedulers.scheduling_lms_discrete.LMSDiscreteSchedulerOutput
+[[autodoc]] LMSDiscreteScheduler
--- a/docs/source/en/api/schedulers/multistep_dpm_solver.md
+++ b/docs/source/en/api/schedulers/multistep_dpm_solver.md
@@ -10,26 +10,11 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->

-# DPMSolverMultistepScheduler
+# Multistep DPM-Solver

-`DPMSolverMultistep` is a multistep scheduler from [DPM-Solver: A Fast ODE Solver for Diffusion Probabilistic Model Sampling in Around 10 Steps](https://huggingface.co/papers/2206.00927) and [DPM-Solver++: Fast Solver for Guided Sampling of Diffusion Probabilistic Models](https://huggingface.co/papers/2211.01095) by Cheng Lu, Yuhao Zhou, Fan Bao, Jianfei Chen, Chongxuan Li, and Jun Zhu.
+## Overview

-DPMSolver (and the improved version DPMSolver++) is a fast dedicated high-order solver for diffusion ODEs with convergence order guarantee. Empirically, DPMSolver sampling with only 20 steps can generate high-quality
-samples, and it can generate quite good samples even in 10 steps.
-
-## Tips
-
-It is recommended to set `solver_order` to 2 for guide sampling, and `solver_order=3` for unconditional sampling.
-
-Dynamic thresholding from Imagen (https://huggingface.co/papers/2205.11487) is supported, and for pixel-space
-diffusion models, you can set both `algorithm_type="dpmsolver++"` and `thresholding=True` to use the dynamic
-thresholding. This thresholding method is unsuitable for latent-space diffusion models such as
-Stable Diffusion.
-
-The SDE variant of DPMSolver and DPM-Solver++ is also supported, but only for the first and second-order solvers. This is a fast SDE solver for the reverse diffusion SDE. It is recommended to use the second-order `sde-dpmsolver++`.
+Original paper can be found [here](https://arxiv.org/abs/2206.00927) and the [improved version](https://arxiv.org/abs/2211.01095). The original implementation can be found [here](https://github.com/LuChengTHU/dpm-solver).

 ## DPMSolverMultistepScheduler
-[[autodoc]] DPMSolverMultistepScheduler
-
-## SchedulerOutput
-[[autodoc]] schedulers.scheduling_utils.SchedulerOutput
+[[autodoc]] DPMSolverMultistepScheduler
--- a/docs/source/en/api/schedulers/multistep_dpm_solver_inverse.md
+++ b/docs/source/en/api/schedulers/multistep_dpm_solver_inverse.md
@@ -10,21 +10,13 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->

-# DPMSolverMultistepInverse
+# Inverse Multistep DPM-Solver (DPMSolverMultistepInverse)

-`DPMSolverMultistepInverse` is the inverted scheduler from [DPM-Solver: A Fast ODE Solver for Diffusion Probabilistic Model Sampling in Around 10 Steps](https://huggingface.co/papers/2206.00927) and [DPM-Solver++: Fast Solver for Guided Sampling of Diffusion Probabilistic Models](https://huggingface.co/papers/2211.01095) by Cheng Lu, Yuhao Zhou, Fan Bao, Jianfei Chen, Chongxuan Li, and Jun Zhu.
+## Overview

-The implementation is mostly based on the DDIM inversion definition of [Null-text Inversion for Editing Real Images using Guided Diffusion Models](https://huggingface.co/papers/2211.09794.pdf) and notebook implementation of the [`DiffEdit`] latent inversion from [Xiang-cd/DiffEdit-stable-diffusion](https://github.com/Xiang-cd/DiffEdit-stable-diffusion/blob/main/diffedit.ipynb).
-
-## Tips
-
-Dynamic thresholding from Imagen (https://huggingface.co/papers/2205.11487) is supported, and for pixel-space
-diffusion models, you can set both `algorithm_type="dpmsolver++"` and `thresholding=True` to use the dynamic
-thresholding. This thresholding method is unsuitable for latent-space diffusion models such as
-Stable Diffusion.
+This scheduler is the inverted scheduler of [DPM-Solver: A Fast ODE Solver for Diffusion Probabilistic Model Sampling in Around 10 Steps](https://arxiv.org/abs/2206.00927) and [DPM-Solver++: Fast Solver for Guided Sampling of Diffusion Probabilistic Models
+](https://arxiv.org/abs/2211.01095) by Cheng Lu, Yuhao Zhou, Fan Bao, Jianfei Chen, Chongxuan Li, and Jun Zhu.
+The implementation is mostly based on the DDIM inversion definition of [Null-text Inversion for Editing Real Images using Guided Diffusion Models](https://arxiv.org/pdf/2211.09794.pdf) and the ad-hoc notebook implementation for DiffEdit latent inversion [here](https://github.com/Xiang-cd/DiffEdit-stable-diffusion/blob/main/diffedit.ipynb).

 ## DPMSolverMultistepInverseScheduler
 [[autodoc]] DPMSolverMultistepInverseScheduler
-
-## SchedulerOutput
-[[autodoc]] schedulers.scheduling_utils.SchedulerOutput
--- a/docs/source/en/api/schedulers/overview.md
+++ b/docs/source/en/api/schedulers/overview.md
@@ -12,53 +12,81 @@ specific language governing permissions and limitations under the License.

 # Schedulers

-🤗 Diffusers provides many scheduler functions for the diffusion process. A scheduler takes a model's output (the sample which the diffusion process is iterating on) and a timestep to return a denoised sample. The timestep is important because it dictates where in the diffusion process the step is; data is generated by iterating forward *n* timesteps and inference occurs by propagating backward through the timesteps. Based on the timestep, a scheduler may be *discrete* in which case the timestep is an `int` or *continuous* in which case the timestep is a `float`.
+Diffusers contains multiple pre-built schedule functions for the diffusion process.

-Depending on the context, a scheduler defines how to iteratively add noise to an image or how to update a sample based on a model's output:
+## What is a scheduler?

- during *training*, a scheduler adds noise (there are different algorithms for how to add noise) to a sample to train a diffusion model
- during *inference*, a scheduler defines how to update a sample based on a pretrained model's output
+The schedule functions, denoted *Schedulers* in the library take in the output of a trained model, a sample which the diffusion process is iterating on, and a timestep to return a denoised sample. That's why schedulers may also be called *Samplers* in other diffusion models implementations.

-Many schedulers are implemented from the [k-diffusion](https://github.com/crowsonkb/k-diffusion) library by [Katherine Crowson](https://github.com/crowsonkb/), and they're also widely used in A1111. To help you map the schedulers from k-diffusion and A1111 to the schedulers in 🤗 Diffusers, take a look at the table below:
+- Schedulers define the methodology for iteratively adding noise to an image or for updating a sample based on model outputs.
+    - adding noise in different manners represent the algorithmic processes to train a diffusion model by adding noise to images.
+    - for inference, the scheduler defines how to update a sample based on an output from a pretrained model.
+- Schedulers are often defined by a *noise schedule* and an *update rule* to solve the differential equation solution.

-| A1111/k-diffusion    | 🤗 Diffusers                         | Usage                                                                                                         |
-|---------------------|-------------------------------------|---------------------------------------------------------------------------------------------------------------|
-| DPM++ 2M            | [`DPMSolverMultistepScheduler`]     |                                                                                                               |
-| DPM++ 2M Karras     | [`DPMSolverMultistepScheduler`]     | init with `use_karras_sigmas=True`                                                                            |
-| DPM++ 2M SDE        | [`DPMSolverMultistepScheduler`]     | init with `algorithm_type="sde-dpmsolver++"`                                                                  |
-| DPM++ 2M SDE Karras | [`DPMSolverMultistepScheduler`]     | init with `use_karras_sigmas=True` and `algorithm_type="sde-dpmsolver++"`                                     |
-| DPM++ 2S a          | N/A                                 | very similar to  `DPMSolverSinglestepScheduler`                         |
-| DPM++ 2S a Karras   | N/A                                 | very similar to  `DPMSolverSinglestepScheduler(use_karras_sigmas=True, ...)` |
-| DPM++ SDE           | [`DPMSolverSinglestepScheduler`]    |                                                                                                               |
-| DPM++ SDE Karras    | [`DPMSolverSinglestepScheduler`]    | init with `use_karras_sigmas=True`                                                                            |
-| DPM2                | [`KDPM2DiscreteScheduler`]          |                                                                                                               |
-| DPM2 Karras         | [`KDPM2DiscreteScheduler`]          | init with `use_karras_sigmas=True`                                                                            |
-| DPM2 a              | [`KDPM2AncestralDiscreteScheduler`] |                                                                                                               |
-| DPM2 a Karras       | [`KDPM2AncestralDiscreteScheduler`] | init with `use_karras_sigmas=True`                                                                            |
-| DPM adaptive        | N/A                                 |                                                                                                               |
-| DPM fast            | N/A                                 |                                                                                                               |
-| Euler               | [`EulerDiscreteScheduler`]          |                                                                                                               |
-| Euler a             | [`EulerAncestralDiscreteScheduler`] |                                                                                                               |
-| Heun                | [`HeunDiscreteScheduler`]           |                                                                                                               |
-| LMS                 | [`LMSDiscreteScheduler`]            |                                                                                                               |
-| LMS Karras          | [`LMSDiscreteScheduler`]            | init with `use_karras_sigmas=True`                                                                            |
-| N/A                 | [`DEISMultistepScheduler`]          |                                                                                                               |
-| N/A                 | [`UniPCMultistepScheduler`]         |                                                                                                               |
+### Discrete versus continuous schedulers

-All schedulers are built from the base [`SchedulerMixin`] class which implements low level utilities shared by all schedulers.
+All schedulers take in a timestep to predict the updated version of the sample being diffused.
+The timesteps dictate where in the diffusion process the step is, where data is generated by iterating forward in time and inference is executed by propagating backwards through timesteps.
+Different algorithms use timesteps that can be discrete (accepting `int` inputs), such as the [`DDPMScheduler`] or [`PNDMScheduler`], or continuous (accepting `float` inputs), such as the score-based schedulers [`ScoreSdeVeScheduler`] or [`ScoreSdeVpScheduler`].

-## SchedulerMixin
+## Designing Re-usable schedulers
+
+The core design principle between the schedule functions is to be model, system, and framework independent.
+This allows for rapid experimentation and cleaner abstractions in the code, where the model prediction is separated from the sample update.
+To this end, the design of schedulers is such that:
+
+- Schedulers can be used interchangeably between diffusion models in inference to find the preferred trade-off between speed and generation quality.
+- Schedulers are currently by default in PyTorch, but are designed to be framework independent (partial Jax support currently exists).
+- Many diffusion pipelines, such as [`StableDiffusionPipeline`] and [`DiTPipeline`] can use any of [`KarrasDiffusionSchedulers`]
+
+## Schedulers Summary
+
+The following table summarizes all officially supported schedulers, their corresponding paper
+
+| Scheduler | Paper |
+|---|---|
+| [ddim](./ddim) | [**Denoising Diffusion Implicit Models**](https://arxiv.org/abs/2010.02502) |
+| [ddim_inverse](./ddim_inverse) | [**Denoising Diffusion Implicit Models**](https://arxiv.org/abs/2010.02502) |
+| [ddpm](./ddpm) | [**Denoising Diffusion Probabilistic Models**](https://arxiv.org/abs/2006.11239) |
+| [deis](./deis) | [**DEISMultistepScheduler**](https://arxiv.org/abs/2204.13902) |
+| [singlestep_dpm_solver](./singlestep_dpm_solver) | [**Singlestep DPM-Solver**](https://arxiv.org/abs/2206.00927) |
+| [multistep_dpm_solver](./multistep_dpm_solver) | [**Multistep DPM-Solver**](https://arxiv.org/abs/2206.00927) |
+| [heun](./heun) | [**Heun scheduler inspired by Karras et. al paper**](https://arxiv.org/abs/2206.00364) |
+| [dpm_discrete](./dpm_discrete) | [**DPM Discrete Scheduler inspired by Karras et. al paper**](https://arxiv.org/abs/2206.00364) |
+| [dpm_discrete_ancestral](./dpm_discrete_ancestral) | [**DPM Discrete Scheduler with ancestral sampling inspired by Karras et. al paper**](https://arxiv.org/abs/2206.00364) |
+| [stochastic_karras_ve](./stochastic_karras_ve) | [**Variance exploding, stochastic sampling from Karras et. al**](https://arxiv.org/abs/2206.00364) |
+| [lms_discrete](./lms_discrete) | [**Linear multistep scheduler for discrete beta schedules**](https://arxiv.org/abs/2206.00364) |
+| [pndm](./pndm) | [**Pseudo numerical methods for diffusion models (PNDM)**](https://github.com/crowsonkb/k-diffusion/blob/481677d114f6ea445aa009cf5bd7a9cdee909e47/k_diffusion/sampling.py#L181) |
+| [score_sde_ve](./score_sde_ve) | [**variance exploding stochastic differential equation (VE-SDE) scheduler**](https://arxiv.org/abs/2011.13456) |
+| [ipndm](./ipndm) | [**improved pseudo numerical methods for diffusion models (iPNDM)**](https://github.com/crowsonkb/v-diffusion-pytorch/blob/987f8985e38208345c1959b0ea767a625831cc9b/diffusion/sampling.py#L296) |
+| [score_sde_vp](./score_sde_vp) | [**Variance preserving stochastic differential equation (VP-SDE) scheduler**](https://arxiv.org/abs/2011.13456) |
+| [euler](./euler) | [**Euler scheduler**](https://arxiv.org/abs/2206.00364) |
+| [euler_ancestral](./euler_ancestral) | [**Euler Ancestral scheduler**](https://github.com/crowsonkb/k-diffusion/blob/481677d114f6ea445aa009cf5bd7a9cdee909e47/k_diffusion/sampling.py#L72) |
+| [vq_diffusion](./vq_diffusion) | [**VQDiffusionScheduler**](https://arxiv.org/abs/2111.14822) |
+| [unipc](./unipc) | [**UniPCMultistepScheduler**](https://arxiv.org/abs/2302.04867) |
+| [repaint](./repaint) | [**RePaint scheduler**](https://arxiv.org/abs/2201.09865) |
+
+## API
+
+The core API for any new scheduler must follow a limited structure.
+- Schedulers should provide one or more `def step(...)` functions that should be called to update the generated sample iteratively.
+- Schedulers should provide a `set_timesteps(...)` method that configures the parameters of a schedule function for a specific inference task.
+- Schedulers should be framework-specific.
+
+The base class [`SchedulerMixin`] implements low level utilities used by multiple schedulers.
+
+### SchedulerMixin
 [[autodoc]] SchedulerMixin

-## SchedulerOutput
+### SchedulerOutput
+The class [`SchedulerOutput`] contains the outputs from any schedulers `step(...)` call.
+
 [[autodoc]] schedulers.scheduling_utils.SchedulerOutput

-## KarrasDiffusionSchedulers
+### KarrasDiffusionSchedulers

-[`KarrasDiffusionSchedulers`] are a broad generalization of schedulers in 🤗 Diffusers. The schedulers in this class are distinguished at a high level by their noise sampling strategy, the type of network and scaling, the training strategy, and how the loss is weighed.
+`KarrasDiffusionSchedulers` encompasses the main generalization of schedulers in Diffusers. The schedulers in this class are distinguished, at a high level, by their noise sampling strategy; the type of network and scaling; and finally the training strategy or how the loss is weighed.

-The different schedulers in this class, depending on the ordinary differential equations (ODE) solver type, fall into the above taxonomy and provide a good abstraction for the design of the main schedulers implemented in 🤗 Diffusers. The schedulers in this class are given [here](https://github.com/huggingface/diffusers/blob/a69754bb879ed55b9b6dc9dd0b3cf4fa4124c765/src/diffusers/schedulers/scheduling_utils.py#L32).
+The different schedulers, depending on the type of ODE solver, fall into the above taxonomy and provide a good abstraction for the design of the main schedulers implemented in Diffusers. The schedulers in this class are given below:

-## PushToHubMixin
-
-[[autodoc]] utils.PushToHubMixin
+[[autodoc]] schedulers.scheduling_utils.KarrasDiffusionSchedulers
--- a/docs/source/en/api/schedulers/pndm.md
+++ b/docs/source/en/api/schedulers/pndm.md
@@ -10,12 +10,11 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->

-# PNDMScheduler
+# Pseudo numerical methods for diffusion models (PNDM)

-`PNDMScheduler`, or pseudo numerical methods for diffusion models, uses more advanced ODE integration techniques like the Runge-Kutta and linear multi-step method. The original implementation can be found at [crowsonkb/k-diffusion](https://github.com/crowsonkb/k-diffusion/blob/481677d114f6ea445aa009cf5bd7a9cdee909e47/k_diffusion/sampling.py#L181).
+## Overview
+
+Original implementation can be found [here](https://github.com/crowsonkb/k-diffusion/blob/481677d114f6ea445aa009cf5bd7a9cdee909e47/k_diffusion/sampling.py#L181).

 ## PNDMScheduler
-[[autodoc]] PNDMScheduler
-
-## SchedulerOutput
-[[autodoc]] schedulers.scheduling_utils.SchedulerOutput
+[[autodoc]] PNDMScheduler
--- a/docs/source/en/api/schedulers/repaint.md
+++ b/docs/source/en/api/schedulers/repaint.md
@@ -10,18 +10,14 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->

-# RePaintScheduler
+# RePaint scheduler

-`RePaintScheduler` is a DDPM-based inpainting scheduler for unsupervised inpainting with extreme masks. It is designed to be used with the [`RePaintPipeline`], and it is based on the paper [RePaint: Inpainting using Denoising Diffusion Probabilistic Models](https://huggingface.co/papers/2201.09865) by Andreas Lugmayr et al.
+## Overview

-The abstract from the paper is:
-
-*Free-form inpainting is the task of adding new content to an image in the regions specified by an arbitrary binary mask. Most existing approaches train for a certain distribution of masks, which limits their generalization capabilities to unseen mask types. Furthermore, training with pixel-wise and perceptual losses often leads to simple textural extensions towards the missing areas instead of semantically meaningful generation. In this work, we propose RePaint: A Denoising Diffusion Probabilistic Model (DDPM) based inpainting approach that is applicable to even extreme masks. We employ a pretrained unconditional DDPM as the generative prior. To condition the generation process, we only alter the reverse diffusion iterations by sampling the unmasked regions using the given image information. Since this technique does not modify or condition the original DDPM network itself, the model produces high-quality and diverse output images for any inpainting form. We validate our method for both faces and general-purpose image inpainting using standard and extreme masks. RePaint outperforms state-of-the-art Autoregressive, and GAN approaches for at least five out of six mask distributions. Github Repository: git.io/RePaint*.
-
-The original implementation can be found at [andreas128/RePaint](https://github.com/andreas128/).
+DDPM-based inpainting scheduler for unsupervised inpainting with extreme masks. 
+Intended for use with [`RePaintPipeline`].
+Based on the paper [RePaint: Inpainting using Denoising Diffusion Probabilistic Models](https://arxiv.org/abs/2201.09865) 
+and the original implementation by Andreas Lugmayr et al.: https://github.com/andreas128/RePaint

 ## RePaintScheduler
-[[autodoc]] RePaintScheduler
-
-## RePaintSchedulerOutput
-[[autodoc]] schedulers.scheduling_repaint.RePaintSchedulerOutput
+[[autodoc]] RePaintScheduler
--- a/docs/source/en/api/schedulers/score_sde_ve.md
+++ b/docs/source/en/api/schedulers/score_sde_ve.md
@@ -10,16 +10,11 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->

-# ScoreSdeVeScheduler
+# Variance Exploding Stochastic Differential Equation (VE-SDE) scheduler

-`ScoreSdeVeScheduler` is a variance exploding stochastic differential equation (SDE) scheduler. It was introduced in the [Score-Based Generative Modeling through Stochastic Differential Equations](https://huggingface.co/papers/2011.13456) paper by Yang Song, Jascha Sohl-Dickstein, Diederik P. Kingma, Abhishek Kumar, Stefano Ermon, Ben Poole.
+## Overview

-The abstract from the paper is:
-
-*Creating noise from data is easy; creating data from noise is generative modeling. We present a stochastic differential equation (SDE) that smoothly transforms a complex data distribution to a known prior distribution by slowly injecting noise, and a corresponding reverse-time SDE that transforms the prior distribution back into the data distribution by slowly removing the noise. Crucially, the reverse-time SDE depends only on the time-dependent gradient field (\aka, score) of the perturbed data distribution. By leveraging advances in score-based generative modeling, we can accurately estimate these scores with neural networks, and use numerical SDE solvers to generate samples. We show that this framework encapsulates previous approaches in score-based generative modeling and diffusion probabilistic modeling, allowing for new sampling procedures and new modeling capabilities. In particular, we introduce a predictor-corrector framework to correct errors in the evolution of the discretized reverse-time SDE. We also derive an equivalent neural ODE that samples from the same distribution as the SDE, but additionally enables exact likelihood computation, and improved sampling efficiency. In addition, we provide a new way to solve inverse problems with score-based models, as demonstrated with experiments on class-conditional generation, image inpainting, and colorization. Combined with multiple architectural improvements, we achieve record-breaking performance for unconditional image generation on CIFAR-10 with an Inception score of 9.89 and FID of 2.20, a competitive likelihood of 2.99 bits/dim, and demonstrate high fidelity generation of 1024 x 1024 images for the first time from a score-based generative model*.
+Original paper can be found [here](https://arxiv.org/abs/2011.13456).

 ## ScoreSdeVeScheduler
 [[autodoc]] ScoreSdeVeScheduler
-
-## SdeVeOutput
-[[autodoc]] schedulers.scheduling_sde_ve.SdeVeOutput
--- a/docs/source/en/api/schedulers/score_sde_vp.md
+++ b/docs/source/en/api/schedulers/score_sde_vp.md
@@ -10,17 +10,15 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->

-# ScoreSdeVpScheduler
+# Variance Preserving Stochastic Differential Equation (VP-SDE) scheduler

-`ScoreSdeVpScheduler` is a variance preserving stochastic differential equation (SDE) scheduler.  It was introduced in the [Score-Based Generative Modeling through Stochastic Differential Equations](https://huggingface.co/papers/2011.13456) paper by Yang Song, Jascha Sohl-Dickstein, Diederik P. Kingma, Abhishek Kumar, Stefano Ermon, Ben Poole.
+## Overview

-The abstract from the paper is:
-
-*Creating noise from data is easy; creating data from noise is generative modeling. We present a stochastic differential equation (SDE) that smoothly transforms a complex data distribution to a known prior distribution by slowly injecting noise, and a corresponding reverse-time SDE that transforms the prior distribution back into the data distribution by slowly removing the noise. Crucially, the reverse-time SDE depends only on the time-dependent gradient field (\aka, score) of the perturbed data distribution. By leveraging advances in score-based generative modeling, we can accurately estimate these scores with neural networks, and use numerical SDE solvers to generate samples. We show that this framework encapsulates previous approaches in score-based generative modeling and diffusion probabilistic modeling, allowing for new sampling procedures and new modeling capabilities. In particular, we introduce a predictor-corrector framework to correct errors in the evolution of the discretized reverse-time SDE. We also derive an equivalent neural ODE that samples from the same distribution as the SDE, but additionally enables exact likelihood computation, and improved sampling efficiency. In addition, we provide a new way to solve inverse problems with score-based models, as demonstrated with experiments on class-conditional generation, image inpainting, and colorization. Combined with multiple architectural improvements, we achieve record-breaking performance for unconditional image generation on CIFAR-10 with an Inception score of 9.89 and FID of 2.20, a competitive likelihood of 2.99 bits/dim, and demonstrate high fidelity generation of 1024 x 1024 images for the first time from a score-based generative model*.
+Original paper can be found [here](https://arxiv.org/abs/2011.13456).

 <Tip warning={true}>

-🚧 This scheduler is under construction!
+Score SDE-VP is under construction.

 </Tip>

--- a/docs/source/en/api/schedulers/singlestep_dpm_solver.md
+++ b/docs/source/en/api/schedulers/singlestep_dpm_solver.md
@@ -10,26 +10,11 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->

-# DPMSolverSinglestepScheduler
+# Singlestep DPM-Solver

-`DPMSolverSinglestepScheduler` is a single step scheduler from [DPM-Solver: A Fast ODE Solver for Diffusion Probabilistic Model Sampling in Around 10 Steps](https://huggingface.co/papers/2206.00927) and [DPM-Solver++: Fast Solver for Guided Sampling of Diffusion Probabilistic Models](https://huggingface.co/papers/2211.01095) by Cheng Lu, Yuhao Zhou, Fan Bao, Jianfei Chen, Chongxuan Li, and Jun Zhu.
+## Overview

-DPMSolver (and the improved version DPMSolver++) is a fast dedicated high-order solver for diffusion ODEs with convergence order guarantee. Empirically, DPMSolver sampling with only 20 steps can generate high-quality
-samples, and it can generate quite good samples even in 10 steps.
-
-The original implementation can be found at [LuChengTHU/dpm-solver](https://github.com/LuChengTHU/dpm-solver).
-
-## Tips
-
-It is recommended to set `solver_order` to 2 for guide sampling, and `solver_order=3` for unconditional sampling.
-
-Dynamic thresholding from Imagen (https://huggingface.co/papers/2205.11487) is supported, and for pixel-space
-diffusion models, you can set both `algorithm_type="dpmsolver++"` and `thresholding=True` to use dynamic
-thresholding. This thresholding method is unsuitable for latent-space diffusion models such as
-Stable Diffusion.
+Original paper can be found [here](https://arxiv.org/abs/2206.00927) and the [improved version](https://arxiv.org/abs/2211.01095). The original implementation can be found [here](https://github.com/LuChengTHU/dpm-solver).

 ## DPMSolverSinglestepScheduler
-[[autodoc]] DPMSolverSinglestepScheduler
-
-## SchedulerOutput
-[[autodoc]] schedulers.scheduling_utils.SchedulerOutput
+[[autodoc]] DPMSolverSinglestepScheduler
--- a/docs/source/en/api/schedulers/stochastic_karras_ve.md
+++ b/docs/source/en/api/schedulers/stochastic_karras_ve.md
@@ -10,12 +10,11 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->

-# KarrasVeScheduler
+# Variance exploding, stochastic sampling from Karras et. al

-`KarrasVeScheduler` is a stochastic sampler tailored o variance-expanding (VE) models. It is based on the [Elucidating the Design Space of Diffusion-Based Generative Models](https://huggingface.co/papers/2206.00364) and [Score-based generative modeling through stochastic differential equations](https://huggingface.co/papers/2011.13456) papers.
+## Overview
+
+Original paper can be found [here](https://arxiv.org/abs/2206.00364).

 ## KarrasVeScheduler
-[[autodoc]] KarrasVeScheduler
-
-## KarrasVeOutput
-[[autodoc]] schedulers.scheduling_karras_ve.KarrasVeOutput
+[[autodoc]] KarrasVeScheduler
--- a/docs/source/en/api/schedulers/unipc.md
+++ b/docs/source/en/api/schedulers/unipc.md
@@ -10,28 +10,15 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->

-# UniPCMultistepScheduler
+# UniPC

-`UniPCMultistepScheduler` is a training-free framework designed for fast sampling of diffusion models. It was introduced in [UniPC: A Unified Predictor-Corrector Framework for Fast Sampling of Diffusion Models](https://huggingface.co/papers/2302.04867) by Wenliang Zhao, Lujia Bai, Yongming Rao, Jie Zhou, Jiwen Lu.
+## Overview

-It consists of a corrector (UniC) and a predictor (UniP) that share a unified analytical form and support arbitrary orders.
-UniPC is by design model-agnostic, supporting pixel-space/latent-space DPMs on unconditional/conditional sampling. It can also be applied to both noise prediction and data prediction models. The corrector UniC can be also applied after any off-the-shelf solvers to increase the order of accuracy.
+UniPC is a training-free framework designed for the fast sampling of diffusion models, which consists of a corrector (UniC) and a predictor (UniP) that share a unified analytical form and support arbitrary orders.

-The abstract from the paper is:
+For more details about the method, please refer to the [paper](https://arxiv.org/abs/2302.04867) and the [code](https://github.com/wl-zhao/UniPC).

-*Diffusion probabilistic models (DPMs) have demonstrated a very promising ability in high-resolution image synthesis. However, sampling from a pre-trained DPM usually requires hundreds of model evaluations, which is computationally expensive. Despite recent progress in designing high-order solvers for DPMs, there still exists room for further speedup, especially in extremely few steps (e.g., 5~10 steps). Inspired by the predictor-corrector for ODE solvers, we develop a unified corrector (UniC) that can be applied after any existing DPM sampler to increase the order of accuracy without extra model evaluations, and derive a unified predictor (UniP) that supports arbitrary order as a byproduct. Combining UniP and UniC, we propose a unified predictor-corrector framework called UniPC for the fast sampling of DPMs, which has a unified analytical form for any order and can significantly improve the sampling quality over previous methods. We evaluate our methods through extensive experiments including both unconditional and conditional sampling using pixel-space and latent-space DPMs. Our UniPC can achieve 3.87 FID on CIFAR10 (unconditional) and 7.51 FID on ImageNet 256times256 (conditional) with only 10 function evaluations. Code is available at https://github.com/wl-zhao/UniPC*.
-
-The original codebase can be found at [wl-zhao/UniPC](https://github.com/wl-zhao/UniPC).
-
-## Tips
-
-It is recommended to set `solver_order` to 2 for guide sampling, and `solver_order=3` for unconditional sampling.
-
-Dynamic thresholding from Imagen (https://huggingface.co/papers/2205.11487) is supported, and for pixel-space
-diffusion models, you can set both `predict_x0=True` and `thresholding=True` to use dynamic thresholding. This thresholding method is unsuitable for latent-space diffusion models such as Stable Diffusion.
+Fast Sampling of Diffusion Models with Exponential Integrator.

 ## UniPCMultistepScheduler
 [[autodoc]] UniPCMultistepScheduler
-
-## SchedulerOutput
-[[autodoc]] schedulers.scheduling_utils.SchedulerOutput
--- a/docs/source/en/api/schedulers/vq_diffusion.md
+++ b/docs/source/en/api/schedulers/vq_diffusion.md
@@ -12,14 +12,9 @@ specific language governing permissions and limitations under the License.

 # VQDiffusionScheduler

-`VQDiffusionScheduler` converts the transformer model's output into a sample for the unnoised image at the previous diffusion timestep. It was introduced in [Vector Quantized Diffusion Model for Text-to-Image Synthesis](https://huggingface.co/papers/2111.14822) by Shuyang Gu, Dong Chen, Jianmin Bao, Fang Wen, Bo Zhang, Dongdong Chen, Lu Yuan, Baining Guo.
+## Overview

-The abstract from the paper is:
-
-*We present the vector quantized diffusion (VQ-Diffusion) model for text-to-image generation. This method is based on a vector quantized variational autoencoder (VQ-VAE) whose latent space is modeled by a conditional variant of the recently developed Denoising Diffusion Probabilistic Model (DDPM). We find that this latent-space method is well-suited for text-to-image generation tasks because it not only eliminates the unidirectional bias with existing methods but also allows us to incorporate a mask-and-replace diffusion strategy to avoid the accumulation of errors, which is a serious problem with existing methods. Our experiments show that the VQ-Diffusion produces significantly better text-to-image generation results when compared with conventional autoregressive (AR) models with similar numbers of parameters. Compared with previous GAN-based text-to-image methods, our VQ-Diffusion can handle more complex scenes and improve the synthesized image quality by a large margin. Finally, we show that the image generation computation in our method can be made highly efficient by reparameterization. With traditional AR methods, the text-to-image generation time increases linearly with the output image resolution and hence is quite time consuming even for normal size images. The VQ-Diffusion allows us to achieve a better trade-off between quality and speed. Our experiments indicate that the VQ-Diffusion model with the reparameterization is fifteen times faster than traditional AR methods while achieving a better image quality.*
+Original paper can be found [here](https://arxiv.org/abs/2111.14822)

 ## VQDiffusionScheduler
-[[autodoc]] VQDiffusionScheduler
-
-## VQDiffusionSchedulerOutput
-[[autodoc]] schedulers.scheduling_vq_diffusion.VQDiffusionSchedulerOutput
+[[autodoc]] VQDiffusionScheduler
--- a/docs/source/en/api/utilities.md
+++ b/docs/source/en/api/utilities.md
@@ -20,8 +20,4 @@ Utility and helper functions for working with 🤗 Diffusers.

 ## export_to_video

-[[autodoc]] utils.testing_utils.export_to_video
-
-## make_image_grid
-
-[[autodoc]] utils.pil_utils.make_image_grid
+[[autodoc]] utils.testing_utils.export_to_video
--- a/docs/source/en/conceptual/evaluation.md
+++ b/docs/source/en/conceptual/evaluation.md
@@ -334,7 +334,7 @@ image_processor = CLIPImageProcessor.from_pretrained(clip_id)
 image_encoder = CLIPVisionModelWithProjection.from_pretrained(clip_id).to(device)
 ```

-Notice that we are using a particular CLIP checkpoint, i.e., `openai/clip-vit-large-patch14`. This is because the Stable Diffusion pre-training was performed with this CLIP variant. For more details, refer to the [documentation](https://huggingface.co/docs/transformers/model_doc/clip).
+Notice that we are using a particular CLIP checkpoint, i.e., `openai/clip-vit-large-patch14`. This is because the Stable Diffusion pre-training was performed with this CLIP variant. For more details, refer to the [documentation](https://huggingface.co/docs/diffusers/main/en/api/pipelines/stable_diffusion/pix2pix#diffusers.StableDiffusionInstructPix2PixPipeline.text_encoder).

 Next, we prepare a PyTorch `nn.Module` to compute directional similarity:

--- a/docs/source/en/conceptual/philosophy.md
+++ b/docs/source/en/conceptual/philosophy.md
@@ -90,7 +90,7 @@ The following design principles are followed:
 - To integrate new model checkpoints whose general architecture can be classified as an architecture that already exists in Diffusers, the existing model architecture shall be adapted to make it work with the new checkpoint. One should only create a new file if the model architecture is fundamentally different.
 - Models should be designed to be easily extendable to future changes. This can be achieved by limiting public function arguments, configuration arguments, and "foreseeing" future changes, *e.g.* it is usually better to add `string` "...type" arguments that can easily be extended to new future types instead of boolean `is_..._type` arguments. Only the minimum amount of changes shall be made to existing architectures to make a new model checkpoint work.
 - The model design is a difficult trade-off between keeping code readable and concise and supporting many model checkpoints. For most parts of the modeling code, classes shall be adapted for new model checkpoints, while there are some exceptions where it is preferred to add new classes to make sure the code is kept concise and 
-readable longterm, such as [UNet blocks](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/unet_2d_blocks.py) and [Attention processors](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+readable longterm, such as [UNet blocks](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/unet_2d_blocks.py) and [Attention processors](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).

 ### Schedulers

--- a/docs/source/en/optimization/fp16.md
+++ b/docs/source/en/optimization/fp16.md
@@ -51,7 +51,6 @@ from diffusers import DiffusionPipeline
 pipe = DiffusionPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5",
    torch_dtype=torch.float16,
-    use_safetensors=True,
 )
 pipe = pipe.to("cuda")

@@ -66,11 +65,42 @@ image = pipe(prompt).images[0]
  
 </Tip>

+## Sliced attention for additional memory savings
+
+For even additional memory savings, you can use a sliced version of attention that performs the computation in steps instead of all at once.
+
+<Tip>
+  Attention slicing is useful even if a batch size of just 1 is used - as long
+  as the model uses more than one attention head. If there is more than one
+  attention head the *QK^T* attention matrix can be computed sequentially for
+  each head which can save a significant amount of memory.
+</Tip>
+
+To perform the attention computation sequentially over each head, you only need to invoke [`~DiffusionPipeline.enable_attention_slicing`] in your pipeline before inference, like here:
+
+```Python
+import torch
+from diffusers import DiffusionPipeline
+
+pipe = DiffusionPipeline.from_pretrained(
+    "runwayml/stable-diffusion-v1-5",
+    torch_dtype=torch.float16,
+)
+pipe = pipe.to("cuda")
+
+prompt = "a photo of an astronaut riding a horse on mars"
+pipe.enable_attention_slicing()
+image = pipe(prompt).images[0]
+```
+
+There's a small performance penalty of about 10% slower inference times, but this method allows you to use Stable Diffusion in as little as 3.2 GB of VRAM!
+
+
 ## Sliced VAE decode for larger batches

 To decode large batches of images with limited VRAM, or to enable batches with 32 images or more, you can use sliced VAE decode that decodes the batch latents one image at a time.

-You likely want to couple this with [`~StableDiffusionPipeline.enable_xformers_memory_efficient_attention`] to further minimize memory use.
+You likely want to couple this with [`~StableDiffusionPipeline.enable_attention_slicing`] or [`~StableDiffusionPipeline.enable_xformers_memory_efficient_attention`] to further minimize memory use.

 To perform the VAE decode one image at a time, invoke [`~StableDiffusionPipeline.enable_vae_slicing`] in your pipeline before inference. For example:

@@ -81,7 +111,6 @@ from diffusers import StableDiffusionPipeline
 pipe = StableDiffusionPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5",
    torch_dtype=torch.float16,
-    use_safetensors=True,
 )
 pipe = pipe.to("cuda")

@@ -97,7 +126,7 @@ You may see a small performance boost in VAE decode on multi-image batches. Ther

 Tiled VAE processing makes it possible to work with large images on limited VRAM. For example, generating 4k images in 8GB of VRAM. Tiled VAE decoder splits the image into overlapping tiles, decodes the tiles, and blends the outputs to make the final image.

-You want to couple this with [`~StableDiffusionPipeline.enable_xformers_memory_efficient_attention`] to further minimize memory use.
+You want to couple this with [`~StableDiffusionPipeline.enable_attention_slicing`] or [`~StableDiffusionPipeline.enable_xformers_memory_efficient_attention`] to further minimize memory use.

 To use tiled VAE processing, invoke [`~StableDiffusionPipeline.enable_vae_tiling`] in your pipeline before inference. For example:

@@ -108,7 +137,6 @@ from diffusers import StableDiffusionPipeline, UniPCMultistepScheduler
 pipe = StableDiffusionPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5",
    torch_dtype=torch.float16,
-    use_safetensors=True,
 )
 pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
 pipe = pipe.to("cuda")
@@ -136,7 +164,6 @@ from diffusers import StableDiffusionPipeline
 pipe = StableDiffusionPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5",
    torch_dtype=torch.float16,
-    use_safetensors=True,
 )

 prompt = "a photo of an astronaut riding a horse on mars"
@@ -161,11 +188,11 @@ from diffusers import StableDiffusionPipeline
 pipe = StableDiffusionPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5",
    torch_dtype=torch.float16,
-    use_safetensors=True,
 )

 prompt = "a photo of an astronaut riding a horse on mars"
 pipe.enable_sequential_cpu_offload()
+pipe.enable_attention_slicing(1)

 image = pipe(prompt).images[0]
 ```
@@ -194,7 +221,6 @@ from diffusers import StableDiffusionPipeline
 pipe = StableDiffusionPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5",  
    torch_dtype=torch.float16,
-    use_safetensors=True,
 )

 prompt = "a photo of an astronaut riding a horse on mars"
@@ -211,11 +237,11 @@ from diffusers import StableDiffusionPipeline
 pipe = StableDiffusionPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5",
    torch_dtype=torch.float16,
-    use_safetensors=True,
 )

 prompt = "a photo of an astronaut riding a horse on mars"
 pipe.enable_model_cpu_offload()
+pipe.enable_attention_slicing(1)

 image = pipe(prompt).images[0]
 ```
@@ -274,7 +300,6 @@ def generate_inputs():
 pipe = StableDiffusionPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5",
    torch_dtype=torch.float16,
-    use_safetensors=True,
 ).to("cuda")
 unet = pipe.unet
 unet.eval()
@@ -338,7 +363,6 @@ class UNet2DConditionOutput:
 pipe = StableDiffusionPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5",
    torch_dtype=torch.float16,
-    use_safetensors=True,
 ).to("cuda")

 # use jitted unet
@@ -398,7 +422,6 @@ import torch
 pipe = DiffusionPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5",
    torch_dtype=torch.float16,
-    use_safetensors=True,
 ).to("cuda")

 pipe.enable_xformers_memory_efficient_attention()
--- a/docs/source/en/optimization/onnx.md
+++ b/docs/source/en/optimization/onnx.md
@@ -11,7 +11,7 @@ specific language governing permissions and limitations under the License.
 -->


-# How to use ONNX Runtime for inference
+# How to use the ONNX Runtime for inference

 🤗 [Optimum](https://github.com/huggingface/optimum) provides a Stable Diffusion pipeline compatible with ONNX Runtime. 

@@ -27,7 +27,7 @@ pip install optimum["onnxruntime"]

 ### Inference

-To load an ONNX model and run inference with ONNX Runtime, you need to replace [`StableDiffusionPipeline`] with `ORTStableDiffusionPipeline`. In case you want to load a PyTorch model and convert it to the ONNX format on-the-fly, you can set `export=True`.
+To load an ONNX model and run inference with the ONNX Runtime, you need to replace [`StableDiffusionPipeline`] with `ORTStableDiffusionPipeline`. In case you want to load a PyTorch model and convert it to the ONNX format on-the-fly, you can set `export=True`.

 ```python
 from optimum.onnxruntime import ORTStableDiffusionPipeline
@@ -86,13 +86,12 @@ optimum-cli export onnx --model stabilityai/stable-diffusion-xl-base-1.0 --task

 ### Inference

-Here is an example of how you can load a SDXL ONNX model from [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0) and run inference with ONNX Runtime :
+To load an ONNX model and run inference with ONNX Runtime, you need to replace `StableDiffusionPipelineXL` with `ORTStableDiffusionPipelineXL` :

 ```python
 from optimum.onnxruntime import ORTStableDiffusionXLPipeline

-model_id = "stabilityai/stable-diffusion-xl-base-1.0"
-pipeline = ORTStableDiffusionXLPipeline.from_pretrained(model_id)
+pipeline = ORTStableDiffusionXLPipeline.from_pretrained("sd_xl_onnx")
 prompt = "sailing ship in storm by Leonardo da Vinci"
 image = pipeline(prompt).images[0]
 ```
--- a/docs/source/en/optimization/open_vino.md
+++ b/docs/source/en/optimization/open_vino.md
@@ -85,13 +85,11 @@ You can find more examples in the optimum [documentation](https://huggingface.co

 ### Inference

-Here is an example of how you can load a SDXL OpenVINO model from [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0) and run inference with OpenVINO Runtime :
-
 ```python
 from optimum.intel import OVStableDiffusionXLPipeline

 model_id = "stabilityai/stable-diffusion-xl-base-1.0"
-pipeline = OVStableDiffusionXLPipeline.from_pretrained(model_id)
+pipeline = OVStableDiffusionXLPipeline.from_pretrained(model_id, export=True)
 prompt = "sailing ship in storm by Rembrandt"
 image = pipeline(prompt).images[0]
 ```
--- a/docs/source/en/optimization/torch2.0.md
+++ b/docs/source/en/optimization/torch2.0.md
@@ -39,7 +39,7 @@ pip install --upgrade torch diffusers
    import torch
    from diffusers import DiffusionPipeline

-    pipe = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True)
+    pipe = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16)
    pipe = pipe.to("cuda")

    prompt = "a photo of an astronaut riding a horse on mars"
@@ -53,7 +53,7 @@ pip install --upgrade torch diffusers
    from diffusers import DiffusionPipeline
    + from diffusers.models.attention_processor import AttnProcessor2_0

-    pipe = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True).to("cuda")
+    pipe = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16).to("cuda")
    + pipe.unet.set_attn_processor(AttnProcessor2_0())

    prompt = "a photo of an astronaut riding a horse on mars"
@@ -69,7 +69,7 @@ pip install --upgrade torch diffusers
    from diffusers import DiffusionPipeline
    from diffusers.models.attention_processor import AttnProcessor

-    pipe = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True).to("cuda")
+    pipe = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16).to("cuda")
    pipe.unet.set_default_attn_processor()

    prompt = "a photo of an astronaut riding a horse on mars"
@@ -107,7 +107,7 @@ path = "runwayml/stable-diffusion-v1-5"

 run_compile = True  # Set True / False

-pipe = DiffusionPipeline.from_pretrained(path, torch_dtype=torch.float16, use_safetensors=True)
+pipe = DiffusionPipeline.from_pretrained(path, torch_dtype=torch.float16)
 pipe = pipe.to("cuda")
 pipe.unet.to(memory_format=torch.channels_last)

@@ -140,7 +140,7 @@ path = "runwayml/stable-diffusion-v1-5"

 run_compile = True  # Set True / False

-pipe = StableDiffusionImg2ImgPipeline.from_pretrained(path, torch_dtype=torch.float16, use_safetensors=True)
+pipe = StableDiffusionImg2ImgPipeline.from_pretrained(path, torch_dtype=torch.float16)
 pipe = pipe.to("cuda")
 pipe.unet.to(memory_format=torch.channels_last)

@@ -180,7 +180,7 @@ path = "runwayml/stable-diffusion-inpainting"

 run_compile = True  # Set True / False

-pipe = StableDiffusionInpaintPipeline.from_pretrained(path, torch_dtype=torch.float16, use_safetensors=True)
+pipe = StableDiffusionInpaintPipeline.from_pretrained(path, torch_dtype=torch.float16)
 pipe = pipe.to("cuda")
 pipe.unet.to(memory_format=torch.channels_last)

@@ -212,9 +212,9 @@ init_image = init_image.resize((512, 512))
 path = "runwayml/stable-diffusion-v1-5"

 run_compile = True  # Set True / False
-controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16, use_safetensors=True)
+controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16)
 pipe = StableDiffusionControlNetPipeline.from_pretrained(
-    path, controlnet=controlnet, torch_dtype=torch.float16, use_safetensors=True
+    path, controlnet=controlnet, torch_dtype=torch.float16
 )

 pipe = pipe.to("cuda")
@@ -240,11 +240,11 @@ import torch

 run_compile = True  # Set True / False

-pipe = DiffusionPipeline.from_pretrained("DeepFloyd/IF-I-M-v1.0", variant="fp16", text_encoder=None, torch_dtype=torch.float16, use_safetensors=True)
+pipe = DiffusionPipeline.from_pretrained("DeepFloyd/IF-I-M-v1.0", variant="fp16", text_encoder=None, torch_dtype=torch.float16)
 pipe.to("cuda")
-pipe_2 = DiffusionPipeline.from_pretrained("DeepFloyd/IF-II-M-v1.0", variant="fp16", text_encoder=None, torch_dtype=torch.float16, use_safetensors=True)
+pipe_2 = DiffusionPipeline.from_pretrained("DeepFloyd/IF-II-M-v1.0", variant="fp16", text_encoder=None, torch_dtype=torch.float16)
 pipe_2.to("cuda")
-pipe_3 = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-x4-upscaler", torch_dtype=torch.float16, use_safetensors=True)
+pipe_3 = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-x4-upscaler", torch_dtype=torch.float16)
 pipe_3.to("cuda")


--- a/docs/source/en/quicktour.md
+++ b/docs/source/en/quicktour.md
@@ -67,7 +67,7 @@ Load the model with the [`~DiffusionPipeline.from_pretrained`] method:
 ```python
 >>> from diffusers import DiffusionPipeline

->>> pipeline = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", use_safetensors=True)
+>>> pipeline = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
 ```

 The [`DiffusionPipeline`] downloads and caches all modeling, tokenization, and scheduling components. You'll see that the Stable Diffusion pipeline is composed of the [`UNet2DConditionModel`] and [`PNDMScheduler`] among other things:
@@ -130,7 +130,7 @@ You can also use the pipeline locally. The only difference is you need to downlo
 Then load the saved weights into the pipeline:

 ```python
->>> pipeline = DiffusionPipeline.from_pretrained("./stable-diffusion-v1-5", use_safetensors=True)
+>>> pipeline = DiffusionPipeline.from_pretrained("./stable-diffusion-v1-5")
 ```

 Now you can run the pipeline as you would in the section above.
@@ -142,7 +142,7 @@ Different schedulers come with different denoising speeds and quality trade-offs
 ```py
 >>> from diffusers import EulerDiscreteScheduler

->>> pipeline = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", use_safetensors=True)
+>>> pipeline = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
 >>> pipeline.scheduler = EulerDiscreteScheduler.from_config(pipeline.scheduler.config)
 ```

@@ -160,7 +160,7 @@ Models are initiated with the [`~ModelMixin.from_pretrained`] method which also
 >>> from diffusers import UNet2DModel

 >>> repo_id = "google/ddpm-cat-256"
->>> model = UNet2DModel.from_pretrained(repo_id, use_safetensors=True)
+>>> model = UNet2DModel.from_pretrained(repo_id)
 ```

 To access the model parameters, call `model.config`:
--- a/docs/source/en/stable_diffusion.md
+++ b/docs/source/en/stable_diffusion.md
@@ -26,7 +26,7 @@ Begin by loading the [`runwayml/stable-diffusion-v1-5`](https://huggingface.co/r
 from diffusers import DiffusionPipeline

 model_id = "runwayml/stable-diffusion-v1-5"
-pipeline = DiffusionPipeline.from_pretrained(model_id, use_safetensors=True)
+pipeline = DiffusionPipeline.from_pretrained(model_id)
 ```

 The example prompt you'll use is a portrait of an old warrior chief, but feel free to use your own prompt:
@@ -75,7 +75,7 @@ Let's start by loading the model in `float16` and generate an image:
 ```python
 import torch

-pipeline = DiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16, use_safetensors=True)
+pipeline = DiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16)
 pipeline = pipeline.to("cuda")
 generator = torch.Generator("cuda").manual_seed(0)
 image = pipeline(prompt, generator=generator).images[0]
@@ -152,13 +152,26 @@ def get_inputs(batch_size=1):
    return {"prompt": prompts, "generator": generator, "num_inference_steps": num_inference_steps}
 ```

+You'll also need a function that'll display each batch of images:
+
+```python
+from PIL import Image
+
+
+def image_grid(imgs, rows=2, cols=2):
+    w, h = imgs[0].size
+    grid = Image.new("RGB", size=(cols * w, rows * h))
+
+    for i, img in enumerate(imgs):
+        grid.paste(img, box=(i % cols * w, i // cols * h))
+    return grid
+```
+
 Start with `batch_size=4` and see how much memory you've consumed:

 ```python
-from diffusers.utils import make_image_grid 
-
 images = pipeline(**get_inputs(batch_size=4)).images
-make_image_grid(images, 2, 2)
+image_grid(images)
 ```

 Unless you have a GPU with more RAM, the code above probably returned an `OOM` error! Most of the memory is taken up by the cross-attention layers. Instead of running this operation in a batch, you can run it sequentially to save a significant amount of memory. All you have to do is configure the pipeline to use the [`~DiffusionPipeline.enable_attention_slicing`] function:
@@ -171,7 +184,7 @@ Now try increasing the `batch_size` to 8!

 ```python
 images = pipeline(**get_inputs(batch_size=8)).images
-make_image_grid(images, rows=2, cols=4)
+image_grid(images, rows=2, cols=4)
 ```

 <div class="flex justify-center">
@@ -200,7 +213,7 @@ from diffusers import AutoencoderKL
 vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse", torch_dtype=torch.float16).to("cuda")
 pipeline.vae = vae
 images = pipeline(**get_inputs(batch_size=8)).images
-make_image_grid(images, rows=2, cols=4)
+image_grid(images, rows=2, cols=4)
 ```

 <div class="flex justify-center">
@@ -225,7 +238,7 @@ Generate a batch of images with the new prompt:

 ```python
 images = pipeline(**get_inputs(batch_size=8)).images
-make_image_grid(images, rows=2, cols=4)
+image_grid(images, rows=2, cols=4)
 ```

 <div class="flex justify-center">
@@ -244,7 +257,7 @@ prompts = [

 generator = [torch.Generator("cuda").manual_seed(1) for _ in range(len(prompts))]
 images = pipeline(prompt=prompts, generator=generator, num_inference_steps=25).images
-make_image_grid(images, 2, 2)
+image_grid(images)
 ```

 <div class="flex justify-center">
--- a/docs/source/en/training/adapt_a_model.md
+++ b/docs/source/en/training/adapt_a_model.md
@@ -11,7 +11,7 @@ A [`UNet2DConditionModel`] by default accepts 4 channels in the [input sample](h
 ```py
 from diffusers import StableDiffusionPipeline

-pipeline = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", use_safetensors=True)
+pipeline = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
 pipeline.unet.config["in_channels"]
 4
 ```
@@ -21,7 +21,7 @@ Inpainting requires 9 channels in the input sample. You can check this value in
 ```py
 from diffusers import StableDiffusionPipeline

-pipeline = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-inpainting", use_safetensors=True)
+pipeline = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-inpainting")
 pipeline.unet.config["in_channels"]
 9
 ```
@@ -35,12 +35,7 @@ from diffusers import UNet2DConditionModel

 model_id = "runwayml/stable-diffusion-v1-5"
 unet = UNet2DConditionModel.from_pretrained(
-    model_id,
-    subfolder="unet",
-    in_channels=9,
-    low_cpu_mem_usage=False,
-    ignore_mismatched_sizes=True,
-    use_safetensors=True,
+    model_id, subfolder="unet", in_channels=9, low_cpu_mem_usage=False, ignore_mismatched_sizes=True
 )
 ```

--- a/docs/source/en/training/controlnet.md
+++ b/docs/source/en/training/controlnet.md
@@ -265,7 +265,7 @@ distributed_type: DEEPSPEED

 See [documentation](https://huggingface.co/docs/accelerate/usage_guides/deepspeed) for more DeepSpeed configuration options.

-</Tip>
+<Tip>

 Changing the default Adam optimizer to DeepSpeed's Adam
 `deepspeed.ops.adam.DeepSpeedCPUAdam` gives a substantial speedup but
@@ -306,9 +306,9 @@ import torch
 base_model_path = "path to model"
 controlnet_path = "path to controlnet"

-controlnet = ControlNetModel.from_pretrained(controlnet_path, torch_dtype=torch.float16, use_safetensors=True)
+controlnet = ControlNetModel.from_pretrained(controlnet_path, torch_dtype=torch.float16)
 pipe = StableDiffusionControlNetPipeline.from_pretrained(
-    base_model_path, controlnet=controlnet, torch_dtype=torch.float16, use_safetensors=True
+    base_model_path, controlnet=controlnet, torch_dtype=torch.float16
 )

 # speed up diffusion process with faster scheduler and memory optimization
@@ -327,7 +327,3 @@ image = pipe(prompt, num_inference_steps=20, generator=generator, image=control_

 image.save("./output.png")
 ```
-
-## Stable Diffusion XL
-
-Training with [Stable Diffusion XL](https://huggingface.co/papers/2307.01952) is also supported via the `train_controlnet_sdxl.py` script. Please refer to the docs [here](https://github.com/huggingface/diffusers/blob/main/examples/controlnet/README_sdxl.md). 
--- a/docs/source/en/training/custom_diffusion.md
+++ b/docs/source/en/training/custom_diffusion.md
@@ -222,9 +222,7 @@ Once you have trained a model using the above command, you can run inference usi
 import torch
 from diffusers import DiffusionPipeline

-pipe = DiffusionPipeline.from_pretrained(
-    "CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16, use_safetensors=True
-).to("cuda")
+pipe = DiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16).to("cuda")
 pipe.unet.load_attn_procs("path-to-save-model", weight_name="pytorch_custom_diffusion_weights.bin")
 pipe.load_textual_inversion("path-to-save-model", weight_name="<new1>.bin")

@@ -248,7 +246,7 @@ model_id = "sayakpaul/custom-diffusion-cat"
 card = RepoCard.load(model_id)
 base_model_id = card.data.to_dict()["base_model"]

-pipe = DiffusionPipeline.from_pretrained(base_model_id, torch_dtype=torch.float16, use_safetensors=True).to("cuda")
+pipe = DiffusionPipeline.from_pretrained(base_model_id, torch_dtype=torch.float16).to("cuda")
 pipe.unet.load_attn_procs(model_id, weight_name="pytorch_custom_diffusion_weights.bin")
 pipe.load_textual_inversion(model_id, weight_name="<new1>.bin")

@@ -272,7 +270,7 @@ model_id = "sayakpaul/custom-diffusion-cat-wooden-pot"
 card = RepoCard.load(model_id)
 base_model_id = card.data.to_dict()["base_model"]

-pipe = DiffusionPipeline.from_pretrained(base_model_id, torch_dtype=torch.float16, use_safetensors=True).to("cuda")
+pipe = DiffusionPipeline.from_pretrained(base_model_id, torch_dtype=torch.float16).to("cuda")
 pipe.unet.load_attn_procs(model_id, weight_name="pytorch_custom_diffusion_weights.bin")
 pipe.load_textual_inversion(model_id, weight_name="<new1>.bin")
 pipe.load_textual_inversion(model_id, weight_name="<new2>.bin")
--- a/docs/source/en/training/distributed_inference.md
+++ b/docs/source/en/training/distributed_inference.md
@@ -16,9 +16,7 @@ Now use the [`~accelerate.PartialState.split_between_processes`] utility as a co
 from accelerate import PartialState
 from diffusers import DiffusionPipeline

-pipeline = DiffusionPipeline.from_pretrained(
-    "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True
-)
+pipeline = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16)
 distributed_state = PartialState()
 pipeline.to(distributed_state.device)

@@ -52,9 +50,7 @@ import torch.multiprocessing as mp

 from diffusers import DiffusionPipeline

-sd = DiffusionPipeline.from_pretrained(
-    "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True
-)
+sd = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16)
 ```

 You'll want to create a function to run inference; [`init_process_group`](https://pytorch.org/docs/stable/distributed.html?highlight=init_process_group#torch.distributed.init_process_group) handles creating a distributed environment with the type of backend to use, the `rank` of the current process, and the `world_size` or the number of processes participating. If you're running inference in parallel over 2 GPUs, then the `world_size` is 2.
--- a/docs/source/en/training/dreambooth.md
+++ b/docs/source/en/training/dreambooth.md
@@ -303,9 +303,7 @@ unet = UNet2DConditionModel.from_pretrained("/sddata/dreambooth/daruma-v2-1/chec
 # if you have trained with `--args.train_text_encoder` make sure to also load the text encoder
 text_encoder = CLIPTextModel.from_pretrained("/sddata/dreambooth/daruma-v2-1/checkpoint-100/text_encoder")

-pipeline = DiffusionPipeline.from_pretrained(
-    model_id, unet=unet, text_encoder=text_encoder, dtype=torch.float16, use_safetensors=True
-)
+pipeline = DiffusionPipeline.from_pretrained(model_id, unet=unet, text_encoder=text_encoder, dtype=torch.float16)
 pipeline.to("cuda")

 # Perform inference, or save, or push to the hub
@@ -320,7 +318,7 @@ from diffusers import DiffusionPipeline

 # Load the pipeline with the same arguments (model, revision) that were used for training
 model_id = "CompVis/stable-diffusion-v1-4"
-pipeline = DiffusionPipeline.from_pretrained(model_id, use_safetensors=True)
+pipeline = DiffusionPipeline.from_pretrained(model_id)

 accelerator = Accelerator()

@@ -335,7 +333,6 @@ pipeline = DiffusionPipeline.from_pretrained(
    model_id,
    unet=accelerator.unwrap_model(unet),
    text_encoder=accelerator.unwrap_model(text_encoder),
-    use_safetensors=True,
 )

 # Perform inference, or save, or push to the hub
@@ -491,7 +488,7 @@ from diffusers import DiffusionPipeline
 import torch

 model_id = "path_to_saved_model"
-pipe = DiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16, use_safetensors=True).to("cuda")
+pipe = DiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to("cuda")

 prompt = "A photo of sks dog in a bucket"
 image = pipe(prompt, num_inference_steps=50, guidance_scale=7.5).images[0]
@@ -513,7 +510,7 @@ must also update the pipeline's scheduler config.
 ```py
 from diffusers import DiffusionPipeline

-pipe = DiffusionPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", use_safetensors=True)
+pipe = DiffusionPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0")

 pipe.load_lora_weights("<lora weights path>")

@@ -707,4 +704,4 @@ accelerate launch train_dreambooth.py \

 ## Stable Diffusion XL

-We support fine-tuning of the UNet and text encoders shipped in [Stable Diffusion XL](https://huggingface.co/papers/2307.01952) with DreamBooth and LoRA via the `train_dreambooth_lora_sdxl.py` script. Please refer to the docs [here](https://github.com/huggingface/diffusers/blob/main/examples/dreambooth/README_sdxl.md). 
+We support fine-tuning of the UNet shipped in [Stable Diffusion XL](https://huggingface.co/papers/2307.01952) with DreamBooth and LoRA via the `train_dreambooth_lora_sdxl.py` script. Please refer to the docs [here](https://github.com/huggingface/diffusers/blob/main/examples/dreambooth/README_sdxl.md). 
--- a/docs/source/en/training/instructpix2pix.md
+++ b/docs/source/en/training/instructpix2pix.md
@@ -165,9 +165,7 @@ import torch
 from diffusers import StableDiffusionInstructPix2PixPipeline

 model_id = "your_model_id"  # <- replace this
-pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(
-    model_id, torch_dtype=torch.float16, use_safetensors=True
-).to("cuda")
+pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to("cuda")
 generator = torch.Generator("cuda").manual_seed(0)

 url = "https://huggingface.co/datasets/sayakpaul/sample-datasets/resolve/main/test_pix2pix_4.png"
@@ -214,4 +212,4 @@ If you're looking for some interesting ways to use the InstructPix2Pix training

 ## Stable Diffusion XL

-Training with [Stable Diffusion XL](https://huggingface.co/papers/2307.01952) is also supported via the `train_instruct_pix2pix_sdxl.py` script. Please refer to the docs [here](https://github.com/huggingface/diffusers/blob/main/examples/instruct_pix2pix/README_sdxl.md). 
+We support fine-tuning of the UNet shipped in [Stable Diffusion XL](https://huggingface.co/papers/2307.01952) with DreamBooth and LoRA via the `train_dreambooth_lora_sdxl.py` script. Please refer to the docs [here](https://github.com/huggingface/diffusers/blob/main/examples/instruct_pix2pix/README_sdxl.md). 
--- a/docs/source/en/training/lora.md
+++ b/docs/source/en/training/lora.md
@@ -98,7 +98,7 @@ Now you can use the model for inference by loading the base model in the [`Stabl

 >>> model_base = "runwayml/stable-diffusion-v1-5"

->>> pipe = StableDiffusionPipeline.from_pretrained(model_base, torch_dtype=torch.float16, use_safetensors=True)
+>>> pipe = StableDiffusionPipeline.from_pretrained(model_base, torch_dtype=torch.float16)
 >>> pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
 ```

@@ -137,7 +137,7 @@ lora_model_id = "sayakpaul/sd-model-finetuned-lora-t4"
 card = RepoCard.load(lora_model_id)
 base_model_id = card.data.to_dict()["base_model"]

-pipe = StableDiffusionPipeline.from_pretrained(base_model_id, torch_dtype=torch.float16, use_safetensors=True)
+pipe = StableDiffusionPipeline.from_pretrained(base_model_id, torch_dtype=torch.float16)
 ...
 ```

@@ -211,7 +211,7 @@ Now you can use the model for inference by loading the base model in the [`Stabl

 >>> model_base = "runwayml/stable-diffusion-v1-5"

->>> pipe = StableDiffusionPipeline.from_pretrained(model_base, torch_dtype=torch.float16, use_safetensors=True)
+>>> pipe = StableDiffusionPipeline.from_pretrained(model_base, torch_dtype=torch.float16)
 ```

 Load the LoRA weights from your finetuned DreamBooth model *on top of the base model weights*, and then move the pipeline to a GPU for faster inference. When you merge the LoRA weights with the frozen pretrained model weights, you can optionally adjust how much of the weights to merge with the `scale` parameter:
@@ -251,7 +251,7 @@ lora_model_id = "sayakpaul/dreambooth-text-encoder-test"
 card = RepoCard.load(lora_model_id)
 base_model_id = card.data.to_dict()["base_model"]

-pipe = StableDiffusionPipeline.from_pretrained(base_model_id, torch_dtype=torch.float16, use_safetensors=True)
+pipe = StableDiffusionPipeline.from_pretrained(base_model_id, torch_dtype=torch.float16)
 pipe = pipe.to("cuda")
 pipe.load_lora_weights(lora_model_id)
 image = pipe("A picture of a sks dog in a bucket", num_inference_steps=25).images[0]
@@ -276,40 +276,20 @@ Note that the use of [`~diffusers.loaders.LoraLoaderMixin.load_lora_weights`] is

 * LoRA parameters that have separate identifiers for the UNet and the text encoder such as: [`"sayakpaul/dreambooth"`](https://huggingface.co/sayakpaul/dreambooth).

-<Tip>
-
-You can also provide a local directory path to [`~diffusers.loaders.LoraLoaderMixin.load_lora_weights`] as well as [`~diffusers.loaders.UNet2DConditionLoadersMixin.load_attn_procs`].
-
-</Tip>
-
-## Stable Diffusion XL
-
-We support fine-tuning with [Stable Diffusion XL](https://huggingface.co/papers/2307.01952). Please refer to the following docs:
-
-* [text_to_image/README_sdxl.md](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/README_sdxl.md)
-* [dreambooth/README_sdxl.md](https://github.com/huggingface/diffusers/blob/main/examples/dreambooth/README_sdxl.md)
+**Note** that it is possible to provide a local directory path to [`~diffusers.loaders.LoraLoaderMixin.load_lora_weights`] as well as [`~diffusers.loaders.UNet2DConditionLoadersMixin.load_attn_procs`]. To know about the supported inputs,
+refer to the respective docstrings.

 ## Unloading LoRA parameters

 You can call [`~diffusers.loaders.LoraLoaderMixin.unload_lora_weights`] on a pipeline to unload the LoRA parameters.

-## Fusing LoRA parameters
+## Supporting A1111 themed LoRA checkpoints from Diffusers

-You can call [`~diffusers.loaders.LoraLoaderMixin.fuse_lora`] on a pipeline to merge the LoRA parameters with the original parameters of the underlying model(s). This can lead to a potential speedup in the inference latency.
+This support was made possible because of our amazing contributors: [@takuma104](https://github.com/takuma104) and [@isidentical](https://github.com/isidentical).

-## Unfusing LoRA parameters
-
-To undo `fuse_lora`, call [`~diffusers.loaders.LoraLoaderMixin.unfuse_lora`] on a pipeline.
-
-## Supporting different LoRA checkpoints from Diffusers
-
-🤗 Diffusers supports loading checkpoints from popular LoRA trainers such as [Kohya](https://github.com/kohya-ss/sd-scripts/) and [TheLastBen](https://github.com/TheLastBen/fast-stable-diffusion). In this section, we outline the current API's details and limitations. 
-
-### Kohya
-
-This support was made possible because of the amazing contributors: [@takuma104](https://github.com/takuma104) and [@isidentical](https://github.com/isidentical).
-
-We support loading Kohya LoRA checkpoints using [`~diffusers.loaders.LoraLoaderMixin.load_lora_weights`]. In this section, we explain how to load such a checkpoint from [CivitAI](https://civitai.com/)
+To provide seamless interoperability with A1111 to our users, we support loading A1111 formatted
+LoRA checkpoints using [`~diffusers.loaders.LoraLoaderMixin.load_lora_weights`] in a limited capacity.
+In this section, we explain how to load an A1111 formatted LoRA checkpoint from [CivitAI](https://civitai.com/)
 in Diffusers and perform inference with it. 

 First, download a checkpoint. We'll use
@@ -327,7 +307,7 @@ import torch
 from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler

 pipeline = StableDiffusionPipeline.from_pretrained(
-    "gsdf/Counterfeit-V2.5", torch_dtype=torch.float16, safety_checker=None, use_safetensors=True
+    "gsdf/Counterfeit-V2.5", torch_dtype=torch.float16, safety_checker=None
 ).to("cuda")
 pipeline.scheduler = DPMSolverMultistepScheduler.from_config(
    pipeline.scheduler.config, use_karras_sigmas=True
@@ -374,78 +354,4 @@ directly with [`~diffusers.loaders.LoraLoaderMixin.load_lora_weights`] like so:
 lora_model_id = "sayakpaul/civitai-light-shadow-lora"
 lora_filename = "light_and_shadow.safetensors"
 pipeline.load_lora_weights(lora_model_id, weight_name=lora_filename)
-```
-
-### Kohya + Stable Diffusion XL
-
-After the release of [Stable Diffusion XL](https://huggingface.co/papers/2307.01952), the community contributed some amazing LoRA checkpoints trained on top of it with the Kohya trainer.  
-
-Here are some example checkpoints we tried out:
-
-* SDXL 0.9:
-  * https://civitai.com/models/22279?modelVersionId=118556 
-  * https://civitai.com/models/104515/sdxlor30costumesrevue-starlight-saijoclaudine-lora 
-  * https://civitai.com/models/108448/daiton-sdxl-test 
-  * https://filebin.net/2ntfqqnapiu9q3zx/pixelbuildings128-v1.safetensors
-* SDXL 1.0:
-  * https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/sd_xl_offset_example-lora_1.0.safetensors
-
-Here is an example of how to perform inference with these checkpoints in `diffusers`:
-
-```python
-from diffusers import DiffusionPipeline
-import torch 
-
-base_model_id = "stabilityai/stable-diffusion-xl-base-0.9"
-pipeline = DiffusionPipeline.from_pretrained(base_model_id, torch_dtype=torch.float16).to("cuda")
-pipeline.load_lora_weights(".", weight_name="Kamepan.safetensors")
-
-prompt = "anime screencap, glint, drawing, best quality, light smile, shy, a full body of a girl wearing wedding dress in the middle of the forest beneath the trees, fireflies, big eyes, 2d, cute, anime girl, waifu, cel shading, magical girl, vivid colors, (outline:1.1), manga anime artstyle, masterpiece, offical wallpaper, glint <lora:kame_sdxl_v2:1>"
-negative_prompt = "(deformed, bad quality, sketch, depth of field, blurry:1.1), grainy, bad anatomy, bad perspective, old, ugly, realistic, cartoon, disney, bad propotions"
-generator = torch.manual_seed(2947883060)
-num_inference_steps = 30
-guidance_scale = 7
-
-image = pipeline(
-    prompt=prompt, negative_prompt=negative_prompt, num_inference_steps=num_inference_steps,
-    generator=generator, guidance_scale=guidance_scale
-).images[0]
-image.save("Kamepan.png")
-```
-
-`Kamepan.safetensors` comes from https://civitai.com/models/22279?modelVersionId=118556 . 
-
-If you notice carefully, the inference UX is exactly identical to what we presented in the sections above. 
-
-Thanks to [@isidentical](https://github.com/isidentical) for helping us on integrating this feature.
-
-<Tip warning={true}>
-
-**Known limitations specific to the Kohya LoRAs**: 
-
-* When images don't looks similar to other UIs, such as ComfyUI, it can be because of multiple reasons, as explained [here](https://github.com/huggingface/diffusers/pull/4287/#issuecomment-1655110736).
-* We don't fully support [LyCORIS checkpoints](https://github.com/KohakuBlueleaf/LyCORIS). To the best of our knowledge, our current `load_lora_weights()` should support LyCORIS checkpoints that have LoRA and LoCon modules but not the other ones, such as Hada, LoKR, etc. 
-
-</Tip>
-
-### TheLastBen
-
-Here is an example:
-
-```python
-from diffusers import DiffusionPipeline
-import torch
-
-pipeline_id = "Lykon/dreamshaper-xl-1-0"
-
-pipe = DiffusionPipeline.from_pretrained(pipeline_id, torch_dtype=torch.float16)
-pipe.enable_model_cpu_offload()
-
-lora_model_id = "TheLastBen/Papercut_SDXL"
-lora_filename = "papercut.safetensors"
-pipe.load_lora_weights(lora_model_id, weight_name=lora_filename)
-
-prompt = "papercut sonic"
-image = pipe(prompt=prompt, num_inference_steps=20, generator=torch.manual_seed(0)).images[0]
-image
-```
+```
--- a/docs/source/en/training/text2image.md
+++ b/docs/source/en/training/text2image.md
@@ -238,7 +238,7 @@ Now you can load the fine-tuned model for inference by passing the model path or
 from diffusers import StableDiffusionPipeline

 model_path = "path_to_saved_model"
-pipe = StableDiffusionPipeline.from_pretrained(model_path, torch_dtype=torch.float16, use_safetensors=True)
+pipe = StableDiffusionPipeline.from_pretrained(model_path, torch_dtype=torch.float16)
 pipe.to("cuda")

 image = pipe(prompt="yoda").images[0]
@@ -275,9 +275,3 @@ image.save("yoda-pokemon.png")
 ```
 </jax>
 </frameworkcontent>
-
-
-## Stable Diffusion XL
-
-* We support fine-tuning the UNet shipped in [Stable Diffusion XL](https://huggingface.co/papers/2307.01952) via the `train_text_to_image_sdxl.py` script. Please refer to the docs [here](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/README_sdxl.md). 
-* We also support fine-tuning of the UNet and Text Encoder shipped in [Stable Diffusion XL](https://huggingface.co/papers/2307.01952) with LoRA via the `train_text_to_image_lora_sdxl.py` script. Please refer to the docs [here](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/README_sdxl.md). 
--- a/docs/source/en/training/text_inversion.md
+++ b/docs/source/en/training/text_inversion.md
@@ -204,7 +204,7 @@ from diffusers import StableDiffusionPipeline
 import torch

 model_id = "runwayml/stable-diffusion-v1-5"
-pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16, use_safetensors=True).to("cuda")
+pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to("cuda")
 ```

 Next, we need to load the textual inversion embedding vector which can be done via the [`TextualInversionLoaderMixin.load_textual_inversion`]
--- a/docs/source/en/tutorials/autopipeline.md
+++ b/docs/source/en/tutorials/autopipeline.md
@@ -1,146 +0,0 @@
-# AutoPipeline
-
-🤗 Diffusers is able to complete many different tasks, and you can often reuse the same pretrained weights for multiple tasks such as text-to-image, image-to-image, and inpainting. If you're new to the library and diffusion models though, it may be difficult to know which pipeline to use for a task. For example, if you're using the [runwayml/stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5) checkpoint for text-to-image, you might not know that you could also use it for image-to-image and inpainting by loading the checkpoint with the [`StableDiffusionImg2ImgPipeline`] and [`StableDiffusionInpaintPipeline`] classes respectively.
-
-The `AutoPipeline` class is designed to simplify the variety of pipelines in 🤗 Diffusers. It is a generic, *task-first* pipeline that lets you focus on the task. The `AutoPipeline` automatically detects the correct pipeline class to use, which makes it easier to load a checkpoint for a task without knowing the specific pipeline class name.
-
-<Tip>
-
-Take a look at the [AutoPipeline](./pipelines/auto_pipeline) reference to see which tasks are supported. Currently, it supports text-to-image, image-to-image, and inpainting.
-
-</Tip>
-
-This tutorial shows you how to use an `AutoPipeline` to automatically infer the pipeline class to load for a specific task, given the pretrained weights.
-
-## Choose an AutoPipeline for your task
-
-Start by picking a checkpoint. For example, if you're interested in text-to-image with the [runwayml/stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5) checkpoint, use [`AutoPipelineForText2Image`]:
-
-```py
-from diffusers import AutoPipelineForText2Image
-import torch
-
-pipeline = AutoPipelineForText2Image.from_pretrained(
-    "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True
-).to("cuda")
-prompt = "peasant and dragon combat, wood cutting style, viking era, bevel with rune"
-
-image = pipeline(prompt, num_inference_steps=25).images[0]
-```
-
-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/autopipeline-text2img.png" alt="generated image of peasant fighting dragon in wood cutting style"/>
-</div>
-
-Under the hood, [`AutoPipelineForText2Image`]:
-
-1. automatically detects a `"stable-diffusion"` class from the [`model_index.json`](https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/model_index.json) file
-2. loads the corresponding text-to-image [`StableDiffusionPipline`] based on the `"stable-diffusion"` class name
-
-Likewise, for image-to-image, [`AutoPipelineForImage2Image`] detects a `"stable-diffusion"` checkpoint from the `model_index.json` file and it'll load the corresponding [`StableDiffusionImg2ImgPipeline`] behind the scenes. You can also pass any additional arguments specific to the pipeline class such as `strength`, which determines the amount of noise or variation added to an input image: 
-
-```py
-from diffusers import AutoPipelineForImage2Image
-
-pipeline = AutoPipelineForImage2Image.from_pretrained(
-    "runwayml/stable-diffusion-v1-5",
-    torch_dtype=torch.float16,
-    use_safetensors=True,
-).to("cuda")
-prompt = "a portrait of a dog wearing a pearl earring"
-
-url = "https://upload.wikimedia.org/wikipedia/commons/thumb/0/0f/1665_Girl_with_a_Pearl_Earring.jpg/800px-1665_Girl_with_a_Pearl_Earring.jpg"
-
-response = requests.get(url)
-image = Image.open(BytesIO(response.content)).convert("RGB")
-image.thumbnail((768, 768))
-
-image = pipeline(prompt, image, num_inference_steps=200, strength=0.75, guidance_scale=10.5).images[0]
-```
-
-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/autopipeline-img2img.png" alt="generated image of a vermeer portrait of a dog wearing a pearl earring"/>
-</div>
-
-And if you want to do inpainting, then [`AutoPipelineForInpainting`] loads the underlying [`StableDiffusionInpaintPipeline`] class in the same way:
-
-```py
-from diffusers import AutoPipelineForInpainting
-from diffusers.utils import load_image
-
-pipeline = AutoPipelineForInpainting.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16, use_safetensors=True
-).to("cuda")
-
-img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png"
-mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png"
-
-init_image = load_image(img_url).convert("RGB")
-mask_image = load_image(mask_url).convert("RGB")
-
-prompt = "A majestic tiger sitting on a bench"
-image = pipeline(prompt, image=init_image, mask_image=mask_image, num_inference_steps=50, strength=0.80).images[0]
-```
-
-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/autopipeline-inpaint.png" alt="generated image of a tiger sitting on a bench"/>
-</div>
-
-If you try to load an unsupported checkpoint, it'll throw an error:
-
-```py
-from diffusers import AutoPipelineForImage2Image
-import torch
-
-pipeline = AutoPipelineForImage2Image.from_pretrained(
-    "openai/shap-e-img2img", torch_dtype=torch.float16, use_safetensors=True
-)
-"ValueError: AutoPipeline can't find a pipeline linked to ShapEImg2ImgPipeline for None"
-```
-
-## Use multiple pipelines
-
-For some workflows or if you're loading many pipelines, it is more memory-efficient to reuse the same components from a checkpoint instead of reloading them which would unnecessarily consume additional memory. For example, if you're using a checkpoint for text-to-image and you want to use it again for image-to-image, use the [`~AutoPipelineForImage2Image.from_pipe`] method. This method creates a new pipeline from the components of a previously loaded pipeline at no additional memory cost.
-
-The [`~AutoPipelineForImage2Image.from_pipe`] method detects the original pipeline class and maps it to the new pipeline class corresponding to the task you want to do. For example, if you load a `"stable-diffusion"` class pipeline for text-to-image:
-
-```py
-from diffusers import AutoPipelineForText2Image, AutoPipelineForImage2Image
-
-pipeline_text2img = AutoPipelineForText2Image.from_pretrained(
-    "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True
-)
-print(type(pipeline_text2img))
-"<class 'diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline'>"
-```
-
-Then [`~AutoPipelineForImage2Image.from_pipe`] maps the original `"stable-diffusion"` pipeline class to [`StableDiffusionImg2ImgPipeline`]:
-
-```py
-pipeline_img2img = AutoPipelineForImage2Image.from_pipe(pipeline_text2img)
-print(type(pipeline_img2img))
-"<class 'diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline'>"
-```
-
-If you passed an optional argument - like disabling the safety checker - to the original pipeline, this argument is also passed on to the new pipeline:
-
-```py
-from diffusers import AutoPipelineForText2Image, AutoPipelineForImage2Image
-
-pipeline_text2img = AutoPipelineForText2Image.from_pretrained(
-    "runwayml/stable-diffusion-v1-5",
-    torch_dtype=torch.float16,
-    use_safetensors=True,
-    requires_safety_checker=False,
-).to("cuda")
-
-pipeline_img2img = AutoPipelineForImage2Image.from_pipe(pipeline_text2img)
-print(pipe.config.requires_safety_checker)
-"False"
-```
-
-You can overwrite any of the arguments and even configuration from the original pipeline if you want to change the behavior of the new pipeline. For example, to turn the safety checker back on and add the `strength` argument:
-
-```py
-pipeline_img2img = AutoPipelineForImage2Image.from_pipe(pipeline_text2img, requires_safety_checker=True, strength=0.3)
-```
--- a/docs/source/en/tutorials/basic_training.md
+++ b/docs/source/en/tutorials/basic_training.md
@@ -252,11 +252,18 @@ Then, you'll need a way to evaluate the model. For evaluation, you can use the [

 ```py
 >>> from diffusers import DDPMPipeline
->>> from diffusers.utils import make_image_grid
 >>> import math
 >>> import os


+>>> def make_grid(images, rows, cols):
+...     w, h = images[0].size
+...     grid = Image.new("RGB", size=(cols * w, rows * h))
+...     for i, image in enumerate(images):
+...         grid.paste(image, box=(i % cols * w, i // cols * h))
+...     return grid
+
+
 >>> def evaluate(config, epoch, pipeline):
 ...     # Sample some images from random noise (this is the backward diffusion process).
 ...     # The default pipeline output type is `List[PIL.Image]`
@@ -266,7 +273,7 @@ Then, you'll need a way to evaluate the model. For evaluation, you can use the [
 ...     ).images

 ...     # Make a grid out of the images
-...     image_grid = make_image_grid(images, rows=4, cols=4)
+...     image_grid = make_grid(images, rows=4, cols=4)

 ...     # Save the images
 ...     test_dir = os.path.join(config.output_dir, "samples")
--- a/docs/source/en/using-diffusers/conditional_image_generation.md
+++ b/docs/source/en/using-diffusers/conditional_image_generation.md
@@ -25,7 +25,7 @@ In this guide, you'll use [`DiffusionPipeline`] for text-to-image generation wit
 ```python
 >>> from diffusers import DiffusionPipeline

->>> generator = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", use_safetensors=True)
+>>> generator = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
 ```

 The [`DiffusionPipeline`] downloads and caches all modeling, tokenization, and scheduling components. 
--- a/docs/source/en/using-diffusers/contribute_pipeline.md
+++ b/docs/source/en/using-diffusers/contribute_pipeline.md
@@ -94,7 +94,7 @@ output = pipeline()
 But what's even better is you can load pre-existing weights into the pipeline if the pipeline structure is identical. For example, you can load the [`google/ddpm-cifar10-32`](https://huggingface.co/google/ddpm-cifar10-32) weights into the one-step pipeline:

 ```python
-pipeline = UnetSchedulerOneForwardPipeline.from_pretrained("google/ddpm-cifar10-32", use_safetensors=True)
+pipeline = UnetSchedulerOneForwardPipeline.from_pretrained("google/ddpm-cifar10-32")

 output = pipeline()
 ```
@@ -108,9 +108,7 @@ Once it is merged, anyone with `diffusers >= 0.4.0` installed can use this pipel
 ```python
 from diffusers import DiffusionPipeline

-pipe = DiffusionPipeline.from_pretrained(
-    "google/ddpm-cifar10-32", custom_pipeline="one_step_unet", use_safetensors=True
-)
+pipe = DiffusionPipeline.from_pretrained("google/ddpm-cifar10-32", custom_pipeline="one_step_unet")
 pipe()
 ```

@@ -119,9 +117,7 @@ Another way to share your community pipeline is to upload the `one_step_unet.py`
 ```python
 from diffusers import DiffusionPipeline

-pipeline = DiffusionPipeline.from_pretrained(
-    "google/ddpm-cifar10-32", custom_pipeline="stevhliu/one_step_unet", use_safetensors=True
-)
+pipeline = DiffusionPipeline.from_pretrained("google/ddpm-cifar10-32", custom_pipeline="stevhliu/one_step_unet")
 ```

 Take a look at the following table to compare the two sharing workflows to help you decide the best option for you:
@@ -165,7 +161,6 @@ pipeline = DiffusionPipeline.from_pretrained(
    feature_extractor=feature_extractor,
    scheduler=scheduler,
    torch_dtype=torch.float16,
-    use_safetensors=True,
 )
 ```

--- a/docs/source/en/using-diffusers/control_brightness.md
+++ b/docs/source/en/using-diffusers/control_brightness.md
@@ -24,7 +24,7 @@ Next, configure the following parameters in the [`DDIMScheduler`]:
 ```py
 >>> from diffusers import DiffusionPipeline, DDIMScheduler

->>> pipeline = DiffusionPipeline.from_pretrained("ptx0/pseudo-journey-v2", use_safetensors=True)
+>>> pipeline = DiffusionPipeline.from_pretrained("ptx0/pseudo-journey-v2")
 # switch the scheduler in the pipeline to use the DDIMScheduler

 >>> pipeline.scheduler = DDIMScheduler.from_config(
--- a/docs/source/en/using-diffusers/controlling_generation.md
+++ b/docs/source/en/using-diffusers/controlling_generation.md
@@ -40,8 +40,6 @@ Unless otherwise mentioned, these are techniques that work with existing models
 12. [Custom Diffusion](#custom-diffusion)
 13. [Model Editing](#model-editing)
 14. [DiffEdit](#diffedit)
-15. [T2I-Adapter](#t2i-adapter)
-16. [FABRIC](#fabric)

 For convenience, we provide a table to denote which methods are inference-only and which require fine-tuning/training.

@@ -62,21 +60,21 @@ For convenience, we provide a table to denote which methods are inference-only a
 |           [Model Editing](#model-editing)           |         ✅         |                   ❌                    |                                                                                                 |
 |                [DiffEdit](#diffedit)                |         ✅         |                   ❌                    |                                                                                                 |
 |             [T2I-Adapter](#t2i-adapter)             |         ✅         |                   ❌                    |                                                                                                 |
-|                [Fabric](#fabric)                    |         ✅         |                   ❌                    |                                                                                                 |
+
 ## Instruct Pix2Pix

 [Paper](https://arxiv.org/abs/2211.09800)

-[Instruct Pix2Pix](../api/pipelines/pix2pix) is fine-tuned from stable diffusion to support editing input images. It takes as inputs an image and a prompt describing an edit, and it outputs the edited image.
+[Instruct Pix2Pix](../api/pipelines/stable_diffusion/pix2pix) is fine-tuned from stable diffusion to support editing input images. It takes as inputs an image and a prompt describing an edit, and it outputs the edited image.
 Instruct Pix2Pix has been explicitly trained to work well with [InstructGPT](https://openai.com/blog/instruction-following/)-like prompts.

-See [here](../api/pipelines/pix2pix) for more information on how to use it.
+See [here](../api/pipelines/stable_diffusion/pix2pix) for more information on how to use it.

 ## Pix2Pix Zero

 [Paper](https://arxiv.org/abs/2302.03027)

-[Pix2Pix Zero](../api/pipelines/pix2pix_zero) allows modifying an image so that one concept or subject is translated to another one while preserving general image semantics.
+[Pix2Pix Zero](../api/pipelines/stable_diffusion/pix2pix_zero) allows modifying an image so that one concept or subject is translated to another one while preserving general image semantics.

 The denoising process is guided from one conceptual embedding towards another conceptual embedding. The intermediate latents are optimized during the denoising process to push the attention maps towards reference attention maps. The reference attention maps are from the denoising process of the input image and are used to encourage semantic preservation.

@@ -89,26 +87,26 @@ Pix2Pix Zero can be used both to edit synthetic images as well as real images.
 <Tip>

 Pix2Pix Zero is the first model that allows "zero-shot" image editing. This means that the model
-can edit an image in less than a minute on a consumer GPU as shown [here](../api/pipelines/pix2pix_zero#usage-example).
+can edit an image in less than a minute on a consumer GPU as shown [here](../api/pipelines/stable_diffusion/pix2pix_zero#usage-example).

 </Tip>

 As mentioned above, Pix2Pix Zero includes optimizing the latents (and not any of the UNet, VAE, or the text encoder) to steer the generation toward a specific concept. This means that the overall
 pipeline might require more memory than a standard [StableDiffusionPipeline](../api/pipelines/stable_diffusion/text2img).

-See [here](../api/pipelines/pix2pix_zero) for more information on how to use it.
+See [here](../api/pipelines/stable_diffusion/pix2pix_zero) for more information on how to use it.

 ## Attend and Excite

 [Paper](https://arxiv.org/abs/2301.13826)

-[Attend and Excite](../api/pipelines/attend_and_excite) allows subjects in the prompt to be faithfully represented in the final image.
+[Attend and Excite](../api/pipelines/stable_diffusion/attend_and_excite) allows subjects in the prompt to be faithfully represented in the final image.

 A set of token indices are given as input, corresponding to the subjects in the prompt that need to be present in the image. During denoising, each token index is guaranteed to have a minimum attention threshold for at least one patch of the image. The intermediate latents are iteratively optimized during the denoising process to strengthen the attention of the most neglected subject token until the attention threshold is passed for all subject tokens.

-Like Pix2Pix Zero, Attend and Excite also involves a mini optimization loop (leaving the pre-trained weights untouched) in its pipeline and can require more memory than the usual [StableDiffusionPipeline](../api/pipelines/stable_diffusion/text2img).
+Like Pix2Pix Zero, Attend and Excite also involves a mini optimization loop (leaving the pre-trained weights untouched) in its pipeline and can require more memory than the usual `StableDiffusionPipeline`.

-See [here](../api/pipelines/attend_and_excite) for more information on how to use it.
+See [here](../api/pipelines/stable_diffusion/attend_and_excite) for more information on how to use it.

 ## Semantic Guidance (SEGA)

@@ -126,11 +124,11 @@ See [here](../api/pipelines/semantic_stable_diffusion) for more information on h

 [Paper](https://arxiv.org/abs/2210.00939)

-[Self-attention Guidance](../api/pipelines/self_attention_guidance) improves the general quality of images.
+[Self-attention Guidance](../api/pipelines/stable_diffusion/self_attention_guidance) improves the general quality of images.

 SAG provides guidance from predictions not conditioned on high-frequency details to fully conditioned images. The high frequency details are extracted out of the UNet self-attention maps.

-See [here](../api/pipelines/self_attention_guidance) for more information on how to use it.
+See [here](../api/pipelines/stable_diffusion/self_attention_guidance) for more information on how to use it.

 ## Depth2Image

@@ -155,9 +153,9 @@ apply Pix2Pix Zero to any of the available Stable Diffusion models.
 [Paper](https://arxiv.org/abs/2302.08113)

 MultiDiffusion defines a new generation process over a pre-trained diffusion model. This process binds together multiple diffusion generation methods that can be readily applied to generate high quality and diverse images. Results adhere to user-provided controls, such as desired aspect ratio (e.g., panorama), and spatial guiding signals, ranging from tight segmentation masks to bounding boxes.
-[MultiDiffusion Panorama](../api/pipelines/panorama) allows to generate high-quality images at arbitrary aspect ratios (e.g., panoramas).
+[MultiDiffusion Panorama](../api/pipelines/stable_diffusion/panorama) allows to generate high-quality images at arbitrary aspect ratios (e.g., panoramas).

-See [here](../api/pipelines/panorama) for more information on how to use it to generate panoramic images.
+See [here](../api/pipelines/stable_diffusion/panorama) for more information on how to use it to generate panoramic images.

 ## Fine-tuning your own models

@@ -207,20 +205,20 @@ For more details, check out our [official doc](../training/custom_diffusion).

 [Paper](https://arxiv.org/abs/2303.08084)

-The [text-to-image model editing pipeline](../api/pipelines/model_editing) helps you mitigate some of the incorrect implicit assumptions a pre-trained text-to-image
+The [text-to-image model editing pipeline](../api/pipelines/stable_diffusion/model_editing) helps you mitigate some of the incorrect implicit assumptions a pre-trained text-to-image
 diffusion model might make about the subjects present in the input prompt. For example, if you prompt Stable Diffusion to generate images for "A pack of roses", the roses in the generated images
 are more likely to be red. This pipeline helps you change that assumption.

-To know more details, check out the [official doc](../api/pipelines/model_editing).
+To know more details, check out the [official doc](../api/pipelines/stable_diffusion/model_editing).

 ## DiffEdit

 [Paper](https://arxiv.org/abs/2210.11427)

-[DiffEdit](../api/pipelines/diffedit) allows for semantic editing of input images along with
+[DiffEdit](../api/pipelines/stable_diffusion/diffedit) allows for semantic editing of input images along with
 input prompts while preserving the original input images as much as possible.

-To know more details, check out the [official doc](../api/pipelines/diffedit).
+To know more details, check out the [official doc](../api/pipelines/stable_diffusion/model_editing).

 ## T2I-Adapter

@@ -231,14 +229,3 @@ There are 8 canonical pre-trained adapters trained on different conditionings su
 depth maps, and semantic segmentations.

 See [here](../api/pipelines/stable_diffusion/adapter) for more information on how to use it.
-
-## Fabric
-
-[Paper](https://arxiv.org/abs/2307.10159)
-
-[Fabric](../api/pipelines/fabric) is a training-free
-approach applicable to a wide range of popular diffusion models, which exploits
-the self-attention layer present in the most widely used architectures to condition
-the diffusion process on a set of feedback images.
-
-To know more details, check out the [official doc](../api/pipelines/fabric).
--- a/docs/source/en/using-diffusers/custom_pipeline_examples.md
+++ b/docs/source/en/using-diffusers/custom_pipeline_examples.md
@@ -32,7 +32,7 @@ If a community doesn't work as expected, please open an issue and ping the autho
 To load a custom pipeline you just need to pass the `custom_pipeline` argument to `DiffusionPipeline`, as one of the files in `diffusers/examples/community`. Feel free to send a PR with your own pipelines, we will merge them quickly.
 ```py
 pipe = DiffusionPipeline.from_pretrained(
-    "CompVis/stable-diffusion-v1-4", custom_pipeline="filename_in_the_community_folder", use_safetensors=True
+    "CompVis/stable-diffusion-v1-4", custom_pipeline="filename_in_the_community_folder"
 )
 ```

@@ -61,7 +61,6 @@ guided_pipeline = DiffusionPipeline.from_pretrained(
    clip_model=clip_model,
    feature_extractor=feature_extractor,
    torch_dtype=torch.float16,
-    use_safetensors=True,
 )
 guided_pipeline.enable_attention_slicing()
 guided_pipeline = guided_pipeline.to("cuda")
@@ -118,7 +117,6 @@ pipe = DiffusionPipeline.from_pretrained(
    torch_dtype=torch.float16,
    safety_checker=None,  # Very important for videos...lots of false positives while interpolating
    custom_pipeline="interpolate_stable_diffusion",
-    use_safetensors=True,
 ).to("cuda")
 pipe.enable_attention_slicing()

@@ -161,7 +159,6 @@ pipe = DiffusionPipeline.from_pretrained(
    "CompVis/stable-diffusion-v1-4",
    custom_pipeline="stable_diffusion_mega",
    torch_dtype=torch.float16,
-    use_safetensors=True,
 )
 pipe.to("cuda")
 pipe.enable_attention_slicing()
@@ -206,7 +203,7 @@ from diffusers import DiffusionPipeline
 import torch

 pipe = DiffusionPipeline.from_pretrained(
-    "hakurei/waifu-diffusion", custom_pipeline="lpw_stable_diffusion", torch_dtype=torch.float16, use_safetensors=True
+    "hakurei/waifu-diffusion", custom_pipeline="lpw_stable_diffusion", torch_dtype=torch.float16
 )
 pipe = pipe.to("cuda")

@@ -227,7 +224,6 @@ pipe = DiffusionPipeline.from_pretrained(
    custom_pipeline="lpw_stable_diffusion_onnx",
    revision="onnx",
    provider="CUDAExecutionProvider",
-    use_safetensors=True,
 )

 prompt = "a photo of an astronaut riding a horse on mars, best quality"
@@ -271,8 +267,8 @@ diffuser_pipeline = DiffusionPipeline.from_pretrained(
    custom_pipeline="speech_to_image_diffusion",
    speech_model=model,
    speech_processor=processor,
+    
    torch_dtype=torch.float16,
-    use_safetensors=True,
 )

 diffuser_pipeline.enable_attention_slicing()
--- a/docs/source/en/using-diffusers/custom_pipeline_overview.md
+++ b/docs/source/en/using-diffusers/custom_pipeline_overview.md
@@ -30,7 +30,7 @@ To load any community pipeline on the Hub, pass the repository id of the communi
 from diffusers import DiffusionPipeline

 pipeline = DiffusionPipeline.from_pretrained(
-    "google/ddpm-cifar10-32", custom_pipeline="hf-internal-testing/diffusers-dummy-pipeline", use_safetensors=True
+    "google/ddpm-cifar10-32", custom_pipeline="hf-internal-testing/diffusers-dummy-pipeline"
 )
 ```

@@ -50,7 +50,6 @@ pipeline = DiffusionPipeline.from_pretrained(
    custom_pipeline="clip_guided_stable_diffusion",
    clip_model=clip_model,
    feature_extractor=feature_extractor,
-    use_safetensors=True,
 )
 ```

--- a/docs/source/en/using-diffusers/depth2img.md
+++ b/docs/source/en/using-diffusers/depth2img.md
@@ -28,7 +28,6 @@ from diffusers import StableDiffusionDepth2ImgPipeline
 pipe = StableDiffusionDepth2ImgPipeline.from_pretrained(
    "stabilityai/stable-diffusion-2-depth",
    torch_dtype=torch.float16,
-    use_safetensors=True,
 ).to("cuda")
 ```

--- a/docs/source/en/using-diffusers/distilled_sd.md
+++ b/docs/source/en/using-diffusers/distilled_sd.md
@@ -1,121 +0,0 @@
-# Distilled Stable Diffusion inference
-
-[[open-in-colab]]
-
-Stable Diffusion inference can be a computationally intensive process because it must iteratively denoise the latents to generate an image. To reduce the computational burden, you can use a *distilled* version of the Stable Diffusion model from [Nota AI](https://huggingface.co/nota-ai). The distilled version of their Stable Diffusion model eliminates some of the residual and attention blocks from the UNet, reducing the model size by 51% and improving latency on CPU/GPU by 43%.
-
-<Tip>
-
-Read this [blog post](https://huggingface.co/blog/sd_distillation) to learn more about how knowledge distillation training works to produce a faster, smaller, and cheaper generative model.
-
-</Tip>
-
-Let's load the distilled Stable Diffusion model and compare it against the original Stable Diffusion model:
-
-```py
-from diffusers import StableDiffusionPipeline
-import torch
-
-distilled = StableDiffusionPipeline.from_pretrained(
-    "nota-ai/bk-sdm-small", torch_dtype=torch.float16, use_safetensors=True,
-).to("cuda")
-
-original = StableDiffusionPipeline.from_pretrained(
-    "CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16, use_safetensors=True,
-).to("cuda")
-```
-
-Given a prompt, get the inference time for the original model:
-
-```py
-import time
-
-seed = 2023
-generator = torch.manual_seed(seed)
-
-NUM_ITERS_TO_RUN = 3
-NUM_INFERENCE_STEPS = 25
-NUM_IMAGES_PER_PROMPT = 4
-
-prompt = "a golden vase with different flowers"
-
-start = time.time_ns()
-for _ in range(NUM_ITERS_TO_RUN):
-    images = original(
-        prompt,
-        num_inference_steps=NUM_INFERENCE_STEPS,
-        generator=generator,
-        num_images_per_prompt=NUM_IMAGES_PER_PROMPT
-    ).images
-end = time.time_ns()
-original_sd = f"{(end - start) / 1e6:.1f}"
-
-print(f"Execution time -- {original_sd} ms\n")
-"Execution time -- 45781.5 ms"
-```
-
-Time the distilled model inference:
-
-```py
-start = time.time_ns()
-for _ in range(NUM_ITERS_TO_RUN):
-    images = distilled(
-        prompt,
-        num_inference_steps=NUM_INFERENCE_STEPS,
-        generator=generator,
-        num_images_per_prompt=NUM_IMAGES_PER_PROMPT
-    ).images
-end = time.time_ns()
-
-distilled_sd = f"{(end - start) / 1e6:.1f}"
-print(f"Execution time -- {distilled_sd} ms\n")
-"Execution time -- 29884.2 ms"
-```
-
-<div class="flex gap-4">
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/original_sd.png"/>
-    <figcaption class="mt-2 text-center text-sm text-gray-500">original Stable Diffusion (45781.5 ms)</figcaption>
-  </div>
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/distilled_sd.png"/>
-    <figcaption class="mt-2 text-center text-sm text-gray-500">distilled Stable Diffusion (29884.2 ms)</figcaption>
-  </div>
-</div>
-
-## Tiny AutoEncoder
-
-To speed inference up even more, use a tiny distilled version of the [Stable Diffusion VAE](https://huggingface.co/sayakpaul/taesdxl-diffusers) to denoise the latents into images. Replace the VAE in the distilled Stable Diffusion model with the tiny VAE:
-
-```py
-from diffusers import AutoencoderTiny
-
-distilled.vae = AutoencoderTiny.from_pretrained(
-    "sayakpaul/taesd-diffusers", torch_dtype=torch.float16, use_safetensors=True,
-).to("cuda")
-```
-
-Time the distilled model and distilled VAE inference:
-
-```py
-start = time.time_ns()
-for _ in range(NUM_ITERS_TO_RUN):
-    images = distilled(
-        prompt,
-        num_inference_steps=NUM_INFERENCE_STEPS,
-        generator=generator,
-        num_images_per_prompt=NUM_IMAGES_PER_PROMPT
-    ).images
-end = time.time_ns()
-
-distilled_tiny_sd = f"{(end - start) / 1e6:.1f}"
-print(f"Execution time -- {distilled_tiny_sd} ms\n")
-"Execution time -- 27165.7 ms"
-```
-
-<div class="flex justify-center">
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/distilled_sd_vae.png" />
-    <figcaption class="mt-2 text-center text-sm text-gray-500">distilled Stable Diffusion + Tiny AutoEncoder (27165.7 ms)</figcaption>
-  </div>
-</div>
--- a/docs/source/en/using-diffusers/img2img.md
+++ b/docs/source/en/using-diffusers/img2img.md
@@ -33,9 +33,9 @@ from io import BytesIO
 from diffusers import StableDiffusionImg2ImgPipeline

 device = "cuda"
-pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
-    "nitrosocke/Ghibli-Diffusion", torch_dtype=torch.float16, use_safetensors=True
-).to(device)
+pipe = StableDiffusionImg2ImgPipeline.from_pretrained("nitrosocke/Ghibli-Diffusion", torch_dtype=torch.float16).to(
+    device
+)
 ```

 Download and preprocess an initial image so you can pass it to the pipeline:
--- a/docs/source/en/using-diffusers/inpaint.md
+++ b/docs/source/en/using-diffusers/inpaint.md
@@ -29,8 +29,6 @@ from diffusers import StableDiffusionInpaintPipeline
 pipeline = StableDiffusionInpaintPipeline.from_pretrained(
    "runwayml/stable-diffusion-inpainting",
    torch_dtype=torch.float16,
-    use_safetensors=True,
-    variant="fp16",
 )
 pipeline = pipeline.to("cuda")
 ```
--- a/docs/source/en/using-diffusers/loading.md
+++ b/docs/source/en/using-diffusers/loading.md
@@ -39,7 +39,7 @@ The [`DiffusionPipeline`] class is the simplest and most generic way to load any
 from diffusers import DiffusionPipeline

 repo_id = "runwayml/stable-diffusion-v1-5"
-pipe = DiffusionPipeline.from_pretrained(repo_id, use_safetensors=True)
+pipe = DiffusionPipeline.from_pretrained(repo_id)
 ```

 You can also load a checkpoint with it's specific pipeline class. The example above loaded a Stable Diffusion model; to get the same result, use the [`StableDiffusionPipeline`] class:
@@ -48,7 +48,7 @@ You can also load a checkpoint with it's specific pipeline class. The example ab
 from diffusers import StableDiffusionPipeline

 repo_id = "runwayml/stable-diffusion-v1-5"
-pipe = StableDiffusionPipeline.from_pretrained(repo_id, use_safetensors=True)
+pipe = StableDiffusionPipeline.from_pretrained(repo_id)
 ```

 A checkpoint (such as [`CompVis/stable-diffusion-v1-4`](https://huggingface.co/CompVis/stable-diffusion-v1-4) or [`runwayml/stable-diffusion-v1-5`](https://huggingface.co/runwayml/stable-diffusion-v1-5)) may also be used for more than one task, like text-to-image or image-to-image. To differentiate what task you want to use the checkpoint for, you have to load it directly with it's corresponding task-specific pipeline class:
@@ -65,7 +65,7 @@ pipe = StableDiffusionImg2ImgPipeline.from_pretrained(repo_id)
 To load a diffusion pipeline locally, use [`git-lfs`](https://git-lfs.github.com/) to manually download the checkpoint (in this case, [`runwayml/stable-diffusion-v1-5`](https://huggingface.co/runwayml/stable-diffusion-v1-5)) to your local disk. This creates a local folder, `./stable-diffusion-v1-5`, on your disk:

 ```bash
-git-lfs install
+git lfs install
 git clone https://huggingface.co/runwayml/stable-diffusion-v1-5
 ```

@@ -75,7 +75,7 @@ Then pass the local path to [`~DiffusionPipeline.from_pretrained`]:
 from diffusers import DiffusionPipeline

 repo_id = "./stable-diffusion-v1-5"
-stable_diffusion = DiffusionPipeline.from_pretrained(repo_id, use_safetensors=True)
+stable_diffusion = DiffusionPipeline.from_pretrained(repo_id)
 ```

 The [`~DiffusionPipeline.from_pretrained`] method won't download any files from the Hub when it detects a local path, but this also means it won't download and cache the latest changes to a checkpoint.
@@ -94,7 +94,7 @@ To find out which schedulers are compatible for customization, you can use the `
 from diffusers import DiffusionPipeline

 repo_id = "runwayml/stable-diffusion-v1-5"
-stable_diffusion = DiffusionPipeline.from_pretrained(repo_id, use_safetensors=True)
+stable_diffusion = DiffusionPipeline.from_pretrained(repo_id)
 stable_diffusion.scheduler.compatibles
 ```

@@ -109,7 +109,7 @@ repo_id = "runwayml/stable-diffusion-v1-5"

 scheduler = EulerDiscreteScheduler.from_pretrained(repo_id, subfolder="scheduler")

-stable_diffusion = DiffusionPipeline.from_pretrained(repo_id, scheduler=scheduler, use_safetensors=True)
+stable_diffusion = DiffusionPipeline.from_pretrained(repo_id, scheduler=scheduler)
 ```

 ### Safety checker
@@ -120,7 +120,7 @@ Diffusion models like Stable Diffusion can generate harmful content, which is wh
 from diffusers import DiffusionPipeline

 repo_id = "runwayml/stable-diffusion-v1-5"
-stable_diffusion = DiffusionPipeline.from_pretrained(repo_id, safety_checker=None, use_safetensors=True)
+stable_diffusion = DiffusionPipeline.from_pretrained(repo_id, safety_checker=None)
 ```

 ### Reuse components across pipelines
@@ -131,7 +131,7 @@ You can also reuse the same components in multiple pipelines to avoid loading th
 from diffusers import StableDiffusionPipeline, StableDiffusionImg2ImgPipeline

 model_id = "runwayml/stable-diffusion-v1-5"
-stable_diffusion_txt2img = StableDiffusionPipeline.from_pretrained(model_id, use_safetensors=True)
+stable_diffusion_txt2img = StableDiffusionPipeline.from_pretrained(model_id)

 components = stable_diffusion_txt2img.components
 ```
@@ -148,7 +148,7 @@ You can also pass the components individually to the pipeline if you want more f
 from diffusers import StableDiffusionPipeline, StableDiffusionImg2ImgPipeline

 model_id = "runwayml/stable-diffusion-v1-5"
-stable_diffusion_txt2img = StableDiffusionPipeline.from_pretrained(model_id, use_safetensors=True)
+stable_diffusion_txt2img = StableDiffusionPipeline.from_pretrained(model_id)
 stable_diffusion_img2img = StableDiffusionImg2ImgPipeline(
    vae=stable_diffusion_txt2img.vae,
    text_encoder=stable_diffusion_txt2img.text_encoder,
@@ -194,12 +194,10 @@ import torch

 # load fp16 variant
 stable_diffusion = DiffusionPipeline.from_pretrained(
-    "runwayml/stable-diffusion-v1-5", variant="fp16", torch_dtype=torch.float16, use_safetensors=True
+    "runwayml/stable-diffusion-v1-5", variant="fp16", torch_dtype=torch.float16
 )
 # load non_ema variant
-stable_diffusion = DiffusionPipeline.from_pretrained(
-    "runwayml/stable-diffusion-v1-5", variant="non_ema", use_safetensors=True
-)
+stable_diffusion = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", variant="non_ema")
 ```

 To save a checkpoint stored in a different floating point type or as a non-EMA variant, use the [`DiffusionPipeline.save_pretrained`] method and specify the `variant` argument. You should try and save a variant to the same folder as the original checkpoint, so you can load both from the same folder:
@@ -217,12 +215,10 @@ If you don't save the variant to an existing folder, you must specify the `varia

 ```python
 # 👎 this won't work
-stable_diffusion = DiffusionPipeline.from_pretrained(
-    "./stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True
-)
+stable_diffusion = DiffusionPipeline.from_pretrained("./stable-diffusion-v1-5", torch_dtype=torch.float16)
 # 👍 this works
 stable_diffusion = DiffusionPipeline.from_pretrained(
-    "./stable-diffusion-v1-5", variant="fp16", torch_dtype=torch.float16, use_safetensors=True
+    "./stable-diffusion-v1-5", variant="fp16", torch_dtype=torch.float16
 )
 ```

@@ -237,7 +233,7 @@ load model variants, e.g.:
 ```python
 from diffusers import DiffusionPipeline

-pipe = DiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", revision="fp16", use_safetensors=True)
+pipe = DiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", revision="fp16")
 ```

 However, this behavior is now deprecated since the "revision" argument should (just as it's done in GitHub) better be used to load model checkpoints from a specific commit or branch in development.
@@ -263,7 +259,7 @@ Models can be loaded from a subfolder with the `subfolder` argument. For example
 from diffusers import UNet2DConditionModel

 repo_id = "runwayml/stable-diffusion-v1-5"
-model = UNet2DConditionModel.from_pretrained(repo_id, subfolder="unet", use_safetensors=True)
+model = UNet2DConditionModel.from_pretrained(repo_id, subfolder="unet")
 ```

 Or directly from a repository's [directory](https://huggingface.co/google/ddpm-cifar10-32/tree/main):
@@ -272,7 +268,7 @@ Or directly from a repository's [directory](https://huggingface.co/google/ddpm-c
 from diffusers import UNet2DModel

 repo_id = "google/ddpm-cifar10-32"
-model = UNet2DModel.from_pretrained(repo_id, use_safetensors=True)
+model = UNet2DModel.from_pretrained(repo_id)
 ```

 You can also load and save model variants by specifying the `variant` argument in [`ModelMixin.from_pretrained`] and [`ModelMixin.save_pretrained`]:
@@ -280,9 +276,7 @@ You can also load and save model variants by specifying the `variant` argument i
 ```python
 from diffusers import UNet2DConditionModel

-model = UNet2DConditionModel.from_pretrained(
-    "runwayml/stable-diffusion-v1-5", subfolder="unet", variant="non-ema", use_safetensors=True
-)
+model = UNet2DConditionModel.from_pretrained("runwayml/stable-diffusion-v1-5", subfolder="unet", variant="non-ema")
 model.save_pretrained("./local-unet", variant="non-ema")
 ```

@@ -316,7 +310,7 @@ euler = EulerDiscreteScheduler.from_pretrained(repo_id, subfolder="scheduler")
 dpm = DPMSolverMultistepScheduler.from_pretrained(repo_id, subfolder="scheduler")

 # replace `dpm` with any of `ddpm`, `ddim`, `pndm`, `lms`, `euler_anc`, `euler`
-pipeline = StableDiffusionPipeline.from_pretrained(repo_id, scheduler=dpm, use_safetensors=True)
+pipeline = StableDiffusionPipeline.from_pretrained(repo_id, scheduler=dpm)
 ```

 ## DiffusionPipeline explained
@@ -332,7 +326,7 @@ The pipelines underlying folder structure corresponds directly with their class
 from diffusers import DiffusionPipeline

 repo_id = "runwayml/stable-diffusion-v1-5"
-pipeline = DiffusionPipeline.from_pretrained(repo_id, use_safetensors=True)
+pipeline = DiffusionPipeline.from_pretrained(repo_id)
 print(pipeline)
 ```

@@ -466,4 +460,4 @@ Every pipeline expects a `model_index.json` file that tells the [`DiffusionPipel
    "AutoencoderKL"
  ]
 }
-```
+```
--- a/docs/source/en/using-diffusers/other-formats.md
+++ b/docs/source/en/using-diffusers/other-formats.md
@@ -111,9 +111,7 @@ If you prefer to run inference with code, click on the **Use in Diffusers** butt
 ```py
 from diffusers import DiffusionPipeline

-pipeline = DiffusionPipeline.from_pretrained(
-    "sayakpaul/textual-inversion-cat-kerascv_sd_diffusers_pipeline", use_safetensors=True
-)
+pipeline = DiffusionPipeline.from_pretrained("sayakpaul/textual-inversion-cat-kerascv_sd_diffusers_pipeline")
 ```

 Then you can generate an image like:
@@ -121,9 +119,7 @@ Then you can generate an image like:
 ```py
 from diffusers import DiffusionPipeline

-pipeline = DiffusionPipeline.from_pretrained(
-    "sayakpaul/textual-inversion-cat-kerascv_sd_diffusers_pipeline", use_safetensors=True
-)
+pipeline = DiffusionPipeline.from_pretrained("sayakpaul/textual-inversion-cat-kerascv_sd_diffusers_pipeline")
 pipeline.to("cuda")

 placeholder_token = "<my-funny-cat-token>"
@@ -175,12 +171,22 @@ images = pipeline(
 ).images
 ```

-Display the images:
+Finally, create a helper function to display the images:

 ```py
-from diffusers.utils import make_image_grid
+from PIL import Image

-make_image_grid(images, 2, 2)
+
+def image_grid(imgs, rows=2, cols=2):
+    w, h = imgs[0].size
+    grid = Image.new("RGB", size=(cols * w, rows * h))
+
+    for i, img in enumerate(imgs):
+        grid.paste(img, box=(i % cols * w, i // cols * h))
+    return grid
+
+
+image_grid(images)
 ```

 <div class="flex justify-center">
--- a/docs/source/en/using-diffusers/pipeline_overview.md
+++ b/docs/source/en/using-diffusers/pipeline_overview.md
@@ -12,6 +12,6 @@ specific language governing permissions and limitations under the License.

 # Overview

-A pipeline is an end-to-end class that provides a quick and easy way to use a diffusion system for inference by bundling independently trained models and schedulers together. Certain combinations of models and schedulers define specific pipeline types, like [`StableDiffusionXLPipeline`] or [`StableDiffusionControlNetPipeline`], with specific capabilities. All pipeline types inherit from the base [`DiffusionPipeline`] class; pass it any checkpoint, and it'll automatically detect the pipeline type and load the necessary components.
+A pipeline is an end-to-end class that provides a quick and easy way to use a diffusion system for inference by bundling independently trained models and schedulers together. Certain combinations of models and schedulers define specific pipeline types, like [`StableDiffusionPipeline`] or [`StableDiffusionControlNetPipeline`], with specific capabilities. All pipeline types inherit from the base [`DiffusionPipeline`] class; pass it any checkpoint, and it'll automatically detect the pipeline type and load the necessary components.

-This section introduces you to some of the more complex pipelines like Stable Diffusion XL, ControlNet, and DiffEdit, which require additional inputs. You'll also learn how to use a distilled version of the Stable Diffusion model to speed up inference, how to control randomness on your hardware when generating images, and how to create a community pipeline for a custom task like generating images from speech.
+This section introduces you to some of the tasks supported by our pipelines such as unconditional image generation and different techniques and variations of text-to-image generation. You'll also learn how to gain more control over the generation process by setting a seed for reproducibility and weighting prompts to adjust the influence certain words in the prompt has over the output. Finally, you'll see how you can create a community pipeline for a custom task like generating images from speech.
--- a/docs/source/en/using-diffusers/push_to_hub.md
+++ b/docs/source/en/using-diffusers/push_to_hub.md
@@ -1,171 +0,0 @@
-# Push files to the Hub
-
-[[open-in-colab]]
-
-🤗 Diffusers provides a [`~diffusers.utils.PushToHubMixin`] for uploading your model, scheduler, or pipeline to the Hub. It is an easy way to store your files on the Hub, and also allows you to share your work with others. Under the hood, the [`~diffusers.utils.PushToHubMixin`]:
-
-1. creates a repository on the Hub
-2. saves your model, scheduler, or pipeline files so they can be reloaded later
-3. uploads folder containing these files to the Hub
-
-This guide will show you how to use the [`~diffusers.utils.PushToHubMixin`] to upload your files to the Hub.
-
-You'll need to log in to your Hub account with your access [token](https://huggingface.co/settings/tokens) first:
-
-```py
-from huggingface_hub import notebook_login
-
-notebook_login()
-```
-
-## Models
-
-To push a model to the Hub, call [`~diffusers.utils.PushToHubMixin.push_to_hub`] and specfiy the repository id of the model to be stored on the Hub:
-
-```py
-from diffusers import ControlNetModel
-
-controlnet = ControlNetModel(
-    block_out_channels=(32, 64),
-    layers_per_block=2,
-    in_channels=4,
-    down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
-    cross_attention_dim=32,
-    conditioning_embedding_out_channels=(16, 32),
-)
-controlnet.push_to_hub("my-controlnet-model")
-```
-
-For model's, you can also specify the [*variant*](loading#checkpoint-variants) of the weights to push to the Hub. For example, to push `fp16` weights:
-
-```py
-controlnet.push_to_hub("my-controlnet-model", variant="fp16")
-```
-
-The [`~diffusers.utils.PushToHubMixin.push_to_hub`] function saves the model's `config.json` file and the weights are automatically saved in the `safetensors` format.
-
-Now you can reload the model from your repository on the Hub:
-
-```py
-model = ControlNetModel.from_pretrained("your-namespace/my-controlnet-model")
-```
-
-## Scheduler
-
-To push a scheduler to the Hub, call [`~diffusers.utils.PushToHubMixin.push_to_hub`] and specfiy the repository id of the scheduler to be stored on the Hub:
-
-```py
-from diffusers import DDIMScheduler
-
-scheduler = DDIMScheduler(
-    beta_start=0.00085,
-    beta_end=0.012,
-    beta_schedule="scaled_linear",
-    clip_sample=False,
-    set_alpha_to_one=False,
-)
-scheduler.push_to_hub("my-controlnet-scheduler")
-```
-
-The [`~diffusers.utils.PushToHubMixin.push_to_hub`] function saves the scheduler's `scheduler_config.json` file to the specified repository.
-
-Now you can reload the scheduler from your repository on the Hub:
-
-```py
-scheduler = DDIMScheduler.from_pretrained("your-namepsace/my-controlnet-scheduler")
-```
-
-## Pipeline
-
-You can also push an entire pipeline with all it's components to the Hub. For example, initialize the components of a [`StableDiffusionPipeline`] with the parameters you want:
-
-```py
-from diffusers import (
-    UNet2DConditionModel,
-    AutoencoderKL,
-    DDIMScheduler,
-    StableDiffusionPipeline,
-)
-from transformers import CLIPTextModel, CLIPTextConfig, CLIPTokenizer
-
-unet = UNet2DConditionModel(
-    block_out_channels=(32, 64),
-    layers_per_block=2,
-    sample_size=32,
-    in_channels=4,
-    out_channels=4,
-    down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
-    up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-    cross_attention_dim=32,
-)
-
-scheduler = DDIMScheduler(
-    beta_start=0.00085,
-    beta_end=0.012,
-    beta_schedule="scaled_linear",
-    clip_sample=False,
-    set_alpha_to_one=False,
-)
-
-vae = AutoencoderKL(
-    block_out_channels=[32, 64],
-    in_channels=3,
-    out_channels=3,
-    down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
-    up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-    latent_channels=4,
-)
-
-text_encoder_config = CLIPTextConfig(
-    bos_token_id=0,
-    eos_token_id=2,
-    hidden_size=32,
-    intermediate_size=37,
-    layer_norm_eps=1e-05,
-    num_attention_heads=4,
-    num_hidden_layers=5,
-    pad_token_id=1,
-    vocab_size=1000,
-)
-text_encoder = CLIPTextModel(text_encoder_config)
-tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-```
-
-Pass all of the components to the [`StableDiffusionPipeline`] and call [`~diffusers.utils.PushToHubMixin.push_to_hub`] to push the pipeline to the Hub:
-
-```py
-components = {
-    "unet": unet,
-    "scheduler": scheduler,
-    "vae": vae,
-    "text_encoder": text_encoder,
-    "tokenizer": tokenizer,
-    "safety_checker": None,
-    "feature_extractor": None,
-}
-
-pipeline = StableDiffusionPipeline(**components)
-pipeline.push_to_hub("my-pipeline")
-```
-
-The [`~diffusers.utils.PushToHubMixin.push_to_hub`] function saves each component to a subfolder in the repository. Now you can reload the pipeline from your repository on the Hub:
-
-```py
-pipeline = StableDiffusionPipeline.from_pretrained("your-namespace/my-pipeline")
-```
-
-## Privacy
-
-Set `private=True` in the [`~diffusers.utils.PushToHubMixin.push_to_hub`] function to keep your model, scheduler, or pipeline files private:
-
-```py
-controlnet.push_to_hub("my-controlnet-model", private=True)
-```
-
-Private repositories are only visible to you, and other users won't be able to clone the repository and your repository won't appear in search results. Even if a user has the URL to your private repository, they'll receive a `404 - Repo not found error.`
-
-To load a model, scheduler, or pipeline from a private or gated repositories, set `use_auth_token=True`:
-
-```py
-model = ControlNet.from_pretrained("your-namespace/my-controlnet-model", use_auth_token=True)
-```
--- a/docs/source/en/using-diffusers/reproducibility.md
+++ b/docs/source/en/using-diffusers/reproducibility.md
@@ -40,7 +40,7 @@ import numpy as np
 model_id = "google/ddpm-cifar10-32"

 # load model and scheduler
-ddim = DDIMPipeline.from_pretrained(model_id, use_safetensors=True)
+ddim = DDIMPipeline.from_pretrained(model_id)

 # run pipeline for just two steps and return numpy tensor
 image = ddim(num_inference_steps=2, output_type="np").images
@@ -65,7 +65,7 @@ import numpy as np
 model_id = "google/ddpm-cifar10-32"

 # load model and scheduler
-ddim = DDIMPipeline.from_pretrained(model_id, use_safetensors=True)
+ddim = DDIMPipeline.from_pretrained(model_id)

 # create a generator for reproducibility
 generator = torch.Generator(device="cpu").manual_seed(0)
@@ -100,7 +100,7 @@ import numpy as np
 model_id = "google/ddpm-cifar10-32"

 # load model and scheduler
-ddim = DDIMPipeline.from_pretrained(model_id, use_safetensors=True)
+ddim = DDIMPipeline.from_pretrained(model_id)
 ddim.to("cuda")

 # create a generator for reproducibility
@@ -125,7 +125,7 @@ import numpy as np
 model_id = "google/ddpm-cifar10-32"

 # load model and scheduler
-ddim = DDIMPipeline.from_pretrained(model_id, use_safetensors=True)
+ddim = DDIMPipeline.from_pretrained(model_id)
 ddim.to("cuda")

 # create a generator for reproducibility; notice you don't place it on the GPU!
@@ -174,7 +174,7 @@ from diffusers import DDIMScheduler, StableDiffusionPipeline
 import numpy as np

 model_id = "runwayml/stable-diffusion-v1-5"
-pipe = StableDiffusionPipeline.from_pretrained(model_id, use_safetensors=True).to("cuda")
+pipe = StableDiffusionPipeline.from_pretrained(model_id).to("cuda")
 pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
 g = torch.Generator(device="cuda")

--- a/docs/source/en/using-diffusers/reusing_seeds.md
+++ b/docs/source/en/using-diffusers/reusing_seeds.md
@@ -27,9 +27,7 @@ Instantiate a pipeline with [`DiffusionPipeline.from_pretrained`] and place it o
 ```python
 >>> from diffusers import DiffusionPipeline

->>> pipe = DiffusionPipeline.from_pretrained(
-...     "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True
-... )
+>>> pipe = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16)
 >>> pipe = pipe.to("cuda")
 ```

--- a/docs/source/en/using-diffusers/schedulers.md
+++ b/docs/source/en/using-diffusers/schedulers.md
@@ -39,9 +39,7 @@ import torch

 login()

-pipeline = DiffusionPipeline.from_pretrained(
-    "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True
-)
+pipeline = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16)
 ```

 Next, we move it to GPU:
--- a/docs/source/en/using-diffusers/sdxl.md
+++ b/docs/source/en/using-diffusers/sdxl.md
@@ -1,429 +0,0 @@
-# Stable Diffusion XL
-
-[[open-in-colab]]
-
-[Stable Diffusion XL](https://huggingface.co/papers/2307.01952) (SDXL) is a powerful text-to-image generation model that iterates on the previous Stable Diffusion models in three key ways:
-
-1. the UNet is 3x larger and SDXL combines a second text encoder (OpenCLIP ViT-bigG/14) with the original text encoder to significantly increase the number of parameters
-2. introduces size and crop-conditioning to preserve training data from being discarded and gain more control over how a generated image should be cropped
-3. introduces a two-stage model process; the *base* model (can also be run as a standalone model) generates an image as an input to the *refiner* model which adds additional high-quality details
-
-This guide will show you how to use SDXL for text-to-image, image-to-image, and inpainting.
-
-Before you begin, make sure you have the following libraries installed:
-
-```py
-# uncomment to install the necessary libraries in Colab
-#!pip install diffusers transformers accelerate safetensors omegaconf invisible-watermark>=0.2.0
-```
-
-<Tip warning={true}>
-
-We recommend installing the [invisible-watermark](https://pypi.org/project/invisible-watermark/) library to help identify images that are generated. If the invisible-watermark library is installed, it is used by default. To disable the watermarker:
-
-```py
-pipeline = StableDiffusionXLPipeline.from_pretrained(..., add_watermarker=False)
-```
-
-</Tip>
-
-## Load model checkpoints
-
-Model weights may be stored in separate subfolders on the Hub or locally, in which case, you should use the [`~StableDiffusionXLPipeline.from_pretrained`] method:
-
-```py
-from diffusers import StableDiffusionXLPipeline, StableDiffusionXLImg2ImgPipeline
-import torch
-
-pipeline = StableDiffusionXLPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16, variant="fp16", use_safetensors=True
-).to("cuda")
-
-refiner = StableDiffusionXLImg2ImgPipeline.from_single_file(
-    "stabilityai/stable-diffusion-xl-refiner-1.0", torch_dtype=torch.float16, use_safetensors=True, variant="fp16"
-).to("cuda")
-```
-
-You can also use the [`~StableDiffusionXLPipeline.from_single_file`] method to load a model checkpoint stored in a single file format (`.ckpt` or `.safetensors`) from the Hub or locally:
-
-```py
-from diffusers import StableDiffusionXLPipeline, StableDiffusionXLImg2ImgPipeline
-import torch
-
-pipeline = StableDiffusionXLPipeline.from_single_file(
-    "https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/sd_xl_base_1.0.safetensors", torch_dtype=torch.float16, variant="fp16", use_safetensors=True
-).to("cuda")
-
-refiner = StableDiffusionXLImg2ImgPipeline.from_single_file(
-    "https://huggingface.co/stabilityai/stable-diffusion-xl-refiner-1.0/blob/main/sd_xl_refiner_1.0.safetensors", torch_dtype=torch.float16, use_safetensors=True, variant="fp16"
-).to("cuda")
-```
-
-## Text-to-image
-
-For text-to-image, pass a text prompt:
-
-```py
-from diffusers import AutoPipelineForText2Image
-import torch
-
-pipeline_text2image = AutoPipelineForText2Image.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16, variant="fp16", use_safetensors=True
-).to("cuda")
-
-prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k"
-image = pipeline(prompt=prompt).images[0]
-```
-
-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sdxl-text2img.png" alt="generated image of an astronaut in a jungle"/>
-</div>
-
-## Image-to-image
-
-For image-to-image, SDXL works especially well with image sizes between 768x768 and 1024x1024. Pass an initial image, and a text prompt to condition the image with:
-
-```py
-from diffusers import AutoPipelineForImg2Img
-from diffusers.utils import load_image
-
-# use from_pipe to avoid consuming additional memory when loading a checkpoint
-pipeline = AutoPipelineForImage2Image.from_pipe(pipeline_text2image).to("cuda")
-url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sdxl-img2img.png"
-
-init_image = load_image(url).convert("RGB")
-prompt = "a dog catching a frisbee in the jungle"
-image = pipeline(prompt, image=init_image, strength=0.8, guidance_scale=10.5).images[0]
-```
-
-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sdxl-img2img.png" alt="generated image of a dog catching a frisbee in a jungle"/>
-</div>
-
-## Inpainting
-
-For inpainting, you'll need the original image and a mask of what you want to replace in the original image. Create a prompt to describe what you want to replace the masked area with.
-
-```py
-from diffusers import AutoPipelineForInpainting
-from diffusers.utils import load_image
-
-# use from_pipe to avoid consuming additional memory when loading a checkpoint
-pipeline = AutoPipelineForInpainting.from_pipe(pipeline_text2image).to("cuda")
-
-img_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sdxl-text2img.png"
-mask_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sdxl-inpaint-mask.png"
-
-init_image = load_image(img_url).convert("RGB")
-mask_image = load_image(mask_url).convert("RGB")
-
-prompt = "A deep sea diver floating"
-image = pipeline(prompt=prompt, image=init_image, mask_image=mask_image, strength=0.85, guidance_scale=12.5).images[0]
-```
-
-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sdxl-inpaint.png" alt="generated image of a deep sea diver in a jungle"/>
-</div>
-
-## Refine image quality
-
-SDXL includes a [refiner model](https://huggingface.co/stabilityai/stable-diffusion-xl-refiner-1.0) specialized in denoising low-noise stage images to generate higher-quality images from the base model. There are two ways to use the refiner:
-
-1. use the base and refiner model together to produce a refined image
-2. use the base model to produce an image, and subsequently use the refiner model to add more details to the image (this is how SDXL is originally trained)
-
-### Base + refiner model
-
-When you use the base and refiner model together to generate an image, this is known as an ([*ensemble of expert denoisers*](https://research.nvidia.com/labs/dir/eDiff-I/)). The ensemble of expert denoisers approach requires less overall denoising steps versus passing the base model's output to the refiner model, so it should be significantly faster to run. However, you won't be able to inspect the base model's output because it still contains a large amount of noise.
-
-As an ensemble of expert denoisers, the base model serves as the expert during the high-noise diffusion stage and the refiner model serves as the expert during the low-noise diffusion stage. Load the base and refiner model:
-
-```py
-from diffusers import DiffusionPipeline
-import torch
-
-base = DiffusionPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16, variant="fp16", use_safetensors=True
-).to("cuda")
-
-refiner = DiffusionPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-xl-refiner-1.0",
-    text_encoder_2=base.text_encoder_2,
-    vae=base.vae,
-    torch_dtype=torch.float16,
-    use_safetensors=True,
-    variant="fp16",
-).to("cuda")
-```
-
-To use this approach, you need to define the number of timesteps for each model to run through their respective stages. For the base model, this is controlled by the [`denoising_end`](https://huggingface.co/docs/diffusers/main/en/api/pipelines/stable_diffusion/stable_diffusion_xl#diffusers.StableDiffusionXLPipeline.__call__.denoising_end) parameter and for the refiner model, it is controlled by the [`denoising_start`](https://huggingface.co/docs/diffusers/main/en/api/pipelines/stable_diffusion/stable_diffusion_xl#diffusers.StableDiffusionXLImg2ImgPipeline.__call__.denoising_start) parameter.
-
-<Tip>
-
-The `denoising_end` and `denoising_start` parameters should be a float between 0 and 1. These parameters are represented as a proportion of discrete timesteps as defined by the scheduler. If you're also using the `strength` parameter, it'll be ignored because the number of denoising steps is determined by the discrete timesteps the model is trained on and the declared fractional cutoff.
-
-</Tip>
-
-Let's set `denoising_end=0.8` so the base model performs the first 80% of denoising the **high-noise** timesteps and set `denoising_start=0.8` so the refiner model performs the last 20% of denoising the **low-noise** timesteps. The base model output should be in **latent** space instead of a PIL image.
-
-```py
-prompt = "A majestic lion jumping from a big stone at night"
-
-image = base(
-    prompt=prompt,
-    num_inference_steps=40,
-    denoising_end=0.8,
-    output_type="latent",
-).images
-image = refiner(
-    prompt=prompt,
-    num_inference_steps=40,
-    denoising_start=0.8,
-    image=image,
-).images[0]
-```
-
-<div class="flex gap-4">
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lion_base.png" alt="generated image of a lion on a rock at night" />
-    <figcaption class="mt-2 text-center text-sm text-gray-500">base model</figcaption>
-  </div>
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lion_refined.png" alt="generated image of a lion on a rock at night in higher quality" />
-    <figcaption class="mt-2 text-center text-sm text-gray-500">ensemble of expert denoisers</figcaption>
-  </div>
-</div>
-
-The refiner model can also be used for inpainting in the [`StableDiffusionXLInpaintPipeline`]:
-
-```py
-from diffusers import StableDiffusionXLInpaintPipeline
-from diffusers.utils import load_image
-
-base = StableDiffusionXLInpaintPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16, variant="fp16", use_safetensors=True
-).to("cuda")
-
-refiner = StableDiffusionXLInpaintPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-xl-refiner-1.0",
-    text_encoder_2=pipe.text_encoder_2,
-    vae=pipe.vae,
-    torch_dtype=torch.float16,
-    use_safetensors=True,
-    variant="fp16",
-).to("cuda")
-
-img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png"
-mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png"
-
-init_image = load_image(img_url).convert("RGB")
-mask_image = load_image(mask_url).convert("RGB")
-
-prompt = "A majestic tiger sitting on a bench"
-num_inference_steps = 75
-high_noise_frac = 0.7
-
-image = base(
-    prompt=prompt,
-    image=init_image,
-    mask_image=mask_image,
-    num_inference_steps=num_inference_steps,
-    denoising_end=high_noise_frac,
-    output_type="latent",
-).images
-image = refiner(
-    prompt=prompt,
-    image=image,
-    mask_image=mask_image,
-    num_inference_steps=num_inference_steps,
-    denoising_start=high_noise_frac,
-).images[0]
-```
-
-This ensemble of expert denoisers method works well for all available schedulers!
-
-### Base to refiner model
-
-SDXL gets a boost in image quality by using the refiner model to add additional high-quality details to the fully-denoised image from the base model, in an image-to-image setting.
-
-Load the base and refiner models:
-
-```py
-from diffusers import DiffusionPipeline
-import torch
-
-base = DiffusionPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16, variant="fp16", use_safetensors=True
-).to("cuda")
-
-refiner = DiffusionPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-xl-refiner-1.0",
-    text_encoder_2=pipe.text_encoder_2,
-    vae=pipe.vae,
-    torch_dtype=torch.float16,
-    use_safetensors=True,
-    variant="fp16",
-).to("cuda")
-```
-
-Generate an image from the base model, and set the model output to **latent** space:
-
-```py
-prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k"
-
-image = base(prompt=prompt, output_type="latent").images[0]
-```
-
-Pass the generated image to the refiner model:
-
-```py
-image = refiner(prompt=prompt, image=image[None, :]).images[0]
-```
-
-<div class="flex gap-4">
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/sd_xl/init_image.png" alt="generated image of an astronaut riding a green horse on Mars" />
-    <figcaption class="mt-2 text-center text-sm text-gray-500">base model</figcaption>
-  </div>
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/sd_xl/refined_image.png" alt="higher quality generated image of an astronaut riding a green horse on Mars" />
-    <figcaption class="mt-2 text-center text-sm text-gray-500">base model + refiner model</figcaption>
-  </div>
-</div>
-
-For inpainting, load the refiner model in the [`StableDiffusionXLInpaintPipeline`], remove the `denoising_end` and `denoising_start` parameters, and choose a smaller number of inference steps for the refiner.
-
-## Micro-conditioning
-
-SDXL training involves several additional conditioning techniques, which are referred to as *micro-conditioning*. These include original image size, target image size, and cropping parameters. The micro-conditionings can be used at inference time to create high-quality, centered images.
-
-<Tip>
-
-You can use both micro-conditioning and negative micro-conditioning parameters thanks to classifier-free guidance. They are available in the [`StableDiffusionXLPipeline`], [`StableDiffusionXLImg2ImgPipeline`], [`StableDiffusionXLInpaintPipeline`], and [`StableDiffusionXLControlNetPipeline`].
-
-</Tip>
-
-### Size conditioning
-
-There are two types of size conditioning:
-
- [`original_size`](https://huggingface.co/docs/diffusers/main/en/api/pipelines/stable_diffusion/stable_diffusion_xl#diffusers.StableDiffusionXLPipeline.__call__.original_size) conditioning comes from upscaled images in the training batch (because it would be wasteful to discard the smaller images which make up almost 40% of the total training data). This way, SDXL learns that upscaling artifacts are not supposed to be present in high-resolution images. During inference, you can use `original_size` to indicate the original image resolution. Using the default value of `(1024, 1024)` produces higher-quality images that resemble the 1024x1024 images in the dataset. If you choose to use a lower resolution, such as `(256, 256)`, the model still generates 1024x1024 images, but they'll look like the low resolution images (simpler patterns, blurring) in the dataset.
-
- [`target_size`](https://huggingface.co/docs/diffusers/main/en/api/pipelines/stable_diffusion/stable_diffusion_xl#diffusers.StableDiffusionXLPipeline.__call__.target_size) conditioning comes from finetuning SDXL to support different image aspect ratios. During inference, if you use the default value of `(1024, 1024)`, you'll get an image that resembles the composition of square images in the dataset. We recommend using the same value for `target_size` and `original_size`, but feel free to experiment with other options!
-
-🤗 Diffusers also lets you specify negative conditions about an image's size to steer generation away from certain image resolutions:
-
-```py
-from diffusers import StableDiffusionXLPipeline
-import torch
-
-pipe = StableDiffusionXLPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16, variant="fp16", use_safetensors=True
-).to("cuda")
-
-prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k"
-image = pipe(
-    prompt=prompt,
-    negative_original_size=(512, 512),
-    negative_target_size=(1024, 1024),
-).images[0]
-```
-
-<div class="flex flex-col justify-center">
-  <img src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/sd_xl/negative_conditions.png"/>
-  <figcaption class="text-center">Images negative conditioned on image resolutions of (128, 128), (256, 256), and (512, 512).</figcaption>
-</div>
-
-### Crop conditioning
-
-Images generated by previous Stable Diffusion models may sometimes appear to be cropped. This is because images are actually cropped during training so that all the images in a batch have the same size. By conditioning on crop coordinates, SDXL *learns* that no cropping - coordinates `(0, 0)` - usually correlates with centered subjects and complete faces (this is the default value in 🤗 Diffusers). You can experiment with different coordinates if you want to generate off-centered compositions!
-
-```py
-from diffusers import StableDiffusionXLPipeline
-import torch
-
-
-pipeline = StableDiffusionXLPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16, variant="fp16", use_safetensors=True
-).to("cuda")
-
-prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k"
-image = pipeline(prompt=prompt, crops_coords_top_left=(256,0)).images[0]
-```
-
-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sdxl-cropped.png" alt="generated image of an astronaut in a jungle, slightly cropped"/>
-</div>
-
-You can also specify negative cropping coordinates to steer generation away from certain cropping parameters:
-
-```py
-from diffusers import StableDiffusionXLPipeline
-import torch
-
-pipe = StableDiffusionXLPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16, variant="fp16", use_safetensors=True
-).to("cuda")
-
-prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k"
-image = pipe(
-    prompt=prompt,
-    negative_original_size=(512, 512),
-    negative_crops_coords_top_left=(0, 0),
-    negative_target_size=(1024, 1024),
-).images[0]
-```
-
-## Use a different prompt for each text-encoder
-
-SDXL uses two text-encoders, so it is possible to pass a different prompt to each text-encoder, which can [improve quality](https://github.com/huggingface/diffusers/issues/4004#issuecomment-1627764201). Pass your original prompt to `prompt` and the second prompt to `prompt_2` (use `negative_prompt` and `negative_prompt_2` if you're using a negative prompts):
-
-```py
-from diffusers import StableDiffusionXLPipeline
-import torch
-
-pipeline = StableDiffusionXLPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16, variant="fp16", use_safetensors=True
-).to("cuda")
-
-# prompt is passed to OAI CLIP-ViT/L-14
-prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k"
-# prompt_2 is passed to OpenCLIP-ViT/bigG-14
-prompt_2 = "Van Gogh painting"
-image = pipeline(prompt=prompt, prompt_2=prompt_2).images[0]
-```
-
-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sdxl-double-prompt.png" alt="generated image of an astronaut in a jungle in the style of a van gogh painting"/>
-</div>
-
-## Optimizations
-
-SDXL is a large model, and you may need to optimize memory to get it to run on your hardware. Here are some tips to save memory and speed up inference.
-
-1. Offload the model to the CPU with [`~StableDiffusionXLPipeline.enable_model_cpu_offload`] for out-of-memory errors:
-
-```diff
- base.to("cuda")
- refiner.to("cuda")
-+ base.enable_model_cpu_offload
-+ refiner.enable_model_cpu_offload
-```
-
-2. Use `torch.compile` for ~20% speed-up (you need `torch>2.0`):
-
-```diff
-+ base.unet = torch.compile(base.unet, mode="reduce-overhead", fullgraph=True)
-+ refiner.unet = torch.compile(refiner.unet, mode="reduce-overhead", fullgraph=True)
-```
-
-3. Enable [xFormers](/optimization/xformers) to run SDXL if `torch<2.0`:
-
-```diff
-+ base.enable_xformers_memory_efficient_attention()
-+ refiner.enable_xformers_memory_efficient_attention()
-```
-
-## Other resources
-
-If you're interested in experimenting with a minimal version of the [`UNet2DConditionModel`] used in SDXL, take a look at the [minSDXL](https://github.com/cloneofsimo/minSDXL) implementation which is written in PyTorch and directly compatible with 🤗 Diffusers.
--- a/docs/source/en/using-diffusers/stable_diffusion_jax_how_to.md
+++ b/docs/source/en/using-diffusers/stable_diffusion_jax_how_to.md
@@ -153,10 +153,19 @@ images = pipeline.numpy_to_pil(images)

 ### Visualization

-```python
-from diffusers import make_image_grid
+Let's create a helper function to display images in a grid.

-make_image_grid(images, 2, 4)
+```python
+def image_grid(imgs, rows, cols):
+    w, h = imgs[0].size
+    grid = Image.new("RGB", size=(cols * w, rows * h))
+    for i, img in enumerate(imgs):
+        grid.paste(img, box=(i % cols * w, i // cols * h))
+    return grid
+```
+
+```python
+image_grid(images, 2, 4)
 ```

 ![img](https://huggingface.co/datasets/YiYiXu/test-doc-assets/resolve/main/stable_diffusion_jax_how_to_cell_38_output_0.jpeg)
@@ -189,7 +198,7 @@ images = pipeline(prompt_ids, p_params, rng, jit=True).images
 images = images.reshape((images.shape[0] * images.shape[1],) + images.shape[-3:])
 images = pipeline.numpy_to_pil(images)

-make_image_grid(images, 2, 4)
+image_grid(images, 2, 4)
 ```

 ![img](https://huggingface.co/datasets/YiYiXu/test-doc-assets/resolve/main/stable_diffusion_jax_how_to_cell_43_output_0.jpeg)
--- a/docs/source/en/using-diffusers/textual_inversion_inference.md
+++ b/docs/source/en/using-diffusers/textual_inversion_inference.md
@@ -14,7 +14,7 @@ from huggingface_hub import notebook_login
 notebook_login()
 ```

-Import the necessary libraries:
+Import the necessary libraries, and create a helper function to visualize the generated images:

 ```py
 import os
@@ -24,8 +24,19 @@ import PIL
 from PIL import Image

 from diffusers import StableDiffusionPipeline
-from diffusers.utils import make_image_grid
 from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
+
+
+def image_grid(imgs, rows, cols):
+    assert len(imgs) == rows * cols
+
+    w, h = imgs[0].size
+    grid = Image.new("RGB", size=(cols * w, rows * h))
+    grid_w, grid_h = grid.size
+
+    for i, img in enumerate(imgs):
+        grid.paste(img, box=(i % cols * w, i // cols * h))
+    return grid
 ```

 Pick a Stable Diffusion checkpoint and a pre-learned concept from the [Stable Diffusion Conceptualizer](https://huggingface.co/spaces/sd-concepts-library/stable-diffusion-conceptualizer):
@@ -38,9 +49,7 @@ repo_id_embeds = "sd-concepts-library/cat-toy"
 Now you can load a pipeline, and pass the pre-learned concept to it:

 ```py
-pipeline = StableDiffusionPipeline.from_pretrained(
-    pretrained_model_name_or_path, torch_dtype=torch.float16, use_safetensors=True
-).to("cuda")
+pipeline = StableDiffusionPipeline.from_pretrained(pretrained_model_name_or_path, torch_dtype=torch.float16).to("cuda")

 pipeline.load_textual_inversion(repo_id_embeds)
 ```
@@ -62,7 +71,7 @@ for _ in range(num_rows):
    images = pipe(prompt, num_images_per_prompt=num_samples, num_inference_steps=50, guidance_scale=7.5).images
    all_images.extend(images)

-grid = make_image_grid(all_images, num_samples, num_rows)
+grid = image_grid(all_images, num_samples, num_rows)
 grid
 ```

--- a/docs/source/en/using-diffusers/unconditional_image_generation.md
+++ b/docs/source/en/using-diffusers/unconditional_image_generation.md
@@ -32,7 +32,7 @@ In this guide, you'll use [`DiffusionPipeline`] for unconditional image generati
 ```python
 >>> from diffusers import DiffusionPipeline

->>> generator = DiffusionPipeline.from_pretrained("anton-l/ddpm-butterflies-128", use_safetensors=True)
+>>> generator = DiffusionPipeline.from_pretrained("anton-l/ddpm-butterflies-128")
 ```

 The [`DiffusionPipeline`] downloads and caches all modeling, tokenization, and scheduling components. 
--- a/docs/source/en/using-diffusers/using_safetensors.md
+++ b/docs/source/en/using-diffusers/using_safetensors.md
@@ -40,9 +40,7 @@ You can use the model with the new `.safetensors` weights by specifying the refe
 ```py
 from diffusers import DiffusionPipeline

-pipeline = DiffusionPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-2-1", revision="refs/pr/22", use_safetensors=True
-)
+pipeline = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1", revision="refs/pr/22")
 ```

 ## Why use safetensors?
@@ -57,7 +55,7 @@ There are several reasons for using safetensors:
 	```py
 from diffusers import StableDiffusionPipeline

- pipeline = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1", use_safetensors=True)
+ pipeline = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1")
 "Loaded in safetensors 0:00:02.033658"
 "Loaded in PyTorch 0:00:02.663379"
 	```
--- a/docs/source/en/using-diffusers/weighted_prompts.md
+++ b/docs/source/en/using-diffusers/weighted_prompts.md
@@ -10,36 +10,31 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->

-# Prompt weighting
+# Weighting prompts

 [[open-in-colab]]

-Prompt weighting provides a way to emphasize or de-emphasize certain parts of a prompt, allowing for more control over the generated image. A prompt can include several concepts, which gets turned into contextualized text embeddings. The embeddings are used by the model to condition its cross-attention layers to generate an image (read the Stable Diffusion [blog post](https://huggingface.co/blog/stable_diffusion) to learn more about how it works).
+Text-guided diffusion models generate images based on a given text prompt. The text prompt
+can include multiple concepts that the model should generate and it's often desirable to weight
+certain parts of the prompt more or less. 

-Prompt weighting works by increasing or decreasing the scale of the text embedding vector that corresponds to its concept in the prompt because you may not necessarily want the model to focus on all concepts equally. The easiest way to prepare the prompt-weighted embeddings is to use [Compel](https://github.com/damian0815/compel), a text prompt-weighting and blending library. Once you have the prompt-weighted embeddings, you can pass them to any pipeline that has a [`prompt_embeds`](https://huggingface.co/docs/diffusers/en/api/pipelines/stable_diffusion/text2img#diffusers.StableDiffusionPipeline.__call__.prompt_embeds) (and optionally [`negative_prompt_embeds`](https://huggingface.co/docs/diffusers/en/api/pipelines/stable_diffusion/text2img#diffusers.StableDiffusionPipeline.__call__.negative_prompt_embeds)) parameter, such as [`StableDiffusionPipeline`], [`StableDiffusionControlNetPipeline`], and [`StableDiffusionXLPipeline`].
+Diffusion models work by conditioning the cross attention layers of the diffusion model with contextualized text embeddings (see the [Stable Diffusion Guide for more information](../stable-diffusion)).
+Thus a simple way to emphasize (or de-emphasize) certain parts of the prompt is by increasing or reducing the scale of the text embedding vector that corresponds to the relevant part of the prompt.
+This is called "prompt-weighting" and has been a highly demanded feature by the community (see issue [here](https://github.com/huggingface/diffusers/issues/2431)).

-<Tip>
+## How to do prompt-weighting in Diffusers

-If your favorite pipeline doesn't have a `prompt_embeds` parameter, please open an [issue](https://github.com/huggingface/diffusers/issues/new/choose) so we can add it!
+We believe the role of `diffusers` is to be a toolbox that provides essential features that enable other projects, such as [InvokeAI](https://github.com/invoke-ai/InvokeAI) or [diffuzers](https://github.com/abhishekkrthakur/diffuzers), to build powerful UIs. In order to support arbitrary methods to manipulate prompts, `diffusers` exposes a [`prompt_embeds`](https://huggingface.co/docs/diffusers/v0.14.0/en/api/pipelines/stable_diffusion/text2img#diffusers.StableDiffusionPipeline.__call__.prompt_embeds) function argument to many pipelines such as [`StableDiffusionPipeline`], allowing to directly pass the "prompt-weighted"/scaled text embeddings to the pipeline.

-</Tip>
+The [compel library](https://github.com/damian0815/compel) provides an easy way to emphasize or de-emphasize portions of the prompt for you. We strongly recommend it instead of preparing the embeddings yourself.

-This guide will show you how to weight and blend your prompts with Compel in 🤗 Diffusers.
-
-Before you begin, make sure you have the latest version of Compel installed:
-
-```py
-# uncomment to install in Colab
-#!pip install compel --upgrade
-```
-
-For this guide, let's generate an image with the prompt `"a red cat playing with a ball"` using the [`StableDiffusionPipeline`]:
+Let's look at a simple example. Imagine you want to generate an image of `"a red cat playing with a ball"` as 
+follows:

 ```py
 from diffusers import StableDiffusionPipeline, UniPCMultistepScheduler
-import torch

-pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", use_safetensors=True)
+pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
 pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)

 prompt = "a red cat playing with a ball"
@@ -50,13 +45,19 @@ image = pipe(prompt, generator=generator, num_inference_steps=20).images[0]
 image
 ```

-<div class="flex justify-center">
-  <img class="rounded-xl" src="https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/compel/forest_0.png"/>
-</div>
+This gives you:

-## Weighting
+![img](https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/compel/forest_0.png)

-You'll notice there is no "ball" in the image! Let's use compel to upweight the concept of "ball" in the prompt. Create a [`Compel`](https://github.com/damian0815/compel/blob/main/doc/compel.md#compel-objects) object, and pass it a tokenizer and text encoder:
+As you can see, there is no "ball" in the image. Let's emphasize this part!
+
+For this we should install the `compel` library:
+
+```
+pip install compel
+```
+
+and then create a `Compel` object:

 ```py
 from compel import Compel
@@ -64,114 +65,40 @@ from compel import Compel
 compel_proc = Compel(tokenizer=pipe.tokenizer, text_encoder=pipe.text_encoder)
 ```

-compel uses `+` or `-` to increase or decrease the weight of a word in the prompt. To increase the weight of "ball":
-
-<Tip>
-
-`+` corresponds to the value `1.1`, `++` corresponds to `1.1^2`, and so on. Similarly, `-` corresponds to `0.9` and `--` corresponds to `0.9^2`. Feel free to experiment with adding more `+` or `-` in your prompt!
-
-</Tip>
+Now we emphasize the part "ball" with the `"++"` syntax:

 ```py
 prompt = "a red cat playing with a ball++"
 ```

-Pass the prompt to `compel_proc` to create the new prompt embeddings which are passed to the pipeline:
+and instead of passing this to the pipeline directly, we have to process it using `compel_proc`:

 ```py
 prompt_embeds = compel_proc(prompt)
-generator = torch.manual_seed(33)
+```

-image = pipe(prompt_embeds=prompt_embeds, generator=generator, num_inference_steps=20).images[0]
+Now we can pass `prompt_embeds` directly to the pipeline:
+
+```py
+generator = torch.Generator(device="cpu").manual_seed(33)
+
+images = pipe(prompt_embeds=prompt_embeds, generator=generator, num_inference_steps=20).images[0]
 image
 ```

-<div class="flex justify-center">
-  <img class="rounded-xl" src="https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/compel/forest_1.png"/>
-</div>
+We now get the following image which has a "ball"!

-To downweight parts of the prompt, use the `-` suffix:
+![img](https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/compel/forest_1.png)

-```py
-prompt = "a red------- cat playing with a ball"
-prompt_embeds = compel_proc(prompt)
+Similarly, we de-emphasize parts of the sentence by using the `--` suffix for words, feel free to give it 
+a try!

-generator = torch.manual_seed(33)
+If your favorite pipeline does not have a `prompt_embeds` input, please make sure to open an issue, the 
+diffusers team tries to be as responsive as possible.
+
+Compel 1.1.6 adds a utility class to simplify using textual inversions.  Instantiate a `DiffusersTextualInversionManager` and pass it to Compel init:

-image = pipe(prompt_embeds=prompt_embeds, generator=generator, num_inference_steps=20).images[0]
-image
 ```
-
-<div class="flex justify-center">
-  <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/compel-neg.png"/>
-</div>
-
-You can even up or downweight multiple concepts in the same prompt:
-
-```py
-prompt = "a red cat++ playing with a ball----"
-prompt_embeds = compel_proc(prompt)
-
-generator = torch.manual_seed(33)
-
-image = pipe(prompt_embeds=prompt_embeds, generator=generator, num_inference_steps=20).images[0]
-image
-```
-
-<div class="flex justify-center">
-  <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/compel-pos-neg.png"/>
-</div>
-
-## Blending
-
-You can also create a weighted *blend* of prompts by adding `.blend()` to a list of prompts and passing it some weights. Your blend may not always produce the result you expect because it breaks some assumptions about how the text encoder functions, so just have fun and experiment with it!
-
-```py
-prompt_embeds = compel_proc('("a red cat playing with a ball", "jungle").blend(0.7, 0.8)')
-generator = torch.Generator(device="cuda").manual_seed(33)
-
-image = pipe(prompt_embeds=prompt_embeds, generator=generator, num_inference_steps=20).images[0]
-image
-```
-
-<div class="flex justify-center">
-  <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/compel-blend.png"/>
-</div>
-
-## Conjunction
-
-A conjunction diffuses each prompt independently and concatenates their results by their weighted sum. Add `.and()` to the end of a list of prompts to create a conjunction:
-  
-```py
-prompt_embeds = compel_proc('["a red cat", "playing with a", "ball"].and()')
-generator = torch.Generator(device="cuda").manual_seed(55)
-
-image = pipe(prompt_embeds=prompt_embeds, generator=generator, num_inference_steps=20).images[0]
-image
-```
-
-<div class="flex justify-center">
-  <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/compel-conj.png"/>
-</div>
-
-## Textual inversion
-
-[Textual inversion](../training/text_inversion) is a technique for learning a specific concept from some images which you can use to generate new images conditioned on that concept.
-
-Create a pipeline and use the [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] function to load the textual inversion embeddings (feel free to browse the [Stable Diffusion Conceptualizer](https://huggingface.co/spaces/sd-concepts-library/stable-diffusion-conceptualizer) for 100+ trained concepts):
-
-```py
-import torch
-from diffusers import StableDiffusionPipeline
-from compel import Compel, DiffusersTextualInversionManager
-
-pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True, variant="fp16").to("cuda")
-pipe.load_textual_inversion("sd-concepts-library/midjourney-style")
-```
-
-Compel provides a `DiffusersTextualInversionManager` class to simplify prompt weighting with textual inversion. Instantiate `DiffusersTextualInversionManager` and pass it to the `Compel` class:
-
-```py
 textual_inversion_manager = DiffusersTextualInversionManager(pipe)
 compel = Compel(
    tokenizer=pipe.tokenizer,
@@ -179,87 +106,5 @@ compel = Compel(
    textual_inversion_manager=textual_inversion_manager)
 ```

-Incorporate the concept to condition a prompt with using the `<concept>` syntax:
-
-```py
-prompt_embeds = compel_proc('("A red cat++ playing with a ball <midjourney-style>")')
-
-image = pipe(prompt_embeds=prompt_embeds).images[0]
-image
-```
-
-<div class="flex justify-center">
-  <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/compel-text-inversion.png"/>
-</div>
-
-## DreamBooth
-
-[DreamBooth](../training/dreambooth) is a technique for generating contextualized images of a subject given just a few images of the subject to train on. It is similar to textual inversion, but DreamBooth trains the full model whereas textual inversion only fine-tunes the text embeddings. This means you should use [`~DiffusionPipeline.from_pretrained`] to load the DreamBooth model (feel free to browse the [Stable Diffusion Dreambooth Concepts Library](https://huggingface.co/sd-dreambooth-library) for 100+ trained models):
-
-```py
-import torch
-from diffusers import DiffusionPipeline, UniPCMultistepScheduler
-from compel import Compel
-
-pipe = DiffusionPipeline.from_pretrained("sd-dreambooth-library/dndcoverart-v1", torch_dtype=torch.float16).to("cuda")
-pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
-```
-
-Create a `Compel` class with a tokenizer and text encoder, and pass your prompt to it. Depending on the model you use, you'll need to incorporate the model's unique identifier into your prompt. For example, the `dndcoverart-v1` model uses the identifier `dndcoverart`:
-
-```py
-compel_proc = Compel(tokenizer=pipe.tokenizer, text_encoder=pipe.text_encoder)
-prompt_embeds = compel_proc('("magazine cover of a dndcoverart dragon, high quality, intricate details, larry elmore art style").and()')
-image = pipe(prompt_embeds=prompt_embeds).images[0]
-image
-```
-
-<div class="flex justify-center">
-  <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/compel-dreambooth.png"/>
-</div>
-
-## Stable Diffusion XL
-
-Stable Diffusion XL (SDXL) has two tokenizers and text encoders so it's usage is a bit different. To address this, you should pass both tokenizers and encoders to the `Compel` class:
-
-```py
-from compel import Compel, ReturnedEmbeddingsType
-from diffusers import DiffusionPipeline
-
-pipeline = DiffusionPipeline.from_pretrained(
-  "stabilityai/stable-diffusion-xl-base-1.0",
-  variant="fp16",
-  use_safetensors=True,
-  torch_dtype=torch.float16
-).to("cuda")
-
-compel = Compel(
-  tokenizer=[pipeline.tokenizer, pipeline.tokenizer_2] ,
-  text_encoder=[pipeline.text_encoder, pipeline.text_encoder_2],
-  returned_embeddings_type=ReturnedEmbeddingsType.PENULTIMATE_HIDDEN_STATES_NON_NORMALIZED,
-  requires_pooled=[False, True]
-)
-```
-
-This time, let's upweight "ball" by a factor of 1.5 for the first prompt, and downweight "ball" by 0.6 for the second prompt. The [`StableDiffusionXLPipeline`] also requires [`pooled_prompt_embeds`](https://huggingface.co/docs/diffusers/en/api/pipelines/stable_diffusion/stable_diffusion_xl#diffusers.StableDiffusionXLInpaintPipeline.__call__.pooled_prompt_embeds) (and optionally [`negative_pooled_prompt_embeds`](https://huggingface.co/docs/diffusers/en/api/pipelines/stable_diffusion/stable_diffusion_xl#diffusers.StableDiffusionXLInpaintPipeline.__call__.negative_pooled_prompt_embeds)) so you should pass those to the pipeline along with the conditioning tensors:
-
-```py
-# apply weights
-prompt = ["a red cat playing with a (ball)1.5", "a red cat playing with a (ball)0.6"]
-conditioning, pooled = compel(prompt)
-
-# generate image
-generator = [torch.Generator().manual_seed(33) for _ in range(len(prompt))]
-images = pipeline(prompt_embeds=conditioning, pooled_prompt_embeds=pooled, generator=generator, num_inference_steps=30).images
-```
-
-<div class="flex gap-4">
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/compel/sdxl_ball1.png"/>
-    <figcaption class="mt-2 text-center text-sm text-gray-500">"a red cat playing with a (ball)1.5"</figcaption>
-  </div>
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/compel/sdxl_ball2.png"/>
-    <figcaption class="mt-2 text-center text-sm text-gray-500">"a red cat playing with a (ball)0.6"</figcaption>
-  </div>
-</div>
+Also, please check out the documentation of the [compel](https://github.com/damian0815/compel) library for 
+more information.
--- a/docs/source/en/using-diffusers/write_own_pipeline.md
+++ b/docs/source/en/using-diffusers/write_own_pipeline.md
@@ -25,7 +25,7 @@ A pipeline is a quick and easy way to run a model for inference, requiring no mo
 ```py
 >>> from diffusers import DDPMPipeline

->>> ddpm = DDPMPipeline.from_pretrained("google/ddpm-cat-256", use_safetensors=True).to("cuda")
+>>> ddpm = DDPMPipeline.from_pretrained("google/ddpm-cat-256").to("cuda")
 >>> image = ddpm(num_inference_steps=25).images[0]
 >>> image
 ```
@@ -46,7 +46,7 @@ To recreate the pipeline with the model and scheduler separately, let's write ou
 >>> from diffusers import DDPMScheduler, UNet2DModel

 >>> scheduler = DDPMScheduler.from_pretrained("google/ddpm-cat-256")
->>> model = UNet2DModel.from_pretrained("google/ddpm-cat-256", use_safetensors=True).to("cuda")
+>>> model = UNet2DModel.from_pretrained("google/ddpm-cat-256").to("cuda")
 ```

 2. Set the number of timesteps to run the denoising process for:
@@ -94,9 +94,9 @@ This is the entire denoising process, and you can use this same pattern to write
 >>> from PIL import Image
 >>> import numpy as np

->>> image = (input / 2 + 0.5).clamp(0, 1).squeeze()
->>> image = (image.permute(1, 2, 0) * 255).round().to(torch.uint8).cpu().numpy()
->>> image = Image.fromarray(image)
+>>> image = (input / 2 + 0.5).clamp(0, 1)
+>>> image = image.cpu().permute(0, 2, 3, 1).numpy()[0]
+>>> image = Image.fromarray((image * 255).round().astype("uint8"))
 >>> image
 ```

@@ -124,14 +124,10 @@ Now that you know what you need for the Stable Diffusion pipeline, load all thes
 >>> from transformers import CLIPTextModel, CLIPTokenizer
 >>> from diffusers import AutoencoderKL, UNet2DConditionModel, PNDMScheduler

->>> vae = AutoencoderKL.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="vae", use_safetensors=True)
+>>> vae = AutoencoderKL.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="vae")
 >>> tokenizer = CLIPTokenizer.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="tokenizer")
->>> text_encoder = CLIPTextModel.from_pretrained(
-...     "CompVis/stable-diffusion-v1-4", subfolder="text_encoder", use_safetensors=True
-... )
->>> unet = UNet2DConditionModel.from_pretrained(
-...     "CompVis/stable-diffusion-v1-4", subfolder="unet", use_safetensors=True
-... )
+>>> text_encoder = CLIPTextModel.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="text_encoder")
+>>> unet = UNet2DConditionModel.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="unet")
 ```

 Instead of the default [`PNDMScheduler`], exchange it for the [`UniPCMultistepScheduler`] to see how easy it is to plug a different scheduler in:
@@ -271,11 +267,11 @@ with torch.no_grad():
 Lastly, convert the image to a `PIL.Image` to see your generated image!

 ```py
->>> image = (image / 2 + 0.5).clamp(0, 1).squeeze()
->>> image = (image.permute(1, 2, 0) * 255).to(torch.uint8).cpu().numpy()
+>>> image = (image / 2 + 0.5).clamp(0, 1)
+>>> image = image.detach().cpu().permute(0, 2, 3, 1).numpy()
 >>> images = (image * 255).round().astype("uint8")
->>> image = Image.fromarray(image)
->>> image
+>>> pil_images = [Image.fromarray(image) for image in images]
+>>> pil_images[0]
 ```

 <div class="flex justify-center">
--- a/docs/source/ko/training/dreambooth.md
+++ b/docs/source/ko/training/dreambooth.md
@@ -15,7 +15,8 @@ specific language governing permissions and limitations under the License.
 [DreamBooth](https://arxiv.org/abs/2208.12242)는 한 주제에 대한 적은 이미지(3~5개)만으로도 stable diffusion과 같이 text-to-image 모델을 개인화할 수 있는 방법입니다. 이를 통해 모델은 다양한 장면, 포즈 및 장면(뷰)에서 피사체에 대해 맥락화(contextualized)된 이미지를 생성할 수 있습니다.

 ![프로젝트 블로그에서의 DreamBooth 예시](https://dreambooth.github.io/DreamBooth_files/teaser_static.jpg)
-<small>에서의 Dreambooth 예시 <a href="https://dreambooth.github.io">project's blog.</a></small>
+<a href="https://dreambooth.github.io">project's blog.</a></small>
+<small><a href="https://dreambooth.github.io">프로젝트 블로그</a>에서의 Dreambooth 예시</small>


 이 가이드는 다양한 GPU, Flax 사양에 대해 [`CompVis/stable-diffusion-v1-4`](https://huggingface.co/CompVis/stable-diffusion-v1-4) 모델로 DreamBooth를 파인튜닝하는 방법을 보여줍니다. 더 깊이 파고들어 작동 방식을 확인하는 데 관심이 있는 경우, 이 가이드에 사용된 DreamBooth의 모든 학습 스크립트를 [여기](https://github.com/huggingface/diffusers/tree/main/examples/dreambooth)에서 찾을 수 있습니다.
@@ -471,4 +472,4 @@ image = pipe(prompt, num_inference_steps=50, guidance_scale=7.5).images[0]
 image.save("dog-bucket.png")
 ```

-[저장된 학습 체크포인트](#inference-from-a-saved-checkpoint)에서도 추론을 실행할 수도 있습니다.
+[저장된 학습 체크포인트](#inference-from-a-saved-checkpoint)에서도 추론을 실행할 수도 있습니다.
--- a/examples/community/README.md
+++ b/examples/community/README.md
@@ -39,11 +39,6 @@ If a community doesn't work as expected, please open an issue and ping the autho
 | CLIP Guided Images Mixing Stable Diffusion Pipeline | Сombine images using usual diffusion models. | [CLIP Guided Images Mixing Using Stable Diffusion](#clip-guided-images-mixing-with-stable-diffusion) | - | [Karachev Denis](https://github.com/TheDenk) |  
 | TensorRT Stable Diffusion Inpainting Pipeline                                                                                                    | Accelerates the Stable Diffusion Inpainting Pipeline using TensorRT                                                                                                                                                                                                                                                                                                                                                                                                                                      | [TensorRT Stable Diffusion Inpainting Pipeline](#tensorrt-inpainting-stable-diffusion-pipeline)      | - |              [Asfiya Baig](https://github.com/asfiyab-nvidia) |
 |   IADB Pipeline                                                                                                    | Implementation of [Iterative α-(de)Blending: a Minimalist Deterministic Diffusion Model](https://arxiv.org/abs/2305.03486)                                                                                                                                                                                                                                                                                                                                                                                                                                      | [IADB Pipeline](#iadb-pipeline)      | - |              [Thomas Chambon](https://github.com/tchambon) 
-|   Zero1to3 Pipeline                                                                                                    | Implementation of [Zero-1-to-3: Zero-shot One Image to 3D Object](https://arxiv.org/abs/2303.11328)                                                                                                                                                                                                                                                                                                                                                                                                                                      | [Zero1to3 Pipeline](#Zero1to3-pipeline)      | - |              [Xin Kong](https://github.com/kxhit) |
-Stable Diffusion XL Long Weighted Prompt Pipeline | A pipeline support unlimited length of prompt and negative prompt, use A1111 style of prompt weighting | [Stable Diffusion XL Long Weighted Prompt Pipeline](#stable-diffusion-xl-long-weighted-prompt-pipeline) | - | [Andrew Zhu](https://xhinker.medium.com/) | 
-FABRIC - Stable Diffusion with feedback Pipeline | pipeline supports feedback from liked and disliked images | [Stable Diffusion Fabric Pipline](#stable-diffusion-fabric-pipeline) | - | [Shauray Singh](https://shauray8.github.io/about_shauray/) | 
-sketch inpaint - Inpainting with non-inpaint Stable Diffusion | sketch inpaint much like in automatic1111 | [Masked Im2Im Stable Diffusion Pipeline](#stable-diffusion-masked-im2im) | - | [Anatoly Belikov](https://github.com/noskill) | 
-

 To load a custom pipeline you just need to pass the `custom_pipeline` argument to `DiffusionPipeline`, as one of the files in `diffusers/examples/community`. Feel free to send a PR with your own pipelines, we will merge them quickly.
 ```py
@@ -1532,44 +1527,6 @@ CLIP guided stable diffusion images mixing pipline allows to combine two images
 This approach is using (optional) CoCa model to avoid writing image description.  
 [More code examples](https://github.com/TheDenk/images_mixing)

-
-### Stable Diffusion XL Long Weighted Prompt Pipeline
-
-This SDXL pipeline support unlimted length prompt and negative prompt, compatible with A1111 prompt weighted style. 
-
-You can provide both `prompt` and `prompt_2`. if only one prompt is provided, `prompt_2` will be a copy of the provided `prompt`. Here is a sample code to use this pipeline. 
-
-```python
-from diffusers import DiffusionPipeline
-import torch
-
-pipe = DiffusionPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0"
-    , torch_dtype       = torch.float16
-    , use_safetensors   = True
-    , variant           = "fp16"
-    , custom_pipeline   = "lpw_stable_diffusion_xl",
-)
-
-prompt = "photo of a cute (white) cat running on the grass"*20
-prompt2 = "chasing (birds:1.5)"*20
-prompt = f"{prompt},{prompt2}"
-neg_prompt = "blur, low quality, carton, animate"
-
-pipe.to("cuda")
-images = pipe(
-    prompt                  = prompt 
-    , negative_prompt       = neg_prompt 
-).images[0]
-
-pipe.to("cpu")
-torch.cuda.empty_cache()
-images
-```
-
-In the above code, the `prompt2` is appended to the `prompt`, which is more than 77 tokens. "birds" are showing up in the result. 
-![Stable Diffusion XL Long Weighted Prompt Pipeline sample](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sdxl_long_weighted_prompt.png)
-
 ## Example Images Mixing (with CoCa)
 ```python
 import requests
@@ -1687,13 +1644,13 @@ from io import BytesIO
 from PIL import Image
 import torch
 from diffusers import PNDMScheduler
-from diffusers.pipelines.stable_diffusion import StableDiffusionInpaintPipeline
+from diffusers.pipelines.stable_diffusion import StableDiffusionImg2ImgPipeline

 # Use the PNDMScheduler scheduler here instead
 scheduler = PNDMScheduler.from_pretrained("stabilityai/stable-diffusion-2-inpainting", subfolder="scheduler")


-pipe = StableDiffusionInpaintPipeline.from_pretrained("stabilityai/stable-diffusion-2-inpainting",
+pipe = StableDiffusionImg2ImgPipeline.from_pretrained("stabilityai/stable-diffusion-2-inpainting",
    custom_pipeline="stable_diffusion_tensorrt_inpaint",
    revision='fp16',
    torch_dtype=torch.float16,
@@ -1810,253 +1767,3 @@ while True:
    loss.backward()
    optimizer.step()
 ```
-
-### Zero1to3 pipeline
-
-This pipeline is the implementation of the [Zero-1-to-3: Zero-shot One Image to 3D Object](https://arxiv.org/abs/2303.11328) paper.
-The original pytorch-lightning [repo](https://github.com/cvlab-columbia/zero123) and a diffusers [repo](https://github.com/kxhit/zero123-hf).
-
-The following code shows how to use the Zero1to3 pipeline to generate novel view synthesis images using a pretrained stable diffusion model.
-
-```python
-import os
-import torch
-from pipeline_zero1to3 import Zero1to3StableDiffusionPipeline
-from diffusers.utils import load_image
-
-model_id = "kxic/zero123-165000" # zero123-105000, zero123-165000, zero123-xl
-
-pipe = Zero1to3StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16)
-
-pipe.enable_xformers_memory_efficient_attention()
-pipe.enable_vae_tiling()
-pipe.enable_attention_slicing()
-pipe = pipe.to("cuda")
-
-num_images_per_prompt = 4
-
-# test inference pipeline
-# x y z, Polar angle (vertical rotation in degrees) 	Azimuth angle (horizontal rotation in degrees) 	Zoom (relative distance from center)
-query_pose1 = [-75.0, 100.0, 0.0]
-query_pose2 = [-20.0, 125.0, 0.0]
-query_pose3 = [-55.0, 90.0, 0.0]
-
-# load image
-# H, W = (256, 256) # H, W = (512, 512)   # zero123 training is 256,256
-
-# for batch input
-input_image1 = load_image("./demo/4_blackarm.png") #load_image("https://cvlab-zero123-live.hf.space/file=/home/user/app/configs/4_blackarm.png")
-input_image2 = load_image("./demo/8_motor.png") #load_image("https://cvlab-zero123-live.hf.space/file=/home/user/app/configs/8_motor.png")
-input_image3 = load_image("./demo/7_london.png") #load_image("https://cvlab-zero123-live.hf.space/file=/home/user/app/configs/7_london.png")
-input_images = [input_image1, input_image2, input_image3]
-query_poses = [query_pose1, query_pose2, query_pose3]
-
-# # for single input
-# H, W = (256, 256)
-# input_images = [input_image2.resize((H, W), PIL.Image.NEAREST)]
-# query_poses = [query_pose2]
-
-
-# better do preprocessing
-from gradio_new import preprocess_image, create_carvekit_interface
-import numpy as np
-import PIL.Image as Image
-
-pre_images = []
-models = dict()
-print('Instantiating Carvekit HiInterface...')
-models['carvekit'] = create_carvekit_interface()
-if not isinstance(input_images, list):
-    input_images = [input_images]
-for raw_im in input_images:
-    input_im = preprocess_image(models, raw_im, True)
-    H, W = input_im.shape[:2]
-    pre_images.append(Image.fromarray((input_im * 255.0).astype(np.uint8)))
-input_images = pre_images
-
-# infer pipeline, in original zero123 num_inference_steps=76
-images = pipe(input_imgs=input_images, prompt_imgs=input_images, poses=query_poses, height=H, width=W,
-              guidance_scale=3.0, num_images_per_prompt=num_images_per_prompt, num_inference_steps=50).images
-
-
-# save imgs
-log_dir = "logs"
-os.makedirs(log_dir, exist_ok=True)
-bs = len(input_images)
-i = 0
-for obj in range(bs):
-    for idx in range(num_images_per_prompt):
-        images[i].save(os.path.join(log_dir,f"obj{obj}_{idx}.jpg"))
-        i += 1
-
-```
-
-### Stable Diffusion XL Reference
-
-This pipeline uses the Reference . Refer to the [stable_diffusion_reference](https://github.com/huggingface/diffusers/blob/main/examples/community/README.md#stable-diffusion-reference).
-
-
-```py
-import torch
-from PIL import Image
-from diffusers.utils import load_image
-from diffusers import DiffusionPipeline
-from diffusers.schedulers import UniPCMultistepScheduler
-input_image = load_image("https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png")
-
-# pipe = DiffusionPipeline.from_pretrained(
-#     "stabilityai/stable-diffusion-xl-base-1.0",
-#     custom_pipeline="stable_diffusion_xl_reference",
-#     torch_dtype=torch.float16,
-#     use_safetensors=True,
-#     variant="fp16").to('cuda:0')
-
-pipe = StableDiffusionXLReferencePipeline.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0",
-    torch_dtype=torch.float16,
-    use_safetensors=True,
-    variant="fp16").to('cuda:0')
-
-pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
-
-result_img = pipe(ref_image=input_image,
-      prompt="1girl",
-      num_inference_steps=20,
-      reference_attn=True,
-      reference_adain=True).images[0]
-```
-
-Reference Image
-
-![reference_image](https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png)
-
-Output Image   
-
-`prompt: 1 girl`
-
-`reference_attn=True, reference_adain=True, num_inference_steps=20`
-![Output_image](https://github.com/zideliu/diffusers/assets/34944964/743848da-a215-48f9-ae39-b5e2ae49fb13)
-
-Reference Image
-![reference_image](https://github.com/huggingface/diffusers/assets/34944964/449bdab6-e744-4fb2-9620-d4068d9a741b)
-
-
-Output Image 
-
-`prompt: A dog`
-
-`reference_attn=True, reference_adain=False, num_inference_steps=20`
-![Output_image](https://github.com/huggingface/diffusers/assets/34944964/fff2f16f-6e91-434b-abcc-5259d866c31e)
-
-Reference Image
-![reference_image](https://github.com/huggingface/diffusers/assets/34944964/077ed4fe-2991-4b79-99a1-009f056227d1)
-
-Output Image
-
-`prompt: An astronaut riding a lion`
-
-`reference_attn=True, reference_adain=True, num_inference_steps=20`
-![output_image](https://github.com/huggingface/diffusers/assets/34944964/9b2f1aca-886f-49c3-89ec-d2031c8e3670)
-
-### Stable diffusion fabric pipeline
-
-FABRIC approach applicable to a wide range of popular diffusion models, which exploits
-the self-attention layer present in the most widely used architectures to condition
-the diffusion process on a set of feedback images.
-
-
-```python
-import requests
-import torch
-from PIL import Image
-from io import BytesIO
-
-from diffusers import Diffusionpipeline
-
-# load the pipeline
-# make sure you're logged in with `huggingface-cli login`
-model_id_or_path = "runwayml/stable-diffusion-v1-5"
-#can also be used with dreamlike-art/dreamlike-photoreal-2.0
-pipe = DiffusionPipeline.from_pretrained(model_id_or_path, torch_dtype=torch.float16, custom_pipeline="pipeline_fabric").to("cuda")
-
-# let's specify a prompt
-prompt = "An astronaut riding an elephant"
-negative_prompt = "lowres, cropped"
-
-# call the pipeline
-image = pipe(
-    prompt=prompt,
-    negative_prompt=negative_prompt,
-    num_inference_steps=20,
-    generator=torch.manual_seed(12)
-).images[0]
-
-image.save("horse_to_elephant.jpg")
-
-# let's try another example with feedback
-url = "https://raw.githubusercontent.com/ChenWu98/cycle-diffusion/main/data/dalle2/A%20black%20colored%20car.png"
-response = requests.get(url)
-init_image = Image.open(BytesIO(response.content)).convert("RGB")
-
-prompt = "photo, A blue colored car, fish eye"
-liked = [init_image]
-## same goes with disliked
-
-# call the pipeline
-torch.manual_seed(0)
-image = pipe(
-    prompt=prompt,
-    negative_prompt=negative_prompt,
-    liked = liked,
-    num_inference_steps=20,
-).images[0]
-
-image.save("black_to_blue.png")
-```
-
-*With enough feedbacks you can create very similar high quality images.*
-
-The original codebase can be found at [sd-fabric/fabric](https://github.com/sd-fabric/fabric), and available checkpoints are [dreamlike-art/dreamlike-photoreal-2.0](https://huggingface.co/dreamlike-art/dreamlike-photoreal-2.0), [runwayml/stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5), and [stabilityai/stable-diffusion-2-1](https://huggingface.co/stabilityai/stable-diffusion-2-1) (may give unexpected results).
-
-Let's have a look at the images (*512X512*)
-
-| Without Feedback            | With Feedback  (1st image)          |
-|---------------------|---------------------|
-| ![Image 1](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/fabric_wo_feedback.jpg) | ![Feedback Image 1](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/fabric_w_feedback.png) | 
-
-
-### Masked Im2Im Stable Diffusion Pipeline
-
-This pipeline reimplements sketch inpaint feature from A1111 for non-inpaint models. The following code reads two images, original and one with mask painted over it. It computes mask as a difference of two images and does the inpainting in the area defined by the mask.
-
-```python
-img = PIL.Image.open("./mech.png")
-# read image with mask painted over
-img_paint = PIL.Image.open("./mech_painted.png")
-neq = numpy.any(numpy.array(img) != numpy.array(img_paint), axis=-1)
-mask = neq / neq.max()
-
-pipeline = MaskedStableDiffusionImg2ImgPipeline.from_pretrained("frankjoshua/icbinpICantBelieveIts_v8")
-
-# works best with EulerAncestralDiscreteScheduler
-pipeline.scheduler = EulerAncestralDiscreteScheduler.from_config(pipeline.scheduler.config)
-generator = torch.Generator(device="cpu").manual_seed(4)
-
-prompt = "a man wearing a mask"
-result = pipeline(prompt=prompt, image=img_paint, mask=mask, strength=0.75,
-                  generator=generator)
-result.images[0].save("result.png")
-```
-
-original image mech.png
-
-<img src=https://github.com/noskill/diffusers/assets/733626/10ad972d-d655-43cb-8de1-039e3d79e849 width="25%" >
-
-image with mask mech_painted.png
-
-<img src=https://github.com/noskill/diffusers/assets/733626/c334466a-67fe-4377-9ff7-f46021b9c224 width="25%" >
-
-result:
-
-<img src=https://github.com/noskill/diffusers/assets/733626/23a0a71d-51db-471e-926a-107ac62512a8 width="25%" >
-
--- a/examples/community/checkpoint_merger.py
+++ b/examples/community/checkpoint_merger.py
@@ -2,8 +2,14 @@ import glob
 import os
 from typing import Dict, List, Union

-import safetensors.torch
 import torch
+
+from diffusers.utils import is_safetensors_available
+
+
+if is_safetensors_available():
+    import safetensors.torch
+
 from huggingface_hub import snapshot_download

 from diffusers import DiffusionPipeline, __version__
@@ -223,14 +229,14 @@ class CheckpointMergerPipeline(DiffusionPipeline):
                    update_theta_0 = getattr(module, "load_state_dict")
                    theta_1 = (
                        safetensors.torch.load_file(checkpoint_path_1)
-                        if (checkpoint_path_1.endswith(".safetensors"))
+                        if (is_safetensors_available() and checkpoint_path_1.endswith(".safetensors"))
                        else torch.load(checkpoint_path_1, map_location="cpu")
                    )
                    theta_2 = None
                    if checkpoint_path_2:
                        theta_2 = (
                            safetensors.torch.load_file(checkpoint_path_2)
-                            if (checkpoint_path_2.endswith(".safetensors"))
+                            if (is_safetensors_available() and checkpoint_path_2.endswith(".safetensors"))
                            else torch.load(checkpoint_path_2, map_location="cpu")
                        )

--- a/examples/community/clip_guided_images_mixing_stable_diffusion.py
+++ b/examples/community/clip_guided_images_mixing_stable_diffusion.py
@@ -408,7 +408,7 @@ class CLIPGuidedImagesMixingStableDiffusion(DiffusionPipeline):
        if accepts_generator:
            extra_step_kwargs["generator"] = generator

-        with self.progress_bar(total=num_inference_steps) as progress_bar:
+        with self.progress_bar(total=num_inference_steps):
            for i, t in enumerate(timesteps):
                # expand the latents if we are doing classifier free guidance
                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
@@ -440,7 +440,6 @@ class CLIPGuidedImagesMixingStableDiffusion(DiffusionPipeline):
                # compute the previous noisy sample x_t -> x_t-1
                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample

-                progress_bar.update()
        # Hardcode 0.18215 because stable-diffusion-2-base has not self.vae.config.scaling_factor
        latents = 1 / 0.18215 * latents
        image = self.vae.decode(latents).sample
--- a/examples/community/composable_stable_diffusion.py
+++ b/examples/community/composable_stable_diffusion.py
@@ -423,7 +423,7 @@ class ComposableStableDiffusionPipeline(DiffusionPipeline):
            num_inference_steps (`int`, *optional*, defaults to 50):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference.
-            guidance_scale (`float`, *optional*, defaults to 5.0):
+            guidance_scale (`float`, *optional*, defaults to 7.5):
                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
                `guidance_scale` is defined as `w` of equation 2. of [Imagen
                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
--- a/examples/community/lpw_stable_diffusion.py
+++ b/examples/community/lpw_stable_diffusion.py
@@ -967,7 +967,7 @@ class StableDiffusionLongPromptWeightingPipeline(
            cross_attention_kwargs (`dict`, *optional*):
                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
                `self.processor` in
-                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+                [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).

        Returns:
            `None` if cancelled by `is_cancelled_callback`,
@@ -1202,7 +1202,7 @@ class StableDiffusionLongPromptWeightingPipeline(
            cross_attention_kwargs (`dict`, *optional*):
                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
                `self.processor` in
-                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+                [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).

        Returns:
            `None` if cancelled by `is_cancelled_callback`,
@@ -1316,7 +1316,7 @@ class StableDiffusionLongPromptWeightingPipeline(
            cross_attention_kwargs (`dict`, *optional*):
                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
                `self.processor` in
-                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+                [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).

        Returns:
            `None` if cancelled by `is_cancelled_callback`,
@@ -1437,7 +1437,7 @@ class StableDiffusionLongPromptWeightingPipeline(
            cross_attention_kwargs (`dict`, *optional*):
                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
                `self.processor` in
-                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+                [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).

        Returns:
            `None` if cancelled by `is_cancelled_callback`,
--- a/examples/community/lpw_stable_diffusion_xl.py
+++ b/examples/community/lpw_stable_diffusion_xl.py
--- a/examples/community/masked_stable_diffusion_img2img.py
+++ b/examples/community/masked_stable_diffusion_img2img.py
@@ -1,261 +0,0 @@
-from typing import Any, Callable, Dict, List, Optional, Union
-
-import numpy as np
-import PIL
-import torch
-
-from diffusers import StableDiffusionImg2ImgPipeline
-from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
-
-
-class MaskedStableDiffusionImg2ImgPipeline(StableDiffusionImg2ImgPipeline):
-    debug_save = False
-
-    @torch.no_grad()
-    def __call__(
-        self,
-        prompt: Union[str, List[str]] = None,
-        image: Union[
-            torch.FloatTensor,
-            PIL.Image.Image,
-            np.ndarray,
-            List[torch.FloatTensor],
-            List[PIL.Image.Image],
-            List[np.ndarray],
-        ] = None,
-        strength: float = 0.8,
-        num_inference_steps: Optional[int] = 50,
-        guidance_scale: Optional[float] = 7.5,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: Optional[float] = 0.0,
-        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
-        prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
-        callback_steps: int = 1,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-        mask: Union[
-            torch.FloatTensor,
-            PIL.Image.Image,
-            np.ndarray,
-            List[torch.FloatTensor],
-            List[PIL.Image.Image],
-            List[np.ndarray],
-        ] = None,
-    ):
-        r"""
-        The call function to the pipeline for generation.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
-            image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
-                `Image` or tensor representing an image batch to be used as the starting point. Can also accept image
-                latents as `image`, but if passing latents directly it is not encoded again.
-            strength (`float`, *optional*, defaults to 0.8):
-                Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a
-                starting point and more noise is added the higher the `strength`. The number of denoising steps depends
-                on the amount of noise initially added. When `strength` is 1, added noise is maximum and the denoising
-                process runs for the full number of iterations specified in `num_inference_steps`. A value of 1
-                essentially ignores `image`.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference. This parameter is modulated by `strength`.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                A higher guidance scale value encourages the model to generate images closely linked to the text
-                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
-                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
-                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
-            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
-                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
-                generation deterministic.
-            prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
-                provided, text embeddings are generated from the `prompt` input argument.
-            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
-                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (`Callable`, *optional*):
-                A function that calls every `callback_steps` steps during inference. The function is called with the
-                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function is called. If not specified, the callback is called at
-                every step.
-            cross_attention_kwargs (`dict`, *optional*):
-                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
-                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
-            mask (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`, *optional*):
-                A mask with non-zero elements for the area to be inpainted. If not specified, no mask is applied.
-        Examples:
-
-        Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
-                otherwise a `tuple` is returned where the first element is a list with the generated images and the
-                second element is a list of `bool`s indicating whether the corresponding generated image contains
-                "not-safe-for-work" (nsfw) content.
-        """
-        # code adapted from parent class StableDiffusionImg2ImgPipeline
-
-        # 0. Check inputs. Raise error if not correct
-        self.check_inputs(prompt, strength, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds)
-
-        # 1. Define call parameters
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-        device = self._execution_device
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-
-        # 2. Encode input prompt
-        text_encoder_lora_scale = (
-            cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
-        )
-        prompt_embeds = self._encode_prompt(
-            prompt,
-            device,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-            lora_scale=text_encoder_lora_scale,
-        )
-
-        # 3. Preprocess image
-        image = self.image_processor.preprocess(image)
-
-        # 4. set timesteps
-        self.scheduler.set_timesteps(num_inference_steps, device=device)
-        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
-        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
-
-        # 5. Prepare latent variables
-        # it is sampled from the latent distribution of the VAE
-        latents = self.prepare_latents(
-            image, latent_timestep, batch_size, num_images_per_prompt, prompt_embeds.dtype, device, generator
-        )
-
-        # mean of the latent distribution
-        init_latents = [
-            self.vae.encode(image.to(device=device, dtype=prompt_embeds.dtype)[i : i + 1]).latent_dist.mean
-            for i in range(batch_size)
-        ]
-        init_latents = torch.cat(init_latents, dim=0)
-
-        # 6. create latent mask
-        latent_mask = self._make_latent_mask(latents, mask)
-
-        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-
-        # 8. Denoising loop
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                # expand the latents if we are doing classifier free guidance
-                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
-                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-
-                # predict the noise residual
-                noise_pred = self.unet(
-                    latent_model_input,
-                    t,
-                    encoder_hidden_states=prompt_embeds,
-                    cross_attention_kwargs=cross_attention_kwargs,
-                    return_dict=False,
-                )[0]
-
-                # perform guidance
-                if do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-
-                if latent_mask is not None:
-                    latents = torch.lerp(init_latents * self.vae.config.scaling_factor, latents, latent_mask)
-                    noise_pred = torch.lerp(torch.zeros_like(noise_pred), noise_pred, latent_mask)
-
-                # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
-
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    progress_bar.update()
-                    if callback is not None and i % callback_steps == 0:
-                        callback(i, t, latents)
-
-        if not output_type == "latent":
-            scaled = latents / self.vae.config.scaling_factor
-            if latent_mask is not None:
-                # scaled = latents / self.vae.config.scaling_factor * latent_mask + init_latents * (1 - latent_mask)
-                scaled = torch.lerp(init_latents, scaled, latent_mask)
-            image = self.vae.decode(scaled, return_dict=False)[0]
-            if self.debug_save:
-                image_gen = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
-                image_gen = self.image_processor.postprocess(image_gen, output_type=output_type, do_denormalize=[True])
-                image_gen[0].save("from_latent.png")
-            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
-        else:
-            image = latents
-            has_nsfw_concept = None
-
-        if has_nsfw_concept is None:
-            do_denormalize = [True] * image.shape[0]
-        else:
-            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
-
-        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
-
-        # Offload last model to CPU
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
-            self.final_offload_hook.offload()
-
-        if not return_dict:
-            return (image, has_nsfw_concept)
-
-        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
-
-    def _make_latent_mask(self, latents, mask):
-        if mask is not None:
-            latent_mask = []
-            if not isinstance(mask, list):
-                tmp_mask = [mask]
-            else:
-                tmp_mask = mask
-            _, l_channels, l_height, l_width = latents.shape
-            for m in tmp_mask:
-                if not isinstance(m, PIL.Image.Image):
-                    if len(m.shape) == 2:
-                        m = m[..., np.newaxis]
-                    if m.max() > 1:
-                        m = m / 255.0
-                    m = self.image_processor.numpy_to_pil(m)[0]
-                if m.mode != "L":
-                    m = m.convert("L")
-                resized = self.image_processor.resize(m, l_height, l_width)
-                if self.debug_save:
-                    resized.save("latent_mask.png")
-                latent_mask.append(np.repeat(np.array(resized)[np.newaxis, :, :], l_channels, axis=0))
-            latent_mask = torch.as_tensor(np.stack(latent_mask)).to(latents)
-            latent_mask = latent_mask / latent_mask.max()
-        return latent_mask
--- a/examples/community/pipeline_fabric.py
+++ b/examples/community/pipeline_fabric.py
@@ -1,751 +0,0 @@
-# Copyright 2023 FABRIC authors and the HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from typing import List, Optional, Union
-
-import torch
-from packaging import version
-from PIL import Image
-from transformers import CLIPTextModel, CLIPTokenizer
-
-from diffusers import AutoencoderKL, UNet2DConditionModel
-from diffusers.configuration_utils import FrozenDict
-from diffusers.image_processor import VaeImageProcessor
-from diffusers.loaders import LoraLoaderMixin, TextualInversionLoaderMixin
-from diffusers.models.attention import BasicTransformerBlock
-from diffusers.models.attention_processor import LoRAAttnProcessor
-from diffusers.pipelines.pipeline_utils import DiffusionPipeline
-from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
-from diffusers.schedulers import EulerAncestralDiscreteScheduler, KarrasDiffusionSchedulers
-from diffusers.utils import (
-    deprecate,
-    logging,
-    randn_tensor,
-    replace_example_docstring,
-)
-
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-EXAMPLE_DOC_STRING = """
-    Examples:
-        ```py
-        >>> from diffusers import DiffusionPipeline
-        >>> import torch
-
-        >>> model_id = "dreamlike-art/dreamlike-photoreal-2.0"
-        >>> pipe = DiffusionPipeline(model_id, torch_dtype=torch.float16, custom_pipeline="pipeline_fabric")
-        >>> pipe = pipe.to("cuda")
-        >>> prompt = "a giant standing in a fantasy landscape best quality"
-        >>> liked = []  # list of images for positive feedback
-        >>> disliked = []  # list of images for negative feedback
-        >>> image = pipe(prompt, num_images=4, liked=liked, disliked=disliked).images[0]
-        ```
-"""
-
-
-class FabricCrossAttnProcessor:
-    def __init__(self):
-        self.attntion_probs = None
-
-    def __call__(
-        self,
-        attn,
-        hidden_states,
-        encoder_hidden_states=None,
-        attention_mask=None,
-        weights=None,
-        lora_scale=1.0,
-    ):
-        batch_size, sequence_length, _ = (
-            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
-        )
-        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
-
-        if isinstance(attn.processor, LoRAAttnProcessor):
-            query = attn.to_q(hidden_states) + lora_scale * attn.processor.to_q_lora(hidden_states)
-        else:
-            query = attn.to_q(hidden_states)
-
-        if encoder_hidden_states is None:
-            encoder_hidden_states = hidden_states
-        elif attn.norm_cross:
-            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
-
-        if isinstance(attn.processor, LoRAAttnProcessor):
-            key = attn.to_k(encoder_hidden_states) + lora_scale * attn.processor.to_k_lora(encoder_hidden_states)
-            value = attn.to_v(encoder_hidden_states) + lora_scale * attn.processor.to_v_lora(encoder_hidden_states)
-        else:
-            key = attn.to_k(encoder_hidden_states)
-            value = attn.to_v(encoder_hidden_states)
-
-        query = attn.head_to_batch_dim(query)
-        key = attn.head_to_batch_dim(key)
-        value = attn.head_to_batch_dim(value)
-
-        attention_probs = attn.get_attention_scores(query, key, attention_mask)
-
-        if weights is not None:
-            if weights.shape[0] != 1:
-                weights = weights.repeat_interleave(attn.heads, dim=0)
-            attention_probs = attention_probs * weights[:, None]
-            attention_probs = attention_probs / attention_probs.sum(dim=-1, keepdim=True)
-
-        hidden_states = torch.bmm(attention_probs, value)
-        hidden_states = attn.batch_to_head_dim(hidden_states)
-
-        # linear proj
-        if isinstance(attn.processor, LoRAAttnProcessor):
-            hidden_states = attn.to_out[0](hidden_states) + lora_scale * attn.processor.to_out_lora(hidden_states)
-        else:
-            hidden_states = attn.to_out[0](hidden_states)
-        # dropout
-        hidden_states = attn.to_out[1](hidden_states)
-
-        return hidden_states
-
-
-class FabricPipeline(DiffusionPipeline):
-    r"""
-    Pipeline for text-to-image generation using Stable Diffusion and conditioning the results using feedback images.
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
-    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
-
-    Args:
-        vae ([`AutoencoderKL`]):
-            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
-        text_encoder ([`~transformers.CLIPTextModel`]):
-            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
-        tokenizer ([`~transformers.CLIPTokenizer`]):
-            A `CLIPTokenizer` to tokenize text.
-        unet ([`UNet2DConditionModel`]):
-            A `UNet2DConditionModel` to denoise the encoded image latents.
-        scheduler ([`EulerAncestralDiscreteScheduler`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
-            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
-        safety_checker ([`StableDiffusionSafetyChecker`]):
-            Classification module that estimates whether generated images could be considered offensive or harmful.
-            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
-            about a model's potential harms.
-    """
-
-    def __init__(
-        self,
-        vae: AutoencoderKL,
-        text_encoder: CLIPTextModel,
-        tokenizer: CLIPTokenizer,
-        unet: UNet2DConditionModel,
-        scheduler: KarrasDiffusionSchedulers,
-        requires_safety_checker: bool = True,
-    ):
-        super().__init__()
-
-        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
-            version.parse(unet.config._diffusers_version).base_version
-        ) < version.parse("0.9.0.dev0")
-        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
-        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
-            deprecation_message = (
-                "The configuration file of the unet has set the default `sample_size` to smaller than"
-                " 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of any of the"
-                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
-                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
-                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
-                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
-                " in the config might lead to incorrect results in future versions. If you have downloaded this"
-                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
-                " the `unet/config.json` file"
-            )
-
-            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
-            new_config = dict(unet.config)
-            new_config["sample_size"] = 64
-            unet._internal_dict = FrozenDict(new_config)
-
-        self.register_modules(
-            unet=unet,
-            vae=vae,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            scheduler=scheduler,
-        )
-        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
-        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
-
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
-    def _encode_prompt(
-        self,
-        prompt,
-        device,
-        num_images_per_prompt,
-        do_classifier_free_guidance,
-        negative_prompt=None,
-        prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
-        lora_scale: Optional[float] = None,
-    ):
-        r"""
-        Encodes the prompt into text encoder hidden states.
-
-        Args:
-             prompt (`str` or `List[str]`, *optional*):
-                prompt to be encoded
-            device: (`torch.device`):
-                torch device
-            num_images_per_prompt (`int`):
-                number of images that should be generated per prompt
-            do_classifier_free_guidance (`bool`):
-                whether to use classifier free guidance or not
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
-                less than `1`).
-            prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            lora_scale (`float`, *optional*):
-                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
-        """
-        # set lora scale so that monkey patched LoRA
-        # function of text encoder can correctly access it
-        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
-            self._lora_scale = lora_scale
-
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        if prompt_embeds is None:
-            # textual inversion: procecss multi-vector tokens if necessary
-            if isinstance(self, TextualInversionLoaderMixin):
-                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
-
-            text_inputs = self.tokenizer(
-                prompt,
-                padding="max_length",
-                max_length=self.tokenizer.model_max_length,
-                truncation=True,
-                return_tensors="pt",
-            )
-            text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
-
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
-                text_input_ids, untruncated_ids
-            ):
-                removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
-                )
-                logger.warning(
-                    "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
-                )
-
-            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-                attention_mask = text_inputs.attention_mask.to(device)
-            else:
-                attention_mask = None
-
-            prompt_embeds = self.text_encoder(
-                text_input_ids.to(device),
-                attention_mask=attention_mask,
-            )
-            prompt_embeds = prompt_embeds[0]
-
-        if self.text_encoder is not None:
-            prompt_embeds_dtype = self.text_encoder.dtype
-        elif self.unet is not None:
-            prompt_embeds_dtype = self.unet.dtype
-        else:
-            prompt_embeds_dtype = prompt_embeds.dtype
-
-        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
-
-        bs_embed, seq_len, _ = prompt_embeds.shape
-        # duplicate text embeddings for each generation per prompt, using mps friendly method
-        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
-        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
-
-        # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance and negative_prompt_embeds is None:
-            uncond_tokens: List[str]
-            if negative_prompt is None:
-                uncond_tokens = [""] * batch_size
-            elif prompt is not None and type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}."
-                )
-            elif isinstance(negative_prompt, str):
-                uncond_tokens = [negative_prompt]
-            elif batch_size != len(negative_prompt):
-                raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`."
-                )
-            else:
-                uncond_tokens = negative_prompt
-
-            # textual inversion: procecss multi-vector tokens if necessary
-            if isinstance(self, TextualInversionLoaderMixin):
-                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
-
-            max_length = prompt_embeds.shape[1]
-            uncond_input = self.tokenizer(
-                uncond_tokens,
-                padding="max_length",
-                max_length=max_length,
-                truncation=True,
-                return_tensors="pt",
-            )
-
-            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-                attention_mask = uncond_input.attention_mask.to(device)
-            else:
-                attention_mask = None
-
-            negative_prompt_embeds = self.text_encoder(
-                uncond_input.input_ids.to(device),
-                attention_mask=attention_mask,
-            )
-            negative_prompt_embeds = negative_prompt_embeds[0]
-
-        if do_classifier_free_guidance:
-            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-            seq_len = negative_prompt_embeds.shape[1]
-
-            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
-
-            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
-            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
-
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
-
-        return prompt_embeds
-
-    def get_unet_hidden_states(self, z_all, t, prompt_embd):
-        cached_hidden_states = []
-        for module in self.unet.modules():
-            if isinstance(module, BasicTransformerBlock):
-
-                def new_forward(self, hidden_states, *args, **kwargs):
-                    cached_hidden_states.append(hidden_states.clone().detach().cpu())
-                    return self.old_forward(hidden_states, *args, **kwargs)
-
-                module.attn1.old_forward = module.attn1.forward
-                module.attn1.forward = new_forward.__get__(module.attn1)
-
-        # run forward pass to cache hidden states, output can be discarded
-        _ = self.unet(z_all, t, encoder_hidden_states=prompt_embd)
-
-        # restore original forward pass
-        for module in self.unet.modules():
-            if isinstance(module, BasicTransformerBlock):
-                module.attn1.forward = module.attn1.old_forward
-                del module.attn1.old_forward
-
-        return cached_hidden_states
-
-    def unet_forward_with_cached_hidden_states(
-        self,
-        z_all,
-        t,
-        prompt_embd,
-        cached_pos_hiddens: Optional[List[torch.Tensor]] = None,
-        cached_neg_hiddens: Optional[List[torch.Tensor]] = None,
-        pos_weights=(0.8, 0.8),
-        neg_weights=(0.5, 0.5),
-    ):
-        if cached_pos_hiddens is None and cached_neg_hiddens is None:
-            return self.unet(z_all, t, encoder_hidden_states=prompt_embd)
-
-        local_pos_weights = torch.linspace(*pos_weights, steps=len(self.unet.down_blocks) + 1)[:-1].tolist()
-        local_neg_weights = torch.linspace(*neg_weights, steps=len(self.unet.down_blocks) + 1)[:-1].tolist()
-        for block, pos_weight, neg_weight in zip(
-            self.unet.down_blocks + [self.unet.mid_block] + self.unet.up_blocks,
-            local_pos_weights + [pos_weights[1]] + local_pos_weights[::-1],
-            local_neg_weights + [neg_weights[1]] + local_neg_weights[::-1],
-        ):
-            for module in block.modules():
-                if isinstance(module, BasicTransformerBlock):
-
-                    def new_forward(
-                        self,
-                        hidden_states,
-                        pos_weight=pos_weight,
-                        neg_weight=neg_weight,
-                        **kwargs,
-                    ):
-                        cond_hiddens, uncond_hiddens = hidden_states.chunk(2, dim=0)
-                        batch_size, d_model = cond_hiddens.shape[:2]
-                        device, dtype = hidden_states.device, hidden_states.dtype
-
-                        weights = torch.ones(batch_size, d_model, device=device, dtype=dtype)
-                        out_pos = self.old_forward(hidden_states)
-                        out_neg = self.old_forward(hidden_states)
-
-                        if cached_pos_hiddens is not None:
-                            cached_pos_hs = cached_pos_hiddens.pop(0).to(hidden_states.device)
-                            cond_pos_hs = torch.cat([cond_hiddens, cached_pos_hs], dim=1)
-                            pos_weights = weights.clone().repeat(1, 1 + cached_pos_hs.shape[1] // d_model)
-                            pos_weights[:, d_model:] = pos_weight
-                            attn_with_weights = FabricCrossAttnProcessor()
-                            out_pos = attn_with_weights(
-                                self,
-                                cond_hiddens,
-                                encoder_hidden_states=cond_pos_hs,
-                                weights=pos_weights,
-                            )
-                        else:
-                            out_pos = self.old_forward(cond_hiddens)
-
-                        if cached_neg_hiddens is not None:
-                            cached_neg_hs = cached_neg_hiddens.pop(0).to(hidden_states.device)
-                            uncond_neg_hs = torch.cat([uncond_hiddens, cached_neg_hs], dim=1)
-                            neg_weights = weights.clone().repeat(1, 1 + cached_neg_hs.shape[1] // d_model)
-                            neg_weights[:, d_model:] = neg_weight
-                            attn_with_weights = FabricCrossAttnProcessor()
-                            out_neg = attn_with_weights(
-                                self,
-                                uncond_hiddens,
-                                encoder_hidden_states=uncond_neg_hs,
-                                weights=neg_weights,
-                            )
-                        else:
-                            out_neg = self.old_forward(uncond_hiddens)
-
-                        out = torch.cat([out_pos, out_neg], dim=0)
-                        return out
-
-                    module.attn1.old_forward = module.attn1.forward
-                    module.attn1.forward = new_forward.__get__(module.attn1)
-
-        out = self.unet(z_all, t, encoder_hidden_states=prompt_embd)
-
-        # restore original forward pass
-        for module in self.unet.modules():
-            if isinstance(module, BasicTransformerBlock):
-                module.attn1.forward = module.attn1.old_forward
-                del module.attn1.old_forward
-
-        return out
-
-    def preprocess_feedback_images(self, images, vae, dim, device, dtype, generator) -> torch.tensor:
-        images_t = [self.image_to_tensor(img, dim, dtype) for img in images]
-        images_t = torch.stack(images_t).to(device)
-        latents = vae.config.scaling_factor * vae.encode(images_t).latent_dist.sample(generator)
-
-        return torch.cat([latents], dim=0)
-
-    def check_inputs(
-        self,
-        prompt,
-        negative_prompt=None,
-        liked=None,
-        disliked=None,
-        height=None,
-        width=None,
-    ):
-        if prompt is None:
-            raise ValueError("Provide `prompt`. Cannot leave both `prompt` undefined.")
-        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if negative_prompt is not None and (
-            not isinstance(negative_prompt, str) and not isinstance(negative_prompt, list)
-        ):
-            raise ValueError(f"`negative_prompt` has to be of type `str` or `list` but is {type(negative_prompt)}")
-
-        if liked is not None and not isinstance(liked, list):
-            raise ValueError(f"`liked` has to be of type `list` but is {type(liked)}")
-
-        if disliked is not None and not isinstance(disliked, list):
-            raise ValueError(f"`disliked` has to be of type `list` but is {type(disliked)}")
-
-        if height is not None and not isinstance(height, int):
-            raise ValueError(f"`height` has to be of type `int` but is {type(height)}")
-
-        if width is not None and not isinstance(width, int):
-            raise ValueError(f"`width` has to be of type `int` but is {type(width)}")
-
-    @torch.no_grad()
-    @replace_example_docstring(EXAMPLE_DOC_STRING)
-    def __call__(
-        self,
-        prompt: Optional[Union[str, List[str]]] = "",
-        negative_prompt: Optional[Union[str, List[str]]] = "lowres, bad anatomy, bad hands, cropped, worst quality",
-        liked: Optional[Union[List[str], List[Image.Image]]] = [],
-        disliked: Optional[Union[List[str], List[Image.Image]]] = [],
-        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
-        height: int = 512,
-        width: int = 512,
-        return_dict: bool = True,
-        num_images: int = 4,
-        guidance_scale: float = 7.0,
-        num_inference_steps: int = 20,
-        output_type: Optional[str] = "pil",
-        feedback_start_ratio: float = 0.33,
-        feedback_end_ratio: float = 0.66,
-        min_weight: float = 0.05,
-        max_weight: float = 0.8,
-        neg_scale: float = 0.5,
-        pos_bottleneck_scale: float = 1.0,
-        neg_bottleneck_scale: float = 1.0,
-        latents: Optional[torch.FloatTensor] = None,
-    ):
-        r"""
-        The call function to the pipeline for generation. Generate a trajectory of images with binary feedback. The
-        feedback can be given as a list of liked and disliked images.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`
-                instead.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
-                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
-            liked (`List[Image.Image]` or `List[str]`, *optional*):
-                Encourages images with liked features.
-            disliked (`List[Image.Image]` or `List[str]`, *optional*):
-                Discourages images with disliked features.
-            generator (`torch.Generator` or `List[torch.Generator]` or `int`, *optional*):
-                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) or an `int` to
-                make generation deterministic.
-            height (`int`, *optional*, defaults to 512):
-                Height of the generated image.
-            width (`int`, *optional*, defaults to 512):
-                Width of the generated image.
-            num_images (`int`, *optional*, defaults to 4):
-                The number of images to generate per prompt.
-            guidance_scale (`float`, *optional*, defaults to 7.0):
-                A higher guidance scale value encourages the model to generate images closely linked to the text
-                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
-            num_inference_steps (`int`, *optional*, defaults to 20):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            feedback_start_ratio (`float`, *optional*, defaults to `.33`):
-                Start point for providing feedback (between 0 and 1).
-            feedback_end_ratio (`float`, *optional*, defaults to `.66`):
-                End point for providing feedback (between 0 and 1).
-            min_weight (`float`, *optional*, defaults to `.05`):
-                Minimum weight for feedback.
-            max_weight (`float`, *optional*, defults tp `1.0`):
-                Maximum weight for feedback.
-            neg_scale (`float`, *optional*, defaults to `.5`):
-                Scale factor for negative feedback.
-
-        Examples:
-
-        Returns:
-            [`~pipelines.fabric.FabricPipelineOutput`] or `tuple`:
-                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
-                otherwise a `tuple` is returned where the first element is a list with the generated images and the
-                second element is a list of `bool`s indicating whether the corresponding generated image contains
-                "not-safe-for-work" (nsfw) content.
-
-        """
-
-        self.check_inputs(prompt, negative_prompt, liked, disliked)
-
-        device = self._execution_device
-        dtype = self.unet.dtype
-
-        if isinstance(prompt, str) and prompt is not None:
-            batch_size = 1
-        elif isinstance(prompt, list) and prompt is not None:
-            batch_size = len(prompt)
-        else:
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if isinstance(negative_prompt, str):
-            negative_prompt = negative_prompt
-        elif isinstance(negative_prompt, list):
-            negative_prompt = negative_prompt
-        else:
-            assert len(negative_prompt) == batch_size
-
-        shape = (
-            batch_size * num_images,
-            self.unet.config.in_channels,
-            height // self.vae_scale_factor,
-            width // self.vae_scale_factor,
-        )
-        latent_noise = randn_tensor(
-            shape,
-            device=device,
-            dtype=dtype,
-            generator=generator,
-        )
-
-        positive_latents = (
-            self.preprocess_feedback_images(liked, self.vae, (height, width), device, dtype, generator)
-            if liked and len(liked) > 0
-            else torch.tensor(
-                [],
-                device=device,
-                dtype=dtype,
-            )
-        )
-        negative_latents = (
-            self.preprocess_feedback_images(disliked, self.vae, (height, width), device, dtype, generator)
-            if disliked and len(disliked) > 0
-            else torch.tensor(
-                [],
-                device=device,
-                dtype=dtype,
-            )
-        )
-
-        do_classifier_free_guidance = guidance_scale > 0.1
-
-        (prompt_neg_embs, prompt_pos_embs) = self._encode_prompt(
-            prompt,
-            device,
-            num_images,
-            do_classifier_free_guidance,
-            negative_prompt,
-        ).split([num_images * batch_size, num_images * batch_size])
-
-        batched_prompt_embd = torch.cat([prompt_pos_embs, prompt_neg_embs], dim=0)
-
-        null_tokens = self.tokenizer(
-            [""],
-            return_tensors="pt",
-            max_length=self.tokenizer.model_max_length,
-            padding="max_length",
-            truncation=True,
-        )
-
-        if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-            attention_mask = null_tokens.attention_mask.to(device)
-        else:
-            attention_mask = None
-
-        null_prompt_emb = self.text_encoder(
-            input_ids=null_tokens.input_ids.to(device),
-            attention_mask=attention_mask,
-        ).last_hidden_state
-
-        null_prompt_emb = null_prompt_emb.to(device=device, dtype=dtype)
-
-        self.scheduler.set_timesteps(num_inference_steps, device=device)
-        timesteps = self.scheduler.timesteps
-        latent_noise = latent_noise * self.scheduler.init_noise_sigma
-
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-
-        ref_start_idx = round(len(timesteps) * feedback_start_ratio)
-        ref_end_idx = round(len(timesteps) * feedback_end_ratio)
-
-        with self.progress_bar(total=num_inference_steps) as pbar:
-            for i, t in enumerate(timesteps):
-                sigma = self.scheduler.sigma_t[t] if hasattr(self.scheduler, "sigma_t") else 0
-                if hasattr(self.scheduler, "sigmas"):
-                    sigma = self.scheduler.sigmas[i]
-
-                alpha_hat = 1 / (sigma**2 + 1)
-
-                z_single = self.scheduler.scale_model_input(latent_noise, t)
-                z_all = torch.cat([z_single] * 2, dim=0)
-                z_ref = torch.cat([positive_latents, negative_latents], dim=0)
-
-                if i >= ref_start_idx and i <= ref_end_idx:
-                    weight_factor = max_weight
-                else:
-                    weight_factor = min_weight
-
-                pos_ws = (weight_factor, weight_factor * pos_bottleneck_scale)
-                neg_ws = (weight_factor * neg_scale, weight_factor * neg_scale * neg_bottleneck_scale)
-
-                if z_ref.size(0) > 0 and weight_factor > 0:
-                    noise = torch.randn_like(z_ref)
-                    if isinstance(self.scheduler, EulerAncestralDiscreteScheduler):
-                        z_ref_noised = (alpha_hat**0.5 * z_ref + (1 - alpha_hat) ** 0.5 * noise).type(dtype)
-                    else:
-                        z_ref_noised = self.scheduler.add_noise(z_ref, noise, t)
-
-                    ref_prompt_embd = torch.cat(
-                        [null_prompt_emb] * (len(positive_latents) + len(negative_latents)), dim=0
-                    )
-                    cached_hidden_states = self.get_unet_hidden_states(z_ref_noised, t, ref_prompt_embd)
-
-                    n_pos, n_neg = positive_latents.shape[0], negative_latents.shape[0]
-                    cached_pos_hs, cached_neg_hs = [], []
-                    for hs in cached_hidden_states:
-                        cached_pos, cached_neg = hs.split([n_pos, n_neg], dim=0)
-                        cached_pos = cached_pos.view(1, -1, *cached_pos.shape[2:]).expand(num_images, -1, -1)
-                        cached_neg = cached_neg.view(1, -1, *cached_neg.shape[2:]).expand(num_images, -1, -1)
-                        cached_pos_hs.append(cached_pos)
-                        cached_neg_hs.append(cached_neg)
-
-                    if n_pos == 0:
-                        cached_pos_hs = None
-                    if n_neg == 0:
-                        cached_neg_hs = None
-                else:
-                    cached_pos_hs, cached_neg_hs = None, None
-                unet_out = self.unet_forward_with_cached_hidden_states(
-                    z_all,
-                    t,
-                    prompt_embd=batched_prompt_embd,
-                    cached_pos_hiddens=cached_pos_hs,
-                    cached_neg_hiddens=cached_neg_hs,
-                    pos_weights=pos_ws,
-                    neg_weights=neg_ws,
-                )[0]
-
-                noise_cond, noise_uncond = unet_out.chunk(2)
-                guidance = noise_cond - noise_uncond
-                noise_pred = noise_uncond + guidance_scale * guidance
-                latent_noise = self.scheduler.step(noise_pred, t, latent_noise)[0]
-
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    pbar.update()
-
-        y = self.vae.decode(latent_noise / self.vae.config.scaling_factor, return_dict=False)[0]
-        imgs = self.image_processor.postprocess(
-            y,
-            output_type=output_type,
-        )
-
-        if not return_dict:
-            return imgs
-
-        return StableDiffusionPipelineOutput(imgs, False)
-
-    def image_to_tensor(self, image: Union[str, Image.Image], dim: tuple, dtype):
-        """
-        Convert latent PIL image to a torch tensor for further processing.
-        """
-        if isinstance(image, str):
-            image = Image.open(image)
-        if not image.mode == "RGB":
-            image = image.convert("RGB")
-        image = self.image_processor.preprocess(image, height=dim[0], width=dim[1])[0]
-        return image.type(dtype)
--- a/examples/community/pipeline_zero1to3.py
+++ b/examples/community/pipeline_zero1to3.py
@@ -1,890 +0,0 @@
-# A diffuser version implementation of Zero1to3 (https://github.com/cvlab-columbia/zero123), ICCV 2023
-# by Xin Kong
-
-import inspect
-from typing import Any, Callable, Dict, List, Optional, Union
-
-import kornia
-import numpy as np
-import PIL
-import torch
-from packaging import version
-from transformers import CLIPFeatureExtractor, CLIPVisionModelWithProjection
-
-# from ...configuration_utils import FrozenDict
-# from ...models import AutoencoderKL, UNet2DConditionModel
-# from ...schedulers import KarrasDiffusionSchedulers
-# from ...utils import (
-#     deprecate,
-#     is_accelerate_available,
-#     is_accelerate_version,
-#     logging,
-#     randn_tensor,
-#     replace_example_docstring,
-# )
-# from ..pipeline_utils import DiffusionPipeline
-# from . import StableDiffusionPipelineOutput
-# from .safety_checker import StableDiffusionSafetyChecker
-from diffusers import AutoencoderKL, DiffusionPipeline, UNet2DConditionModel
-from diffusers.configuration_utils import ConfigMixin, FrozenDict
-from diffusers.models.modeling_utils import ModelMixin
-from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput, StableDiffusionSafetyChecker
-from diffusers.schedulers import KarrasDiffusionSchedulers
-from diffusers.utils import (
-    deprecate,
-    is_accelerate_available,
-    is_accelerate_version,
-    logging,
-    randn_tensor,
-    replace_example_docstring,
-)
-
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-# todo
-EXAMPLE_DOC_STRING = """
-    Examples:
-        ```py
-        >>> import torch
-        >>> from diffusers import StableDiffusionPipeline
-
-        >>> pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16)
-        >>> pipe = pipe.to("cuda")
-
-        >>> prompt = "a photo of an astronaut riding a horse on mars"
-        >>> image = pipe(prompt).images[0]
-        ```
-"""
-
-
-class CCProjection(ModelMixin, ConfigMixin):
-    def __init__(self, in_channel=772, out_channel=768):
-        super().__init__()
-        self.in_channel = in_channel
-        self.out_channel = out_channel
-        self.projection = torch.nn.Linear(in_channel, out_channel)
-
-    def forward(self, x):
-        return self.projection(x)
-
-
-class Zero1to3StableDiffusionPipeline(DiffusionPipeline):
-    r"""
-    Pipeline for single view conditioned novel view generation using Zero1to3.
-
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-
-    Args:
-        vae ([`AutoencoderKL`]):
-            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
-        image_encoder ([`CLIPVisionModelWithProjection`]):
-            Frozen CLIP image-encoder. Stable Diffusion Image Variation uses the vision portion of
-            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPVisionModelWithProjection),
-            specifically the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
-        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
-        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
-            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
-        safety_checker ([`StableDiffusionSafetyChecker`]):
-            Classification module that estimates whether generated images could be considered offensive or harmful.
-            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
-        feature_extractor ([`CLIPFeatureExtractor`]):
-            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
-        cc_projection ([`CCProjection`]):
-            Projection layer to project the concated CLIP features and pose embeddings to the original CLIP feature size.
-    """
-    _optional_components = ["safety_checker", "feature_extractor"]
-
-    def __init__(
-        self,
-        vae: AutoencoderKL,
-        image_encoder: CLIPVisionModelWithProjection,
-        unet: UNet2DConditionModel,
-        scheduler: KarrasDiffusionSchedulers,
-        safety_checker: StableDiffusionSafetyChecker,
-        feature_extractor: CLIPFeatureExtractor,
-        cc_projection: CCProjection,
-        requires_safety_checker: bool = True,
-    ):
-        super().__init__()
-
-        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
-            deprecation_message = (
-                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
-                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
-                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
-                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
-                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
-                " file"
-            )
-            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
-            new_config = dict(scheduler.config)
-            new_config["steps_offset"] = 1
-            scheduler._internal_dict = FrozenDict(new_config)
-
-        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
-            deprecation_message = (
-                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
-                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
-                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
-                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
-                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
-            )
-            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
-            new_config = dict(scheduler.config)
-            new_config["clip_sample"] = False
-            scheduler._internal_dict = FrozenDict(new_config)
-
-        if safety_checker is None and requires_safety_checker:
-            logger.warning(
-                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
-                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
-                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
-                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
-                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
-                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
-            )
-
-        if safety_checker is not None and feature_extractor is None:
-            raise ValueError(
-                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
-                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
-            )
-
-        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
-            version.parse(unet.config._diffusers_version).base_version
-        ) < version.parse("0.9.0.dev0")
-        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
-        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
-            deprecation_message = (
-                "The configuration file of the unet has set the default `sample_size` to smaller than"
-                " 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of any of the"
-                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
-                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
-                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
-                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
-                " in the config might lead to incorrect results in future versions. If you have downloaded this"
-                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
-                " the `unet/config.json` file"
-            )
-            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
-            new_config = dict(unet.config)
-            new_config["sample_size"] = 64
-            unet._internal_dict = FrozenDict(new_config)
-
-        self.register_modules(
-            vae=vae,
-            image_encoder=image_encoder,
-            unet=unet,
-            scheduler=scheduler,
-            safety_checker=safety_checker,
-            feature_extractor=feature_extractor,
-            cc_projection=cc_projection,
-        )
-        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
-        self.register_to_config(requires_safety_checker=requires_safety_checker)
-        # self.model_mode = None
-
-    def enable_vae_slicing(self):
-        r"""
-        Enable sliced VAE decoding.
-
-        When this option is enabled, the VAE will split the input tensor in slices to compute decoding in several
-        steps. This is useful to save some memory and allow larger batch sizes.
-        """
-        self.vae.enable_slicing()
-
-    def disable_vae_slicing(self):
-        r"""
-        Disable sliced VAE decoding. If `enable_vae_slicing` was previously invoked, this method will go back to
-        computing decoding in one step.
-        """
-        self.vae.disable_slicing()
-
-    def enable_vae_tiling(self):
-        r"""
-        Enable tiled VAE decoding.
-
-        When this option is enabled, the VAE will split the input tensor into tiles to compute decoding and encoding in
-        several steps. This is useful to save a large amount of memory and to allow the processing of larger images.
-        """
-        self.vae.enable_tiling()
-
-    def disable_vae_tiling(self):
-        r"""
-        Disable tiled VAE decoding. If `enable_vae_tiling` was previously invoked, this method will go back to
-        computing decoding in one step.
-        """
-        self.vae.disable_tiling()
-
-    def enable_sequential_cpu_offload(self, gpu_id=0):
-        r"""
-        Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
-        text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
-        `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
-        Note that offloading happens on a submodule basis. Memory savings are higher than with
-        `enable_model_cpu_offload`, but performance is lower.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.14.0"):
-            from accelerate import cpu_offload
-        else:
-            raise ImportError("`enable_sequential_cpu_offload` requires `accelerate v0.14.0` or higher")
-
-        device = torch.device(f"cuda:{gpu_id}")
-
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-
-        for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
-            cpu_offload(cpu_offloaded_model, device)
-
-        if self.safety_checker is not None:
-            cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True)
-
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
-        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
-        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
-        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_offload` requires `accelerate v0.17.0` or higher.")
-
-        device = torch.device(f"cuda:{gpu_id}")
-
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-
-        hook = None
-        for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-
-        if self.safety_checker is not None:
-            _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
-
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
-
-    @property
-    def _execution_device(self):
-        r"""
-        Returns the device on which the pipeline's models will be executed. After calling
-        `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
-        hooks.
-        """
-        if not hasattr(self.unet, "_hf_hook"):
-            return self.device
-        for module in self.unet.modules():
-            if (
-                hasattr(module, "_hf_hook")
-                and hasattr(module._hf_hook, "execution_device")
-                and module._hf_hook.execution_device is not None
-            ):
-                return torch.device(module._hf_hook.execution_device)
-        return self.device
-
-    def _encode_prompt(
-        self,
-        prompt,
-        device,
-        num_images_per_prompt,
-        do_classifier_free_guidance,
-        negative_prompt=None,
-        prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
-    ):
-        r"""
-        Encodes the prompt into text encoder hidden states.
-
-        Args:
-             prompt (`str` or `List[str]`, *optional*):
-                prompt to be encoded
-            device: (`torch.device`):
-                torch device
-            num_images_per_prompt (`int`):
-                number of images that should be generated per prompt
-            do_classifier_free_guidance (`bool`):
-                whether to use classifier free guidance or not
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
-                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
-            prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-        """
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        if prompt_embeds is None:
-            text_inputs = self.tokenizer(
-                prompt,
-                padding="max_length",
-                max_length=self.tokenizer.model_max_length,
-                truncation=True,
-                return_tensors="pt",
-            )
-            text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
-
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
-                text_input_ids, untruncated_ids
-            ):
-                removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
-                )
-                logger.warning(
-                    "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
-                )
-
-            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-                attention_mask = text_inputs.attention_mask.to(device)
-            else:
-                attention_mask = None
-
-            prompt_embeds = self.text_encoder(
-                text_input_ids.to(device),
-                attention_mask=attention_mask,
-            )
-            prompt_embeds = prompt_embeds[0]
-
-        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
-
-        bs_embed, seq_len, _ = prompt_embeds.shape
-        # duplicate text embeddings for each generation per prompt, using mps friendly method
-        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
-        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
-
-        # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance and negative_prompt_embeds is None:
-            uncond_tokens: List[str]
-            if negative_prompt is None:
-                uncond_tokens = [""] * batch_size
-            elif type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}."
-                )
-            elif isinstance(negative_prompt, str):
-                uncond_tokens = [negative_prompt]
-            elif batch_size != len(negative_prompt):
-                raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`."
-                )
-            else:
-                uncond_tokens = negative_prompt
-
-            max_length = prompt_embeds.shape[1]
-            uncond_input = self.tokenizer(
-                uncond_tokens,
-                padding="max_length",
-                max_length=max_length,
-                truncation=True,
-                return_tensors="pt",
-            )
-
-            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-                attention_mask = uncond_input.attention_mask.to(device)
-            else:
-                attention_mask = None
-
-            negative_prompt_embeds = self.text_encoder(
-                uncond_input.input_ids.to(device),
-                attention_mask=attention_mask,
-            )
-            negative_prompt_embeds = negative_prompt_embeds[0]
-
-        if do_classifier_free_guidance:
-            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-            seq_len = negative_prompt_embeds.shape[1]
-
-            negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
-
-            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
-            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
-
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
-
-        return prompt_embeds
-
-    def CLIP_preprocess(self, x):
-        dtype = x.dtype
-        # following openai's implementation
-        # TODO HF OpenAI CLIP preprocessing issue https://github.com/huggingface/transformers/issues/22505#issuecomment-1650170741
-        # follow openai preprocessing to keep exact same, input tensor [-1, 1], otherwise the preprocessing will be different, https://github.com/huggingface/transformers/pull/22608
-        if isinstance(x, torch.Tensor):
-            if x.min() < -1.0 or x.max() > 1.0:
-                raise ValueError("Expected input tensor to have values in the range [-1, 1]")
-        x = kornia.geometry.resize(
-            x.to(torch.float32), (224, 224), interpolation="bicubic", align_corners=True, antialias=False
-        ).to(dtype=dtype)
-        x = (x + 1.0) / 2.0
-        # renormalize according to clip
-        x = kornia.enhance.normalize(
-            x, torch.Tensor([0.48145466, 0.4578275, 0.40821073]), torch.Tensor([0.26862954, 0.26130258, 0.27577711])
-        )
-        return x
-
-    # from image_variation
-    def _encode_image(self, image, device, num_images_per_prompt, do_classifier_free_guidance):
-        dtype = next(self.image_encoder.parameters()).dtype
-        if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
-            raise ValueError(
-                f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
-            )
-
-        if isinstance(image, torch.Tensor):
-            # Batch single image
-            if image.ndim == 3:
-                assert image.shape[0] == 3, "Image outside a batch should be of shape (3, H, W)"
-                image = image.unsqueeze(0)
-
-            assert image.ndim == 4, "Image must have 4 dimensions"
-
-            # Check image is in [-1, 1]
-            if image.min() < -1 or image.max() > 1:
-                raise ValueError("Image should be in [-1, 1] range")
-        else:
-            # preprocess image
-            if isinstance(image, (PIL.Image.Image, np.ndarray)):
-                image = [image]
-
-            if isinstance(image, list) and isinstance(image[0], PIL.Image.Image):
-                image = [np.array(i.convert("RGB"))[None, :] for i in image]
-                image = np.concatenate(image, axis=0)
-            elif isinstance(image, list) and isinstance(image[0], np.ndarray):
-                image = np.concatenate([i[None, :] for i in image], axis=0)
-
-            image = image.transpose(0, 3, 1, 2)
-            image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
-
-        image = image.to(device=device, dtype=dtype)
-
-        image = self.CLIP_preprocess(image)
-        # if not isinstance(image, torch.Tensor):
-        #     # 0-255
-        #     print("Warning: image is processed by hf's preprocess, which is different from openai original's.")
-        #     image = self.feature_extractor(images=image, return_tensors="pt").pixel_values
-        image_embeddings = self.image_encoder(image).image_embeds.to(dtype=dtype)
-        image_embeddings = image_embeddings.unsqueeze(1)
-
-        # duplicate image embeddings for each generation per prompt, using mps friendly method
-        bs_embed, seq_len, _ = image_embeddings.shape
-        image_embeddings = image_embeddings.repeat(1, num_images_per_prompt, 1)
-        image_embeddings = image_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
-
-        if do_classifier_free_guidance:
-            negative_prompt_embeds = torch.zeros_like(image_embeddings)
-
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-            image_embeddings = torch.cat([negative_prompt_embeds, image_embeddings])
-
-        return image_embeddings
-
-    def _encode_pose(self, pose, device, num_images_per_prompt, do_classifier_free_guidance):
-        dtype = next(self.cc_projection.parameters()).dtype
-        if isinstance(pose, torch.Tensor):
-            pose_embeddings = pose.unsqueeze(1).to(device=device, dtype=dtype)
-        else:
-            if isinstance(pose[0], list):
-                pose = torch.Tensor(pose)
-            else:
-                pose = torch.Tensor([pose])
-            x, y, z = pose[:, 0].unsqueeze(1), pose[:, 1].unsqueeze(1), pose[:, 2].unsqueeze(1)
-            pose_embeddings = (
-                torch.cat([torch.deg2rad(x), torch.sin(torch.deg2rad(y)), torch.cos(torch.deg2rad(y)), z], dim=-1)
-                .unsqueeze(1)
-                .to(device=device, dtype=dtype)
-            )  # B, 1, 4
-        # duplicate pose embeddings for each generation per prompt, using mps friendly method
-        bs_embed, seq_len, _ = pose_embeddings.shape
-        pose_embeddings = pose_embeddings.repeat(1, num_images_per_prompt, 1)
-        pose_embeddings = pose_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
-        if do_classifier_free_guidance:
-            negative_prompt_embeds = torch.zeros_like(pose_embeddings)
-
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-            pose_embeddings = torch.cat([negative_prompt_embeds, pose_embeddings])
-        return pose_embeddings
-
-    def _encode_image_with_pose(self, image, pose, device, num_images_per_prompt, do_classifier_free_guidance):
-        img_prompt_embeds = self._encode_image(image, device, num_images_per_prompt, False)
-        pose_prompt_embeds = self._encode_pose(pose, device, num_images_per_prompt, False)
-        prompt_embeds = torch.cat([img_prompt_embeds, pose_prompt_embeds], dim=-1)
-        prompt_embeds = self.cc_projection(prompt_embeds)
-        # prompt_embeds = img_prompt_embeds
-        # follow 0123, add negative prompt, after projection
-        if do_classifier_free_guidance:
-            negative_prompt = torch.zeros_like(prompt_embeds)
-            prompt_embeds = torch.cat([negative_prompt, prompt_embeds])
-        return prompt_embeds
-
-    def run_safety_checker(self, image, device, dtype):
-        if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device)
-            image, has_nsfw_concept = self.safety_checker(
-                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
-            )
-        else:
-            has_nsfw_concept = None
-        return image, has_nsfw_concept
-
-    def decode_latents(self, latents):
-        latents = 1 / self.vae.config.scaling_factor * latents
-        image = self.vae.decode(latents).sample
-        image = (image / 2 + 0.5).clamp(0, 1)
-        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
-        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
-        return image
-
-    def prepare_extra_step_kwargs(self, generator, eta):
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        extra_step_kwargs = {}
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-
-        # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        if accepts_generator:
-            extra_step_kwargs["generator"] = generator
-        return extra_step_kwargs
-
-    def check_inputs(self, image, height, width, callback_steps):
-        if (
-            not isinstance(image, torch.Tensor)
-            and not isinstance(image, PIL.Image.Image)
-            and not isinstance(image, list)
-        ):
-            raise ValueError(
-                "`image` has to be of type `torch.FloatTensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is"
-                f" {type(image)}"
-            )
-
-        if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
-
-        if (callback_steps is None) or (
-            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
-        ):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}."
-            )
-
-    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
-        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
-        if isinstance(generator, list) and len(generator) != batch_size:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
-            )
-
-        if latents is None:
-            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
-        else:
-            latents = latents.to(device)
-
-        # scale the initial noise by the standard deviation required by the scheduler
-        latents = latents * self.scheduler.init_noise_sigma
-        return latents
-
-    def prepare_img_latents(self, image, batch_size, dtype, device, generator=None, do_classifier_free_guidance=False):
-        if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
-            raise ValueError(
-                f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
-            )
-
-        if isinstance(image, torch.Tensor):
-            # Batch single image
-            if image.ndim == 3:
-                assert image.shape[0] == 3, "Image outside a batch should be of shape (3, H, W)"
-                image = image.unsqueeze(0)
-
-            assert image.ndim == 4, "Image must have 4 dimensions"
-
-            # Check image is in [-1, 1]
-            if image.min() < -1 or image.max() > 1:
-                raise ValueError("Image should be in [-1, 1] range")
-        else:
-            # preprocess image
-            if isinstance(image, (PIL.Image.Image, np.ndarray)):
-                image = [image]
-
-            if isinstance(image, list) and isinstance(image[0], PIL.Image.Image):
-                image = [np.array(i.convert("RGB"))[None, :] for i in image]
-                image = np.concatenate(image, axis=0)
-            elif isinstance(image, list) and isinstance(image[0], np.ndarray):
-                image = np.concatenate([i[None, :] for i in image], axis=0)
-
-            image = image.transpose(0, 3, 1, 2)
-            image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
-
-        image = image.to(device=device, dtype=dtype)
-
-        if isinstance(generator, list) and len(generator) != batch_size:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
-            )
-
-        if isinstance(generator, list):
-            init_latents = [
-                self.vae.encode(image[i : i + 1]).latent_dist.mode(generator[i]) for i in range(batch_size)  # sample
-            ]
-            init_latents = torch.cat(init_latents, dim=0)
-        else:
-            init_latents = self.vae.encode(image).latent_dist.mode()
-
-        # init_latents = self.vae.config.scaling_factor * init_latents  # todo in original zero123's inference gradio_new.py, model.encode_first_stage() is not scaled by scaling_factor
-        if batch_size > init_latents.shape[0]:
-            # init_latents = init_latents.repeat(batch_size // init_latents.shape[0], 1, 1, 1)
-            num_images_per_prompt = batch_size // init_latents.shape[0]
-            # duplicate image latents for each generation per prompt, using mps friendly method
-            bs_embed, emb_c, emb_h, emb_w = init_latents.shape
-            init_latents = init_latents.unsqueeze(1)
-            init_latents = init_latents.repeat(1, num_images_per_prompt, 1, 1, 1)
-            init_latents = init_latents.view(bs_embed * num_images_per_prompt, emb_c, emb_h, emb_w)
-
-        # init_latents = torch.cat([init_latents]*2) if do_classifier_free_guidance else init_latents   # follow zero123
-        init_latents = (
-            torch.cat([torch.zeros_like(init_latents), init_latents]) if do_classifier_free_guidance else init_latents
-        )
-
-        init_latents = init_latents.to(device=device, dtype=dtype)
-        return init_latents
-
-    # def load_cc_projection(self, pretrained_weights=None):
-    #     self.cc_projection = torch.nn.Linear(772, 768)
-    #     torch.nn.init.eye_(list(self.cc_projection.parameters())[0][:768, :768])
-    #     torch.nn.init.zeros_(list(self.cc_projection.parameters())[1])
-    #     if pretrained_weights is not None:
-    #         self.cc_projection.load_state_dict(pretrained_weights)
-
-    @torch.no_grad()
-    @replace_example_docstring(EXAMPLE_DOC_STRING)
-    def __call__(
-        self,
-        input_imgs: Union[torch.FloatTensor, PIL.Image.Image] = None,
-        prompt_imgs: Union[torch.FloatTensor, PIL.Image.Image] = None,
-        poses: Union[List[float], List[List[float]]] = None,
-        torch_dtype=torch.float32,
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        num_inference_steps: int = 50,
-        guidance_scale: float = 3.0,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: float = 0.0,
-        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
-        latents: Optional[torch.FloatTensor] = None,
-        prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
-        callback_steps: int = 1,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-        controlnet_conditioning_scale: float = 1.0,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            input_imgs (`PIL` or `List[PIL]`, *optional*):
-                The single input image for each 3D object
-            prompt_imgs (`PIL` or `List[PIL]`, *optional*):
-                Same as input_imgs, but will be used later as an image prompt condition, encoded by CLIP feature
-            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
-                The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
-                The width in pixels of the generated image.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
-                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
-                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
-                to make generation deterministic.
-            latents (`torch.FloatTensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
-            prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-            cross_attention_kwargs (`dict`, *optional*):
-                A kwargs dictionary that if specified is passed along to the `AttnProcessor` as defined under
-                `self.processor` in
-                [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
-
-        Examples:
-
-        Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-            When returning a tuple, the first element is a list with the generated images, and the second element is a
-            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, according to the `safety_checker`.
-        """
-        # 0. Default height and width to unet
-        height = height or self.unet.config.sample_size * self.vae_scale_factor
-        width = width or self.unet.config.sample_size * self.vae_scale_factor
-
-        # 1. Check inputs. Raise error if not correct
-        # input_image = hint_imgs
-        self.check_inputs(input_imgs, height, width, callback_steps)
-
-        # 2. Define call parameters
-        if isinstance(input_imgs, PIL.Image.Image):
-            batch_size = 1
-        elif isinstance(input_imgs, list):
-            batch_size = len(input_imgs)
-        else:
-            batch_size = input_imgs.shape[0]
-        device = self._execution_device
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-
-        # 3. Encode input image with pose as prompt
-        prompt_embeds = self._encode_image_with_pose(
-            prompt_imgs, poses, device, num_images_per_prompt, do_classifier_free_guidance
-        )
-
-        # 4. Prepare timesteps
-        self.scheduler.set_timesteps(num_inference_steps, device=device)
-        timesteps = self.scheduler.timesteps
-
-        # 5. Prepare latent variables
-        latents = self.prepare_latents(
-            batch_size * num_images_per_prompt,
-            4,
-            height,
-            width,
-            prompt_embeds.dtype,
-            device,
-            generator,
-            latents,
-        )
-
-        # 6. Prepare image latents
-        img_latents = self.prepare_img_latents(
-            input_imgs,
-            batch_size * num_images_per_prompt,
-            prompt_embeds.dtype,
-            device,
-            generator,
-            do_classifier_free_guidance,
-        )
-
-        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-
-        # 7. Denoising loop
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                # expand the latents if we are doing classifier free guidance
-                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
-                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-                latent_model_input = torch.cat([latent_model_input, img_latents], dim=1)
-
-                # predict the noise residual
-                noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=prompt_embeds).sample
-
-                # perform guidance
-                if do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-
-                # compute the previous noisy sample x_t -> x_t-1
-                # latents = self.scheduler.step(noise_pred.to(dtype=torch.float32), t, latents.to(dtype=torch.float32)).prev_sample.to(prompt_embeds.dtype)
-                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
-
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    progress_bar.update()
-                    if callback is not None and i % callback_steps == 0:
-                        callback(i, t, latents)
-
-        # 8. Post-processing
-        has_nsfw_concept = None
-        if output_type == "latent":
-            image = latents
-        elif output_type == "pil":
-            # 8. Post-processing
-            image = self.decode_latents(latents)
-            # 10. Convert to PIL
-            image = self.numpy_to_pil(image)
-        else:
-            # 8. Post-processing
-            image = self.decode_latents(latents)
-
-        # Offload last model to CPU
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
-            self.final_offload_hook.offload()
-
-        if not return_dict:
-            return (image, has_nsfw_concept)
-
-        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
--- a/examples/community/run_onnx_controlnet.py
+++ b/examples/community/run_onnx_controlnet.py
@@ -1,909 +0,0 @@
-import argparse
-import inspect
-import os
-import time
-import warnings
-from typing import Any, Callable, Dict, List, Optional, Union
-
-import numpy as np
-import PIL.Image
-import torch
-from PIL import Image
-from transformers import CLIPTokenizer
-
-from diffusers import OnnxRuntimeModel, StableDiffusionImg2ImgPipeline, UniPCMultistepScheduler
-from diffusers.image_processor import VaeImageProcessor
-from diffusers.pipelines.pipeline_utils import DiffusionPipeline
-from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
-from diffusers.schedulers import KarrasDiffusionSchedulers
-from diffusers.utils import (
-    deprecate,
-    logging,
-    randn_tensor,
-    replace_example_docstring,
-)
-
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-
-EXAMPLE_DOC_STRING = """
-    Examples:
-        ```py
-        >>> # !pip install opencv-python transformers accelerate
-        >>> from diffusers import StableDiffusionControlNetImg2ImgPipeline, ControlNetModel, UniPCMultistepScheduler
-        >>> from diffusers.utils import load_image
-        >>> import numpy as np
-        >>> import torch
-
-        >>> import cv2
-        >>> from PIL import Image
-
-        >>> # download an image
-        >>> image = load_image(
-        ...     "https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png"
-        ... )
-        >>> np_image = np.array(image)
-
-        >>> # get canny image
-        >>> np_image = cv2.Canny(np_image, 100, 200)
-        >>> np_image = np_image[:, :, None]
-        >>> np_image = np.concatenate([np_image, np_image, np_image], axis=2)
-        >>> canny_image = Image.fromarray(np_image)
-
-        >>> # load control net and stable diffusion v1-5
-        >>> controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16)
-        >>> pipe = StableDiffusionControlNetImg2ImgPipeline.from_pretrained(
-        ...     "runwayml/stable-diffusion-v1-5", controlnet=controlnet, torch_dtype=torch.float16
-        ... )
-
-        >>> # speed up diffusion process with faster scheduler and memory optimization
-        >>> pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
-        >>> pipe.enable_model_cpu_offload()
-
-        >>> # generate image
-        >>> generator = torch.manual_seed(0)
-        >>> image = pipe(
-        ...     "futuristic-looking woman",
-        ...     num_inference_steps=20,
-        ...     generator=generator,
-        ...     image=image,
-        ...     control_image=canny_image,
-        ... ).images[0]
-        ```
-"""
-
-
-def prepare_image(image):
-    if isinstance(image, torch.Tensor):
-        # Batch single image
-        if image.ndim == 3:
-            image = image.unsqueeze(0)
-
-        image = image.to(dtype=torch.float32)
-    else:
-        # preprocess image
-        if isinstance(image, (PIL.Image.Image, np.ndarray)):
-            image = [image]
-
-        if isinstance(image, list) and isinstance(image[0], PIL.Image.Image):
-            image = [np.array(i.convert("RGB"))[None, :] for i in image]
-            image = np.concatenate(image, axis=0)
-        elif isinstance(image, list) and isinstance(image[0], np.ndarray):
-            image = np.concatenate([i[None, :] for i in image], axis=0)
-
-        image = image.transpose(0, 3, 1, 2)
-        image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
-
-    return image
-
-
-class OnnxStableDiffusionControlNetImg2ImgPipeline(DiffusionPipeline):
-    vae_encoder: OnnxRuntimeModel
-    vae_decoder: OnnxRuntimeModel
-    text_encoder: OnnxRuntimeModel
-    tokenizer: CLIPTokenizer
-    unet: OnnxRuntimeModel
-    scheduler: KarrasDiffusionSchedulers
-
-    def __init__(
-        self,
-        vae_encoder: OnnxRuntimeModel,
-        vae_decoder: OnnxRuntimeModel,
-        text_encoder: OnnxRuntimeModel,
-        tokenizer: CLIPTokenizer,
-        unet: OnnxRuntimeModel,
-        scheduler: KarrasDiffusionSchedulers,
-    ):
-        super().__init__()
-
-        self.register_modules(
-            vae_encoder=vae_encoder,
-            vae_decoder=vae_decoder,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            unet=unet,
-            scheduler=scheduler,
-        )
-        self.vae_scale_factor = 2 ** (4 - 1)
-        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True)
-        self.control_image_processor = VaeImageProcessor(
-            vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True, do_normalize=False
-        )
-
-    def _encode_prompt(
-        self,
-        prompt: Union[str, List[str]],
-        num_images_per_prompt: Optional[int],
-        do_classifier_free_guidance: bool,
-        negative_prompt: Optional[str],
-        prompt_embeds: Optional[np.ndarray] = None,
-        negative_prompt_embeds: Optional[np.ndarray] = None,
-    ):
-        r"""
-        Encodes the prompt into text encoder hidden states.
-
-        Args:
-            prompt (`str` or `List[str]`):
-                prompt to be encoded
-            num_images_per_prompt (`int`):
-                number of images that should be generated per prompt
-            do_classifier_free_guidance (`bool`):
-                whether to use classifier free guidance or not
-            negative_prompt (`str` or `List[str]`):
-                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
-                if `guidance_scale` is less than `1`).
-            prompt_embeds (`np.ndarray`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`np.ndarray`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-        """
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        if prompt_embeds is None:
-            # get prompt text embeddings
-            text_inputs = self.tokenizer(
-                prompt,
-                padding="max_length",
-                max_length=self.tokenizer.model_max_length,
-                truncation=True,
-                return_tensors="np",
-            )
-            text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(prompt, padding="max_length", return_tensors="np").input_ids
-
-            if not np.array_equal(text_input_ids, untruncated_ids):
-                removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
-                )
-                logger.warning(
-                    "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
-                )
-
-            prompt_embeds = self.text_encoder(input_ids=text_input_ids.astype(np.int32))[0]
-
-        prompt_embeds = np.repeat(prompt_embeds, num_images_per_prompt, axis=0)
-
-        # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance and negative_prompt_embeds is None:
-            uncond_tokens: List[str]
-            if negative_prompt is None:
-                uncond_tokens = [""] * batch_size
-            elif type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}."
-                )
-            elif isinstance(negative_prompt, str):
-                uncond_tokens = [negative_prompt] * batch_size
-            elif batch_size != len(negative_prompt):
-                raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`."
-                )
-            else:
-                uncond_tokens = negative_prompt
-
-            max_length = prompt_embeds.shape[1]
-            uncond_input = self.tokenizer(
-                uncond_tokens,
-                padding="max_length",
-                max_length=max_length,
-                truncation=True,
-                return_tensors="np",
-            )
-            negative_prompt_embeds = self.text_encoder(input_ids=uncond_input.input_ids.astype(np.int32))[0]
-
-        if do_classifier_free_guidance:
-            negative_prompt_embeds = np.repeat(negative_prompt_embeds, num_images_per_prompt, axis=0)
-
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-            prompt_embeds = np.concatenate([negative_prompt_embeds, prompt_embeds])
-
-        return prompt_embeds
-
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
-    def decode_latents(self, latents):
-        warnings.warn(
-            "The decode_latents method is deprecated and will be removed in a future version. Please"
-            " use VaeImageProcessor instead",
-            FutureWarning,
-        )
-        latents = 1 / self.vae.config.scaling_factor * latents
-        image = self.vae.decode(latents, return_dict=False)[0]
-        image = (image / 2 + 0.5).clamp(0, 1)
-        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
-        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
-        return image
-
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
-    def prepare_extra_step_kwargs(self, generator, eta):
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        extra_step_kwargs = {}
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-
-        # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        if accepts_generator:
-            extra_step_kwargs["generator"] = generator
-        return extra_step_kwargs
-
-    def check_inputs(
-        self,
-        num_controlnet,
-        prompt,
-        image,
-        callback_steps,
-        negative_prompt=None,
-        prompt_embeds=None,
-        negative_prompt_embeds=None,
-        controlnet_conditioning_scale=1.0,
-        control_guidance_start=0.0,
-        control_guidance_end=1.0,
-    ):
-        if (callback_steps is None) or (
-            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
-        ):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}."
-            )
-
-        if prompt is not None and prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two."
-            )
-        elif prompt is None and prompt_embeds is None:
-            raise ValueError(
-                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
-            )
-        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if negative_prompt is not None and negative_prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
-                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
-            )
-
-        if prompt_embeds is not None and negative_prompt_embeds is not None:
-            if prompt_embeds.shape != negative_prompt_embeds.shape:
-                raise ValueError(
-                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
-                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}."
-                )
-
-        # Check `image`
-        if num_controlnet == 1:
-            self.check_image(image, prompt, prompt_embeds)
-        elif num_controlnet > 1:
-            if not isinstance(image, list):
-                raise TypeError("For multiple controlnets: `image` must be type `list`")
-
-            # When `image` is a nested list:
-            # (e.g. [[canny_image_1, pose_image_1], [canny_image_2, pose_image_2]])
-            elif any(isinstance(i, list) for i in image):
-                raise ValueError("A single batch of multiple conditionings are supported at the moment.")
-            elif len(image) != num_controlnet:
-                raise ValueError(
-                    f"For multiple controlnets: `image` must have the same length as the number of controlnets, but got {len(image)} images and {num_controlnet} ControlNets."
-                )
-
-            for image_ in image:
-                self.check_image(image_, prompt, prompt_embeds)
-        else:
-            assert False
-
-        # Check `controlnet_conditioning_scale`
-        if num_controlnet == 1:
-            if not isinstance(controlnet_conditioning_scale, float):
-                raise TypeError("For single controlnet: `controlnet_conditioning_scale` must be type `float`.")
-        elif num_controlnet > 1:
-            if isinstance(controlnet_conditioning_scale, list):
-                if any(isinstance(i, list) for i in controlnet_conditioning_scale):
-                    raise ValueError("A single batch of multiple conditionings are supported at the moment.")
-            elif (
-                isinstance(controlnet_conditioning_scale, list)
-                and len(controlnet_conditioning_scale) != num_controlnet
-            ):
-                raise ValueError(
-                    "For multiple controlnets: When `controlnet_conditioning_scale` is specified as `list`, it must have"
-                    " the same length as the number of controlnets"
-                )
-        else:
-            assert False
-
-        if len(control_guidance_start) != len(control_guidance_end):
-            raise ValueError(
-                f"`control_guidance_start` has {len(control_guidance_start)} elements, but `control_guidance_end` has {len(control_guidance_end)} elements. Make sure to provide the same number of elements to each list."
-            )
-
-        if num_controlnet > 1:
-            if len(control_guidance_start) != num_controlnet:
-                raise ValueError(
-                    f"`control_guidance_start`: {control_guidance_start} has {len(control_guidance_start)} elements but there are {num_controlnet} controlnets available. Make sure to provide {num_controlnet}."
-                )
-
-        for start, end in zip(control_guidance_start, control_guidance_end):
-            if start >= end:
-                raise ValueError(
-                    f"control guidance start: {start} cannot be larger or equal to control guidance end: {end}."
-                )
-            if start < 0.0:
-                raise ValueError(f"control guidance start: {start} can't be smaller than 0.")
-            if end > 1.0:
-                raise ValueError(f"control guidance end: {end} can't be larger than 1.0.")
-
-    # Copied from diffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.check_image
-    def check_image(self, image, prompt, prompt_embeds):
-        image_is_pil = isinstance(image, PIL.Image.Image)
-        image_is_tensor = isinstance(image, torch.Tensor)
-        image_is_np = isinstance(image, np.ndarray)
-        image_is_pil_list = isinstance(image, list) and isinstance(image[0], PIL.Image.Image)
-        image_is_tensor_list = isinstance(image, list) and isinstance(image[0], torch.Tensor)
-        image_is_np_list = isinstance(image, list) and isinstance(image[0], np.ndarray)
-
-        if (
-            not image_is_pil
-            and not image_is_tensor
-            and not image_is_np
-            and not image_is_pil_list
-            and not image_is_tensor_list
-            and not image_is_np_list
-        ):
-            raise TypeError(
-                f"image must be passed and be one of PIL image, numpy array, torch tensor, list of PIL images, list of numpy arrays or list of torch tensors, but is {type(image)}"
-            )
-
-        if image_is_pil:
-            image_batch_size = 1
-        else:
-            image_batch_size = len(image)
-
-        if prompt is not None and isinstance(prompt, str):
-            prompt_batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            prompt_batch_size = len(prompt)
-        elif prompt_embeds is not None:
-            prompt_batch_size = prompt_embeds.shape[0]
-
-        if image_batch_size != 1 and image_batch_size != prompt_batch_size:
-            raise ValueError(
-                f"If image batch size is not 1, image batch size must be same as prompt batch size. image batch size: {image_batch_size}, prompt batch size: {prompt_batch_size}"
-            )
-
-    # Copied from diffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.prepare_image
-    def prepare_control_image(
-        self,
-        image,
-        width,
-        height,
-        batch_size,
-        num_images_per_prompt,
-        device,
-        dtype,
-        do_classifier_free_guidance=False,
-        guess_mode=False,
-    ):
-        image = self.control_image_processor.preprocess(image, height=height, width=width).to(dtype=torch.float32)
-        image_batch_size = image.shape[0]
-
-        if image_batch_size == 1:
-            repeat_by = batch_size
-        else:
-            # image batch size is the same as prompt batch size
-            repeat_by = num_images_per_prompt
-
-        image = image.repeat_interleave(repeat_by, dim=0)
-
-        image = image.to(device=device, dtype=dtype)
-
-        if do_classifier_free_guidance and not guess_mode:
-            image = torch.cat([image] * 2)
-
-        return image
-
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps
-    def get_timesteps(self, num_inference_steps, strength, device):
-        # get the original timestep using init_timestep
-        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
-
-        t_start = max(num_inference_steps - init_timestep, 0)
-        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
-
-        return timesteps, num_inference_steps - t_start
-
-    def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None):
-        if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
-            raise ValueError(
-                f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
-            )
-
-        image = image.to(device=device, dtype=dtype)
-
-        batch_size = batch_size * num_images_per_prompt
-
-        if image.shape[1] == 4:
-            init_latents = image
-
-        else:
-            _image = image.cpu().detach().numpy()
-            init_latents = self.vae_encoder(sample=_image)[0]
-            init_latents = torch.from_numpy(init_latents).to(device=device, dtype=dtype)
-            init_latents = 0.18215 * init_latents
-
-        if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
-            # expand init_latents for batch_size
-            deprecation_message = (
-                f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
-                " images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
-                " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
-                " your script to pass as many initial images as text prompts to suppress this warning."
-            )
-            deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False)
-            additional_image_per_prompt = batch_size // init_latents.shape[0]
-            init_latents = torch.cat([init_latents] * additional_image_per_prompt, dim=0)
-        elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
-            raise ValueError(
-                f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
-            )
-        else:
-            init_latents = torch.cat([init_latents], dim=0)
-
-        shape = init_latents.shape
-        noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
-
-        # get latents
-        init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
-        latents = init_latents
-
-        return latents
-
-    @torch.no_grad()
-    @replace_example_docstring(EXAMPLE_DOC_STRING)
-    def __call__(
-        self,
-        num_controlnet: int,
-        fp16: bool = True,
-        prompt: Union[str, List[str]] = None,
-        image: Union[
-            torch.FloatTensor,
-            PIL.Image.Image,
-            np.ndarray,
-            List[torch.FloatTensor],
-            List[PIL.Image.Image],
-            List[np.ndarray],
-        ] = None,
-        control_image: Union[
-            torch.FloatTensor,
-            PIL.Image.Image,
-            np.ndarray,
-            List[torch.FloatTensor],
-            List[PIL.Image.Image],
-            List[np.ndarray],
-        ] = None,
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        strength: float = 0.8,
-        num_inference_steps: int = 50,
-        guidance_scale: float = 7.5,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: float = 0.0,
-        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
-        latents: Optional[torch.FloatTensor] = None,
-        prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
-        callback_steps: int = 1,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-        controlnet_conditioning_scale: Union[float, List[float]] = 0.8,
-        guess_mode: bool = False,
-        control_guidance_start: Union[float, List[float]] = 0.0,
-        control_guidance_end: Union[float, List[float]] = 1.0,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
-                instead.
-            image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,:
-                    `List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`):
-                The initial image will be used as the starting point for the image generation process. Can also accpet
-                image latents as `image`, if passing latents directly, it will not be encoded again.
-            control_image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,:
-                    `List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`):
-                The ControlNet input condition. ControlNet uses this input condition to generate guidance to Unet. If
-                the type is specified as `Torch.FloatTensor`, it is passed to ControlNet as is. `PIL.Image.Image` can
-                also be accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If
-                height and/or width are passed, `image` is resized according to them. If multiple ControlNets are
-                specified in init, images must be passed as a list such that each element of the list can be correctly
-                batched for input to a single controlnet.
-            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
-                The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
-                The width in pixels of the generated image.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
-                less than `1`).
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
-                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
-                to make generation deterministic.
-            latents (`torch.FloatTensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
-            prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-            cross_attention_kwargs (`dict`, *optional*):
-                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
-                `self.processor` in
-                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
-            controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
-                The outputs of the controlnet are multiplied by `controlnet_conditioning_scale` before they are added
-                to the residual in the original unet. If multiple ControlNets are specified in init, you can set the
-                corresponding scale as a list. Note that by default, we use a smaller conditioning scale for inpainting
-                than for [`~StableDiffusionControlNetPipeline.__call__`].
-            guess_mode (`bool`, *optional*, defaults to `False`):
-                In this mode, the ControlNet encoder will try best to recognize the content of the input image even if
-                you remove all prompts. The `guidance_scale` between 3.0 and 5.0 is recommended.
-            control_guidance_start (`float` or `List[float]`, *optional*, defaults to 0.0):
-                The percentage of total steps at which the controlnet starts applying.
-            control_guidance_end (`float` or `List[float]`, *optional*, defaults to 1.0):
-                The percentage of total steps at which the controlnet stops applying.
-
-        Examples:
-
-        Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-            When returning a tuple, the first element is a list with the generated images, and the second element is a
-            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, according to the `safety_checker`.
-        """
-        if fp16:
-            torch_dtype = torch.float16
-            np_dtype = np.float16
-        else:
-            torch_dtype = torch.float32
-            np_dtype = np.float32
-
-        # align format for control guidance
-        if not isinstance(control_guidance_start, list) and isinstance(control_guidance_end, list):
-            control_guidance_start = len(control_guidance_end) * [control_guidance_start]
-        elif not isinstance(control_guidance_end, list) and isinstance(control_guidance_start, list):
-            control_guidance_end = len(control_guidance_start) * [control_guidance_end]
-        elif not isinstance(control_guidance_start, list) and not isinstance(control_guidance_end, list):
-            mult = num_controlnet
-            control_guidance_start, control_guidance_end = mult * [control_guidance_start], mult * [
-                control_guidance_end
-            ]
-
-        # 1. Check inputs. Raise error if not correct
-        self.check_inputs(
-            num_controlnet,
-            prompt,
-            control_image,
-            callback_steps,
-            negative_prompt,
-            prompt_embeds,
-            negative_prompt_embeds,
-            controlnet_conditioning_scale,
-            control_guidance_start,
-            control_guidance_end,
-        )
-
-        # 2. Define call parameters
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        device = self._execution_device
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-
-        if num_controlnet > 1 and isinstance(controlnet_conditioning_scale, float):
-            controlnet_conditioning_scale = [controlnet_conditioning_scale] * num_controlnet
-
-        # 3. Encode input prompt
-        prompt_embeds = self._encode_prompt(
-            prompt,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-        )
-        # 4. Prepare image
-        image = self.image_processor.preprocess(image).to(dtype=torch.float32)
-
-        # 5. Prepare controlnet_conditioning_image
-        if num_controlnet == 1:
-            control_image = self.prepare_control_image(
-                image=control_image,
-                width=width,
-                height=height,
-                batch_size=batch_size * num_images_per_prompt,
-                num_images_per_prompt=num_images_per_prompt,
-                device=device,
-                dtype=torch_dtype,
-                do_classifier_free_guidance=do_classifier_free_guidance,
-                guess_mode=guess_mode,
-            )
-        elif num_controlnet > 1:
-            control_images = []
-
-            for control_image_ in control_image:
-                control_image_ = self.prepare_control_image(
-                    image=control_image_,
-                    width=width,
-                    height=height,
-                    batch_size=batch_size * num_images_per_prompt,
-                    num_images_per_prompt=num_images_per_prompt,
-                    device=device,
-                    dtype=torch_dtype,
-                    do_classifier_free_guidance=do_classifier_free_guidance,
-                    guess_mode=guess_mode,
-                )
-
-                control_images.append(control_image_)
-
-            control_image = control_images
-        else:
-            assert False
-
-        # 5. Prepare timesteps
-        self.scheduler.set_timesteps(num_inference_steps, device=device)
-        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
-        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
-
-        # 6. Prepare latent variables
-        latents = self.prepare_latents(
-            image,
-            latent_timestep,
-            batch_size,
-            num_images_per_prompt,
-            torch_dtype,
-            device,
-            generator,
-        )
-
-        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-
-        # 7.1 Create tensor stating which controlnets to keep
-        controlnet_keep = []
-        for i in range(len(timesteps)):
-            keeps = [
-                1.0 - float(i / len(timesteps) < s or (i + 1) / len(timesteps) > e)
-                for s, e in zip(control_guidance_start, control_guidance_end)
-            ]
-            controlnet_keep.append(keeps[0] if num_controlnet == 1 else keeps)
-
-        # 8. Denoising loop
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                # expand the latents if we are doing classifier free guidance
-                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
-                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-
-                if isinstance(controlnet_keep[i], list):
-                    cond_scale = [c * s for c, s in zip(controlnet_conditioning_scale, controlnet_keep[i])]
-                else:
-                    controlnet_cond_scale = controlnet_conditioning_scale
-                    if isinstance(controlnet_cond_scale, list):
-                        controlnet_cond_scale = controlnet_cond_scale[0]
-                    cond_scale = controlnet_cond_scale * controlnet_keep[i]
-
-                # predict the noise residual
-                _latent_model_input = latent_model_input.cpu().detach().numpy()
-                _prompt_embeds = np.array(prompt_embeds, dtype=np_dtype)
-                _t = np.array([t.cpu().detach().numpy()], dtype=np_dtype)
-
-                if num_controlnet == 1:
-                    control_images = np.array([control_image], dtype=np_dtype)
-                else:
-                    control_images = []
-                    for _control_img in control_image:
-                        _control_img = _control_img.cpu().detach().numpy()
-                        control_images.append(_control_img)
-                    control_images = np.array(control_images, dtype=np_dtype)
-
-                control_scales = np.array(cond_scale, dtype=np_dtype)
-                control_scales = np.resize(control_scales, (num_controlnet, 1))
-
-                noise_pred = self.unet(
-                    sample=_latent_model_input,
-                    timestep=_t,
-                    encoder_hidden_states=_prompt_embeds,
-                    controlnet_conds=control_images,
-                    conditioning_scales=control_scales,
-                )[0]
-                noise_pred = torch.from_numpy(noise_pred).to(device)
-
-                # perform guidance
-                if do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-
-                # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
-
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    progress_bar.update()
-                    if callback is not None and i % callback_steps == 0:
-                        callback(i, t, latents)
-
-        if not output_type == "latent":
-            _latents = latents.cpu().detach().numpy() / 0.18215
-            _latents = np.array(_latents, dtype=np_dtype)
-            image = self.vae_decoder(latent_sample=_latents)[0]
-            image = torch.from_numpy(image).to(device, dtype=torch.float32)
-            has_nsfw_concept = None
-        else:
-            image = latents
-            has_nsfw_concept = None
-
-        if has_nsfw_concept is None:
-            do_denormalize = [True] * image.shape[0]
-        else:
-            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
-
-        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
-
-        if not return_dict:
-            return (image, has_nsfw_concept)
-
-        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        "--sd_model",
-        type=str,
-        required=True,
-        help="Path to the `diffusers` checkpoint to convert (either a local directory or on the Hub).",
-    )
-
-    parser.add_argument(
-        "--onnx_model_dir",
-        type=str,
-        required=True,
-        help="Path to the ONNX directory",
-    )
-
-    parser.add_argument("--qr_img_path", type=str, required=True, help="Path to the qr code image")
-
-    args = parser.parse_args()
-
-    qr_image = Image.open(args.qr_img_path)
-    qr_image = qr_image.resize((512, 512))
-
-    # init stable diffusion pipeline
-    pipeline = StableDiffusionImg2ImgPipeline.from_pretrained(args.sd_model)
-    pipeline.scheduler = UniPCMultistepScheduler.from_config(pipeline.scheduler.config)
-
-    provider = ["CUDAExecutionProvider", "CPUExecutionProvider"]
-    onnx_pipeline = OnnxStableDiffusionControlNetImg2ImgPipeline(
-        vae_encoder=OnnxRuntimeModel.from_pretrained(
-            os.path.join(args.onnx_model_dir, "vae_encoder"), provider=provider
-        ),
-        vae_decoder=OnnxRuntimeModel.from_pretrained(
-            os.path.join(args.onnx_model_dir, "vae_decoder"), provider=provider
-        ),
-        text_encoder=OnnxRuntimeModel.from_pretrained(
-            os.path.join(args.onnx_model_dir, "text_encoder"), provider=provider
-        ),
-        tokenizer=pipeline.tokenizer,
-        unet=OnnxRuntimeModel.from_pretrained(os.path.join(args.onnx_model_dir, "unet"), provider=provider),
-        scheduler=pipeline.scheduler,
-    )
-    onnx_pipeline = onnx_pipeline.to("cuda")
-
-    prompt = "a cute cat fly to the moon"
-    negative_prompt = "paintings, sketches, worst quality, low quality, normal quality, lowres, normal quality, monochrome, grayscale, skin spots, acnes, skin blemishes, age spot, glans, nsfw, nipples, necklace, worst quality, low quality, watermark, username, signature, multiple breasts, lowres, bad anatomy, bad hands, error, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry, bad feet, single color, ugly, duplicate, morbid, mutilated, tranny, trans, trannsexual, hermaphrodite, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, ugly, blurry, bad anatomy, bad proportions, extra limbs, disfigured, bad anatomy, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, mutated hands, fused fingers, too many fingers, long neck, bad body perspect"
-
-    for i in range(10):
-        start_time = time.time()
-        image = onnx_pipeline(
-            num_controlnet=2,
-            prompt=prompt,
-            negative_prompt=negative_prompt,
-            image=qr_image,
-            control_image=[qr_image, qr_image],
-            width=512,
-            height=512,
-            strength=0.75,
-            num_inference_steps=20,
-            num_images_per_prompt=1,
-            controlnet_conditioning_scale=[0.8, 0.8],
-            control_guidance_start=[0.3, 0.3],
-            control_guidance_end=[0.9, 0.9],
-        ).images[0]
-        print(time.time() - start_time)
-        image.save("output_qr_code.png")
--- a/examples/community/run_tensorrt_controlnet.py
+++ b/examples/community/run_tensorrt_controlnet.py
--- a/Show More
+++ b/Show More