[WIP] Don't merge / CPU offload example

2026-02-24 03:40:37 +08:00 · 2023-01-10 16:58:22 +00:00
198 changed files with 3895 additions and 13643 deletions
--- a/README.md
+++ b/README.md
@@ -284,53 +284,6 @@ output = pipeline(
 output_images = pipeline.numpy_to_pil(np.asarray(output.reshape((num_samples,) + output.shape[-3:])))
 ```

-Diffusers also has a Text-guided inpainting pipeline with Flax/Jax
-
-```python
-import jax
-import numpy as np
-from flax.jax_utils import replicate
-from flax.training.common_utils import shard
-import PIL
-import requests
-from io import BytesIO
-
-
-from diffusers import FlaxStableDiffusionInpaintPipeline
-
-def download_image(url):
-    response = requests.get(url)
-    return PIL.Image.open(BytesIO(response.content)).convert("RGB")
-img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png"
-mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png"
-
-init_image = download_image(img_url).resize((512, 512))
-mask_image = download_image(mask_url).resize((512, 512))
-
-pipeline, params = FlaxStableDiffusionInpaintPipeline.from_pretrained("xvjiarui/stable-diffusion-2-inpainting")
-
-prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
-prng_seed = jax.random.PRNGKey(0)
-num_inference_steps = 50
-
-num_samples = jax.device_count()
-prompt = num_samples * [prompt]
-init_image = num_samples * [init_image]
-mask_image = num_samples * [mask_image]
-prompt_ids, processed_masked_images, processed_masks = pipeline.prepare_inputs(prompt, init_image, mask_image)
-
-
-# shard inputs and rng
-params = replicate(params)
-prng_seed = jax.random.split(prng_seed, jax.device_count())
-prompt_ids = shard(prompt_ids)
-processed_masked_images = shard(processed_masked_images)
-processed_masks = shard(processed_masks)
-
-images = pipeline(prompt_ids, processed_masks, processed_masked_images, params, prng_seed, num_inference_steps, jit=True).images
-images = pipeline.numpy_to_pil(np.asarray(images.reshape((num_samples,) + images.shape[-3:])))
-```
-
 ### Image-to-Image text-guided generation with Stable Diffusion

 The `StableDiffusionImg2ImgPipeline` lets you pass a text prompt and an initial image to condition the generation of new images.
--- a/docker/diffusers-flax-cpu/Dockerfile
+++ b/docker/diffusers-flax-cpu/Dockerfile
@@ -34,8 +34,8 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip && \
        datasets \
        hf-doc-builder \
        huggingface-hub \
-        Jinja2 \
        librosa \
+        modelcards \
        numpy \
        scipy \
        tensorboard \
--- a/docker/diffusers-flax-tpu/Dockerfile
+++ b/docker/diffusers-flax-tpu/Dockerfile
@@ -36,8 +36,8 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip && \
        datasets \
        hf-doc-builder \
        huggingface-hub \
-        Jinja2 \
        librosa \        
+        modelcards \
        numpy \
        scipy \
        tensorboard \
--- a/docker/diffusers-onnxruntime-cpu/Dockerfile
+++ b/docker/diffusers-onnxruntime-cpu/Dockerfile
@@ -34,8 +34,8 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip && \
        datasets \
        hf-doc-builder \
        huggingface-hub \
-        Jinja2 \
        librosa \
+        modelcards \
        numpy \
        scipy \
        tensorboard \
--- a/docker/diffusers-onnxruntime-cuda/Dockerfile
+++ b/docker/diffusers-onnxruntime-cuda/Dockerfile
@@ -34,8 +34,8 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip && \
        datasets \
        hf-doc-builder \
        huggingface-hub \
-        Jinja2 \
        librosa \
+        modelcards \
        numpy \
        scipy \
        tensorboard \
--- a/docker/diffusers-pytorch-cpu/Dockerfile
+++ b/docker/diffusers-pytorch-cpu/Dockerfile
@@ -33,8 +33,8 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip && \
        datasets \
        hf-doc-builder \
        huggingface-hub \
-        Jinja2 \
        librosa \
+        modelcards \
        numpy \
        scipy \
        tensorboard \
--- a/docker/diffusers-pytorch-cuda/Dockerfile
+++ b/docker/diffusers-pytorch-cuda/Dockerfile
@@ -33,8 +33,8 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip && \
        datasets \
        hf-doc-builder \
        huggingface-hub \
-        Jinja2 \
        librosa \
+        modelcards \
        numpy \
        scipy \
        tensorboard \
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -32,8 +32,6 @@
      title: Text-Guided Depth-to-Image
    - local: using-diffusers/reusing_seeds
      title: Reusing seeds for deterministic generation
-    - local: using-diffusers/reproducibility
-      title: Reproducibility
    - local: using-diffusers/custom_pipeline_examples
      title: Community Pipelines
    - local: using-diffusers/contribute_pipeline
@@ -73,8 +71,6 @@
    title: Dreambooth
  - local: training/text2image
    title: Text-to-image fine-tuning
-  - local: training/lora
-    title: LoRA Support in Diffusers
  title: Training
 - sections:
  - local: conceptual/philosophy
@@ -94,8 +90,6 @@
      title: Configuration
    - local: api/outputs
      title: Outputs
-    - local: api/loaders
-      title: Loaders
    title: Main Classes
  - sections:
    - local: api/pipelines/overview
@@ -112,8 +106,6 @@
      title: DDIM
    - local: api/pipelines/ddpm
      title: DDPM
-    - local: api/pipelines/dit
-      title: DiT
    - local: api/pipelines/latent_diffusion
      title: Latent Diffusion
    - local: api/pipelines/paint_by_example
@@ -141,8 +133,6 @@
        title: Image-Variation
      - local: api/pipelines/stable_diffusion/upscale
        title: Super-Resolution
-      - local: api/pipelines/stable_diffusion/pix2pix
-        title: InstructPix2Pix
      title: Stable Diffusion
    - local: api/pipelines/stable_diffusion_2
      title: Stable Diffusion 2
--- a/docs/source/en/api/loaders.mdx
+++ b/docs/source/en/api/loaders.mdx
@@ -1,30 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# Loaders
-
-There are many weights to train adapter neural networks for diffusion models, such as 
- [Textual Inversion](./training/text_inversion.mdx)
- [LoRA](https://github.com/cloneofsimo/lora)
- [Hypernetworks](https://arxiv.org/abs/1609.09106)
-
-Such adapter neural networks often only consist of a fraction of the number of weights compared 
-to the pretrained model and as such are very portable. The Diffusers library offers an easy-to-use
-API to load such adapter neural networks via the [`loaders.py` module](https://github.com/huggingface/diffusers/blob/main/src/diffusers/loaders.py). 
-
-**Note**: This module is still highly experimental and prone to future changes.
-
-## LoaderMixins
-
-### UNet2DConditionLoadersMixin
-
-[[autodoc]] loaders.UNet2DConditionLoadersMixin
--- a/docs/source/en/api/logging.mdx
+++ b/docs/source/en/api/logging.mdx
@@ -1,4 +1,4 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.

 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
--- a/docs/source/en/api/pipelines/dit.mdx
+++ b/docs/source/en/api/pipelines/dit.mdx
@@ -1,59 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# Scalable Diffusion Models with Transformers (DiT)
-
-## Overview
-
-[Scalable Diffusion Models with Transformers](https://arxiv.org/abs/2212.09748) (DiT) by William Peebles and Saining Xie.
-
-The abstract of the paper is the following:
-
-*We explore a new class of diffusion models based on the transformer architecture. We train latent diffusion models of images, replacing the commonly-used U-Net backbone with a transformer that operates on latent patches. We analyze the scalability of our Diffusion Transformers (DiTs) through the lens of forward pass complexity as measured by Gflops. We find that DiTs with higher Gflops -- through increased transformer depth/width or increased number of input tokens -- consistently have lower FID. In addition to possessing good scalability properties, our largest DiT-XL/2 models outperform all prior diffusion models on the class-conditional ImageNet 512x512 and 256x256 benchmarks, achieving a state-of-the-art FID of 2.27 on the latter.*
-
-The original codebase of this paper can be found here: [facebookresearch/dit](https://github.com/facebookresearch/dit).
-
-## Available Pipelines:
-
-| Pipeline | Tasks | Colab
-|---|---|:---:|
-| [pipeline_dit.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/dit/pipeline_dit.py) | *Conditional Image Generation* | - |
-
-
-## Usage example
-
-```python
-from diffusers import DiTPipeline, DPMSolverMultistepScheduler
-import torch
-
-pipe = DiTPipeline.from_pretrained("facebook/DiT-XL-2-256", torch_dtype=torch.float16)
-pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
-pipe = pipe.to("cuda")
-
-# pick words from Imagenet class labels
-pipe.labels  # to print all available words
-
-# pick words that exist in ImageNet
-words = ["white shark", "umbrella"]
-
-class_ids = pipe.get_label_ids(words)
-
-generator = torch.manual_seed(33)
-output = pipe(class_labels=class_ids, num_inference_steps=25, generator=generator)
-
-image = output.images[0]  # label 'white shark'
-```
-
-## DiTPipeline
-[[autodoc]] DiTPipeline
-	- all
-	- __call__
--- a/docs/source/en/api/pipelines/stable_diffusion/overview.mdx
+++ b/docs/source/en/api/pipelines/stable_diffusion/overview.mdx
@@ -31,7 +31,6 @@ For more details about how Stable Diffusion works and how it differs from the ba
 | [StableDiffusionDepth2ImgPipeline](./depth2img) | **Experimental** – *Depth-to-Image Text-Guided Generation * | | Coming soon
 | [StableDiffusionImageVariationPipeline](./image_variation) | **Experimental** – *Image Variation Generation * | | [🤗 Stable Diffusion Image Variations](https://huggingface.co/spaces/lambdalabs/stable-diffusion-image-variations)
 | [StableDiffusionUpscalePipeline](./upscale) | **Experimental** – *Text-Guided Image Super-Resolution * | | Coming soon
-| [StableDiffusionInstructPix2PixPipeline](./pix2pix) | **Experimental** – *Text-Based Image Editing * | | [InstructPix2Pix: Learning to Follow Image Editing Instructions](https://huggingface.co/spaces/timbrooks/instruct-pix2pix)



--- a/docs/source/en/api/pipelines/stable_diffusion/pix2pix.mdx
+++ b/docs/source/en/api/pipelines/stable_diffusion/pix2pix.mdx
@@ -1,70 +0,0 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# InstructPix2Pix: Learning to Follow Image Editing Instructions
-
-## Overview
-
-[InstructPix2Pix: Learning to Follow Image Editing Instructions](https://arxiv.org/abs/2211.09800) by Tim Brooks, Aleksander Holynski and Alexei A. Efros.
-
-The abstract of the paper is the following:
-
-*We propose a method for editing images from human instructions: given an input image and a written instruction that tells the model what to do, our model follows these instructions to edit the image. To obtain training data for this problem, we combine the knowledge of two large pretrained models -- a language model (GPT-3) and a text-to-image model (Stable Diffusion) -- to generate a large dataset of image editing examples. Our conditional diffusion model, InstructPix2Pix, is trained on our generated data, and generalizes to real images and user-written instructions at inference time. Since it performs edits in the forward pass and does not require per example fine-tuning or inversion, our model edits images quickly, in a matter of seconds. We show compelling editing results for a diverse collection of input images and written instructions.*
-
-Resources:
-
-* [Project Page](https://www.timothybrooks.com/instruct-pix2pix).
-* [Paper](https://arxiv.org/abs/2211.09800).
-* [Original Code](https://github.com/timothybrooks/instruct-pix2pix).
-* [Demo](https://huggingface.co/spaces/timbrooks/instruct-pix2pix).
-
-
-## Available Pipelines:
-
-| Pipeline | Tasks | Demo
-|---|---|:---:|
-| [StableDiffusionInstructPix2PixPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py) | *Text-Based Image Editing* | [🤗 Space](https://huggingface.co/spaces/timbrooks/instruct-pix2pix) |
-
-<!-- TODO: add Colab -->
-
-## Usage example
-
-```python
-import PIL
-import requests
-import torch
-from diffusers import StableDiffusionInstructPix2PixPipeline
-
-model_id = "timbrooks/instruct-pix2pix"
-pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to("cuda")
-
-url = "https://huggingface.co/datasets/diffusers/diffusers-images-docs/resolve/main/mountain.png"
-
-
-def download_image(url):
-    image = PIL.Image.open(requests.get(url, stream=True).raw)
-    image = PIL.ImageOps.exif_transpose(image)
-    image = image.convert("RGB")
-    return image
-
-
-image = download_image(url)
-
-prompt = "make the mountains snowy"
-edit = pipe(prompt, image=image, num_inference_steps=20, image_guidance_scale=1.5, guidance_scale=7).images[0]
-images[0].save("snowy_mountains.png")
-```
-
-## StableDiffusionInstructPix2PixPipeline
-[[autodoc]] StableDiffusionInstructPix2PixPipeline
-	- __call__
-	- all
--- a/docs/source/en/api/schedulers/overview.mdx
+++ b/docs/source/en/api/schedulers/overview.mdx
@@ -37,7 +37,6 @@ To this end, the design of schedulers is such that:

 - Schedulers can be used interchangeably between diffusion models in inference to find the preferred trade-off between speed and generation quality.
 - Schedulers are currently by default in PyTorch, but are designed to be framework independent (partial Jax support currently exists).
- Many diffusion pipelines, such as [`StableDiffusionPipeline`] and [`DiTPipeline`] can use any of [`KarrasDiffusionSchedulers`]

 ## Schedulers Summary

@@ -81,6 +80,4 @@ The class [`SchedulerOutput`] contains the outputs from any schedulers `step(...

 [[autodoc]] schedulers.scheduling_utils.SchedulerOutput

-### KarrasDiffusionSchedulers

-[[autodoc]] schedulers.scheduling_utils.KarrasDiffusionSchedulers
--- a/docs/source/en/optimization/fp16.mdx
+++ b/docs/source/en/optimization/fp16.mdx
@@ -20,6 +20,7 @@ We'll discuss how the following settings impact performance and memory.
 | ---------------- | ------- | ------- |
 | original         | 9.50s   | x1      |
 | cuDNN auto-tuner | 9.37s   | x1.01   |
+| autocast (fp16)  | 5.47s   | x1.74   |
 | fp16             | 3.61s   | x2.63   |
 | channels last    | 3.30s   | x2.88   |
 | traced UNet      | 3.21s   | x2.96   |
@@ -53,9 +54,27 @@ import torch
 torch.backends.cuda.matmul.allow_tf32 = True
 ```

+## Automatic mixed precision (AMP)
+
+If you use a CUDA GPU, you can take advantage of `torch.autocast` to perform inference roughly twice as fast at the cost of slightly lower precision. All you need to do is put your inference call inside an `autocast` context manager. The following example shows how to do it using Stable Diffusion text-to-image generation as an example:
+
+```Python
+from torch import autocast
+from diffusers import StableDiffusionPipeline
+
+pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
+pipe = pipe.to("cuda")
+
+prompt = "a photo of an astronaut riding a horse on mars"
+with autocast("cuda"):
+    image = pipe(prompt).images[0]
+```
+
+Despite the precision loss, in our experience the final image results look the same as the `float32` versions. Feel free to experiment and report back!
+
 ## Half precision weights

-To save more GPU memory and get more speed, you can load and run the model weights directly in half precision. This involves loading the float16 version of the weights, which was saved to a branch named `fp16`, and telling PyTorch to use the `float16` type when loading them:
+To save more GPU memory and get even more speed, you can load and run the model weights directly in half precision. This involves loading the float16 version of the weights, which was saved to a branch named `fp16`, and telling PyTorch to use the `float16` type when loading them:

 ```Python
 pipe = StableDiffusionPipeline.from_pretrained(
@@ -69,11 +88,6 @@ prompt = "a photo of an astronaut riding a horse on mars"
 image = pipe(prompt).images[0]
 ```

-<Tip warning={true}>
-  It is strongly discouraged to make use of [`torch.autocast`](https://pytorch.org/docs/stable/amp.html#torch.autocast) in any of the pipelines as it can lead to black images and is always slower than using pure 
-  float16 precision.
-</Tip>
-
 ## Sliced attention for additional memory savings

 For even additional memory savings, you can use a sliced version of attention that performs the computation in steps instead of all at once.
@@ -135,7 +149,7 @@ You may see a small performance boost in VAE decode on multi-image batches. Ther

 ## Offloading to CPU with accelerate for memory savings

-For additional memory savings, you can offload the weights to CPU and only load them to GPU when performing the forward pass.
+For additional memory savings, you can offload the weights to CPU and load them to GPU when performing the forward pass.

 To perform CPU offloading, all you have to do is invoke [`~StableDiffusionPipeline.enable_sequential_cpu_offload`]:

@@ -148,15 +162,16 @@ pipe = StableDiffusionPipeline.from_pretrained(
    
    torch_dtype=torch.float16,
 )
+pipe = pipe.to("cuda")

 prompt = "a photo of an astronaut riding a horse on mars"
 pipe.enable_sequential_cpu_offload()
 image = pipe(prompt).images[0]
 ```

-And you can get the memory consumption to < 3GB.
+And you can get the memory consumption to < 2GB.

-If is also possible to chain it with attention slicing for minimal memory consumption (< 2GB).
+If is also possible to chain it with attention slicing for minimal memory consumption, running it in as little as < 800mb of GPU vRAM:

 ```Python
 import torch
@@ -167,6 +182,7 @@ pipe = StableDiffusionPipeline.from_pretrained(
    
    torch_dtype=torch.float16,
 )
+pipe = pipe.to("cuda")

 prompt = "a photo of an astronaut riding a horse on mars"
 pipe.enable_sequential_cpu_offload()
@@ -175,8 +191,6 @@ pipe.enable_attention_slicing(1)
 image = pipe(prompt).images[0]
 ```

-**Note**: When using `enable_sequential_cpu_offload()`, it is important to **not** move the pipeline to CUDA beforehand or else the gain in memory consumption will only be minimal. See [this issue](https://github.com/huggingface/diffusers/issues/1934) for more information.
-
 ## Using Channels Last memory format

 Channels last memory format is an alternative way of ordering NCHW tensors in memory preserving dimensions ordering. Channels last tensors ordered in such a way that channels become the densest dimension (aka storing images pixel-per-pixel). Since not all operators currently support channels last format it may result in a worst performance, so it's better to try it and see if it works for your model.
@@ -343,4 +357,4 @@ with torch.inference_mode():

 # optional: You can disable it via
 # pipe.disable_xformers_memory_efficient_attention()
-```
+```
--- a/docs/source/en/training/lora.mdx
+++ b/docs/source/en/training/lora.mdx
@@ -1,155 +0,0 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# LoRA Support in Diffusers 
-
-Diffusers supports LoRA for faster fine-tuning of Stable Diffusion, allowing greater memory efficiency and easier portability. 
-
-Low-Rank Adaption of Large Language Models was first introduced by Microsoft in
-[LoRA: Low-Rank Adaptation of Large Language Models](https://arxiv.org/abs/2106.09685) by *Edward J. Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, Weizhu Chen*.
-
-In a nutshell, LoRA allows adapting pretrained models by adding pairs of rank-decomposition weight matrices (called **update matrices**) 
-to existing weights and **only** training those newly added weights. This has a couple of advantages:
-
- Previous pretrained weights are kept frozen so that the model is not so prone to [catastrophic forgetting](https://www.pnas.org/doi/10.1073/pnas.1611835114). 
- Rank-decomposition matrices have significantly fewer parameters than the original model, which means that trained LoRA weights are easily portable.
- LoRA matrices are generally added to the attention layers of the original model and they control to which extent the model is adapted toward new training images via a `scale` parameter.
-
-**__Note that the usage of LoRA is not just limited to attention layers. In the original LoRA work, the authors found out that just amending
-the attention layers of a language model is sufficient to obtain good downstream performance with great efficiency. This is why, it's common
-to just add the LoRA weights to the attention layers of a model.__**
-
-[cloneofsimo](https://github.com/cloneofsimo) was the first to try out LoRA training for Stable Diffusion in the popular [lora](https://github.com/cloneofsimo/lora) GitHub repository.
-
-<Tip>
-
-LoRA allows us to achieve greater memory efficiency since the pretrained weights are kept frozen and only the LoRA weights are trained, thereby
-allowing us to run fine-tuning on consumer GPUs like Tesla T4, RTX 3080 or even RTX 2080 Ti! One can get access to GPUs like T4 in the free 
-tiers of Kaggle Kernels and Google Colab Notebooks.
-
-</Tip>
-
-## Getting started with LoRA for fine-tuning
-
-Stable Diffusion can be fine-tuned in different ways:
-
-* [Textual inversion](https://huggingface.co/docs/diffusers/main/en/training/text_inversion)
-* [DreamBooth](https://huggingface.co/docs/diffusers/main/en/training/dreambooth) 
-* [Text2Image fine-tuning](https://huggingface.co/docs/diffusers/main/en/training/text2image) 
-
-We provide two end-to-end examples that show how to run fine-tuning with LoRA:
-
-* [DreamBooth](https://github.com/huggingface/diffusers/tree/main/examples/dreambooth#training-with-low-rank-adaptation-of-large-language-models-lora) 
-* [Text2Image](https://github.com/huggingface/diffusers/tree/main/examples/text_to_image#training-with-lora)
-
-If you want to perform DreamBooth training with LoRA, for instance, you would run:
-
-```bash
-export MODEL_NAME="runwayml/stable-diffusion-v1-5"
-export INSTANCE_DIR="path-to-instance-images"
-export OUTPUT_DIR="path-to-save-model"
-
-accelerate launch train_dreambooth_lora.py \
-  --pretrained_model_name_or_path=$MODEL_NAME  \
-  --instance_data_dir=$INSTANCE_DIR \
-  --output_dir=$OUTPUT_DIR \
-  --instance_prompt="a photo of sks dog" \
-  --resolution=512 \
-  --train_batch_size=1 \
-  --gradient_accumulation_steps=1 \
-  --checkpointing_steps=100 \
-  --learning_rate=1e-4 \
-  --report_to="wandb" \
-  --lr_scheduler="constant" \
-  --lr_warmup_steps=0 \
-  --max_train_steps=500 \
-  --validation_prompt="A photo of sks dog in a bucket" \
-  --validation_epochs=50 \
-  --seed="0" \
-  --push_to_hub
-```
-
-A similar process can be followed to fully fine-tune Stable Diffusion on a custom dataset using the
-`examples/text_to_image/train_text_to_image_lora.py` script.
-
-Refer to the respective examples linked above to learn more. 
-
-<Tip>
-
-When using LoRA we can use a much higher learning rate (typically 1e-4 as opposed to ~1e-6) compared to non-LoRA Dreambooth fine-tuning.
-
-</Tip>
-
-But there is no free lunch. For the given dataset and expected generation quality, you'd still need to experiment with
-different hyperparameters. Here are some important ones:
-
-* Training time
-    * Learning rate 
-    * Number of training steps
-* Inference time 
-    * Number of steps 
-    * Scheduler type
-
-Additionally, you can follow [this blog](https://huggingface.co/blog/dreambooth) that documents some of our experimental
-findings for performing DreamBooth training of Stable Diffusion.
-
-When fine-tuning, the LoRA update matrices are only added to the attention layers. To enable this, we added new weight
-loading functionalities. Their details are available [here](https://huggingface.co/docs/diffusers/main/en/api/loaders).
-
-## Inference 
-
-Assuming you used the `examples/text_to_image/train_text_to_image_lora.py` to fine-tune Stable Diffusion on the [Pokemon
-dataset](https://huggingface.co/datasets/lambdalabs/pokemon-blip-captions), you can perform inference like so: 
-
-```py 
-from diffusers import StableDiffusionPipeline
-import torch
-
-model_path = "sayakpaul/sd-model-finetuned-lora-t4"
-pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
-pipe.unet.load_attn_procs(model_path)
-pipe.to("cuda")
-
-prompt = "A pokemon with blue eyes."
-image = pipe(prompt, num_inference_steps=30, guidance_scale=7.5).images[0]
-image.save("pokemon.png")
-```
-
-Here are some example images you can expect:
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pokemon-collage.png"/>
-
-[`sayakpaul/sd-model-finetuned-lora-t4`](https://huggingface.co/sayakpaul/sd-model-finetuned-lora-t4) contains [LoRA fine-tuned update matrices](https://huggingface.co/sayakpaul/sd-model-finetuned-lora-t4/blob/main/pytorch_lora_weights.bin)
-which is only 3 MBs in size. During inference, the pre-trained Stable Diffusion checkpoints are loaded alongside these update
-matrices and then they are combined to run inference.
-
-You can use the [`huggingface_hub`](https://github.com/huggingface/huggingface_hub) library to retrieve the base model
-from [`sayakpaul/sd-model-finetuned-lora-t4`](https://huggingface.co/sayakpaul/sd-model-finetuned-lora-t4) like so:
-
-```py
-from huggingface_hub.repocard import RepoCard
-
-card = RepoCard.load("sayakpaul/sd-model-finetuned-lora-t4")
-base_model = card.data.to_dict()["base_model"]
-# 'CompVis/stable-diffusion-v1-4'
-```
-
-And then you can use `pipe = StableDiffusionPipeline.from_pretrained(base_model, torch_dtype=torch.float16)`.
-
-This is especially useful when you don't want to hardcode the base model identifier during initializing the `StableDiffusionPipeline`.
-
-Inference for DreamBooth training remains the same. Check
-[this section](https://github.com/huggingface/diffusers/tree/main/examples/dreambooth#inference-1) for more details. 
-
-## Known limitations 
-
-* Currently, we only support LoRA for the attention layers of [`UNet2DConditionModel`](https://huggingface.co/docs/diffusers/main/en/api/models#diffusers.UNet2DConditionModel).
--- a/docs/source/en/training/overview.mdx
+++ b/docs/source/en/training/overview.mdx
@@ -37,7 +37,6 @@ Training examples show how to pretrain or fine-tune diffusion models for a varie
 - [Text-to-Image Training](./text2image)
 - [Text Inversion](./text_inversion)
 - [Dreambooth](./dreambooth)
- [LoRA Support](./lora)

 If possible, please [install xFormers](../optimization/xformers) for memory efficient attention. This could help make your training faster and less memory intensive.

--- a/docs/source/en/using-diffusers/reproducibility.mdx
+++ b/docs/source/en/using-diffusers/reproducibility.mdx
@@ -1,159 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# Reproducibility
-
-Before reading about reproducibility for Diffusers, it is strongly recommended to take a look at 
-[PyTorch's statement about reproducibility](https://pytorch.org/docs/stable/notes/randomness.html).
-
-PyTorch states that 
-> *completely reproducible results are not guaranteed across PyTorch releases, individual commits, or different platforms.*
-While one can never expect the same results across platforms, one can expect results to be reproducible 
-across releases, platforms, etc... within a certain tolerance. However, this tolerance strongly varies 
-depending on the diffusion pipeline and checkpoint.
-
-In the following, we show how to best control sources of randomness for diffusion models.
-
-## Inference
-
-During inference, diffusion pipelines heavily rely on random sampling operations, such as the creating the 
-gaussian noise tensors to be denoised and adding noise to the scheduling step.
-
-Let's have a look at an example. We run the [DDIM pipeline](./api/pipelines/ddim.mdx) 
-for just two inference steps and return a numpy tensor to look into the numerical values of the output.
-
-```python
-from diffusers import DDIMPipeline
-import numpy as np
-
-model_id = "google/ddpm-cifar10-32"
-
-# load model and scheduler
-ddim = DDIMPipeline.from_pretrained(model_id)
-
-# run pipeline for just two steps and return numpy tensor
-image = ddim(num_inference_steps=2, output_type="np").images
-print(np.abs(image).sum())
-```
-
-Running the above prints a value of 1464.2076, but running it again prints a different 
-value of 1495.1768. What is going on here? Every time the pipeline is run, gaussian noise 
-is created and step-wise denoised. To create the gaussian noise with [`torch.randn`](https://pytorch.org/docs/stable/generated/torch.randn.html), a different random seed is taken every time, thus leading to a different result.
-This is a desired property of diffusion pipelines, as it means that the pipeline can create a different random image every time it is run. In many cases, one would like to generate the exact same image of a certain 
-run, for which case an instance of a [PyTorch generator](https://pytorch.org/docs/stable/generated/torch.randn.html) has to be passed:
-
-```python
-import torch
-from diffusers import DDIMPipeline
-import numpy as np
-
-model_id = "google/ddpm-cifar10-32"
-
-# load model and scheduler
-ddim = DDIMPipeline.from_pretrained(model_id)
-
-# create a generator for reproducibility
-generator = torch.Generator(device="cpu").manual_seed(0)
-
-# run pipeline for just two steps and return numpy tensor
-image = ddim(num_inference_steps=2, output_type="np", generator=generator).images
-print(np.abs(image).sum())
-```
-
-Running the above always prints a value of 1491.1711 - also upon running it again because we 
-define the generator object to be passed to all random functions of the pipeline.
-
-If you run this code snippet on your specific hardware and version, you should get a similar, if not the same, result.
-
-<Tip>
-
-It might be a bit unintuitive at first to pass `generator` objects to the pipelines instead of 
-just integer values representing the seed, but this is the recommended design when dealing with 
-probabilistic models in PyTorch as generators are *random states* that are advanced and can thus be 
-passed to multiple pipelines in a sequence.
-
-</Tip>
-
-Great! Now, we know how to write reproducible pipelines, but it gets a bit trickier since the above example only runs on the CPU. How do we also achieve reproducibility on GPU? 
-In short, one should not expect full reproducibility across different hardware when running pipelines on GPU 
-as matrix multiplications are less deterministic on GPU than on CPU and diffusion pipelines tend to require
-a lot of matrix multiplications. Let's see what we can do to keep the randomness within limits across 
-different GPU hardware.
-
-To achieve maximum speed performance, it is recommended to create the generator directly on GPU when running 
-the pipeline on GPU:
-
-```python
-import torch
-from diffusers import DDIMPipeline
-import numpy as np
-
-model_id = "google/ddpm-cifar10-32"
-
-# load model and scheduler
-ddim = DDIMPipeline.from_pretrained(model_id)
-ddim.to("cuda")
-
-# create a generator for reproducibility
-generator = torch.Generator(device="cuda").manual_seed(0)
-
-# run pipeline for just two steps and return numpy tensor
-image = ddim(num_inference_steps=2, output_type="np", generator=generator).images
-print(np.abs(image).sum())
-```
-
-Running the above now prints a value of 1389.8634 - even though we're using the exact same seed! 
-This is unfortunate as it means we cannot reproduce the results we achieved on GPU, also on CPU.
-Nevertheless, it should be expected since the GPU uses a different random number generator than the CPU.
-
-To circumvent this problem, we created a [`randn_tensor`](#diffusers.utils.randn_tensor) function, which can create random noise 
-on the CPU and then move the tensor to GPU if necessary. The function is used everywhere inside the pipelines allowing the user to **always** pass a CPU generator even if the pipeline is run on GPU:
-
-```python
-import torch
-from diffusers import DDIMPipeline
-import numpy as np
-
-model_id = "google/ddpm-cifar10-32"
-
-# load model and scheduler
-ddim = DDIMPipeline.from_pretrained(model_id)
-ddim.to("cuda")
-
-# create a generator for reproducibility
-generator = torch.manual_seed(0)
-
-# run pipeline for just two steps and return numpy tensor
-image = ddim(num_inference_steps=2, output_type="np", generator=generator).images
-print(np.abs(image).sum())
-```
-
-Running the above now prints a value of 1491.1713, much closer to the value of 1491.1711 when 
-the pipeline is fully run on the CPU.
-
-<Tip>
-
-As a consequence, we recommend always passing a CPU generator if Reproducibility is important.
-The loss of performance is often neglectable, but one can be sure to generate much more similar 
-values than if the pipeline would have been run on CPU.
-
-</Tip>
-
-Finally, we noticed that more complex pipelines, such as [`UnCLIPPipeline`] are often extremely 
-susceptible to precision error propagation and thus one cannot expect even similar results across 
-different GPU hardware or PyTorch versions. In such cases, one has to make sure to run 
-exactly the same hardware and PyTorch version for full Reproducibility.
-
-## Randomness utilities
-
-### randn_tensor
-[[autodoc]] diffusers.utils.randn_tensor
--- a/examples/community/README.md
+++ b/examples/community/README.md
@@ -26,7 +26,6 @@ If a community doesn't work as expected, please open an issue and ping the autho
 | Checkpoint Merger Pipeline | Diffusion Pipeline that enables merging of saved model checkpoints | [Checkpoint Merger Pipeline](#checkpoint-merger-pipeline)                   | -                                                                                                                                                                                                                  | [Naga Sai Abhinay Devarinti](https://github.com/Abhinay1997/) | 
 Stable Diffusion v1.1-1.4 Comparison | Run all 4 model checkpoints for Stable Diffusion and compare their results together | [Stable Diffusion Comparison](#stable-diffusion-comparisons) | - | [Suvaditya Mukherjee](https://github.com/suvadityamuk) |
 MagicMix | Diffusion Pipeline for semantic mixing of an image and a text prompt | [MagicMix](#magic-mix) | - | [Partho Das](https://github.com/daspartho) |
-| Stable UnCLIP | Diffusion Pipeline for combining prior model (generate clip image embedding from text, UnCLIPPipeline `"kakaobrain/karlo-v1-alpha"`) and decoder pipeline (decode clip image embedding to image, StableDiffusionImageVariationPipeline `"lambdalabs/sd-image-variations-diffusers"` ). | [Stable UnCLIP](#stable-unclip) | -  |[Ray Wang](https://wrong.wang) |



@@ -641,6 +640,7 @@ from diffusers import DiffusionPipeline

 from PIL import Image
 import requests
+from torch import autocast

 processor = CLIPSegProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
 model = CLIPSegForImageSegmentation.from_pretrained("CIDAS/clipseg-rd64-refined")
@@ -659,7 +659,8 @@ image = Image.open(requests.get(url, stream=True).raw).resize((512, 512))
 text = "a glass"  # will mask out this text
 prompt = "a cup"  # the masked out region will be replaced with this

-image = pipe(image=image, text=text, prompt=prompt).images[0]
+with autocast("cuda"):
+    image = pipe(image=image, text=text, prompt=prompt).images[0]
 ```

 ### Bit Diffusion 
@@ -862,92 +863,3 @@ E.g. the above script generates the following image:
 ![206903104-913a671d-ef53-4ae4-919d-64c3059c8f67](https://user-images.githubusercontent.com/59410571/209578602-70f323fa-05b7-4dd6-b055-e40683e37914.jpg)

 For more example generations check out this [demo notebook](https://github.com/daspartho/MagicMix/blob/main/demo.ipynb).
-
-
-### Stable UnCLIP
-
-UnCLIPPipeline("kakaobrain/karlo-v1-alpha") provide a prior model that can generate clip image embedding from text.
-StableDiffusionImageVariationPipeline("lambdalabs/sd-image-variations-diffusers") provide a decoder model than can generate images from clip image embedding.
-
-```python
-import torch
-from diffusers import DiffusionPipeline
-
-device = torch.device("cpu" if not torch.cuda.is_available() else "cuda")
-
-pipeline = DiffusionPipeline.from_pretrained(
-    "kakaobrain/karlo-v1-alpha",
-    torch_dtype=torch.float16,
-    custom_pipeline="stable_unclip",
-    decoder_pipe_kwargs=dict(
-        image_encoder=None,
-    ),
-)
-pipeline.to(device)
-
-prompt = "a shiba inu wearing a beret and black turtleneck"
-random_generator = torch.Generator(device=device).manual_seed(1000)
-output = pipeline(
-    prompt=prompt,
-    width=512,
-    height=512,
-    generator=random_generator,
-    prior_guidance_scale=4,
-    prior_num_inference_steps=25,
-    decoder_guidance_scale=8,
-    decoder_num_inference_steps=50,
-)
-
-image = output.images[0]
-image.save("./shiba-inu.jpg")
-
-# debug
-
-# `pipeline.decoder_pipe` is a regular StableDiffusionImageVariationPipeline instance.
-# It is used to convert clip image embedding to latents, then fed into VAE decoder.
-print(pipeline.decoder_pipe.__class__)
-# <class 'diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_image_variation.StableDiffusionImageVariationPipeline'>
-
-# this pipeline only use prior module in "kakaobrain/karlo-v1-alpha"
-# It is used to convert clip text embedding to clip image embedding.
-print(pipeline)
-# StableUnCLIPPipeline {
-#   "_class_name": "StableUnCLIPPipeline",
-#   "_diffusers_version": "0.12.0.dev0",
-#   "prior": [
-#     "diffusers",
-#     "PriorTransformer"
-#   ],
-#   "prior_scheduler": [
-#     "diffusers",
-#     "UnCLIPScheduler"
-#   ],
-#   "text_encoder": [
-#     "transformers",
-#     "CLIPTextModelWithProjection"
-#   ],
-#   "tokenizer": [
-#     "transformers",
-#     "CLIPTokenizer"
-#   ]
-# }
-
-# pipeline.prior_scheduler is the scheduler used for prior in UnCLIP.
-print(pipeline.prior_scheduler)
-# UnCLIPScheduler {
-#   "_class_name": "UnCLIPScheduler",
-#   "_diffusers_version": "0.12.0.dev0",
-#   "clip_sample": true,
-#   "clip_sample_range": 5.0,
-#   "num_train_timesteps": 1000,
-#   "prediction_type": "sample",
-#   "variance_type": "fixed_small_log"
-# }
-```
-
-
-`shiba-inu.jpg`
-
-
-![shiba-inu](https://user-images.githubusercontent.com/16448529/209185639-6e5ec794-ce9d-4883-aa29-bd6852a2abad.jpg)
-
--- a/examples/community/checkpoint_merger.py
+++ b/examples/community/checkpoint_merger.py
@@ -4,12 +4,6 @@ from typing import Dict, List, Union

 import torch

-from diffusers.utils import is_safetensors_available
-
-
-if is_safetensors_available():
-    import safetensors.torch
-
 from diffusers import DiffusionPipeline, __version__
 from diffusers.schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME
 from diffusers.utils import CONFIG_NAME, DIFFUSERS_CACHE, ONNX_WEIGHTS_NAME, WEIGHTS_NAME
@@ -38,7 +32,6 @@ class CheckpointMergerPipeline(DiffusionPipeline):
    """

    def __init__(self):
-        self.register_to_config()
        super().__init__()

    def _compare_model_configs(self, dict0, dict1):
@@ -99,8 +92,7 @@ class CheckpointMergerPipeline(DiffusionPipeline):
        alpha = kwargs.pop("alpha", 0.5)
        interp = kwargs.pop("interp", None)

-        print("Received list", pretrained_model_name_or_path_list)
-        print(f"Combining with alpha={alpha}, interpolation mode={interp}")
+        print("Recieved list", pretrained_model_name_or_path_list)

        checkpoint_count = len(pretrained_model_name_or_path_list)
        # Ignore result from model_index_json comparision of the two checkpoints
@@ -121,16 +113,17 @@ class CheckpointMergerPipeline(DiffusionPipeline):
        # Step 1: Load the model config and compare the checkpoints. We'll compare the model_index.json first while ignoring the keys starting with '_'
        config_dicts = []
        for pretrained_model_name_or_path in pretrained_model_name_or_path_list:
-            config_dict = DiffusionPipeline.load_config(
-                pretrained_model_name_or_path,
-                cache_dir=cache_dir,
-                resume_download=resume_download,
-                force_download=force_download,
-                proxies=proxies,
-                local_files_only=local_files_only,
-                use_auth_token=use_auth_token,
-                revision=revision,
-            )
+            if not os.path.isdir(pretrained_model_name_or_path):
+                config_dict = DiffusionPipeline.get_config_dict(
+                    pretrained_model_name_or_path,
+                    cache_dir=cache_dir,
+                    resume_download=resume_download,
+                    force_download=force_download,
+                    proxies=proxies,
+                    local_files_only=local_files_only,
+                    use_auth_token=use_auth_token,
+                    revision=revision,
+                )
            config_dicts.append(config_dict)

        comparison_result = True
@@ -155,30 +148,25 @@ class CheckpointMergerPipeline(DiffusionPipeline):
            requested_pipeline_class = config_dict.get("_class_name")
            user_agent = {"diffusers": __version__, "pipeline_class": requested_pipeline_class}

-            cached_folder = (
-                pretrained_model_name_or_path
-                if os.path.isdir(pretrained_model_name_or_path)
-                else snapshot_download(
-                    pretrained_model_name_or_path,
-                    cache_dir=cache_dir,
-                    resume_download=resume_download,
-                    proxies=proxies,
-                    local_files_only=local_files_only,
-                    use_auth_token=use_auth_token,
-                    revision=revision,
-                    allow_patterns=allow_patterns,
-                    user_agent=user_agent,
-                )
+            cached_folder = snapshot_download(
+                pretrained_model_name_or_path,
+                cache_dir=cache_dir,
+                resume_download=resume_download,
+                proxies=proxies,
+                local_files_only=local_files_only,
+                use_auth_token=use_auth_token,
+                revision=revision,
+                allow_patterns=allow_patterns,
+                user_agent=user_agent,
            )
            print("Cached Folder", cached_folder)
            cached_folders.append(cached_folder)

        # Step 3:-
-        # Load the first checkpoint as a diffusion pipeline and modify its module state_dict in place
+        # Load the first checkpoint as a diffusion pipeline and modify it's module state_dict in place
        final_pipe = DiffusionPipeline.from_pretrained(
            cached_folders[0], torch_dtype=torch_dtype, device_map=device_map
        )
-        final_pipe.to(self.device)

        checkpoint_path_2 = None
        if len(cached_folders) > 2:
@@ -198,56 +186,38 @@ class CheckpointMergerPipeline(DiffusionPipeline):
            if not attr.startswith("_"):
                checkpoint_path_1 = os.path.join(cached_folders[1], attr)
                if os.path.exists(checkpoint_path_1):
-                    files = list(
-                        (
-                            *glob.glob(os.path.join(checkpoint_path_1, "*.safetensors")),
-                            *glob.glob(os.path.join(checkpoint_path_1, "*.bin")),
-                        )
-                    )
+                    files = glob.glob(os.path.join(checkpoint_path_1, "*.bin"))
                    checkpoint_path_1 = files[0] if len(files) > 0 else None
                if checkpoint_path_2 is not None and os.path.exists(checkpoint_path_2):
-                    files = list(
-                        (
-                            *glob.glob(os.path.join(checkpoint_path_2, "*.safetensors")),
-                            *glob.glob(os.path.join(checkpoint_path_2, "*.bin")),
-                        )
-                    )
+                    files = glob.glob(os.path.join(checkpoint_path_2, "*.bin"))
                    checkpoint_path_2 = files[0] if len(files) > 0 else None
                # For an attr if both checkpoint_path_1 and 2 are None, ignore.
                # If atleast one is present, deal with it according to interp method, of course only if the state_dict keys match.
                if checkpoint_path_1 is None and checkpoint_path_2 is None:
-                    print(f"Skipping {attr}: not present in 2nd or 3d model")
+                    print("SKIPPING ATTR ", attr)
                    continue
                try:
                    module = getattr(final_pipe, attr)
-                    if isinstance(module, bool):  # ignore requires_safety_checker boolean
-                        continue
                    theta_0 = getattr(module, "state_dict")
                    theta_0 = theta_0()

                    update_theta_0 = getattr(module, "load_state_dict")
-                    theta_1 = (
-                        safetensors.torch.load_file(checkpoint_path_1)
-                        if (is_safetensors_available() and checkpoint_path_1.endswith(".safetensors"))
-                        else torch.load(checkpoint_path_1, map_location="cpu")
-                    )
-                    theta_2 = None
-                    if checkpoint_path_2:
-                        theta_2 = (
-                            safetensors.torch.load_file(checkpoint_path_2)
-                            if (is_safetensors_available() and checkpoint_path_2.endswith(".safetensors"))
-                            else torch.load(checkpoint_path_2, map_location="cpu")
-                        )
+                    theta_1 = torch.load(checkpoint_path_1)
+
+                    theta_2 = torch.load(checkpoint_path_2) if checkpoint_path_2 else None

                    if not theta_0.keys() == theta_1.keys():
-                        print(f"Skipping {attr}: key mismatch")
+                        print("SKIPPING ATTR ", attr, " DUE TO MISMATCH")
                        continue
                    if theta_2 and not theta_1.keys() == theta_2.keys():
-                        print(f"Skipping {attr}:y mismatch")
-                except Exception as e:
-                    print(f"Skipping {attr} do to an unexpected error: {str(e)}")
+                        print("SKIPPING ATTR ", attr, " DUE TO MISMATCH")
+                except:
+                    print("SKIPPING ATTR ", attr)
                    continue
-                print(f"MERGING {attr}")
+                print("Found dicts for")
+                print(attr)
+                print(checkpoint_path_1)
+                print(checkpoint_path_2)

                for key in theta_0.keys():
                    if theta_2:
@@ -260,6 +230,8 @@ class CheckpointMergerPipeline(DiffusionPipeline):
                update_theta_0(theta_0)

                del theta_0
+                print("Diffusion pipeline successfully updated with merged weights")
+
        return final_pipe

    @staticmethod
--- a/examples/community/clip_guided_stable_diffusion.py
+++ b/examples/community/clip_guided_stable_diffusion.py
@@ -78,12 +78,12 @@ class CLIPGuidedStableDiffusion(DiffusionPipeline):
        )

        self.normalize = transforms.Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std)
-        self.cut_out_size = (
+        cut_out_size = (
            feature_extractor.size
            if isinstance(feature_extractor.size, int)
            else feature_extractor.size["shortest_edge"]
        )
-        self.make_cutouts = MakeCutouts(self.cut_out_size)
+        self.make_cutouts = MakeCutouts(cut_out_size)

        set_requires_grad(self.text_encoder, False)
        set_requires_grad(self.clip_model, False)
@@ -157,7 +157,7 @@ class CLIPGuidedStableDiffusion(DiffusionPipeline):
        if use_cutouts:
            image = self.make_cutouts(image, num_cutouts)
        else:
-            image = transforms.Resize(self.cut_out_size)(image)
+            image = transforms.Resize(self.feature_extractor.size)(image)
        image = self.normalize(image).to(latents.dtype)

        image_embeddings_clip = self.clip_model.get_image_features(image)
--- a/examples/community/stable_unclip.py
+++ b/examples/community/stable_unclip.py
@@ -1,287 +0,0 @@
-import types
-from typing import List, Optional, Tuple, Union
-
-import torch
-
-from diffusers.models import PriorTransformer
-from diffusers.pipelines import DiffusionPipeline, StableDiffusionImageVariationPipeline
-from diffusers.schedulers import UnCLIPScheduler
-from diffusers.utils import logging, randn_tensor
-from transformers import CLIPTextModelWithProjection, CLIPTokenizer
-from transformers.models.clip.modeling_clip import CLIPTextModelOutput
-
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-
-def _encode_image(self, image, device, num_images_per_prompt, do_classifier_free_guidance):
-    image = image.to(device=device)
-    image_embeddings = image  # take image as image_embeddings
-    image_embeddings = image_embeddings.unsqueeze(1)
-
-    # duplicate image embeddings for each generation per prompt, using mps friendly method
-    bs_embed, seq_len, _ = image_embeddings.shape
-    image_embeddings = image_embeddings.repeat(1, num_images_per_prompt, 1)
-    image_embeddings = image_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
-
-    if do_classifier_free_guidance:
-        uncond_embeddings = torch.zeros_like(image_embeddings)
-
-        # For classifier free guidance, we need to do two forward passes.
-        # Here we concatenate the unconditional and text embeddings into a single batch
-        # to avoid doing two forward passes
-        image_embeddings = torch.cat([uncond_embeddings, image_embeddings])
-
-    return image_embeddings
-
-
-class StableUnCLIPPipeline(DiffusionPipeline):
-    def __init__(
-        self,
-        prior: PriorTransformer,
-        tokenizer: CLIPTokenizer,
-        text_encoder: CLIPTextModelWithProjection,
-        prior_scheduler: UnCLIPScheduler,
-        decoder_pipe_kwargs: Optional[dict] = None,
-    ):
-        super().__init__()
-
-        decoder_pipe_kwargs = dict(image_encoder=None) if decoder_pipe_kwargs is None else decoder_pipe_kwargs
-
-        decoder_pipe_kwargs["torch_dtype"] = decoder_pipe_kwargs.get("torch_dtype", None) or prior.dtype
-
-        self.decoder_pipe = StableDiffusionImageVariationPipeline.from_pretrained(
-            "lambdalabs/sd-image-variations-diffusers", **decoder_pipe_kwargs
-        )
-
-        # replace `_encode_image` method
-        self.decoder_pipe._encode_image = types.MethodType(_encode_image, self.decoder_pipe)
-
-        self.register_modules(
-            prior=prior,
-            tokenizer=tokenizer,
-            text_encoder=text_encoder,
-            prior_scheduler=prior_scheduler,
-        )
-
-    def _encode_prompt(
-        self,
-        prompt,
-        device,
-        num_images_per_prompt,
-        do_classifier_free_guidance,
-        text_model_output: Optional[Union[CLIPTextModelOutput, Tuple]] = None,
-        text_attention_mask: Optional[torch.Tensor] = None,
-    ):
-        if text_model_output is None:
-            batch_size = len(prompt) if isinstance(prompt, list) else 1
-            # get prompt text embeddings
-            text_inputs = self.tokenizer(
-                prompt,
-                padding="max_length",
-                max_length=self.tokenizer.model_max_length,
-                return_tensors="pt",
-            )
-            text_input_ids = text_inputs.input_ids
-            text_mask = text_inputs.attention_mask.bool().to(device)
-
-            if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
-                removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :])
-                logger.warning(
-                    "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
-                )
-                text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
-
-            text_encoder_output = self.text_encoder(text_input_ids.to(device))
-
-            text_embeddings = text_encoder_output.text_embeds
-            text_encoder_hidden_states = text_encoder_output.last_hidden_state
-
-        else:
-            batch_size = text_model_output[0].shape[0]
-            text_embeddings, text_encoder_hidden_states = text_model_output[0], text_model_output[1]
-            text_mask = text_attention_mask
-
-        text_embeddings = text_embeddings.repeat_interleave(num_images_per_prompt, dim=0)
-        text_encoder_hidden_states = text_encoder_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
-        text_mask = text_mask.repeat_interleave(num_images_per_prompt, dim=0)
-
-        if do_classifier_free_guidance:
-            uncond_tokens = [""] * batch_size
-
-            uncond_input = self.tokenizer(
-                uncond_tokens,
-                padding="max_length",
-                max_length=self.tokenizer.model_max_length,
-                truncation=True,
-                return_tensors="pt",
-            )
-            uncond_text_mask = uncond_input.attention_mask.bool().to(device)
-            uncond_embeddings_text_encoder_output = self.text_encoder(uncond_input.input_ids.to(device))
-
-            uncond_embeddings = uncond_embeddings_text_encoder_output.text_embeds
-            uncond_text_encoder_hidden_states = uncond_embeddings_text_encoder_output.last_hidden_state
-
-            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-
-            seq_len = uncond_embeddings.shape[1]
-            uncond_embeddings = uncond_embeddings.repeat(1, num_images_per_prompt)
-            uncond_embeddings = uncond_embeddings.view(batch_size * num_images_per_prompt, seq_len)
-
-            seq_len = uncond_text_encoder_hidden_states.shape[1]
-            uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.repeat(1, num_images_per_prompt, 1)
-            uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.view(
-                batch_size * num_images_per_prompt, seq_len, -1
-            )
-            uncond_text_mask = uncond_text_mask.repeat_interleave(num_images_per_prompt, dim=0)
-
-            # done duplicates
-
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
-            text_encoder_hidden_states = torch.cat([uncond_text_encoder_hidden_states, text_encoder_hidden_states])
-
-            text_mask = torch.cat([uncond_text_mask, text_mask])
-
-        return text_embeddings, text_encoder_hidden_states, text_mask
-
-    @property
-    def _execution_device(self):
-        r"""
-        Returns the device on which the pipeline's models will be executed. After calling
-        `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
-        hooks.
-        """
-        if self.device != torch.device("meta") or not hasattr(self.prior, "_hf_hook"):
-            return self.device
-        for module in self.prior.modules():
-            if (
-                hasattr(module, "_hf_hook")
-                and hasattr(module._hf_hook, "execution_device")
-                and module._hf_hook.execution_device is not None
-            ):
-                return torch.device(module._hf_hook.execution_device)
-        return self.device
-
-    def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
-        if latents is None:
-            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
-        else:
-            if latents.shape != shape:
-                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
-            latents = latents.to(device)
-
-        latents = latents * scheduler.init_noise_sigma
-        return latents
-
-    def to(self, torch_device: Optional[Union[str, torch.device]] = None):
-        self.decoder_pipe.to(torch_device)
-        super().to(torch_device)
-
-    @torch.no_grad()
-    def __call__(
-        self,
-        prompt: Optional[Union[str, List[str]]] = None,
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        num_images_per_prompt: int = 1,
-        prior_num_inference_steps: int = 25,
-        generator: Optional[torch.Generator] = None,
-        prior_latents: Optional[torch.FloatTensor] = None,
-        text_model_output: Optional[Union[CLIPTextModelOutput, Tuple]] = None,
-        text_attention_mask: Optional[torch.Tensor] = None,
-        prior_guidance_scale: float = 4.0,
-        decoder_guidance_scale: float = 8.0,
-        decoder_num_inference_steps: int = 50,
-        decoder_num_images_per_prompt: Optional[int] = 1,
-        decoder_eta: float = 0.0,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-    ):
-        if prompt is not None:
-            if isinstance(prompt, str):
-                batch_size = 1
-            elif isinstance(prompt, list):
-                batch_size = len(prompt)
-            else:
-                raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-        else:
-            batch_size = text_model_output[0].shape[0]
-
-        device = self._execution_device
-
-        batch_size = batch_size * num_images_per_prompt
-
-        do_classifier_free_guidance = prior_guidance_scale > 1.0 or decoder_guidance_scale > 1.0
-
-        text_embeddings, text_encoder_hidden_states, text_mask = self._encode_prompt(
-            prompt, device, num_images_per_prompt, do_classifier_free_guidance, text_model_output, text_attention_mask
-        )
-
-        # prior
-
-        self.prior_scheduler.set_timesteps(prior_num_inference_steps, device=device)
-        prior_timesteps_tensor = self.prior_scheduler.timesteps
-
-        embedding_dim = self.prior.config.embedding_dim
-
-        prior_latents = self.prepare_latents(
-            (batch_size, embedding_dim),
-            text_embeddings.dtype,
-            device,
-            generator,
-            prior_latents,
-            self.prior_scheduler,
-        )
-
-        for i, t in enumerate(self.progress_bar(prior_timesteps_tensor)):
-            # expand the latents if we are doing classifier free guidance
-            latent_model_input = torch.cat([prior_latents] * 2) if do_classifier_free_guidance else prior_latents
-
-            predicted_image_embedding = self.prior(
-                latent_model_input,
-                timestep=t,
-                proj_embedding=text_embeddings,
-                encoder_hidden_states=text_encoder_hidden_states,
-                attention_mask=text_mask,
-            ).predicted_image_embedding
-
-            if do_classifier_free_guidance:
-                predicted_image_embedding_uncond, predicted_image_embedding_text = predicted_image_embedding.chunk(2)
-                predicted_image_embedding = predicted_image_embedding_uncond + prior_guidance_scale * (
-                    predicted_image_embedding_text - predicted_image_embedding_uncond
-                )
-
-            if i + 1 == prior_timesteps_tensor.shape[0]:
-                prev_timestep = None
-            else:
-                prev_timestep = prior_timesteps_tensor[i + 1]
-
-            prior_latents = self.prior_scheduler.step(
-                predicted_image_embedding,
-                timestep=t,
-                sample=prior_latents,
-                generator=generator,
-                prev_timestep=prev_timestep,
-            ).prev_sample
-
-        prior_latents = self.prior.post_process_latents(prior_latents)
-
-        image_embeddings = prior_latents
-
-        output = self.decoder_pipe(
-            image=image_embeddings,
-            height=height,
-            width=width,
-            num_inference_steps=decoder_num_inference_steps,
-            guidance_scale=decoder_guidance_scale,
-            generator=generator,
-            output_type=output_type,
-            return_dict=return_dict,
-            num_images_per_prompt=decoder_num_images_per_prompt,
-            eta=decoder_eta,
-        )
-        return output
--- a/examples/dreambooth/README.md
+++ b/examples/dreambooth/README.md
@@ -5,7 +5,6 @@ The `train_dreambooth.py` script shows how to implement the training procedure a


 ## Running locally with PyTorch
-
 ### Installing the dependencies

 Before running the scripts, make sure to install the library's training dependencies:
@@ -131,42 +130,6 @@ accelerate launch train_dreambooth.py \
  --max_train_steps=800
 ```

-
-### Training on a 12GB GPU:
-
-It is possible to run dreambooth on a 12GB GPU by using the following optimizations:
- [gradient checkpointing and the 8-bit optimizer](#training-on-a-16gb-gpu)
- [xformers](#training-with-xformers)
- [setting grads to none](#set-grads-to-none)
-
-```bash
-export MODEL_NAME="CompVis/stable-diffusion-v1-4"
-export INSTANCE_DIR="path-to-instance-images"
-export CLASS_DIR="path-to-class-images"
-export OUTPUT_DIR="path-to-save-model"
-
-accelerate launch train_dreambooth.py \
-  --pretrained_model_name_or_path=$MODEL_NAME  \
-  --instance_data_dir=$INSTANCE_DIR \
-  --class_data_dir=$CLASS_DIR \
-  --output_dir=$OUTPUT_DIR \
-  --with_prior_preservation --prior_loss_weight=1.0 \
-  --instance_prompt="a photo of sks dog" \
-  --class_prompt="a photo of dog" \
-  --resolution=512 \
-  --train_batch_size=1 \
-  --gradient_accumulation_steps=1 --gradient_checkpointing \
-  --use_8bit_adam \
-  --enable_xformers_memory_efficient_attention \
-  --set_grads_to_none \
-  --learning_rate=2e-6 \
-  --lr_scheduler="constant" \
-  --lr_warmup_steps=0 \
-  --num_class_images=200 \
-  --max_train_steps=800
-```
-
-
 ### Training on a 8 GB GPU:

 By using [DeepSpeed](https://www.deepspeed.ai/) it's possible to offload some
@@ -241,9 +204,10 @@ accelerate launch train_dreambooth.py \
  --max_train_steps=800
 ```

-### Using DreamBooth for pipelines other than Stable Diffusion
+### Using DreamBooth for other pipelines than Stable Diffusion

-The [AltDiffusion pipeline](https://huggingface.co/docs/diffusers/api/pipelines/alt_diffusion) also supports dreambooth fine-tuning. The process is the same as above, all you need to do is replace the `MODEL_NAME` like this:
+Altdiffusion also support dreambooth now, the runing comman is basically the same as abouve, all you need to do is replace the `MODEL_NAME` like this:
+One can now simply change the `pretrained_model_name_or_path` to another architecture such as [`AltDiffusion`](https://huggingface.co/docs/diffusers/api/pipelines/alt_diffusion).

 ```
 export MODEL_NAME="CompVis/stable-diffusion-v1-4" --> export MODEL_NAME="BAAI/AltDiffusion-m9"
@@ -253,7 +217,7 @@ export MODEL_NAME="CompVis/stable-diffusion-v1-4" --> export MODEL_NAME="BAAI/Al

 ### Inference

-Once you have trained a model using the above command, you can run inference simply using the `StableDiffusionPipeline`. Make sure to include the `identifier` (e.g. sks in above example) in your prompt.
+Once you have trained a model using above command, the inference can be done simply using the `StableDiffusionPipeline`. Make sure to include the `identifier`(e.g. sks in above example) in your prompt.

 ```python
 from diffusers import StableDiffusionPipeline
@@ -272,102 +236,6 @@ image.save("dog-bucket.png")

 You can also perform inference from one of the checkpoints saved during the training process, if you used the `--checkpointing_steps` argument. Please, refer to [the documentation](https://huggingface.co/docs/diffusers/main/en/training/dreambooth#performing-inference-using-a-saved-checkpoint) to see how to do it.

-## Training with Low-Rank Adaptation of Large Language Models (LoRA)
-
-Low-Rank Adaption of Large Language Models was first introduced by Microsoft in [LoRA: Low-Rank Adaptation of Large Language Models](https://arxiv.org/abs/2106.09685) by *Edward J. Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, Weizhu Chen*
-
-In a nutshell, LoRA allows to adapt pretrained models by adding pairs of rank-decomposition matrices to existing weights and **only** training those newly added weights. This has a couple of advantages:
- Previous pretrained weights are kept frozen so that the model is not prone to [catastrophic forgetting](https://www.pnas.org/doi/10.1073/pnas.1611835114)
- Rank-decomposition matrices have significantly fewer parameters than the original model, which means that trained LoRA weights are easily portable.
- LoRA attention layers allow to control to which extent the model is adapted torwards new training images via a `scale` parameter.
-
-[cloneofsimo](https://github.com/cloneofsimo) was the first to try out LoRA training for Stable Diffusion in 
-the popular [lora](https://github.com/cloneofsimo/lora) GitHub repository.
-
-### Training
-
-Let's get started with a simple example. We will re-use the dog example of the [previous section](#dog-toy-example).
-
-First, you need to set-up your dreambooth training example as is explained in the [installation section](#Installing-the-dependencies).
-Next, let's download the dog dataset. Download images from [here](https://drive.google.com/drive/folders/1BO_dyz-p65qhBRRMRA4TbZ8qW4rB99JZ) and save them in a directory. Make sure to set `INSTANCE_DIR` to the name of your directory further below. This will be our training data.
-
-Now, you can launch the training. Here we will use [Stable Diffusion 1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5).
-
-**___Note: Change the `resolution` to 768 if you are using the [stable-diffusion-2](https://huggingface.co/stabilityai/stable-diffusion-2) 768x768 model.___**
-
-**___Note: It is quite useful to monitor the training progress by regularly generating sample images during training. [wandb](https://docs.wandb.ai/quickstart) is a nice solution to easily see generating images during training. All you need to do is to run `pip install wandb` before training and pass `--report_to="wandb"` to automatically log images.___**
-
-
-```bash
-export MODEL_NAME="runwayml/stable-diffusion-v1-5"
-export INSTANCE_DIR="path-to-instance-images"
-export OUTPUT_DIR="path-to-save-model"
-```
-
-For this example we want to directly store the trained LoRA embeddings on the Hub, so 
-we need to be logged in and add the `--push_to_hub` flag.
-
-```bash
-huggingface-cli login
-```
-
-Now we can start training!
-
-```bash
-accelerate launch train_dreambooth_lora.py \
-  --pretrained_model_name_or_path=$MODEL_NAME  \
-  --instance_data_dir=$INSTANCE_DIR \
-  --output_dir=$OUTPUT_DIR \
-  --instance_prompt="a photo of sks dog" \
-  --resolution=512 \
-  --train_batch_size=1 \
-  --gradient_accumulation_steps=1 \
-  --checkpointing_steps=100 \
-  --learning_rate=1e-4 \
-  --report_to="wandb" \
-  --lr_scheduler="constant" \
-  --lr_warmup_steps=0 \
-  --max_train_steps=500 \
-  --validation_prompt="A photo of sks dog in a bucket" \
-  --validation_epochs=50 \
-  --seed="0" \
-  --push_to_hub
-```
-
-**___Note: When using LoRA we can use a much higher learning rate compared to vanilla dreambooth. Here we 
-use *1e-4* instead of the usual *2e-6*.___**
-
-The final LoRA embedding weights have been uploaded to [patrickvonplaten/lora_dreambooth_dog_example](https://huggingface.co/patrickvonplaten/lora_dreambooth_dog_example). **___Note: [The final weights](https://huggingface.co/patrickvonplaten/lora/blob/main/pytorch_attn_procs.bin) are only 3 MB in size which is orders of magnitudes smaller than the original model.**
-
-The training results are summarized [here](https://api.wandb.ai/report/patrickvonplaten/xm6cd5q5).
-You can use the `Step` slider to see how the model learned the features of our subject while the model trained.
-
-### Inference
-
-After training, LoRA weights can be loaded very easily into the original pipeline. First, you need to 
-load the original pipeline:
-
-```python
-from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler
-import torch
-
-pipe = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16)
-pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
-pipe.to("cuda")
-```
-
-Next, we can load the adapter layers into the UNet with the [`load_attn_procs` function](https://huggingface.co/docs/diffusers/api/loaders#diffusers.loaders.UNet2DConditionLoadersMixin.load_attn_procs).
-
-```python
-pipe.unet.load_attn_procs("patrickvonplaten/lora_dreambooth_dog_example")
-```
-
-Finally, we can run the model in inference.
-
-```python
-image = pipe("A picture of a sks dog in a bucket", num_inference_steps=25).images[0]
-```
-
 ## Training with Flax/JAX

 For faster training on TPUs and GPUs you can leverage the flax training example. Follow the instructions above to get the model and dataset before running the script.
@@ -453,12 +321,3 @@ python train_dreambooth_flax.py \
 You can enable memory efficient attention by [installing xFormers](https://github.com/facebookresearch/xformers#installing-xformers) and padding the `--enable_xformers_memory_efficient_attention` argument to the script. This is not available with the Flax/JAX implementation.

 You can also use Dreambooth to train the specialized in-painting model. See [the script in the research folder for details](https://github.com/huggingface/diffusers/tree/main/examples/research_projects/dreambooth_inpaint).
-
-### Set grads to none
-
-To save even more memory, pass the `--set_grads_to_none` argument to the script. This will set grads to None instead of zero. However, be aware that it changes certain behaviors, so if you start experiencing any problems, remove this argument.
-
-More info: https://pytorch.org/docs/stable/generated/torch.optim.Optimizer.zero_grad.html
-
-### Experimental results
-You can refer to [this blog post](https://huggingface.co/blog/dreambooth) that discusses some of DreamBooth experiments in detail. Specifically, it recommends a set of DreamBooth-specific tips and tricks that we have found to work well for a variety of subjects. 
--- a/examples/dreambooth/requirements.txt
+++ b/examples/dreambooth/requirements.txt
@@ -3,4 +3,4 @@ torchvision
 transformers>=4.25.1
 ftfy
 tensorboard
-Jinja2
+modelcards
--- a/examples/dreambooth/requirements_flax.txt
+++ b/examples/dreambooth/requirements_flax.txt
@@ -5,4 +5,4 @@ torch
 torchvision
 ftfy
 tensorboard
-Jinja2
+modelcards
--- a/examples/dreambooth/train_dreambooth.py
+++ b/examples/dreambooth/train_dreambooth.py
@@ -1,18 +1,3 @@
-#!/usr/bin/env python
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-
 import argparse
 import hashlib
 import itertools
@@ -38,7 +23,7 @@ from diffusers import AutoencoderKL, DDPMScheduler, DiffusionPipeline, UNet2DCon
 from diffusers.optimization import get_scheduler
 from diffusers.utils import check_min_version
 from diffusers.utils.import_utils import is_xformers_available
-from huggingface_hub import HfFolder, Repository, create_repo, whoami
+from huggingface_hub import HfFolder, Repository, whoami
 from PIL import Image
 from torchvision import transforms
 from tqdm.auto import tqdm
@@ -46,7 +31,7 @@ from transformers import AutoTokenizer, PretrainedConfig


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.12.0")
+check_min_version("0.10.0.dev0")

 logger = get_logger(__name__)

@@ -156,13 +141,7 @@ def parse_args(input_args=None):
        ),
    )
    parser.add_argument(
-        "--center_crop",
-        default=False,
-        action="store_true",
-        help=(
-            "Whether to center crop the input images to the resolution. If not set, the images will be randomly"
-            " cropped. The images will be resized to the resolution first before cropping."
-        ),
+        "--center_crop", action="store_true", help="Whether to center crop images before resizing to resolution"
    )
    parser.add_argument(
        "--train_text_encoder",
@@ -246,14 +225,6 @@ def parse_args(input_args=None):
    parser.add_argument(
        "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
    )
-    parser.add_argument(
-        "--dataloader_num_workers",
-        type=int,
-        default=0,
-        help=(
-            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
-        ),
-    )
    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
@@ -318,15 +289,6 @@ def parse_args(input_args=None):
    parser.add_argument(
        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
    )
-    parser.add_argument(
-        "--set_grads_to_none",
-        action="store_true",
-        help=(
-            "Save more memory by using setting grads to None instead of zero. Be aware, that this changes certain"
-            " behaviors, so disable this argument if it causes any problems. More info:"
-            " https://pytorch.org/docs/stable/generated/torch.optim.Optimizer.zero_grad.html"
-        ),
-    )

    if input_args is not None:
        args = parser.parse_args(input_args)
@@ -374,7 +336,7 @@ class DreamBoothDataset(Dataset):

        self.instance_data_root = Path(instance_data_root)
        if not self.instance_data_root.exists():
-            raise ValueError(f"Instance {self.instance_data_root} images root doesn't exists.")
+            raise ValueError("Instance images root doesn't exists.")

        self.instance_images_path = list(Path(instance_data_root).iterdir())
        self.num_instance_images = len(self.instance_images_path)
@@ -574,8 +536,7 @@ def main(args):
                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
            else:
                repo_name = args.hub_model_id
-            create_repo(repo_name, exist_ok=True, token=args.hub_token)
-            repo = Repository(args.output_dir, clone_from=repo_name, token=args.hub_token)
+            repo = Repository(args.output_dir, clone_from=repo_name)

            with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
                if "step_*" not in gitignore:
@@ -675,7 +636,7 @@ def main(args):
        batch_size=args.train_batch_size,
        shuffle=True,
        collate_fn=lambda examples: collate_fn(examples, args.with_prior_preservation),
-        num_workers=args.dataloader_num_workers,
+        num_workers=1,
    )

    # Scheduler and math around the number of training steps.
@@ -722,16 +683,11 @@ def main(args):
        " doing mixed precision training. copy of the weights should still be float32."
    )

-    if accelerator.unwrap_model(unet).dtype != torch.float32:
-        raise ValueError(
-            f"Unet loaded as datatype {accelerator.unwrap_model(unet).dtype}. {low_precision_error_string}"
-        )
+    if unet.dtype != torch.float32:
+        raise ValueError(f"Unet loaded as datatype {unet.dtype}. {low_precision_error_string}")

-    if args.train_text_encoder and accelerator.unwrap_model(text_encoder).dtype != torch.float32:
-        raise ValueError(
-            f"Text encoder loaded as datatype {accelerator.unwrap_model(text_encoder).dtype}."
-            f" {low_precision_error_string}"
-        )
+    if args.train_text_encoder and text_encoder.dtype != torch.float32:
+        raise ValueError(f"Text encoder loaded as datatype {text_encoder.dtype}. {low_precision_error_string}")

    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
@@ -768,21 +724,14 @@ def main(args):
            dirs = os.listdir(args.output_dir)
            dirs = [d for d in dirs if d.startswith("checkpoint")]
            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
-            path = dirs[-1] if len(dirs) > 0 else None
+            path = dirs[-1]
+        accelerator.print(f"Resuming from checkpoint {path}")
+        accelerator.load_state(os.path.join(args.output_dir, path))
+        global_step = int(path.split("-")[1])

-        if path is None:
-            accelerator.print(
-                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
-            )
-            args.resume_from_checkpoint = None
-        else:
-            accelerator.print(f"Resuming from checkpoint {path}")
-            accelerator.load_state(os.path.join(args.output_dir, path))
-            global_step = int(path.split("-")[1])
-
-            resume_global_step = global_step * args.gradient_accumulation_steps
-            first_epoch = global_step // num_update_steps_per_epoch
-            resume_step = resume_global_step % (num_update_steps_per_epoch * args.gradient_accumulation_steps)
+        resume_global_step = global_step * args.gradient_accumulation_steps
+        first_epoch = resume_global_step // num_update_steps_per_epoch
+        resume_step = resume_global_step % num_update_steps_per_epoch

    # Only show the progress bar once on each machine.
    progress_bar = tqdm(range(global_step, args.max_train_steps), disable=not accelerator.is_local_main_process)
@@ -855,7 +804,7 @@ def main(args):
                    accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
                optimizer.step()
                lr_scheduler.step()
-                optimizer.zero_grad(set_to_none=args.set_grads_to_none)
+                optimizer.zero_grad()

            # Checks if the accelerator has performed an optimization step behind the scenes
            if accelerator.sync_gradients:
--- a/examples/dreambooth/train_dreambooth_flax.py
+++ b/examples/dreambooth/train_dreambooth_flax.py
@@ -27,7 +27,7 @@ from diffusers.utils import check_min_version
 from flax import jax_utils
 from flax.training import train_state
 from flax.training.common_utils import shard
-from huggingface_hub import HfFolder, Repository, create_repo, whoami
+from huggingface_hub import HfFolder, Repository, whoami
 from jax.experimental.compilation_cache import compilation_cache as cc
 from PIL import Image
 from torchvision import transforms
@@ -36,7 +36,7 @@ from transformers import CLIPFeatureExtractor, CLIPTokenizer, FlaxCLIPTextModel,


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.12.0")
+check_min_version("0.10.0.dev0")

 # Cache compiled models across invocations of this script.
 cc.initialize_cache(os.path.expanduser("~/.cache/jax/compilation_cache"))
@@ -132,13 +132,7 @@ def parse_args():
        ),
    )
    parser.add_argument(
-        "--center_crop",
-        default=False,
-        action="store_true",
-        help=(
-            "Whether to center crop the input images to the resolution. If not set, the images will be randomly"
-            " cropped. The images will be resized to the resolution first before cropping."
-        ),
+        "--center_crop", action="store_true", help="Whether to center crop images before resizing to resolution"
    )
    parser.add_argument("--train_text_encoder", action="store_true", help="Whether to train the text encoder")
    parser.add_argument(
@@ -393,8 +387,7 @@ def main():
                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
            else:
                repo_name = args.hub_model_id
-            create_repo(repo_name, exist_ok=True, token=args.hub_token)
-            repo = Repository(args.output_dir, clone_from=repo_name, token=args.hub_token)
+            repo = Repository(args.output_dir, clone_from=repo_name)

            with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
                if "step_*" not in gitignore:
--- a/examples/dreambooth/train_dreambooth_lora.py
+++ b/examples/dreambooth/train_dreambooth_lora.py
--- a/examples/research_projects/colossalai/README.md
+++ b/examples/research_projects/colossalai/README.md
@@ -10,24 +10,20 @@ By accommodating model data in CPU and GPU and moving the data to the computing
 Before running the scripts, make sure to install the library's training dependencies:

 ```bash
-pip install -r requirements.txt
+pip install -r requirements_colossalai.txt
 ```

-## Install [ColossalAI](https://github.com/hpcaitech/ColossalAI.git)
+### Install [colossalai](https://github.com/hpcaitech/ColossalAI.git)

-**From PyPI**
 ```bash
-pip install colossalai
+pip install colossalai==0.2.0+torch1.12cu11.3 -f https://release.colossalai.org
 ```

 **From source**

 ```bash
 git clone https://github.com/hpcaitech/ColossalAI.git
-cd ColossalAI
-
-# install colossalai
-pip install .
+python setup.py install
 ```

 ## Dataset for Teyvat BLIP captions
--- a/examples/research_projects/colossalai/requirement.txt
+++ b/examples/research_projects/colossalai/requirement.txt
@@ -3,5 +3,5 @@ torch
 torchvision
 ftfy
 tensorboard
-Jinja2
+modelcards
 transformers
--- a/examples/research_projects/colossalai/train_dreambooth_colossalai.py
+++ b/examples/research_projects/colossalai/train_dreambooth_colossalai.py
@@ -20,7 +20,7 @@ from colossalai.utils import get_current_device
 from colossalai.utils.model.colo_init_context import ColoInitContext
 from diffusers import AutoencoderKL, DDPMScheduler, DiffusionPipeline, UNet2DConditionModel
 from diffusers.optimization import get_scheduler
-from huggingface_hub import HfFolder, Repository, create_repo, whoami
+from huggingface_hub import HfFolder, Repository, whoami
 from PIL import Image
 from torchvision import transforms
 from tqdm.auto import tqdm
@@ -139,13 +139,7 @@ def parse_args(input_args=None):
        help="Placement Policy for Gemini. Valid when using colossalai as dist plan.",
    )
    parser.add_argument(
-        "--center_crop",
-        default=False,
-        action="store_true",
-        help=(
-            "Whether to center crop the input images to the resolution. If not set, the images will be randomly"
-            " cropped. The images will be resized to the resolution first before cropping."
-        ),
+        "--center_crop", action="store_true", help="Whether to center crop images before resizing to resolution"
    )
    parser.add_argument(
        "--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader."
@@ -371,11 +365,6 @@ def gemini_zero_dpp(model: torch.nn.Module, placememt_policy: str = "auto"):


 def main(args):
-    if args.seed is None:
-        colossalai.launch_from_torch(config={})
-    else:
-        colossalai.launch_from_torch(config={}, seed=args.seed)
-
    colossalai.launch_from_torch(config={})

    if args.seed is not None:
@@ -426,8 +415,7 @@ def main(args):
                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
            else:
                repo_name = args.hub_model_id
-            create_repo(repo_name, exist_ok=True, token=args.hub_token)
-            repo = Repository(args.output_dir, clone_from=repo_name, token=args.hub_token)
+            repo = Repository(args.output_dir, clone_from=repo_name)

            with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
                if "step_*" not in gitignore:
--- a/examples/research_projects/dreambooth_inpaint/requirements.txt
+++ b/examples/research_projects/dreambooth_inpaint/requirements.txt
@@ -4,4 +4,4 @@ torchvision
 transformers>=4.21.0
 ftfy
 tensorboard
-Jinja2
+modelcards
--- a/examples/research_projects/dreambooth_inpaint/train_dreambooth_inpaint.py
+++ b/examples/research_projects/dreambooth_inpaint/train_dreambooth_inpaint.py
@@ -25,7 +25,7 @@ from diffusers import (
 )
 from diffusers.optimization import get_scheduler
 from diffusers.utils import check_min_version
-from huggingface_hub import HfFolder, Repository, create_repo, whoami
+from huggingface_hub import HfFolder, Repository, whoami
 from PIL import Image, ImageDraw
 from torchvision import transforms
 from tqdm.auto import tqdm
@@ -154,13 +154,7 @@ def parse_args():
        ),
    )
    parser.add_argument(
-        "--center_crop",
-        default=False,
-        action="store_true",
-        help=(
-            "Whether to center crop the input images to the resolution. If not set, the images will be randomly"
-            " cropped. The images will be resized to the resolution first before cropping."
-        ),
+        "--center_crop", action="store_true", help="Whether to center crop images before resizing to resolution"
    )
    parser.add_argument("--train_text_encoder", action="store_true", help="Whether to train the text encoder")
    parser.add_argument(
@@ -477,8 +471,7 @@ def main():
                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
            else:
                repo_name = args.hub_model_id
-            create_repo(repo_name, exist_ok=True, token=args.hub_token)
-            repo = Repository(args.output_dir, clone_from=repo_name, token=args.hub_token)
+            repo = Repository(args.output_dir, clone_from=repo_name)

            with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
                if "step_*" not in gitignore:
@@ -666,21 +659,14 @@ def main():
            dirs = os.listdir(args.output_dir)
            dirs = [d for d in dirs if d.startswith("checkpoint")]
            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
-            path = dirs[-1] if len(dirs) > 0 else None
+            path = dirs[-1]
+        accelerator.print(f"Resuming from checkpoint {path}")
+        accelerator.load_state(os.path.join(args.output_dir, path))
+        global_step = int(path.split("-")[1])

-        if path is None:
-            accelerator.print(
-                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
-            )
-            args.resume_from_checkpoint = None
-        else:
-            accelerator.print(f"Resuming from checkpoint {path}")
-            accelerator.load_state(os.path.join(args.output_dir, path))
-            global_step = int(path.split("-")[1])
-
-            resume_global_step = global_step * args.gradient_accumulation_steps
-            first_epoch = global_step // num_update_steps_per_epoch
-            resume_step = resume_global_step % (num_update_steps_per_epoch * args.gradient_accumulation_steps)
+        resume_global_step = global_step * args.gradient_accumulation_steps
+        first_epoch = resume_global_step // num_update_steps_per_epoch
+        resume_step = resume_global_step % num_update_steps_per_epoch

    # Only show the progress bar once on each machine.
    progress_bar = tqdm(range(global_step, args.max_train_steps), disable=not accelerator.is_local_main_process)
--- a/examples/research_projects/intel_opts/textual_inversion/requirements.txt
+++ b/examples/research_projects/intel_opts/textual_inversion/requirements.txt
@@ -3,5 +3,5 @@ torchvision
 transformers>=4.21.0
 ftfy
 tensorboard
-Jinja2
+modelcards
 intel_extension_for_pytorch>=1.13
--- a/examples/research_projects/intel_opts/textual_inversion/textual_inversion_bf16.py
+++ b/examples/research_projects/intel_opts/textual_inversion/textual_inversion_bf16.py
@@ -21,7 +21,7 @@ from diffusers import AutoencoderKL, DDPMScheduler, PNDMScheduler, StableDiffusi
 from diffusers.optimization import get_scheduler
 from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker
 from diffusers.utils import check_min_version
-from huggingface_hub import HfFolder, Repository, create_repo, whoami
+from huggingface_hub import HfFolder, Repository, whoami

 # TODO: remove and import from diffusers.utils when the new version of diffusers is released
 from packaging import version
@@ -130,7 +130,7 @@ def parse_args():
        ),
    )
    parser.add_argument(
-        "--center_crop", action="store_true", help="Whether to center crop images before resizing to resolution."
+        "--center_crop", action="store_true", help="Whether to center crop images before resizing to resolution"
    )
    parser.add_argument(
        "--train_batch_size", type=int, default=16, help="Batch size (per device) for the training dataloader."
@@ -336,10 +336,7 @@ class TextualInversionDataset(Dataset):

        if self.center_crop:
            crop = min(img.shape[0], img.shape[1])
-            (
-                h,
-                w,
-            ) = (
+            (h, w,) = (
                img.shape[0],
                img.shape[1],
            )
@@ -393,8 +390,7 @@ def main():
                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
            else:
                repo_name = args.hub_model_id
-            create_repo(repo_name, exist_ok=True, token=args.hub_token)
-            repo = Repository(args.output_dir, clone_from=repo_name, token=args.hub_token)
+            repo = Repository(args.output_dir, clone_from=repo_name)

            with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
                if "step_*" not in gitignore:
--- a/examples/research_projects/multi_subject_dreambooth/README.md
+++ b/examples/research_projects/multi_subject_dreambooth/README.md
@@ -1,291 +0,0 @@
-# Multi Subject DreamBooth training
-
-[DreamBooth](https://arxiv.org/abs/2208.12242) is a method to personalize text2image models like stable diffusion given just a few(3~5) images of a subject.
-This `train_multi_subject_dreambooth.py` script shows how to implement the training procedure for one or more subjects and adapt it for stable diffusion. Note that this code is based off of the `examples/dreambooth/train_dreambooth.py` script as of 01/06/2022.
-
-This script was added by @kopsahlong, and is not actively maintained. However, if you come across anything that could use fixing, feel free to open an issue and tag @kopsahlong.
-
-## Running locally with PyTorch
-### Installing the dependencies
-
-Before running the script, make sure to install the library's training dependencies:
-
-To start, execute the following steps in a new virtual environment:
-```bash
-git clone https://github.com/huggingface/diffusers
-cd diffusers
-pip install -e .
-```
-
-Then cd into the folder `diffusers/examples/research_projects/multi_subject_dreambooth` and run the following:
-```bash
-pip install -r requirements.txt
-```
-
-And initialize an [🤗Accelerate](https://github.com/huggingface/accelerate/) environment with:
-
-```bash
-accelerate config
-```
-
-Or for a default accelerate configuration without answering questions about your environment
-
-```bash
-accelerate config default
-```
-
-Or if your environment doesn't support an interactive shell e.g. a notebook
-
-```python
-from accelerate.utils import write_basic_config
-write_basic_config()
-```
-
-### Multi Subject Training Example
-In order to have your model learn multiple concepts at once, we simply add in the additional data directories and prompts to our `instance_data_dir` and `instance_prompt` (as well as `class_data_dir` and `class_prompt` if `--with_prior_preservation` is specified) as one comma separated string.
-
-See an example with 2 subjects below, which learns a model for one dog subject and one human subject:
-
-```bash
-export MODEL_NAME="CompVis/stable-diffusion-v1-4"
-export OUTPUT_DIR="path-to-save-model"
-
-# Subject 1
-export INSTANCE_DIR_1="path-to-instance-images-concept-1"
-export INSTANCE_PROMPT_1="a photo of a sks dog"
-export CLASS_DIR_1="path-to-class-images-dog"
-export CLASS_PROMPT_1="a photo of a dog"
-
-# Subject 2
-export INSTANCE_DIR_2="path-to-instance-images-concept-2"
-export INSTANCE_PROMPT_2="a photo of a t@y person"
-export CLASS_DIR_2="path-to-class-images-person"
-export CLASS_PROMPT_2="a photo of a person"
-
-accelerate launch train_multi_subject_dreambooth.py \
-  --pretrained_model_name_or_path=$MODEL_NAME  \
-  --instance_data_dir="$INSTANCE_DIR_1,$INSTANCE_DIR_2" \
-  --output_dir=$OUTPUT_DIR \
-  --train_text_encoder \
-  --instance_prompt="$INSTANCE_PROMPT_1,$INSTANCE_PROMPT_2" \
-  --with_prior_preservation \
-  --prior_loss_weight=1.0 \
-  --class_data_dir="$CLASS_DIR_1,$CLASS_DIR_2" \
-  --class_prompt="$CLASS_PROMPT_1,$CLASS_PROMPT_2"\
-  --num_class_images=50 \
-  --resolution=512 \
-  --train_batch_size=1 \
-  --gradient_accumulation_steps=1 \
-  --learning_rate=1e-6 \
-  --lr_scheduler="constant" \
-  --lr_warmup_steps=0 \
-  --max_train_steps=1500
-```
-
-This example shows training for 2 subjects, but please note that the model can be trained on any number of new concepts. This can be done by continuing to add in the corresponding directories and prompts to the corresponding comma separated string.
-
-Note also that in this script, `sks` and `t@y` were used as tokens to learn the new subjects ([this thread](https://github.com/XavierXiao/Dreambooth-Stable-Diffusion/issues/71) inspired the use of `t@y` as our second identifier). However, there may be better rare tokens to experiment with, and results also seemed to be good when more intuitive words are used.
-
-### Inference
-
-Once you have trained a model using above command, the inference can be done simply using the `StableDiffusionPipeline`. Make sure to include the `identifier`(e.g. sks in above example) in your prompt.
-
-```python
-from diffusers import StableDiffusionPipeline
-import torch
-
-model_id = "path-to-your-trained-model"
-pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to("cuda")
-
-prompt = "A photo of a t@y person petting an sks dog"
-image = pipe(prompt, num_inference_steps=200, guidance_scale=7.5).images[0]
-
-image.save("person-petting-dog.png")
-```
-
-### Inference from a training checkpoint
-
-You can also perform inference from one of the checkpoints saved during the training process, if you used the `--checkpointing_steps` argument. Please, refer to [the documentation](https://huggingface.co/docs/diffusers/main/en/training/dreambooth#performing-inference-using-a-saved-checkpoint) to see how to do it.
-
-## Additional Dreambooth documentation
-Because the `train_multi_subject_dreambooth.py` script here was forked from an original version of `train_dreambooth.py` in the `examples/dreambooth` folder, I've included the original applicable training documentation for single subject examples below.
-
-This should explain how to play with training variables such as prior preservation, fine tuning the text encoder, etc. which is still applicable to our multi subject training code. Note also that the examples below, which are single subject examples, also work with `train_multi_subject_dreambooth.py`, as this script supports 1 (or more) subjects.
-
-### Single subject dog toy example
-
-Let's get our dataset. Download images from [here](https://drive.google.com/drive/folders/1BO_dyz-p65qhBRRMRA4TbZ8qW4rB99JZ) and save them in a directory. This will be our training data.
-
-And launch the training using
-
-**___Note: Change the `resolution` to 768 if you are using the [stable-diffusion-2](https://huggingface.co/stabilityai/stable-diffusion-2) 768x768 model.___**
-
-```bash
-export MODEL_NAME="CompVis/stable-diffusion-v1-4"
-export INSTANCE_DIR="path-to-instance-images"
-export OUTPUT_DIR="path-to-save-model"
-
-accelerate launch train_dreambooth.py \
-  --pretrained_model_name_or_path=$MODEL_NAME  \
-  --instance_data_dir=$INSTANCE_DIR \
-  --output_dir=$OUTPUT_DIR \
-  --instance_prompt="a photo of sks dog" \
-  --resolution=512 \
-  --train_batch_size=1 \
-  --gradient_accumulation_steps=1 \
-  --learning_rate=5e-6 \
-  --lr_scheduler="constant" \
-  --lr_warmup_steps=0 \
-  --max_train_steps=400
-```
-
-### Training with prior-preservation loss
-
-Prior-preservation is used to avoid overfitting and language-drift. Refer to the paper to learn more about it. For prior-preservation we first generate images using the model with a class prompt and then use those during training along with our data.
-According to the paper, it's recommended to generate `num_epochs * num_samples` images for prior-preservation. 200-300 works well for most cases. The `num_class_images` flag sets the number of images to generate with the class prompt. You can place existing images in `class_data_dir`, and the training script will generate any additional images so that `num_class_images` are present in `class_data_dir` during training time.
-
-```bash
-export MODEL_NAME="CompVis/stable-diffusion-v1-4"
-export INSTANCE_DIR="path-to-instance-images"
-export CLASS_DIR="path-to-class-images"
-export OUTPUT_DIR="path-to-save-model"
-
-accelerate launch train_dreambooth.py \
-  --pretrained_model_name_or_path=$MODEL_NAME  \
-  --instance_data_dir=$INSTANCE_DIR \
-  --class_data_dir=$CLASS_DIR \
-  --output_dir=$OUTPUT_DIR \
-  --with_prior_preservation --prior_loss_weight=1.0 \
-  --instance_prompt="a photo of sks dog" \
-  --class_prompt="a photo of dog" \
-  --resolution=512 \
-  --train_batch_size=1 \
-  --gradient_accumulation_steps=1 \
-  --learning_rate=5e-6 \
-  --lr_scheduler="constant" \
-  --lr_warmup_steps=0 \
-  --num_class_images=200 \
-  --max_train_steps=800
-```
-
-
-### Training on a 16GB GPU:
-
-With the help of gradient checkpointing and the 8-bit optimizer from bitsandbytes it's possible to run train dreambooth on a 16GB GPU.
-
-To install `bitandbytes` please refer to this [readme](https://github.com/TimDettmers/bitsandbytes#requirements--installation).
-
-```bash
-export MODEL_NAME="CompVis/stable-diffusion-v1-4"
-export INSTANCE_DIR="path-to-instance-images"
-export CLASS_DIR="path-to-class-images"
-export OUTPUT_DIR="path-to-save-model"
-
-accelerate launch train_dreambooth.py \
-  --pretrained_model_name_or_path=$MODEL_NAME  \
-  --instance_data_dir=$INSTANCE_DIR \
-  --class_data_dir=$CLASS_DIR \
-  --output_dir=$OUTPUT_DIR \
-  --with_prior_preservation --prior_loss_weight=1.0 \
-  --instance_prompt="a photo of sks dog" \
-  --class_prompt="a photo of dog" \
-  --resolution=512 \
-  --train_batch_size=1 \
-  --gradient_accumulation_steps=2 --gradient_checkpointing \
-  --use_8bit_adam \
-  --learning_rate=5e-6 \
-  --lr_scheduler="constant" \
-  --lr_warmup_steps=0 \
-  --num_class_images=200 \
-  --max_train_steps=800
-```
-
-### Training on a 8 GB GPU:
-
-By using [DeepSpeed](https://www.deepspeed.ai/) it's possible to offload some
-tensors from VRAM to either CPU or NVME allowing to train with less VRAM.
-
-DeepSpeed needs to be enabled with `accelerate config`. During configuration
-answer yes to "Do you want to use DeepSpeed?". With DeepSpeed stage 2, fp16
-mixed precision and offloading both parameters and optimizer state to cpu it's
-possible to train on under 8 GB VRAM with a drawback of requiring significantly
-more RAM (about 25 GB). See [documentation](https://huggingface.co/docs/accelerate/usage_guides/deepspeed) for more DeepSpeed configuration options.
-
-Changing the default Adam optimizer to DeepSpeed's special version of Adam
-`deepspeed.ops.adam.DeepSpeedCPUAdam` gives a substantial speedup but enabling
-it requires CUDA toolchain with the same version as pytorch. 8-bit optimizer
-does not seem to be compatible with DeepSpeed at the moment.
-
-```bash
-export MODEL_NAME="CompVis/stable-diffusion-v1-4"
-export INSTANCE_DIR="path-to-instance-images"
-export CLASS_DIR="path-to-class-images"
-export OUTPUT_DIR="path-to-save-model"
-
-accelerate launch --mixed_precision="fp16" train_dreambooth.py \
-  --pretrained_model_name_or_path=$MODEL_NAME \
-  --instance_data_dir=$INSTANCE_DIR \
-  --class_data_dir=$CLASS_DIR \
-  --output_dir=$OUTPUT_DIR \
-  --with_prior_preservation --prior_loss_weight=1.0 \
-  --instance_prompt="a photo of sks dog" \
-  --class_prompt="a photo of dog" \
-  --resolution=512 \
-  --train_batch_size=1 \
-  --sample_batch_size=1 \
-  --gradient_accumulation_steps=1 --gradient_checkpointing \
-  --learning_rate=5e-6 \
-  --lr_scheduler="constant" \
-  --lr_warmup_steps=0 \
-  --num_class_images=200 \
-  --max_train_steps=800
-```
-
-### Fine-tune text encoder with the UNet.
-
-The script also allows to fine-tune the `text_encoder` along with the `unet`. It's been observed experimentally that fine-tuning `text_encoder` gives much better results especially on faces. 
-Pass the `--train_text_encoder` argument to the script to enable training `text_encoder`.
-
-___Note: Training text encoder requires more memory, with this option the training won't fit on 16GB GPU. It needs at least 24GB VRAM.___
-
-```bash
-export MODEL_NAME="CompVis/stable-diffusion-v1-4"
-export INSTANCE_DIR="path-to-instance-images"
-export CLASS_DIR="path-to-class-images"
-export OUTPUT_DIR="path-to-save-model"
-
-accelerate launch train_dreambooth.py \
-  --pretrained_model_name_or_path=$MODEL_NAME  \
-  --train_text_encoder \
-  --instance_data_dir=$INSTANCE_DIR \
-  --class_data_dir=$CLASS_DIR \
-  --output_dir=$OUTPUT_DIR \
-  --with_prior_preservation --prior_loss_weight=1.0 \
-  --instance_prompt="a photo of sks dog" \
-  --class_prompt="a photo of dog" \
-  --resolution=512 \
-  --train_batch_size=1 \
-  --use_8bit_adam \
-  --gradient_checkpointing \
-  --learning_rate=2e-6 \
-  --lr_scheduler="constant" \
-  --lr_warmup_steps=0 \
-  --num_class_images=200 \
-  --max_train_steps=800
-```
-
-### Using DreamBooth for other pipelines than Stable Diffusion
-
-Altdiffusion also support dreambooth now, the runing comman is basically the same as abouve, all you need to do is replace the `MODEL_NAME` like this:
-One can now simply change the `pretrained_model_name_or_path` to another architecture such as [`AltDiffusion`](https://huggingface.co/docs/diffusers/api/pipelines/alt_diffusion).
-
-```
-export MODEL_NAME="CompVis/stable-diffusion-v1-4" --> export MODEL_NAME="BAAI/AltDiffusion-m9"
-or
-export MODEL_NAME="CompVis/stable-diffusion-v1-4" --> export MODEL_NAME="BAAI/AltDiffusion"
-```
-
-### Training with xformers:
-You can enable memory efficient attention by [installing xFormers](https://github.com/facebookresearch/xformers#installing-xformers) and padding the `--enable_xformers_memory_efficient_attention` argument to the script. This is not available with the Flax/JAX implementation.
-
-You can also use Dreambooth to train the specialized in-painting model. See [the script in the research folder for details](https://github.com/huggingface/diffusers/tree/main/examples/research_projects/dreambooth_inpaint).
--- a/examples/research_projects/multi_subject_dreambooth/requirements.txt
+++ b/examples/research_projects/multi_subject_dreambooth/requirements.txt
@@ -1,6 +0,0 @@
-accelerate
-torchvision
-transformers>=4.25.1
-ftfy
-tensorboard
-Jinja2
--- a/examples/research_projects/multi_subject_dreambooth/train_multi_subject_dreambooth.py
+++ b/examples/research_projects/multi_subject_dreambooth/train_multi_subject_dreambooth.py
@@ -1,883 +0,0 @@
-import argparse
-import hashlib
-import itertools
-import logging
-import math
-import os
-import warnings
-from pathlib import Path
-from typing import Optional
-
-import torch
-import torch.nn.functional as F
-import torch.utils.checkpoint
-from torch.utils.data import Dataset
-
-import datasets
-import diffusers
-import transformers
-from accelerate import Accelerator
-from accelerate.logging import get_logger
-from accelerate.utils import set_seed
-from diffusers import AutoencoderKL, DDPMScheduler, DiffusionPipeline, UNet2DConditionModel
-from diffusers.optimization import get_scheduler
-from diffusers.utils import check_min_version
-from diffusers.utils.import_utils import is_xformers_available
-from huggingface_hub import HfFolder, Repository, create_repo, whoami
-from PIL import Image
-from torchvision import transforms
-from tqdm.auto import tqdm
-from transformers import AutoTokenizer, PretrainedConfig
-
-
-# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.10.0.dev0")
-
-logger = get_logger(__name__)
-
-
-def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: str, revision: str):
-    text_encoder_config = PretrainedConfig.from_pretrained(
-        pretrained_model_name_or_path,
-        subfolder="text_encoder",
-        revision=revision,
-    )
-    model_class = text_encoder_config.architectures[0]
-
-    if model_class == "CLIPTextModel":
-        from transformers import CLIPTextModel
-
-        return CLIPTextModel
-    elif model_class == "RobertaSeriesModelWithTransformation":
-        from diffusers.pipelines.alt_diffusion.modeling_roberta_series import RobertaSeriesModelWithTransformation
-
-        return RobertaSeriesModelWithTransformation
-    else:
-        raise ValueError(f"{model_class} is not supported.")
-
-
-def parse_args(input_args=None):
-    parser = argparse.ArgumentParser(description="Simple example of a training script.")
-    parser.add_argument(
-        "--pretrained_model_name_or_path",
-        type=str,
-        default=None,
-        required=True,
-        help="Path to pretrained model or model identifier from huggingface.co/models.",
-    )
-    parser.add_argument(
-        "--revision",
-        type=str,
-        default=None,
-        required=False,
-        help="Revision of pretrained model identifier from huggingface.co/models.",
-    )
-    parser.add_argument(
-        "--tokenizer_name",
-        type=str,
-        default=None,
-        help="Pretrained tokenizer name or path if not the same as model_name",
-    )
-    parser.add_argument(
-        "--instance_data_dir",
-        type=str,
-        default=None,
-        required=True,
-        help="A folder containing the training data of instance images.",
-    )
-    parser.add_argument(
-        "--class_data_dir",
-        type=str,
-        default=None,
-        required=False,
-        help="A folder containing the training data of class images.",
-    )
-    parser.add_argument(
-        "--instance_prompt",
-        type=str,
-        default=None,
-        required=True,
-        help="The prompt with identifier specifying the instance",
-    )
-    parser.add_argument(
-        "--class_prompt",
-        type=str,
-        default=None,
-        help="The prompt to specify images in the same class as provided instance images.",
-    )
-    parser.add_argument(
-        "--with_prior_preservation",
-        default=False,
-        action="store_true",
-        help="Flag to add prior preservation loss.",
-    )
-    parser.add_argument("--prior_loss_weight", type=float, default=1.0, help="The weight of prior preservation loss.")
-    parser.add_argument(
-        "--num_class_images",
-        type=int,
-        default=100,
-        help=(
-            "Minimal class images for prior preservation loss. If there are not enough images already present in"
-            " class_data_dir, additional images will be sampled with class_prompt."
-        ),
-    )
-    parser.add_argument(
-        "--output_dir",
-        type=str,
-        default="text-inversion-model",
-        help="The output directory where the model predictions and checkpoints will be written.",
-    )
-    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
-    parser.add_argument(
-        "--resolution",
-        type=int,
-        default=512,
-        help=(
-            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
-            " resolution"
-        ),
-    )
-    parser.add_argument(
-        "--center_crop",
-        default=False,
-        action="store_true",
-        help=(
-            "Whether to center crop the input images to the resolution. If not set, the images will be randomly"
-            " cropped. The images will be resized to the resolution first before cropping."
-        ),
-    )
-    parser.add_argument("--train_text_encoder", action="store_true", help="Whether to train the text encoder")
-    parser.add_argument(
-        "--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader."
-    )
-    parser.add_argument(
-        "--sample_batch_size", type=int, default=4, help="Batch size (per device) for sampling images."
-    )
-    parser.add_argument("--num_train_epochs", type=int, default=1)
-    parser.add_argument(
-        "--max_train_steps",
-        type=int,
-        default=None,
-        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
-    )
-    parser.add_argument(
-        "--checkpointing_steps",
-        type=int,
-        default=500,
-        help=(
-            "Save a checkpoint of the training state every X updates. These checkpoints can be used both as final"
-            " checkpoints in case they are better than the last checkpoint, and are also suitable for resuming"
-            " training using `--resume_from_checkpoint`."
-        ),
-    )
-    parser.add_argument(
-        "--resume_from_checkpoint",
-        type=str,
-        default=None,
-        help=(
-            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
-            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
-        ),
-    )
-    parser.add_argument(
-        "--gradient_accumulation_steps",
-        type=int,
-        default=1,
-        help="Number of updates steps to accumulate before performing a backward/update pass.",
-    )
-    parser.add_argument(
-        "--gradient_checkpointing",
-        action="store_true",
-        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
-    )
-    parser.add_argument(
-        "--learning_rate",
-        type=float,
-        default=5e-6,
-        help="Initial learning rate (after the potential warmup period) to use.",
-    )
-    parser.add_argument(
-        "--scale_lr",
-        action="store_true",
-        default=False,
-        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
-    )
-    parser.add_argument(
-        "--lr_scheduler",
-        type=str,
-        default="constant",
-        help=(
-            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
-            ' "constant", "constant_with_warmup"]'
-        ),
-    )
-    parser.add_argument(
-        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
-    )
-    parser.add_argument(
-        "--lr_num_cycles",
-        type=int,
-        default=1,
-        help="Number of hard resets of the lr in cosine_with_restarts scheduler.",
-    )
-    parser.add_argument("--lr_power", type=float, default=1.0, help="Power factor of the polynomial scheduler.")
-    parser.add_argument(
-        "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
-    )
-    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
-    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
-    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
-    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
-    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
-    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
-    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
-    parser.add_argument(
-        "--hub_model_id",
-        type=str,
-        default=None,
-        help="The name of the repository to keep in sync with the local `output_dir`.",
-    )
-    parser.add_argument(
-        "--logging_dir",
-        type=str,
-        default="logs",
-        help=(
-            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
-            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
-        ),
-    )
-    parser.add_argument(
-        "--allow_tf32",
-        action="store_true",
-        help=(
-            "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
-            " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
-        ),
-    )
-    parser.add_argument(
-        "--report_to",
-        type=str,
-        default="tensorboard",
-        help=(
-            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
-            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
-        ),
-    )
-    parser.add_argument(
-        "--mixed_precision",
-        type=str,
-        default=None,
-        choices=["no", "fp16", "bf16"],
-        help=(
-            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
-            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
-            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
-        ),
-    )
-    parser.add_argument(
-        "--prior_generation_precision",
-        type=str,
-        default=None,
-        choices=["no", "fp32", "fp16", "bf16"],
-        help=(
-            "Choose prior generation precision between fp32, fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
-            " 1.10.and an Nvidia Ampere GPU.  Default to  fp16 if a GPU is available else fp32."
-        ),
-    )
-    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
-    parser.add_argument(
-        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
-    )
-
-    if input_args is not None:
-        args = parser.parse_args(input_args)
-    else:
-        args = parser.parse_args()
-
-    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
-    if env_local_rank != -1 and env_local_rank != args.local_rank:
-        args.local_rank = env_local_rank
-
-    if args.with_prior_preservation:
-        if args.class_data_dir is None:
-            raise ValueError("You must specify a data directory for class images.")
-        if args.class_prompt is None:
-            raise ValueError("You must specify prompt for class images.")
-    else:
-        # logger is not available yet
-        if args.class_data_dir is not None:
-            warnings.warn("You need not use --class_data_dir without --with_prior_preservation.")
-        if args.class_prompt is not None:
-            warnings.warn("You need not use --class_prompt without --with_prior_preservation.")
-
-    return args
-
-
-class DreamBoothDataset(Dataset):
-    """
-    A dataset to prepare the instance and class images with the prompts for fine-tuning the model.
-    It pre-processes the images and the tokenizes prompts.
-    """
-
-    def __init__(
-        self,
-        instance_data_root,
-        instance_prompt,
-        tokenizer,
-        class_data_root=None,
-        class_prompt=None,
-        size=512,
-        center_crop=False,
-    ):
-        self.size = size
-        self.center_crop = center_crop
-        self.tokenizer = tokenizer
-
-        self.instance_data_root = []
-        self.instance_images_path = []
-        self.num_instance_images = []
-        self.instance_prompt = []
-        self.class_data_root = []
-        self.class_images_path = []
-        self.num_class_images = []
-        self.class_prompt = []
-        self._length = 0
-
-        for i in range(len(instance_data_root)):
-            self.instance_data_root.append(Path(instance_data_root[i]))
-            if not self.instance_data_root[i].exists():
-                raise ValueError("Instance images root doesn't exists.")
-
-            self.instance_images_path.append(list(Path(instance_data_root[i]).iterdir()))
-            self.num_instance_images.append(len(self.instance_images_path[i]))
-            self.instance_prompt.append(instance_prompt[i])
-            self._length += self.num_instance_images[i]
-
-            if class_data_root is not None:
-                self.class_data_root.append(Path(class_data_root[i]))
-                self.class_data_root[i].mkdir(parents=True, exist_ok=True)
-                self.class_images_path.append(list(self.class_data_root[i].iterdir()))
-                self.num_class_images.append(len(self.class_images_path))
-                if self.num_class_images[i] > self.num_instance_images[i]:
-                    self._length -= self.num_instance_images[i]
-                    self._length += self.num_class_images[i]
-                self.class_prompt.append(class_prompt[i])
-            else:
-                self.class_data_root = None
-
-        self.image_transforms = transforms.Compose(
-            [
-                transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR),
-                transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size),
-                transforms.ToTensor(),
-                transforms.Normalize([0.5], [0.5]),
-            ]
-        )
-
-    def __len__(self):
-        return self._length
-
-    def __getitem__(self, index):
-        example = {}
-        for i in range(len(self.instance_images_path)):
-            instance_image = Image.open(self.instance_images_path[i][index % self.num_instance_images[i]])
-            if not instance_image.mode == "RGB":
-                instance_image = instance_image.convert("RGB")
-            example[f"instance_images_{i}"] = self.image_transforms(instance_image)
-            example[f"instance_prompt_ids_{i}"] = self.tokenizer(
-                self.instance_prompt[i],
-                truncation=True,
-                padding="max_length",
-                max_length=self.tokenizer.model_max_length,
-                return_tensors="pt",
-            ).input_ids
-
-        if self.class_data_root:
-            for i in range(len(self.class_data_root)):
-                class_image = Image.open(self.class_images_path[i][index % self.num_class_images[i]])
-                if not class_image.mode == "RGB":
-                    class_image = class_image.convert("RGB")
-                example[f"class_images_{i}"] = self.image_transforms(class_image)
-                example[f"class_prompt_ids_{i}"] = self.tokenizer(
-                    self.class_prompt[i],
-                    truncation=True,
-                    padding="max_length",
-                    max_length=self.tokenizer.model_max_length,
-                    return_tensors="pt",
-                ).input_ids
-
-        return example
-
-
-def collate_fn(num_instances, examples, with_prior_preservation=False):
-    input_ids = []
-    pixel_values = []
-
-    for i in range(num_instances):
-        input_ids += [example[f"instance_prompt_ids_{i}"] for example in examples]
-        pixel_values += [example[f"instance_images_{i}"] for example in examples]
-
-    # Concat class and instance examples for prior preservation.
-    # We do this to avoid doing two forward passes.
-    if with_prior_preservation:
-        for i in range(num_instances):
-            input_ids += [example[f"class_prompt_ids_{i}"] for example in examples]
-            pixel_values += [example[f"class_images_{i}"] for example in examples]
-
-    pixel_values = torch.stack(pixel_values)
-    pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
-
-    input_ids = torch.cat(input_ids, dim=0)
-
-    batch = {
-        "input_ids": input_ids,
-        "pixel_values": pixel_values,
-    }
-    return batch
-
-
-class PromptDataset(Dataset):
-    "A simple dataset to prepare the prompts to generate class images on multiple GPUs."
-
-    def __init__(self, prompt, num_samples):
-        self.prompt = prompt
-        self.num_samples = num_samples
-
-    def __len__(self):
-        return self.num_samples
-
-    def __getitem__(self, index):
-        example = {}
-        example["prompt"] = self.prompt
-        example["index"] = index
-        return example
-
-
-def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: Optional[str] = None):
-    if token is None:
-        token = HfFolder.get_token()
-    if organization is None:
-        username = whoami(token)["name"]
-        return f"{username}/{model_id}"
-    else:
-        return f"{organization}/{model_id}"
-
-
-def main(args):
-    logging_dir = Path(args.output_dir, args.logging_dir)
-
-    accelerator = Accelerator(
-        gradient_accumulation_steps=args.gradient_accumulation_steps,
-        mixed_precision=args.mixed_precision,
-        log_with=args.report_to,
-        logging_dir=logging_dir,
-    )
-
-    # Currently, it's not possible to do gradient accumulation when training two models with accelerate.accumulate
-    # This will be enabled soon in accelerate. For now, we don't allow gradient accumulation when training two models.
-    # TODO (patil-suraj): Remove this check when gradient accumulation with two models is enabled in accelerate.
-    if args.train_text_encoder and args.gradient_accumulation_steps > 1 and accelerator.num_processes > 1:
-        raise ValueError(
-            "Gradient accumulation is not supported when training the text encoder in distributed training. "
-            "Please set gradient_accumulation_steps to 1. This feature will be supported in the future."
-        )
-
-    # Parse instance and class inputs, and double check that lengths match
-    instance_data_dir = args.instance_data_dir.split(",")
-    instance_prompt = args.instance_prompt.split(",")
-    assert all(
-        x == len(instance_data_dir) for x in [len(instance_data_dir), len(instance_prompt)]
-    ), "Instance data dir and prompt inputs are not of the same length."
-
-    if args.with_prior_preservation:
-        class_data_dir = args.class_data_dir.split(",")
-        class_prompt = args.class_prompt.split(",")
-        assert all(
-            x == len(instance_data_dir)
-            for x in [len(instance_data_dir), len(instance_prompt), len(class_data_dir), len(class_prompt)]
-        ), "Instance & class data dir or prompt inputs are not of the same length."
-    else:
-        class_data_dir = args.class_data_dir
-        class_prompt = args.class_prompt
-
-    # Make one log on every process with the configuration for debugging.
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        level=logging.INFO,
-    )
-    logger.info(accelerator.state, main_process_only=False)
-    if accelerator.is_local_main_process:
-        datasets.utils.logging.set_verbosity_warning()
-        transformers.utils.logging.set_verbosity_warning()
-        diffusers.utils.logging.set_verbosity_info()
-    else:
-        datasets.utils.logging.set_verbosity_error()
-        transformers.utils.logging.set_verbosity_error()
-        diffusers.utils.logging.set_verbosity_error()
-
-    # If passed along, set the training seed now.
-    if args.seed is not None:
-        set_seed(args.seed)
-
-    # Generate class images if prior preservation is enabled.
-    if args.with_prior_preservation:
-        for i in range(len(class_data_dir)):
-            class_images_dir = Path(class_data_dir[i])
-            if not class_images_dir.exists():
-                class_images_dir.mkdir(parents=True)
-            cur_class_images = len(list(class_images_dir.iterdir()))
-
-            if cur_class_images < args.num_class_images:
-                torch_dtype = torch.float16 if accelerator.device.type == "cuda" else torch.float32
-                if args.prior_generation_precision == "fp32":
-                    torch_dtype = torch.float32
-                elif args.prior_generation_precision == "fp16":
-                    torch_dtype = torch.float16
-                elif args.prior_generation_precision == "bf16":
-                    torch_dtype = torch.bfloat16
-                pipeline = DiffusionPipeline.from_pretrained(
-                    args.pretrained_model_name_or_path,
-                    torch_dtype=torch_dtype,
-                    safety_checker=None,
-                    revision=args.revision,
-                )
-                pipeline.set_progress_bar_config(disable=True)
-
-                num_new_images = args.num_class_images - cur_class_images
-                logger.info(f"Number of class images to sample: {num_new_images}.")
-
-                sample_dataset = PromptDataset(class_prompt[i], num_new_images)
-                sample_dataloader = torch.utils.data.DataLoader(sample_dataset, batch_size=args.sample_batch_size)
-
-                sample_dataloader = accelerator.prepare(sample_dataloader)
-                pipeline.to(accelerator.device)
-
-                for example in tqdm(
-                    sample_dataloader, desc="Generating class images", disable=not accelerator.is_local_main_process
-                ):
-                    images = pipeline(example["prompt"]).images
-
-                    for i, image in enumerate(images):
-                        hash_image = hashlib.sha1(image.tobytes()).hexdigest()
-                        image_filename = (
-                            class_images_dir / f"{example['index'][i] + cur_class_images}-{hash_image}.jpg"
-                        )
-                        image.save(image_filename)
-
-                del pipeline
-                if torch.cuda.is_available():
-                    torch.cuda.empty_cache()
-
-    # Handle the repository creation
-    if accelerator.is_main_process:
-        if args.push_to_hub:
-            if args.hub_model_id is None:
-                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
-            else:
-                repo_name = args.hub_model_id
-            create_repo(repo_name, exist_ok=True, token=args.hub_token)
-            repo = Repository(args.output_dir, clone_from=repo_name, token=args.hub_token)
-
-            with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
-                if "step_*" not in gitignore:
-                    gitignore.write("step_*\n")
-                if "epoch_*" not in gitignore:
-                    gitignore.write("epoch_*\n")
-        elif args.output_dir is not None:
-            os.makedirs(args.output_dir, exist_ok=True)
-
-    # Load the tokenizer
-    if args.tokenizer_name:
-        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, revision=args.revision, use_fast=False)
-    elif args.pretrained_model_name_or_path:
-        tokenizer = AutoTokenizer.from_pretrained(
-            args.pretrained_model_name_or_path,
-            subfolder="tokenizer",
-            revision=args.revision,
-            use_fast=False,
-        )
-
-    # import correct text encoder class
-    text_encoder_cls = import_model_class_from_model_name_or_path(args.pretrained_model_name_or_path, args.revision)
-
-    # Load scheduler and models
-    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
-    text_encoder = text_encoder_cls.from_pretrained(
-        args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision
-    )
-    vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision)
-    unet = UNet2DConditionModel.from_pretrained(
-        args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision
-    )
-
-    vae.requires_grad_(False)
-    if not args.train_text_encoder:
-        text_encoder.requires_grad_(False)
-
-    if args.enable_xformers_memory_efficient_attention:
-        if is_xformers_available():
-            unet.enable_xformers_memory_efficient_attention()
-        else:
-            raise ValueError("xformers is not available. Make sure it is installed correctly")
-
-    if args.gradient_checkpointing:
-        unet.enable_gradient_checkpointing()
-        if args.train_text_encoder:
-            text_encoder.gradient_checkpointing_enable()
-
-    # Enable TF32 for faster training on Ampere GPUs,
-    # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
-    if args.allow_tf32:
-        torch.backends.cuda.matmul.allow_tf32 = True
-
-    if args.scale_lr:
-        args.learning_rate = (
-            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
-        )
-
-    # Use 8-bit Adam for lower memory usage or to fine-tune the model in 16GB GPUs
-    if args.use_8bit_adam:
-        try:
-            import bitsandbytes as bnb
-        except ImportError:
-            raise ImportError(
-                "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`."
-            )
-
-        optimizer_class = bnb.optim.AdamW8bit
-    else:
-        optimizer_class = torch.optim.AdamW
-
-    # Optimizer creation
-    params_to_optimize = (
-        itertools.chain(unet.parameters(), text_encoder.parameters()) if args.train_text_encoder else unet.parameters()
-    )
-    optimizer = optimizer_class(
-        params_to_optimize,
-        lr=args.learning_rate,
-        betas=(args.adam_beta1, args.adam_beta2),
-        weight_decay=args.adam_weight_decay,
-        eps=args.adam_epsilon,
-    )
-
-    # Dataset and DataLoaders creation:
-    train_dataset = DreamBoothDataset(
-        instance_data_root=instance_data_dir,
-        instance_prompt=instance_prompt,
-        class_data_root=class_data_dir if args.with_prior_preservation else None,
-        class_prompt=class_prompt,
-        tokenizer=tokenizer,
-        size=args.resolution,
-        center_crop=args.center_crop,
-    )
-
-    train_dataloader = torch.utils.data.DataLoader(
-        train_dataset,
-        batch_size=args.train_batch_size,
-        shuffle=True,
-        collate_fn=lambda examples: collate_fn(len(instance_data_dir), examples, args.with_prior_preservation),
-        num_workers=1,
-    )
-
-    # Scheduler and math around the number of training steps.
-    overrode_max_train_steps = False
-    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
-    if args.max_train_steps is None:
-        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
-        overrode_max_train_steps = True
-
-    lr_scheduler = get_scheduler(
-        args.lr_scheduler,
-        optimizer=optimizer,
-        num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps,
-        num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
-        num_cycles=args.lr_num_cycles,
-        power=args.lr_power,
-    )
-
-    # Prepare everything with our `accelerator`.
-    if args.train_text_encoder:
-        unet, text_encoder, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
-            unet, text_encoder, optimizer, train_dataloader, lr_scheduler
-        )
-    else:
-        unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
-            unet, optimizer, train_dataloader, lr_scheduler
-        )
-
-    # For mixed precision training we cast the text_encoder and vae weights to half-precision
-    # as these models are only used for inference, keeping weights in full precision is not required.
-    weight_dtype = torch.float32
-    if accelerator.mixed_precision == "fp16":
-        weight_dtype = torch.float16
-    elif accelerator.mixed_precision == "bf16":
-        weight_dtype = torch.bfloat16
-
-    # Move vae and text_encoder to device and cast to weight_dtype
-    vae.to(accelerator.device, dtype=weight_dtype)
-    if not args.train_text_encoder:
-        text_encoder.to(accelerator.device, dtype=weight_dtype)
-
-    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
-    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
-    if overrode_max_train_steps:
-        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
-    # Afterwards we recalculate our number of training epochs
-    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
-
-    # We need to initialize the trackers we use, and also store our configuration.
-    # The trackers initializes automatically on the main process.
-    if accelerator.is_main_process:
-        accelerator.init_trackers("dreambooth", config=vars(args))
-
-    # Train!
-    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
-
-    logger.info("***** Running training *****")
-    logger.info(f"  Num examples = {len(train_dataset)}")
-    logger.info(f"  Num batches each epoch = {len(train_dataloader)}")
-    logger.info(f"  Num Epochs = {args.num_train_epochs}")
-    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
-    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
-    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
-    logger.info(f"  Total optimization steps = {args.max_train_steps}")
-    global_step = 0
-    first_epoch = 0
-
-    # Potentially load in the weights and states from a previous save
-    if args.resume_from_checkpoint:
-        if args.resume_from_checkpoint != "latest":
-            path = os.path.basename(args.resume_from_checkpoint)
-        else:
-            # Get the mos recent checkpoint
-            dirs = os.listdir(args.output_dir)
-            dirs = [d for d in dirs if d.startswith("checkpoint")]
-            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
-            path = dirs[-1] if len(dirs) > 0 else None
-
-        if path is None:
-            accelerator.print(
-                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
-            )
-            args.resume_from_checkpoint = None
-        else:
-            accelerator.print(f"Resuming from checkpoint {path}")
-            accelerator.load_state(os.path.join(args.output_dir, path))
-            global_step = int(path.split("-")[1])
-
-            resume_global_step = global_step * args.gradient_accumulation_steps
-            first_epoch = global_step // num_update_steps_per_epoch
-            resume_step = resume_global_step % (num_update_steps_per_epoch * args.gradient_accumulation_steps)
-
-    # Only show the progress bar once on each machine.
-    progress_bar = tqdm(range(global_step, args.max_train_steps), disable=not accelerator.is_local_main_process)
-    progress_bar.set_description("Steps")
-
-    for epoch in range(first_epoch, args.num_train_epochs):
-        unet.train()
-        if args.train_text_encoder:
-            text_encoder.train()
-        for step, batch in enumerate(train_dataloader):
-            # Skip steps until we reach the resumed step
-            if args.resume_from_checkpoint and epoch == first_epoch and step < resume_step:
-                if step % args.gradient_accumulation_steps == 0:
-                    progress_bar.update(1)
-                continue
-
-            with accelerator.accumulate(unet):
-                # Convert images to latent space
-                latents = vae.encode(batch["pixel_values"].to(dtype=weight_dtype)).latent_dist.sample()
-                latents = latents * 0.18215
-
-                # Sample noise that we'll add to the latents
-                noise = torch.randn_like(latents)
-                bsz = latents.shape[0]
-                # Sample a random timestep for each image
-                timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device)
-                timesteps = timesteps.long()
-
-                # Add noise to the latents according to the noise magnitude at each timestep
-                # (this is the forward diffusion process)
-                noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
-
-                # Get the text embedding for conditioning
-                encoder_hidden_states = text_encoder(batch["input_ids"])[0]
-
-                # Predict the noise residual
-                model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
-
-                # Get the target for loss depending on the prediction type
-                if noise_scheduler.config.prediction_type == "epsilon":
-                    target = noise
-                elif noise_scheduler.config.prediction_type == "v_prediction":
-                    target = noise_scheduler.get_velocity(latents, noise, timesteps)
-                else:
-                    raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
-
-                if args.with_prior_preservation:
-                    # Chunk the noise and model_pred into two parts and compute the loss on each part separately.
-                    model_pred, model_pred_prior = torch.chunk(model_pred, 2, dim=0)
-                    target, target_prior = torch.chunk(target, 2, dim=0)
-
-                    # Compute instance loss
-                    loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
-
-                    # Compute prior loss
-                    prior_loss = F.mse_loss(model_pred_prior.float(), target_prior.float(), reduction="mean")
-
-                    # Add the prior loss to the instance loss.
-                    loss = loss + args.prior_loss_weight * prior_loss
-                else:
-                    loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
-
-                accelerator.backward(loss)
-                if accelerator.sync_gradients:
-                    params_to_clip = (
-                        itertools.chain(unet.parameters(), text_encoder.parameters())
-                        if args.train_text_encoder
-                        else unet.parameters()
-                    )
-                    accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
-                optimizer.step()
-                lr_scheduler.step()
-                optimizer.zero_grad()
-
-            # Checks if the accelerator has performed an optimization step behind the scenes
-            if accelerator.sync_gradients:
-                progress_bar.update(1)
-                global_step += 1
-
-                if global_step % args.checkpointing_steps == 0:
-                    if accelerator.is_main_process:
-                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
-                        accelerator.save_state(save_path)
-                        logger.info(f"Saved state to {save_path}")
-
-            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
-            progress_bar.set_postfix(**logs)
-            accelerator.log(logs, step=global_step)
-
-            if global_step >= args.max_train_steps:
-                break
-
-    # Create the pipeline using using the trained modules and save it.
-    accelerator.wait_for_everyone()
-    if accelerator.is_main_process:
-        pipeline = DiffusionPipeline.from_pretrained(
-            args.pretrained_model_name_or_path,
-            unet=accelerator.unwrap_model(unet),
-            text_encoder=accelerator.unwrap_model(text_encoder),
-            revision=args.revision,
-        )
-        pipeline.save_pretrained(args.output_dir)
-
-        if args.push_to_hub:
-            repo.push_to_hub(commit_message="End of training", blocking=False, auto_lfs_prune=True)
-
-    accelerator.end_training()
-
-
-if __name__ == "__main__":
-    args = parse_args()
-    main(args)
--- a/examples/test_examples.py
+++ b/examples/test_examples.py
@@ -102,7 +102,7 @@ class ExamplesTestsAccelerate(unittest.TestCase):
            test_args = f"""
                examples/textual_inversion/textual_inversion.py
                --pretrained_model_name_or_path runwayml/stable-diffusion-v1-5
-                --train_data_dir docs/source/en/imgs
+                --train_data_dir docs/source/imgs
                --learnable_property object
                --placeholder_token <cat-toy>
                --initializer_token toy
--- a/examples/text_to_image/README.md
+++ b/examples/text_to_image/README.md
@@ -110,88 +110,13 @@ image = pipe(prompt="yoda").images[0]
 image.save("yoda-pokemon.png")
 ```

-## Training with LoRA

-Low-Rank Adaption of Large Language Models was first introduced by Microsoft in [LoRA: Low-Rank Adaptation of Large Language Models](https://arxiv.org/abs/2106.09685) by *Edward J. Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, Weizhu Chen*.
-
-In a nutshell, LoRA allows adapting pretrained models by adding pairs of rank-decomposition matrices to existing weights and **only** training those newly added weights. This has a couple of advantages:
-
- Previous pretrained weights are kept frozen so that model is not prone to [catastrophic forgetting](https://www.pnas.org/doi/10.1073/pnas.1611835114).
- Rank-decomposition matrices have significantly fewer parameters than original model, which means that trained LoRA weights are easily portable.
- LoRA attention layers allow to control to which extent the model is adapted toward new training images via a `scale` parameter.
-
-[cloneofsimo](https://github.com/cloneofsimo) was the first to try out LoRA training for Stable Diffusion in the popular [lora](https://github.com/cloneofsimo/lora) GitHub repository.
-
-With LoRA, it's possible to fine-tune Stable Diffusion on a custom image-caption pair dataset
-on consumer GPUs like Tesla T4, Tesla V100.
-
-### Training
-
-First, you need to set up your development environment as is explained in the [installation section](#installing-the-dependencies). Make sure to set the `MODEL_NAME` and `DATASET_NAME` environment variables. Here, we will use [Stable Diffusion v1-4](https://hf.co/CompVis/stable-diffusion-v1-4) and the [Pokemons dataset](https://hf.colambdalabs/pokemon-blip-captions).  
-
-**___Note: Change the `resolution` to 768 if you are using the [stable-diffusion-2](https://huggingface.co/stabilityai/stable-diffusion-2) 768x768 model.___**
-
-**___Note: It is quite useful to monitor the training progress by regularly generating sample images during training. [Weights and Biases](https://docs.wandb.ai/quickstart) is a nice solution to easily see generating images during training. All you need to do is to run `pip install wandb` before training to automatically log images.___**
-
-```bash
-export MODEL_NAME="CompVis/stable-diffusion-v1-4"
-export DATASET_NAME="lambdalabs/pokemon-blip-captions"
-```
-
-For this example we want to directly store the trained LoRA embeddings on the Hub, so 
-we need to be logged in and add the `--push_to_hub` flag.
-
-```bash
-huggingface-cli login
-```
-
-Now we can start training!
-
-```bash
-accelerate --mixed_precision="fp16" launch train_text_to_image_lora.py \
-  --pretrained_model_name_or_path=$MODEL_NAME \
-  --dataset_name=$DATASET_NAME --caption_column="text" \
-  --resolution=512 --random_flip \
-  --train_batch_size=1 \
-  --num_train_epochs=100 --checkpointing_steps=5000 \
-  --learning_rate=1e-04 --lr_scheduler="constant" --lr_warmup_steps=0 \
-  --seed=42 \
-  --output_dir="sd-pokemon-model-lora" \
-  --save_sample_prompt="cute dragon creature" --report_to="wandb"
-```
-
-The above command will also run inference as fine-tuning progresses and log the results to Weights and Biases.
-
-**___Note: When using LoRA we can use a much higher learning rate compared to non-LoRA fine-tuning. Here we use *1e-4* instead of the usual *1e-5*. Also, by using LoRA, it's possible to run `train_text_to_image_lora.py` in consumer GPUs like T4 or V100.___**
-
-The final LoRA embedding weights have been uploaded to [sayakpaul/sd-model-finetuned-lora-t4](https://huggingface.co/sayakpaul/sd-model-finetuned-lora-t4). **___Note: [The final weights](https://huggingface.co/sayakpaul/sd-model-finetuned-lora-t4/blob/main/pytorch_lora_weights.bin) are only 3 MB in size, which is orders of magnitudes smaller than the original model.___**
-
-You can check some inference samples that were logged during the course of the fine-tuning process [here](https://wandb.ai/sayakpaul/text2image-fine-tune/runs/q4lc0xsw). 
-
-### Inference
-
-Once you have trained a model using above command, the inference can be done simply using the `StableDiffusionPipeline` after loading the trained LoRA weights.  You 
-need to pass the `output_dir` for loading the LoRA weights which, in this case, is `sd-pokemon-model-lora`.
-
-```python
-from diffusers import StableDiffusionPipeline
-import torch
-
-model_path = "sayakpaul/sd-model-finetuned-lora-t4"
-pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
-pipe.unet.load_attn_procs(model_path)
-pipe.to("cuda")
-
-prompt = "A pokemon with green eyes and red legs."
-image = pipe(prompt, num_inference_steps=30, guidance_scale=7.5).images[0]
-image.save("pokemon.png")
-```

 ## Training with Flax/JAX

 For faster training on TPUs and GPUs you can leverage the flax training example. Follow the instructions above to get the model and dataset before running the script.

-**___Note: The flax example doesn't yet support features like gradient checkpoint, gradient accumulation etc, so to use flax for faster training we will need >30GB cards or TPU v3.___**
+____Note: The flax example don't yet support features like gradient checkpoint, gradient accumulation etc, so to use flax for faster training we will need >30GB cards.___


 Before running the scripts, make sure to install the library's training dependencies:
@@ -216,6 +141,7 @@ python train_text_to_image_flax.py \
  --output_dir="sd-pokemon-model" 
 ```

+
 To run on your own training files prepare the dataset according to the format required by `datasets`, you can find the instructions for how to do that in this [document](https://huggingface.co/docs/datasets/v2.4.0/en/image_load#imagefolder-with-metadata).
 If you wish to use custom loading logic, you should modify the script, we have left pointers for that in the training script.

--- a/examples/text_to_image/requirements.txt
+++ b/examples/text_to_image/requirements.txt
@@ -4,4 +4,4 @@ transformers>=4.25.1
 datasets
 ftfy
 tensorboard
-Jinja2
+modelcards
--- a/examples/text_to_image/requirements_flax.txt
+++ b/examples/text_to_image/requirements_flax.txt
@@ -6,4 +6,4 @@ torch
 torchvision
 ftfy
 tensorboard
-Jinja2
+modelcards
--- a/examples/text_to_image/train_text_to_image.py
+++ b/examples/text_to_image/train_text_to_image.py
@@ -1,25 +1,11 @@
-#!/usr/bin/env python
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-
 import argparse
+import copy
 import logging
 import math
 import os
 import random
 from pathlib import Path
-from typing import Optional
+from typing import Iterable, Optional

 import numpy as np
 import torch
@@ -35,17 +21,16 @@ from accelerate.utils import set_seed
 from datasets import load_dataset
 from diffusers import AutoencoderKL, DDPMScheduler, StableDiffusionPipeline, UNet2DConditionModel
 from diffusers.optimization import get_scheduler
-from diffusers.training_utils import EMAModel
 from diffusers.utils import check_min_version
 from diffusers.utils.import_utils import is_xformers_available
-from huggingface_hub import HfFolder, Repository, create_repo, whoami
+from huggingface_hub import HfFolder, Repository, whoami
 from torchvision import transforms
 from tqdm.auto import tqdm
 from transformers import CLIPTextModel, CLIPTokenizer


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.12.0")
+check_min_version("0.10.0.dev0")

 logger = get_logger(__name__, log_level="INFO")

@@ -134,12 +119,8 @@ def parse_args():
    )
    parser.add_argument(
        "--center_crop",
-        default=False,
        action="store_true",
-        help=(
-            "Whether to center crop the input images to the resolution. If not set, the images will be randomly"
-            " cropped. The images will be resized to the resolution first before cropping."
-        ),
+        help="Whether to center crop images before resizing to resolution (if not set, random crop will be used)",
    )
    parser.add_argument(
        "--random_flip",
@@ -213,14 +194,6 @@ def parse_args():
            " remote repository specified with --pretrained_model_name_or_path."
        ),
    )
-    parser.add_argument(
-        "--dataloader_num_workers",
-        type=int,
-        default=0,
-        help=(
-            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
-        ),
-    )
    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
@@ -317,6 +290,115 @@ dataset_name_mapping = {
 }


+# Adapted from torch-ema https://github.com/fadel/pytorch_ema/blob/master/torch_ema/ema.py#L14
+class EMAModel:
+    """
+    Exponential Moving Average of models weights
+    """
+
+    def __init__(self, parameters: Iterable[torch.nn.Parameter], decay=0.9999):
+        parameters = list(parameters)
+        self.shadow_params = [p.clone().detach() for p in parameters]
+
+        self.collected_params = None
+
+        self.decay = decay
+        self.optimization_step = 0
+
+    @torch.no_grad()
+    def step(self, parameters):
+        parameters = list(parameters)
+
+        self.optimization_step += 1
+
+        # Compute the decay factor for the exponential moving average.
+        value = (1 + self.optimization_step) / (10 + self.optimization_step)
+        one_minus_decay = 1 - min(self.decay, value)
+
+        for s_param, param in zip(self.shadow_params, parameters):
+            if param.requires_grad:
+                s_param.sub_(one_minus_decay * (s_param - param))
+            else:
+                s_param.copy_(param)
+
+        torch.cuda.empty_cache()
+
+    def copy_to(self, parameters: Iterable[torch.nn.Parameter]) -> None:
+        """
+        Copy current averaged parameters into given collection of parameters.
+
+        Args:
+            parameters: Iterable of `torch.nn.Parameter`; the parameters to be
+                updated with the stored moving averages. If `None`, the
+                parameters with which this `ExponentialMovingAverage` was
+                initialized will be used.
+        """
+        parameters = list(parameters)
+        for s_param, param in zip(self.shadow_params, parameters):
+            param.data.copy_(s_param.data)
+
+    def to(self, device=None, dtype=None) -> None:
+        r"""Move internal buffers of the ExponentialMovingAverage to `device`.
+
+        Args:
+            device: like `device` argument to `torch.Tensor.to`
+        """
+        # .to() on the tensors handles None correctly
+        self.shadow_params = [
+            p.to(device=device, dtype=dtype) if p.is_floating_point() else p.to(device=device)
+            for p in self.shadow_params
+        ]
+
+    def state_dict(self) -> dict:
+        r"""
+        Returns the state of the ExponentialMovingAverage as a dict.
+        This method is used by accelerate during checkpointing to save the ema state dict.
+        """
+        # Following PyTorch conventions, references to tensors are returned:
+        # "returns a reference to the state and not its copy!" -
+        # https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict
+        return {
+            "decay": self.decay,
+            "optimization_step": self.optimization_step,
+            "shadow_params": self.shadow_params,
+            "collected_params": self.collected_params,
+        }
+
+    def load_state_dict(self, state_dict: dict) -> None:
+        r"""
+        Loads the ExponentialMovingAverage state.
+        This method is used by accelerate during checkpointing to save the ema state dict.
+        Args:
+            state_dict (dict): EMA state. Should be an object returned
+                from a call to :meth:`state_dict`.
+        """
+        # deepcopy, to be consistent with module API
+        state_dict = copy.deepcopy(state_dict)
+
+        self.decay = state_dict["decay"]
+        if self.decay < 0.0 or self.decay > 1.0:
+            raise ValueError("Decay must be between 0 and 1")
+
+        self.optimization_step = state_dict["optimization_step"]
+        if not isinstance(self.optimization_step, int):
+            raise ValueError("Invalid optimization_step")
+
+        self.shadow_params = state_dict["shadow_params"]
+        if not isinstance(self.shadow_params, list):
+            raise ValueError("shadow_params must be a list")
+        if not all(isinstance(p, torch.Tensor) for p in self.shadow_params):
+            raise ValueError("shadow_params must all be Tensors")
+
+        self.collected_params = state_dict["collected_params"]
+        if self.collected_params is not None:
+            if not isinstance(self.collected_params, list):
+                raise ValueError("collected_params must be a list")
+            if not all(isinstance(p, torch.Tensor) for p in self.collected_params):
+                raise ValueError("collected_params must all be Tensors")
+            if len(self.collected_params) != len(self.shadow_params):
+                raise ValueError("collected_params and shadow_params must have the same length")
+
+
 def main():
    args = parse_args()
    logging_dir = os.path.join(args.output_dir, args.logging_dir)
@@ -355,8 +437,7 @@ def main():
                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
            else:
                repo_name = args.hub_model_id
-            create_repo(repo_name, exist_ok=True, token=args.hub_token)
-            repo = Repository(args.output_dir, clone_from=repo_name, token=args.hub_token)
+            repo = Repository(args.output_dir, clone_from=repo_name)

            with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
                if "step_*" not in gitignore:
@@ -527,11 +608,7 @@ def main():

    # DataLoaders creation:
    train_dataloader = torch.utils.data.DataLoader(
-        train_dataset,
-        shuffle=True,
-        collate_fn=collate_fn,
-        batch_size=args.train_batch_size,
-        num_workers=args.dataloader_num_workers,
+        train_dataset, shuffle=True, collate_fn=collate_fn, batch_size=args.train_batch_size
    )

    # Scheduler and math around the number of training steps.
@@ -603,21 +680,13 @@ def main():
            dirs = os.listdir(args.output_dir)
            dirs = [d for d in dirs if d.startswith("checkpoint")]
            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
-            path = dirs[-1] if len(dirs) > 0 else None
+            path = dirs[-1]
+        accelerator.print(f"Resuming from checkpoint {path}")
+        accelerator.load_state(os.path.join(args.output_dir, path))
+        global_step = int(path.split("-")[1])

-        if path is None:
-            accelerator.print(
-                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
-            )
-            args.resume_from_checkpoint = None
-        else:
-            accelerator.print(f"Resuming from checkpoint {path}")
-            accelerator.load_state(os.path.join(args.output_dir, path))
-            global_step = int(path.split("-")[1])
-
-            resume_global_step = global_step * args.gradient_accumulation_steps
-            first_epoch = global_step // num_update_steps_per_epoch
-            resume_step = resume_global_step % (num_update_steps_per_epoch * args.gradient_accumulation_steps)
+        first_epoch = global_step // num_update_steps_per_epoch
+        resume_step = global_step % num_update_steps_per_epoch

    # Only show the progress bar once on each machine.
    progress_bar = tqdm(range(global_step, args.max_train_steps), disable=not accelerator.is_local_main_process)
--- a/examples/text_to_image/train_text_to_image_flax.py
+++ b/examples/text_to_image/train_text_to_image_flax.py
@@ -27,14 +27,14 @@ from diffusers.utils import check_min_version
 from flax import jax_utils
 from flax.training import train_state
 from flax.training.common_utils import shard
-from huggingface_hub import HfFolder, Repository, create_repo, whoami
+from huggingface_hub import HfFolder, Repository, whoami
 from torchvision import transforms
 from tqdm.auto import tqdm
 from transformers import CLIPFeatureExtractor, CLIPTokenizer, FlaxCLIPTextModel, set_seed


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.12.0")
+check_min_version("0.10.0.dev0")

 logger = logging.getLogger(__name__)

@@ -116,12 +116,8 @@ def parse_args():
    )
    parser.add_argument(
        "--center_crop",
-        default=False,
        action="store_true",
-        help=(
-            "Whether to center crop the input images to the resolution. If not set, the images will be randomly"
-            " cropped. The images will be resized to the resolution first before cropping."
-        ),
+        help="Whether to center crop images before resizing to resolution (if not set, random crop will be used)",
    )
    parser.add_argument(
        "--random_flip",
@@ -259,8 +255,7 @@ def main():
                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
            else:
                repo_name = args.hub_model_id
-            create_repo(repo_name, exist_ok=True, token=args.hub_token)
-            repo = Repository(args.output_dir, clone_from=repo_name, token=args.hub_token)
+            repo = Repository(args.output_dir, clone_from=repo_name)

            with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
                if "step_*" not in gitignore:
--- a/examples/text_to_image/train_text_to_image_lora.py
+++ b/examples/text_to_image/train_text_to_image_lora.py
@@ -1,842 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Fine-tuning script for Stable Diffusion for text2image with support for LoRA."""
-
-import argparse
-import logging
-import math
-import os
-import random
-from pathlib import Path
-from typing import Optional
-
-import numpy as np
-import torch
-import torch.nn.functional as F
-import torch.utils.checkpoint
-
-import datasets
-import diffusers
-import transformers
-from accelerate import Accelerator
-from accelerate.logging import get_logger
-from accelerate.utils import set_seed
-from datasets import load_dataset
-from diffusers import AutoencoderKL, DDPMScheduler, DiffusionPipeline, UNet2DConditionModel
-from diffusers.loaders import AttnProcsLayers
-from diffusers.models.cross_attention import LoRACrossAttnProcessor
-from diffusers.optimization import get_scheduler
-from diffusers.utils import check_min_version, is_wandb_available
-from diffusers.utils.import_utils import is_xformers_available
-from huggingface_hub import HfFolder, Repository, create_repo, whoami
-from torchvision import transforms
-from tqdm.auto import tqdm
-from transformers import CLIPTextModel, CLIPTokenizer
-
-
-# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.12.0")
-
-logger = get_logger(__name__, log_level="INFO")
-
-
-def save_model_card(repo_name, images=None, base_model=str, dataset_name=str, repo_folder=None):
-    img_str = ""
-    for i, image in enumerate(images):
-        image.save(os.path.join(repo_folder, f"image_{i}.png"))
-        img_str += f"![img_{i}](./image_{i}.png)\n"
-
-    yaml = f"""
---
-license: creativeml-openrail-m
-base_model: {base_model}
-tags:
- stable-diffusion
- stable-diffusion-diffusers
- text-to-image
- diffusers
- lora
-inference: true
---
-    """
-    model_card = f"""
-# LoRA text2image fine-tuning - {repo_name}
-These are LoRA adaption weights for {repo_name}. The weights were fine-tuned on the {dataset_name} dataset. You can find some example images in the following. \n
-{img_str}
-"""
-    with open(os.path.join(repo_folder, "README.md"), "w") as f:
-        f.write(yaml + model_card)
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(description="Simple example of a training script.")
-    parser.add_argument(
-        "--pretrained_model_name_or_path",
-        type=str,
-        default=None,
-        required=True,
-        help="Path to pretrained model or model identifier from huggingface.co/models.",
-    )
-    parser.add_argument(
-        "--revision",
-        type=str,
-        default=None,
-        required=False,
-        help="Revision of pretrained model identifier from huggingface.co/models.",
-    )
-    parser.add_argument(
-        "--dataset_name",
-        type=str,
-        default=None,
-        help=(
-            "The name of the Dataset (from the HuggingFace hub) to train on (could be your own, possibly private,"
-            " dataset). It can also be a path pointing to a local copy of a dataset in your filesystem,"
-            " or to a folder containing files that 🤗 Datasets can understand."
-        ),
-    )
-    parser.add_argument(
-        "--dataset_config_name",
-        type=str,
-        default=None,
-        help="The config of the Dataset, leave as None if there's only one config.",
-    )
-    parser.add_argument(
-        "--train_data_dir",
-        type=str,
-        default=None,
-        help=(
-            "A folder containing the training data. Folder contents must follow the structure described in"
-            " https://huggingface.co/docs/datasets/image_dataset#imagefolder. In particular, a `metadata.jsonl` file"
-            " must exist to provide the captions for the images. Ignored if `dataset_name` is specified."
-        ),
-    )
-    parser.add_argument(
-        "--image_column", type=str, default="image", help="The column of the dataset containing an image."
-    )
-    parser.add_argument(
-        "--caption_column",
-        type=str,
-        default="text",
-        help="The column of the dataset containing a caption or a list of captions.",
-    )
-    parser.add_argument(
-        "--validation_prompt", type=str, default=None, help="A prompt that is sampled during training for inference."
-    )
-    parser.add_argument(
-        "--num_validation_images",
-        type=int,
-        default=4,
-        help="Number of images that should be generated during validation with `validation_prompt`.",
-    )
-    parser.add_argument(
-        "--validation_epochs",
-        type=int,
-        default=1,
-        help=(
-            "Run fine-tuning validation every X epochs. The validation process consists of running the prompt"
-            " `args.validation_prompt` multiple times: `args.num_validation_images`."
-        ),
-    )
-    parser.add_argument(
-        "--max_train_samples",
-        type=int,
-        default=None,
-        help=(
-            "For debugging purposes or quicker training, truncate the number of training examples to this "
-            "value if set."
-        ),
-    )
-    parser.add_argument(
-        "--output_dir",
-        type=str,
-        default="sd-model-finetuned-lora",
-        help="The output directory where the model predictions and checkpoints will be written.",
-    )
-    parser.add_argument(
-        "--cache_dir",
-        type=str,
-        default=None,
-        help="The directory where the downloaded models and datasets will be stored.",
-    )
-    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
-    parser.add_argument(
-        "--resolution",
-        type=int,
-        default=512,
-        help=(
-            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
-            " resolution"
-        ),
-    )
-    parser.add_argument(
-        "--center_crop",
-        default=False,
-        action="store_true",
-        help=(
-            "Whether to center crop the input images to the resolution. If not set, the images will be randomly"
-            " cropped. The images will be resized to the resolution first before cropping."
-        ),
-    )
-    parser.add_argument(
-        "--random_flip",
-        action="store_true",
-        help="whether to randomly flip images horizontally",
-    )
-    parser.add_argument(
-        "--train_batch_size", type=int, default=16, help="Batch size (per device) for the training dataloader."
-    )
-    parser.add_argument("--num_train_epochs", type=int, default=100)
-    parser.add_argument(
-        "--max_train_steps",
-        type=int,
-        default=None,
-        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
-    )
-    parser.add_argument(
-        "--gradient_accumulation_steps",
-        type=int,
-        default=1,
-        help="Number of updates steps to accumulate before performing a backward/update pass.",
-    )
-    parser.add_argument(
-        "--gradient_checkpointing",
-        action="store_true",
-        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
-    )
-    parser.add_argument(
-        "--learning_rate",
-        type=float,
-        default=1e-4,
-        help="Initial learning rate (after the potential warmup period) to use.",
-    )
-    parser.add_argument(
-        "--scale_lr",
-        action="store_true",
-        default=False,
-        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
-    )
-    parser.add_argument(
-        "--lr_scheduler",
-        type=str,
-        default="constant",
-        help=(
-            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
-            ' "constant", "constant_with_warmup"]'
-        ),
-    )
-    parser.add_argument(
-        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
-    )
-    parser.add_argument(
-        "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
-    )
-    parser.add_argument(
-        "--allow_tf32",
-        action="store_true",
-        help=(
-            "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
-            " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
-        ),
-    )
-    parser.add_argument(
-        "--dataloader_num_workers",
-        type=int,
-        default=0,
-        help=(
-            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
-        ),
-    )
-    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
-    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
-    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
-    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
-    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
-    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
-    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
-    parser.add_argument(
-        "--hub_model_id",
-        type=str,
-        default=None,
-        help="The name of the repository to keep in sync with the local `output_dir`.",
-    )
-    parser.add_argument(
-        "--logging_dir",
-        type=str,
-        default="logs",
-        help=(
-            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
-            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
-        ),
-    )
-    parser.add_argument(
-        "--mixed_precision",
-        type=str,
-        default=None,
-        choices=["no", "fp16", "bf16"],
-        help=(
-            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
-            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
-            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
-        ),
-    )
-    parser.add_argument(
-        "--report_to",
-        type=str,
-        default="tensorboard",
-        help=(
-            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
-            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
-        ),
-    )
-    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
-    parser.add_argument(
-        "--checkpointing_steps",
-        type=int,
-        default=500,
-        help=(
-            "Save a checkpoint of the training state every X updates. These checkpoints are only suitable for resuming"
-            " training using `--resume_from_checkpoint`."
-        ),
-    )
-    parser.add_argument(
-        "--resume_from_checkpoint",
-        type=str,
-        default=None,
-        help=(
-            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
-            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
-        ),
-    )
-    parser.add_argument(
-        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
-    )
-
-    args = parser.parse_args()
-    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
-    if env_local_rank != -1 and env_local_rank != args.local_rank:
-        args.local_rank = env_local_rank
-
-    # Sanity checks
-    if args.dataset_name is None and args.train_data_dir is None:
-        raise ValueError("Need either a dataset name or a training folder.")
-
-    return args
-
-
-def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: Optional[str] = None):
-    if token is None:
-        token = HfFolder.get_token()
-    if organization is None:
-        username = whoami(token)["name"]
-        return f"{username}/{model_id}"
-    else:
-        return f"{organization}/{model_id}"
-
-
-DATASET_NAME_MAPPING = {
-    "lambdalabs/pokemon-blip-captions": ("image", "text"),
-}
-
-
-def main():
-    args = parse_args()
-    logging_dir = os.path.join(args.output_dir, args.logging_dir)
-
-    accelerator = Accelerator(
-        gradient_accumulation_steps=args.gradient_accumulation_steps,
-        mixed_precision=args.mixed_precision,
-        log_with=args.report_to,
-        logging_dir=logging_dir,
-    )
-    if args.report_to == "wandb":
-        if not is_wandb_available():
-            raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
-        import wandb
-
-    # Make one log on every process with the configuration for debugging.
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        level=logging.INFO,
-    )
-    logger.info(accelerator.state, main_process_only=False)
-    if accelerator.is_local_main_process:
-        datasets.utils.logging.set_verbosity_warning()
-        transformers.utils.logging.set_verbosity_warning()
-        diffusers.utils.logging.set_verbosity_info()
-    else:
-        datasets.utils.logging.set_verbosity_error()
-        transformers.utils.logging.set_verbosity_error()
-        diffusers.utils.logging.set_verbosity_error()
-
-    # If passed along, set the training seed now.
-    if args.seed is not None:
-        set_seed(args.seed)
-
-    # Handle the repository creation
-    if accelerator.is_main_process:
-        if args.push_to_hub:
-            if args.hub_model_id is None:
-                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
-            else:
-                repo_name = args.hub_model_id
-            repo_name = create_repo(repo_name, exist_ok=True)
-            repo = Repository(args.output_dir, clone_from=repo_name)
-
-            with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
-                if "step_*" not in gitignore:
-                    gitignore.write("step_*\n")
-                if "epoch_*" not in gitignore:
-                    gitignore.write("epoch_*\n")
-        elif args.output_dir is not None:
-            os.makedirs(args.output_dir, exist_ok=True)
-
-    # Load scheduler, tokenizer and models.
-    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
-    tokenizer = CLIPTokenizer.from_pretrained(
-        args.pretrained_model_name_or_path, subfolder="tokenizer", revision=args.revision
-    )
-    text_encoder = CLIPTextModel.from_pretrained(
-        args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision
-    )
-    vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision)
-    unet = UNet2DConditionModel.from_pretrained(
-        args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision
-    )
-
-    # For mixed precision training we cast the text_encoder and vae weights to half-precision
-    # as these models are only used for inference, keeping weights in full precision is not required.
-    weight_dtype = torch.float32
-    if accelerator.mixed_precision == "fp16":
-        weight_dtype = torch.float16
-    elif accelerator.mixed_precision == "bf16":
-        weight_dtype = torch.bfloat16
-
-    # Move unet, vae and text_encoder to device and cast to weight_dtype
-    unet.to(accelerator.device, dtype=weight_dtype)
-    vae.to(accelerator.device, dtype=weight_dtype)
-    text_encoder.to(accelerator.device, dtype=weight_dtype)
-
-    if args.enable_xformers_memory_efficient_attention:
-        if is_xformers_available():
-            unet.enable_xformers_memory_efficient_attention()
-        else:
-            raise ValueError("xformers is not available. Make sure it is installed correctly")
-
-    # now we will add new LoRA weights to the attention layers
-    # It's important to realize here how many attention weights will be added and of which sizes
-    # The sizes of the attention layers consist only of two different variables:
-    # 1) - the "hidden_size", which is increased according to `unet.config.block_out_channels`.
-    # 2) - the "cross attention size", which is set to `unet.config.cross_attention_dim`.
-
-    # Let's first see how many attention processors we will have to set.
-    # For Stable Diffusion, it should be equal to:
-    # - down blocks (2x attention layers) * (2x transformer layers) * (3x down blocks) = 12
-    # - mid blocks (2x attention layers) * (1x transformer layers) * (1x mid blocks) = 2
-    # - up blocks (2x attention layers) * (3x transformer layers) * (3x down blocks) = 18
-    # => 32 layers
-
-    # Set correct lora layers
-    lora_attn_procs = {}
-    for name in unet.attn_processors.keys():
-        cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
-        if name.startswith("mid_block"):
-            hidden_size = unet.config.block_out_channels[-1]
-        elif name.startswith("up_blocks"):
-            block_id = int(name[len("up_blocks.")])
-            hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
-        elif name.startswith("down_blocks"):
-            block_id = int(name[len("down_blocks.")])
-            hidden_size = unet.config.block_out_channels[block_id]
-
-        lora_attn_procs[name] = LoRACrossAttnProcessor(
-            hidden_size=hidden_size, cross_attention_dim=cross_attention_dim
-        )
-
-    unet.set_attn_processor(lora_attn_procs)
-    lora_layers = AttnProcsLayers(unet.attn_processors)
-
-    # Enable TF32 for faster training on Ampere GPUs,
-    # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
-    if args.allow_tf32:
-        torch.backends.cuda.matmul.allow_tf32 = True
-
-    if args.scale_lr:
-        args.learning_rate = (
-            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
-        )
-
-    # Initialize the optimizer
-    if args.use_8bit_adam:
-        try:
-            import bitsandbytes as bnb
-        except ImportError:
-            raise ImportError(
-                "Please install bitsandbytes to use 8-bit Adam. You can do so by running `pip install bitsandbytes`"
-            )
-
-        optimizer_cls = bnb.optim.AdamW8bit
-    else:
-        optimizer_cls = torch.optim.AdamW
-
-    optimizer = optimizer_cls(
-        lora_layers.parameters(),
-        lr=args.learning_rate,
-        betas=(args.adam_beta1, args.adam_beta2),
-        weight_decay=args.adam_weight_decay,
-        eps=args.adam_epsilon,
-    )
-
-    # Get the datasets: you can either provide your own training and evaluation files (see below)
-    # or specify a Dataset from the hub (the dataset will be downloaded automatically from the datasets Hub).
-
-    # In distributed training, the load_dataset function guarantees that only one local process can concurrently
-    # download the dataset.
-    if args.dataset_name is not None:
-        # Downloading and loading a dataset from the hub.
-        dataset = load_dataset(
-            args.dataset_name,
-            args.dataset_config_name,
-            cache_dir=args.cache_dir,
-        )
-    else:
-        data_files = {}
-        if args.train_data_dir is not None:
-            data_files["train"] = os.path.join(args.train_data_dir, "**")
-        dataset = load_dataset(
-            "imagefolder",
-            data_files=data_files,
-            cache_dir=args.cache_dir,
-        )
-        # See more about loading custom images at
-        # https://huggingface.co/docs/datasets/v2.4.0/en/image_load#imagefolder
-
-    # Preprocessing the datasets.
-    # We need to tokenize inputs and targets.
-    column_names = dataset["train"].column_names
-
-    # 6. Get the column names for input/target.
-    dataset_columns = DATASET_NAME_MAPPING.get(args.dataset_name, None)
-    if args.image_column is None:
-        image_column = dataset_columns[0] if dataset_columns is not None else column_names[0]
-    else:
-        image_column = args.image_column
-        if image_column not in column_names:
-            raise ValueError(
-                f"--image_column' value '{args.image_column}' needs to be one of: {', '.join(column_names)}"
-            )
-    if args.caption_column is None:
-        caption_column = dataset_columns[1] if dataset_columns is not None else column_names[1]
-    else:
-        caption_column = args.caption_column
-        if caption_column not in column_names:
-            raise ValueError(
-                f"--caption_column' value '{args.caption_column}' needs to be one of: {', '.join(column_names)}"
-            )
-
-    # Preprocessing the datasets.
-    # We need to tokenize input captions and transform the images.
-    def tokenize_captions(examples, is_train=True):
-        captions = []
-        for caption in examples[caption_column]:
-            if isinstance(caption, str):
-                captions.append(caption)
-            elif isinstance(caption, (list, np.ndarray)):
-                # take a random caption if there are multiple
-                captions.append(random.choice(caption) if is_train else caption[0])
-            else:
-                raise ValueError(
-                    f"Caption column `{caption_column}` should contain either strings or lists of strings."
-                )
-        inputs = tokenizer(
-            captions, max_length=tokenizer.model_max_length, padding="max_length", truncation=True, return_tensors="pt"
-        )
-        return inputs.input_ids
-
-    # Preprocessing the datasets.
-    train_transforms = transforms.Compose(
-        [
-            transforms.Resize(args.resolution, interpolation=transforms.InterpolationMode.BILINEAR),
-            transforms.CenterCrop(args.resolution) if args.center_crop else transforms.RandomCrop(args.resolution),
-            transforms.RandomHorizontalFlip() if args.random_flip else transforms.Lambda(lambda x: x),
-            transforms.ToTensor(),
-            transforms.Normalize([0.5], [0.5]),
-        ]
-    )
-
-    def preprocess_train(examples):
-        images = [image.convert("RGB") for image in examples[image_column]]
-        examples["pixel_values"] = [train_transforms(image) for image in images]
-        examples["input_ids"] = tokenize_captions(examples)
-        return examples
-
-    with accelerator.main_process_first():
-        if args.max_train_samples is not None:
-            dataset["train"] = dataset["train"].shuffle(seed=args.seed).select(range(args.max_train_samples))
-        # Set the training transforms
-        train_dataset = dataset["train"].with_transform(preprocess_train)
-
-    def collate_fn(examples):
-        pixel_values = torch.stack([example["pixel_values"] for example in examples])
-        pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
-        input_ids = torch.stack([example["input_ids"] for example in examples])
-        return {"pixel_values": pixel_values, "input_ids": input_ids}
-
-    # DataLoaders creation:
-    train_dataloader = torch.utils.data.DataLoader(
-        train_dataset,
-        shuffle=True,
-        collate_fn=collate_fn,
-        batch_size=args.train_batch_size,
-        num_workers=args.dataloader_num_workers,
-    )
-
-    # Scheduler and math around the number of training steps.
-    overrode_max_train_steps = False
-    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
-    if args.max_train_steps is None:
-        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
-        overrode_max_train_steps = True
-
-    lr_scheduler = get_scheduler(
-        args.lr_scheduler,
-        optimizer=optimizer,
-        num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps,
-        num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
-    )
-
-    # Prepare everything with our `accelerator`.
-    lora_layers, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
-        lora_layers, optimizer, train_dataloader, lr_scheduler
-    )
-
-    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
-    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
-    if overrode_max_train_steps:
-        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
-    # Afterwards we recalculate our number of training epochs
-    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
-
-    # We need to initialize the trackers we use, and also store our configuration.
-    # The trackers initializes automatically on the main process.
-    if accelerator.is_main_process:
-        accelerator.init_trackers("text2image-fine-tune", config=vars(args))
-
-    # Train!
-    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
-
-    logger.info("***** Running training *****")
-    logger.info(f"  Num examples = {len(train_dataset)}")
-    logger.info(f"  Num Epochs = {args.num_train_epochs}")
-    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
-    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
-    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
-    logger.info(f"  Total optimization steps = {args.max_train_steps}")
-    global_step = 0
-    first_epoch = 0
-
-    # Potentially load in the weights and states from a previous save
-    if args.resume_from_checkpoint:
-        if args.resume_from_checkpoint != "latest":
-            path = os.path.basename(args.resume_from_checkpoint)
-        else:
-            # Get the most recent checkpoint
-            dirs = os.listdir(args.output_dir)
-            dirs = [d for d in dirs if d.startswith("checkpoint")]
-            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
-            path = dirs[-1] if len(dirs) > 0 else None
-
-        if path is None:
-            accelerator.print(
-                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
-            )
-            args.resume_from_checkpoint = None
-        else:
-            accelerator.print(f"Resuming from checkpoint {path}")
-            accelerator.load_state(os.path.join(args.output_dir, path))
-            global_step = int(path.split("-")[1])
-
-            resume_global_step = global_step * args.gradient_accumulation_steps
-            first_epoch = global_step // num_update_steps_per_epoch
-            resume_step = resume_global_step % (num_update_steps_per_epoch * args.gradient_accumulation_steps)
-
-    # Only show the progress bar once on each machine.
-    progress_bar = tqdm(range(global_step, args.max_train_steps), disable=not accelerator.is_local_main_process)
-    progress_bar.set_description("Steps")
-
-    for epoch in range(first_epoch, args.num_train_epochs):
-        unet.train()
-        train_loss = 0.0
-        for step, batch in enumerate(train_dataloader):
-            # Skip steps until we reach the resumed step
-            if args.resume_from_checkpoint and epoch == first_epoch and step < resume_step:
-                if step % args.gradient_accumulation_steps == 0:
-                    progress_bar.update(1)
-                continue
-
-            with accelerator.accumulate(unet):
-                # Convert images to latent space
-                latents = vae.encode(batch["pixel_values"].to(dtype=weight_dtype)).latent_dist.sample()
-                latents = latents * 0.18215
-
-                # Sample noise that we'll add to the latents
-                noise = torch.randn_like(latents)
-                bsz = latents.shape[0]
-                # Sample a random timestep for each image
-                timesteps = torch.randint(0, noise_scheduler.num_train_timesteps, (bsz,), device=latents.device)
-                timesteps = timesteps.long()
-
-                # Add noise to the latents according to the noise magnitude at each timestep
-                # (this is the forward diffusion process)
-                noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
-
-                # Get the text embedding for conditioning
-                encoder_hidden_states = text_encoder(batch["input_ids"])[0]
-
-                # Get the target for loss depending on the prediction type
-                if noise_scheduler.config.prediction_type == "epsilon":
-                    target = noise
-                elif noise_scheduler.config.prediction_type == "v_prediction":
-                    target = noise_scheduler.get_velocity(latents, noise, timesteps)
-                else:
-                    raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
-
-                # Predict the noise residual and compute loss
-                model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
-                loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
-
-                # Gather the losses across all processes for logging (if we use distributed training).
-                avg_loss = accelerator.gather(loss.repeat(args.train_batch_size)).mean()
-                train_loss += avg_loss.item() / args.gradient_accumulation_steps
-
-                # Backpropagate
-                accelerator.backward(loss)
-                if accelerator.sync_gradients:
-                    params_to_clip = lora_layers.parameters()
-                    accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
-                optimizer.step()
-                lr_scheduler.step()
-                optimizer.zero_grad()
-
-            # Checks if the accelerator has performed an optimization step behind the scenes
-            if accelerator.sync_gradients:
-                progress_bar.update(1)
-                global_step += 1
-                accelerator.log({"train_loss": train_loss}, step=global_step)
-                train_loss = 0.0
-
-                if global_step % args.checkpointing_steps == 0:
-                    if accelerator.is_main_process:
-                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
-                        accelerator.save_state(save_path)
-                        logger.info(f"Saved state to {save_path}")
-
-            logs = {"step_loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
-            progress_bar.set_postfix(**logs)
-
-            if global_step >= args.max_train_steps:
-                break
-
-        if args.validation_prompt is not None and epoch % args.validation_epochs == 0:
-            logger.info(
-                f"Running validation... \n Generating {args.num_validation_images} images with prompt:"
-                f" {args.validation_prompt}."
-            )
-            # create pipeline
-            pipeline = DiffusionPipeline.from_pretrained(
-                args.pretrained_model_name_or_path,
-                unet=accelerator.unwrap_model(unet),
-                revision=args.revision,
-                torch_dtype=weight_dtype,
-            )
-            pipeline = pipeline.to(accelerator.device)
-            pipeline.set_progress_bar_config(disable=True)
-
-            # run inference
-            generator = torch.Generator(device=accelerator.device).manual_seed(args.seed)
-            images = []
-            for _ in range(args.num_validation_images):
-                images.append(pipeline(args.validation_prompt, num_inference_steps=30, generator=generator).images[0])
-
-            if accelerator.is_main_process:
-                for tracker in accelerator.trackers:
-                    if tracker.name == "tensorboard":
-                        np_images = np.stack([np.asarray(img) for img in images])
-                        tracker.writer.add_images("validation", np_images, epoch, dataformats="NHWC")
-                    if tracker.name == "wandb":
-                        tracker.log(
-                            {
-                                "validation": [
-                                    wandb.Image(image, caption=f"{i}: {args.validation_prompt}")
-                                    for i, image in enumerate(images)
-                                ]
-                            }
-                        )
-
-            del pipeline
-            torch.cuda.empty_cache()
-
-    # Save the lora layers
-    accelerator.wait_for_everyone()
-    if accelerator.is_main_process:
-        unet = unet.to(torch.float32)
-        unet.save_attn_procs(args.output_dir)
-
-        if args.push_to_hub:
-            save_model_card(
-                repo_name,
-                images=images,
-                base_model=args.pretrained_model_name_or_path,
-                dataset_name=args.dataset_name,
-                repo_folder=args.output_dir,
-            )
-            repo.push_to_hub(commit_message="End of training", blocking=False, auto_lfs_prune=True)
-
-    # Final inference
-    # Load previous pipeline
-    pipeline = DiffusionPipeline.from_pretrained(
-        args.pretrained_model_name_or_path, revision=args.revision, torch_dtype=weight_dtype
-    )
-    pipeline = pipeline.to(accelerator.device)
-
-    # load attention processors
-    pipeline.unet.load_attn_procs(args.output_dir)
-
-    # run inference
-    generator = torch.Generator(device=accelerator.device).manual_seed(args.seed)
-    images = []
-    for _ in range(args.num_validation_images):
-        images.append(pipeline(args.validation_prompt, num_inference_steps=30, generator=generator).images[0])
-
-    if accelerator.is_main_process:
-        for tracker in accelerator.trackers:
-            if tracker.name == "tensorboard":
-                np_images = np.stack([np.asarray(img) for img in images])
-                tracker.writer.add_images("test", np_images, epoch, dataformats="NHWC")
-            if tracker.name == "wandb":
-                tracker.log(
-                    {
-                        "test": [
-                            wandb.Image(image, caption=f"{i}: {args.validation_prompt}")
-                            for i, image in enumerate(images)
-                        ]
-                    }
-                )
-
-    accelerator.end_training()
-
-
-if __name__ == "__main__":
-    main()
--- a/examples/textual_inversion/requirements.txt
+++ b/examples/textual_inversion/requirements.txt
@@ -3,4 +3,4 @@ torchvision
 transformers>=4.25.1
 ftfy
 tensorboard
-Jinja2
+modelcards
--- a/examples/textual_inversion/requirements_flax.txt
+++ b/examples/textual_inversion/requirements_flax.txt
@@ -5,4 +5,4 @@ torch
 torchvision
 ftfy
 tensorboard
-Jinja2
+modelcards
--- a/examples/textual_inversion/textual_inversion.py
+++ b/examples/textual_inversion/textual_inversion.py
@@ -1,18 +1,3 @@
-#!/usr/bin/env python
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-
 import argparse
 import logging
 import math
@@ -38,7 +23,7 @@ from diffusers import AutoencoderKL, DDPMScheduler, StableDiffusionPipeline, UNe
 from diffusers.optimization import get_scheduler
 from diffusers.utils import check_min_version
 from diffusers.utils.import_utils import is_xformers_available
-from huggingface_hub import HfFolder, Repository, create_repo, whoami
+from huggingface_hub import HfFolder, Repository, whoami

 # TODO: remove and import from diffusers.utils when the new version of diffusers is released
 from packaging import version
@@ -68,7 +53,8 @@ else:


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.12.0")
+check_min_version("0.10.0.dev0")
+

 logger = get_logger(__name__)

@@ -146,7 +132,7 @@ def parse_args():
        ),
    )
    parser.add_argument(
-        "--center_crop", action="store_true", help="Whether to center crop images before resizing to resolution."
+        "--center_crop", action="store_true", help="Whether to center crop images before resizing to resolution"
    )
    parser.add_argument(
        "--train_batch_size", type=int, default=16, help="Batch size (per device) for the training dataloader."
@@ -193,14 +179,6 @@ def parse_args():
    parser.add_argument(
        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
    )
-    parser.add_argument(
-        "--dataloader_num_workers",
-        type=int,
-        default=0,
-        help=(
-            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
-        ),
-    )
    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
@@ -403,10 +381,7 @@ class TextualInversionDataset(Dataset):

        if self.center_crop:
            crop = min(img.shape[0], img.shape[1])
-            (
-                h,
-                w,
-            ) = (
+            (h, w,) = (
                img.shape[0],
                img.shape[1],
            )
@@ -471,8 +446,7 @@ def main():
                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
            else:
                repo_name = args.hub_model_id
-            create_repo(repo_name, exist_ok=True, token=args.hub_token)
-            repo = Repository(args.output_dir, clone_from=repo_name, token=args.hub_token)
+            repo = Repository(args.output_dir, clone_from=repo_name)

            with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
                if "step_*" not in gitignore:
@@ -573,9 +547,7 @@ def main():
        center_crop=args.center_crop,
        set="train",
    )
-    train_dataloader = torch.utils.data.DataLoader(
-        train_dataset, batch_size=args.train_batch_size, shuffle=True, num_workers=args.dataloader_num_workers
-    )
+    train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=args.train_batch_size, shuffle=True)

    # Scheduler and math around the number of training steps.
    overrode_max_train_steps = False
@@ -596,7 +568,7 @@ def main():
        text_encoder, optimizer, train_dataloader, lr_scheduler
    )

-    # For mixed precision training we cast the unet and vae weights to half-precision
+    # For mixed precision training we cast the text_encoder and vae weights to half-precision
    # as these models are only used for inference, keeping weights in full precision is not required.
    weight_dtype = torch.float32
    if accelerator.mixed_precision == "fp16":
@@ -642,21 +614,14 @@ def main():
            dirs = os.listdir(args.output_dir)
            dirs = [d for d in dirs if d.startswith("checkpoint")]
            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
-            path = dirs[-1] if len(dirs) > 0 else None
+            path = dirs[-1]
+        accelerator.print(f"Resuming from checkpoint {path}")
+        accelerator.load_state(os.path.join(args.output_dir, path))
+        global_step = int(path.split("-")[1])

-        if path is None:
-            accelerator.print(
-                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
-            )
-            args.resume_from_checkpoint = None
-        else:
-            accelerator.print(f"Resuming from checkpoint {path}")
-            accelerator.load_state(os.path.join(args.output_dir, path))
-            global_step = int(path.split("-")[1])
-
-            resume_global_step = global_step * args.gradient_accumulation_steps
-            first_epoch = global_step // num_update_steps_per_epoch
-            resume_step = resume_global_step % (num_update_steps_per_epoch * args.gradient_accumulation_steps)
+        resume_global_step = global_step * args.gradient_accumulation_steps
+        first_epoch = resume_global_step // num_update_steps_per_epoch
+        resume_step = resume_global_step % num_update_steps_per_epoch

    # Only show the progress bar once on each machine.
    progress_bar = tqdm(range(global_step, args.max_train_steps), disable=not accelerator.is_local_main_process)
--- a/examples/textual_inversion/textual_inversion_flax.py
+++ b/examples/textual_inversion/textual_inversion_flax.py
@@ -28,7 +28,7 @@ from diffusers.utils import check_min_version
 from flax import jax_utils
 from flax.training import train_state
 from flax.training.common_utils import shard
-from huggingface_hub import HfFolder, Repository, create_repo, whoami
+from huggingface_hub import HfFolder, Repository, whoami

 # TODO: remove and import from diffusers.utils when the new version of diffusers is released
 from packaging import version
@@ -57,7 +57,7 @@ else:
 # ------------------------------------------------------------------------------

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.12.0")
+check_min_version("0.10.0.dev0")

 logger = logging.getLogger(__name__)

@@ -109,7 +109,7 @@ def parse_args():
        ),
    )
    parser.add_argument(
-        "--center_crop", action="store_true", help="Whether to center crop images before resizing to resolution."
+        "--center_crop", action="store_true", help="Whether to center crop images before resizing to resolution"
    )
    parser.add_argument(
        "--train_batch_size", type=int, default=16, help="Batch size (per device) for the training dataloader."
@@ -306,10 +306,7 @@ class TextualInversionDataset(Dataset):

        if self.center_crop:
            crop = min(img.shape[0], img.shape[1])
-            (
-                h,
-                w,
-            ) = (
+            (h, w,) = (
                img.shape[0],
                img.shape[1],
            )
@@ -372,8 +369,7 @@ def main():
                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
            else:
                repo_name = args.hub_model_id
-            create_repo(repo_name, exist_ok=True, token=args.hub_token)
-            repo = Repository(args.output_dir, clone_from=repo_name, token=args.hub_token)
+            repo = Repository(args.output_dir, clone_from=repo_name)

            with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
                if "step_*" not in gitignore:
--- a/examples/unconditional_image_generation/README.md
+++ b/examples/unconditional_image_generation/README.md
@@ -39,7 +39,6 @@ accelerate launch train_unconditional.py \
  --train_batch_size=16 \
  --num_epochs=100 \
  --gradient_accumulation_steps=1 \
-  --use_ema \
  --learning_rate=1e-4 \
  --lr_warmup_steps=500 \
  --mixed_precision=no \
@@ -64,7 +63,6 @@ accelerate launch train_unconditional.py \
  --train_batch_size=16 \
  --num_epochs=100 \
  --gradient_accumulation_steps=1 \
-  --use_ema \
  --learning_rate=1e-4 \
  --lr_warmup_steps=500 \
  --mixed_precision=no \
@@ -152,7 +150,6 @@ accelerate launch train_unconditional_ort.py \
  --dataset_name="huggan/flowers-102-categories" \
  --resolution=64 \
  --output_dir="ddpm-ema-flowers-64" \
-  --use_ema \
  --train_batch_size=16 \
  --num_epochs=1 \
  --gradient_accumulation_steps=1 \
--- a/examples/unconditional_image_generation/train_unconditional.py
+++ b/examples/unconditional_image_generation/train_unconditional.py
@@ -1,7 +1,5 @@
 import argparse
-import copy
 import inspect
-import logging
 import math
 import os
 from pathlib import Path
@@ -10,8 +8,6 @@ from typing import Optional
 import torch
 import torch.nn.functional as F

-import datasets
-import diffusers
 from accelerate import Accelerator
 from accelerate.logging import get_logger
 from datasets import load_dataset
@@ -19,7 +15,7 @@ from diffusers import DDPMPipeline, DDPMScheduler, UNet2DModel
 from diffusers.optimization import get_scheduler
 from diffusers.training_utils import EMAModel
 from diffusers.utils import check_min_version
-from huggingface_hub import HfFolder, Repository, create_repo, whoami
+from huggingface_hub import HfFolder, Repository, whoami
 from torchvision.transforms import (
    CenterCrop,
    Compose,
@@ -33,9 +29,10 @@ from tqdm.auto import tqdm


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.12.0")
+check_min_version("0.10.0.dev0")

-logger = get_logger(__name__, log_level="INFO")
+
+logger = get_logger(__name__)


 def _extract_into_tensor(arr, timesteps, broadcast_shape):
@@ -159,6 +156,7 @@ def parse_args():
    parser.add_argument(
        "--use_ema",
        action="store_true",
+        default=True,
        help="Whether to use Exponential Moving Average for the final model weights.",
    )
    parser.add_argument("--ema_inv_gamma", type=float, default=1.0, help="The inverse gamma value for the EMA decay.")
@@ -257,7 +255,6 @@ def get_full_repo_name(model_id: str, organization: Optional[str] = None, token:

 def main(args):
    logging_dir = os.path.join(args.output_dir, args.logging_dir)
-
    accelerator = Accelerator(
        gradient_accumulation_steps=args.gradient_accumulation_steps,
        mixed_precision=args.mixed_precision,
@@ -265,39 +262,6 @@ def main(args):
        logging_dir=logging_dir,
    )

-    # Make one log on every process with the configuration for debugging.
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        level=logging.INFO,
-    )
-    logger.info(accelerator.state, main_process_only=False)
-    if accelerator.is_local_main_process:
-        datasets.utils.logging.set_verbosity_warning()
-        diffusers.utils.logging.set_verbosity_info()
-    else:
-        datasets.utils.logging.set_verbosity_error()
-        diffusers.utils.logging.set_verbosity_error()
-
-    # Handle the repository creation
-    if accelerator.is_main_process:
-        if args.push_to_hub:
-            if args.hub_model_id is None:
-                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
-            else:
-                repo_name = args.hub_model_id
-            create_repo(repo_name, exist_ok=True, token=args.hub_token)
-            repo = Repository(args.output_dir, clone_from=repo_name, token=args.hub_token)
-
-            with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
-                if "step_*" not in gitignore:
-                    gitignore.write("step_*\n")
-                if "epoch_*" not in gitignore:
-                    gitignore.write("epoch_*\n")
-        elif args.output_dir is not None:
-            os.makedirs(args.output_dir, exist_ok=True)
-
-    # Initialize the model
    model = UNet2DModel(
        sample_size=args.resolution,
        in_channels=3,
@@ -321,19 +285,8 @@ def main(args):
            "UpBlock2D",
        ),
    )
-
-    # Create EMA for the model.
-    if args.use_ema:
-        ema_model = EMAModel(
-            model.parameters(),
-            decay=args.ema_max_decay,
-            use_ema_warmup=True,
-            inv_gamma=args.ema_inv_gamma,
-            power=args.ema_power,
-        )
-
-    # Initialize the scheduler
    accepts_prediction_type = "prediction_type" in set(inspect.signature(DDPMScheduler.__init__).parameters.keys())
+
    if accepts_prediction_type:
        noise_scheduler = DDPMScheduler(
            num_train_timesteps=args.ddpm_num_steps,
@@ -343,7 +296,6 @@ def main(args):
    else:
        noise_scheduler = DDPMScheduler(num_train_timesteps=args.ddpm_num_steps, beta_schedule=args.ddpm_beta_schedule)

-    # Initialize the optimizer
    optimizer = torch.optim.AdamW(
        model.parameters(),
        lr=args.learning_rate,
@@ -352,24 +304,6 @@ def main(args):
        eps=args.adam_epsilon,
    )

-    # Get the datasets: you can either provide your own training and evaluation files (see below)
-    # or specify a Dataset from the hub (the dataset will be downloaded automatically from the datasets Hub).
-
-    # In distributed training, the load_dataset function guarantees that only one local process can concurrently
-    # download the dataset.
-    if args.dataset_name is not None:
-        dataset = load_dataset(
-            args.dataset_name,
-            args.dataset_config_name,
-            cache_dir=args.cache_dir,
-            split="train",
-        )
-    else:
-        dataset = load_dataset("imagefolder", data_dir=args.train_data_dir, cache_dir=args.cache_dir, split="train")
-        # See more about loading custom images at
-        # https://huggingface.co/docs/datasets/v2.4.0/en/image_load#imagefolder
-
-    # Preprocessing the datasets and DataLoaders creation.
    augmentations = Compose(
        [
            Resize(args.resolution, interpolation=InterpolationMode.BILINEAR),
@@ -380,6 +314,16 @@ def main(args):
        ]
    )

+    if args.dataset_name is not None:
+        dataset = load_dataset(
+            args.dataset_name,
+            args.dataset_config_name,
+            cache_dir=args.cache_dir,
+            split="train",
+        )
+    else:
+        dataset = load_dataset("imagefolder", data_dir=args.train_data_dir, cache_dir=args.cache_dir, split="train")
+
    def transforms(examples):
        images = [augmentations(image.convert("RGB")) for image in examples["image"]]
        return {"input": images}
@@ -391,45 +335,51 @@ def main(args):
        dataset, batch_size=args.train_batch_size, shuffle=True, num_workers=args.dataloader_num_workers
    )

-    # Initialize the learning rate scheduler
    lr_scheduler = get_scheduler(
        args.lr_scheduler,
        optimizer=optimizer,
-        num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps,
-        num_training_steps=(len(train_dataloader) * args.num_epochs),
+        num_warmup_steps=args.lr_warmup_steps,
+        num_training_steps=(len(train_dataloader) * args.num_epochs) // args.gradient_accumulation_steps,
    )

-    # Prepare everything with our `accelerator`.
    model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
        model, optimizer, train_dataloader, lr_scheduler
    )
+    accelerator.register_for_checkpointing(lr_scheduler)

-    if args.use_ema:
-        accelerator.register_for_checkpointing(ema_model)
-        ema_model.to(accelerator.device)
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+
+    ema_model = EMAModel(
+        accelerator.unwrap_model(model),
+        inv_gamma=args.ema_inv_gamma,
+        power=args.ema_power,
+        max_value=args.ema_max_decay,
+    )
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.push_to_hub:
+            if args.hub_model_id is None:
+                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
+            else:
+                repo_name = args.hub_model_id
+            repo = Repository(args.output_dir, clone_from=repo_name)
+
+            with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
+                if "step_*" not in gitignore:
+                    gitignore.write("step_*\n")
+                if "epoch_*" not in gitignore:
+                    gitignore.write("epoch_*\n")
+        elif args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)

-    # We need to initialize the trackers we use, and also store our configuration.
-    # The trackers initializes automatically on the main process.
    if accelerator.is_main_process:
        run = os.path.split(__file__)[-1].split(".")[0]
        accelerator.init_trackers(run)

-    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
-    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
-    max_train_steps = args.num_epochs * num_update_steps_per_epoch
-
-    logger.info("***** Running training *****")
-    logger.info(f"  Num examples = {len(dataset)}")
-    logger.info(f"  Num Epochs = {args.num_epochs}")
-    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
-    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
-    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
-    logger.info(f"  Total optimization steps = {max_train_steps}")
-
    global_step = 0
    first_epoch = 0

-    # Potentially load in the weights and states from a previous save
    if args.resume_from_checkpoint:
        if args.resume_from_checkpoint != "latest":
            path = os.path.basename(args.resume_from_checkpoint)
@@ -438,23 +388,15 @@ def main(args):
            dirs = os.listdir(args.output_dir)
            dirs = [d for d in dirs if d.startswith("checkpoint")]
            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
-            path = dirs[-1] if len(dirs) > 0 else None
+            path = dirs[-1]
+        accelerator.print(f"Resuming from checkpoint {path}")
+        accelerator.load_state(os.path.join(args.output_dir, path))
+        global_step = int(path.split("-")[1])

-        if path is None:
-            accelerator.print(
-                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
-            )
-            args.resume_from_checkpoint = None
-        else:
-            accelerator.print(f"Resuming from checkpoint {path}")
-            accelerator.load_state(os.path.join(args.output_dir, path))
-            global_step = int(path.split("-")[1])
+        resume_global_step = global_step * args.gradient_accumulation_steps
+        first_epoch = resume_global_step // num_update_steps_per_epoch
+        resume_step = resume_global_step % num_update_steps_per_epoch

-            resume_global_step = global_step * args.gradient_accumulation_steps
-            first_epoch = global_step // num_update_steps_per_epoch
-            resume_step = resume_global_step % (num_update_steps_per_epoch * args.gradient_accumulation_steps)
-
-    # Train!
    for epoch in range(first_epoch, args.num_epochs):
        model.train()
        progress_bar = tqdm(total=num_update_steps_per_epoch, disable=not accelerator.is_local_main_process)
@@ -503,12 +445,12 @@ def main(args):
                    accelerator.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()
                lr_scheduler.step()
+                if args.use_ema:
+                    ema_model.step(model)
                optimizer.zero_grad()

            # Checks if the accelerator has performed an optimization step behind the scenes
            if accelerator.sync_gradients:
-                if args.use_ema:
-                    ema_model.step(model.parameters())
                progress_bar.update(1)
                global_step += 1

@@ -530,11 +472,8 @@ def main(args):
        # Generate sample images for visual inspection
        if accelerator.is_main_process:
            if epoch % args.save_images_epochs == 0 or epoch == args.num_epochs - 1:
-                unet = copy.deepcopy(accelerator.unwrap_model(model))
-                if args.use_ema:
-                    ema_model.copy_to(unet.parameters())
                pipeline = DDPMPipeline(
-                    unet=unet,
+                    unet=accelerator.unwrap_model(ema_model.averaged_model if args.use_ema else model),
                    scheduler=noise_scheduler,
                )

@@ -559,6 +498,7 @@ def main(args):
                pipeline.save_pretrained(args.output_dir)
                if args.push_to_hub:
                    repo.push_to_hub(commit_message=f"Epoch {epoch}", blocking=False)
+        accelerator.wait_for_everyone()

    accelerator.end_training()

--- a/examples/unconditional_image_generation/train_unconditional_ort.py
+++ b/examples/unconditional_image_generation/train_unconditional_ort.py
@@ -15,7 +15,7 @@ from diffusers import DDPMPipeline, DDPMScheduler, UNet2DModel
 from diffusers.optimization import get_scheduler
 from diffusers.training_utils import EMAModel
 from diffusers.utils import check_min_version
-from huggingface_hub import HfFolder, Repository, create_repo, whoami
+from huggingface_hub import HfFolder, Repository, whoami
 from onnxruntime.training.ortmodule import ORTModule
 from torchvision.transforms import (
    CenterCrop,
@@ -30,7 +30,8 @@ from tqdm.auto import tqdm


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.12.0")
+check_min_version("0.10.0.dev0")
+

 logger = get_logger(__name__)

@@ -156,6 +157,7 @@ def parse_args():
    parser.add_argument(
        "--use_ema",
        action="store_true",
+        default=True,
        help="Whether to use Exponential Moving Average for the final model weights.",
    )
    parser.add_argument("--ema_inv_gamma", type=float, default=1.0, help="The inverse gamma value for the EMA decay.")
@@ -285,17 +287,8 @@ def main(args):
            "UpBlock2D",
        ),
    )
-
-    if args.use_ema:
-        ema_model = EMAModel(
-            model.parameters(),
-            decay=args.ema_max_decay,
-            use_ema_warmup=True,
-            inv_gamma=args.ema_inv_gamma,
-            power=args.ema_power,
-        )
-
    accepts_prediction_type = "prediction_type" in set(inspect.signature(DDPMScheduler.__init__).parameters.keys())
+
    if accepts_prediction_type:
        noise_scheduler = DDPMScheduler(
            num_train_timesteps=args.ddpm_num_steps,
@@ -354,13 +347,17 @@ def main(args):
    model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
        model, optimizer, train_dataloader, lr_scheduler
    )
-
-    if args.use_ema:
-        accelerator.register_for_checkpointing(ema_model)
-        ema_model.to(accelerator.device)
+    accelerator.register_for_checkpointing(lr_scheduler)

    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)

+    ema_model = EMAModel(
+        accelerator.unwrap_model(model),
+        inv_gamma=args.ema_inv_gamma,
+        power=args.ema_power,
+        max_value=args.ema_max_decay,
+    )
+
    model = ORTModule(model)

    # Handle the repository creation
@@ -370,8 +367,7 @@ def main(args):
                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
            else:
                repo_name = args.hub_model_id
-            create_repo(repo_name, exist_ok=True, token=args.hub_token)
-            repo = Repository(args.output_dir, clone_from=repo_name, token=args.hub_token)
+            repo = Repository(args.output_dir, clone_from=repo_name)

            with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
                if "step_*" not in gitignore:
@@ -395,21 +391,13 @@ def main(args):
            dirs = os.listdir(args.output_dir)
            dirs = [d for d in dirs if d.startswith("checkpoint")]
            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
-            path = dirs[-1] if len(dirs) > 0 else None
-
-        if path is None:
-            accelerator.print(
-                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
-            )
-            args.resume_from_checkpoint = None
-        else:
-            accelerator.print(f"Resuming from checkpoint {path}")
-            accelerator.load_state(os.path.join(args.output_dir, path))
-            global_step = int(path.split("-")[1])
-
-            resume_global_step = global_step * args.gradient_accumulation_steps
-            first_epoch = global_step // num_update_steps_per_epoch
-            resume_step = resume_global_step % (num_update_steps_per_epoch * args.gradient_accumulation_steps)
+            path = dirs[-1]
+        accelerator.print(f"Resuming from checkpoint {path}")
+        accelerator.load_state(os.path.join(args.output_dir, path))
+        global_step = int(path.split("-")[1])
+        resume_global_step = global_step * args.gradient_accumulation_steps
+        first_epoch = resume_global_step // num_update_steps_per_epoch
+        resume_step = resume_global_step % num_update_steps_per_epoch

    for epoch in range(first_epoch, args.num_epochs):
        model.train()
@@ -460,7 +448,7 @@ def main(args):
                optimizer.step()
                lr_scheduler.step()
                if args.use_ema:
-                    ema_model.step(model.parameters())
+                    ema_model.step(model)
                optimizer.zero_grad()

            # Checks if the accelerator has performed an optimization step behind the scenes
--- a/scripts/convert_diffusers_to_original_stable_diffusion.py
+++ b/scripts/convert_diffusers_to_original_stable_diffusion.py
@@ -8,8 +8,6 @@ import re

 import torch

-from safetensors.torch import load_file, save_file
-

 # =================#
 # UNet Conversion #
@@ -268,9 +266,6 @@ if __name__ == "__main__":
    parser.add_argument("--model_path", default=None, type=str, required=True, help="Path to the model to convert.")
    parser.add_argument("--checkpoint_path", default=None, type=str, required=True, help="Path to the output model.")
    parser.add_argument("--half", action="store_true", help="Save weights in half precision.")
-    parser.add_argument(
-        "--use_safetensors", action="store_true", help="Save weights use safetensors, default is ckpt."
-    )

    args = parser.parse_args()

@@ -278,38 +273,23 @@ if __name__ == "__main__":

    assert args.checkpoint_path is not None, "Must provide a checkpoint path!"

-    # Path for safetensors
-    unet_path = osp.join(args.model_path, "unet", "diffusion_pytorch_model.safetensors")
-    vae_path = osp.join(args.model_path, "vae", "diffusion_pytorch_model.safetensors")
-    text_enc_path = osp.join(args.model_path, "text_encoder", "model.safetensors")
-
-    # Load models from safetensors if it exists, if it doesn't pytorch
-    if osp.exists(unet_path):
-        unet_state_dict = load_file(unet_path, device="cpu")
-    else:
-        unet_path = osp.join(args.model_path, "unet", "diffusion_pytorch_model.bin")
-        unet_state_dict = torch.load(unet_path, map_location="cpu")
-
-    if osp.exists(vae_path):
-        vae_state_dict = load_file(vae_path, device="cpu")
-    else:
-        vae_path = osp.join(args.model_path, "vae", "diffusion_pytorch_model.bin")
-        vae_state_dict = torch.load(vae_path, map_location="cpu")
-
-    if osp.exists(text_enc_path):
-        text_enc_dict = load_file(text_enc_path, device="cpu")
-    else:
-        text_enc_path = osp.join(args.model_path, "text_encoder", "pytorch_model.bin")
-        text_enc_dict = torch.load(text_enc_path, map_location="cpu")
+    unet_path = osp.join(args.model_path, "unet", "diffusion_pytorch_model.bin")
+    vae_path = osp.join(args.model_path, "vae", "diffusion_pytorch_model.bin")
+    text_enc_path = osp.join(args.model_path, "text_encoder", "pytorch_model.bin")

    # Convert the UNet model
+    unet_state_dict = torch.load(unet_path, map_location="cpu")
    unet_state_dict = convert_unet_state_dict(unet_state_dict)
    unet_state_dict = {"model.diffusion_model." + k: v for k, v in unet_state_dict.items()}

    # Convert the VAE model
+    vae_state_dict = torch.load(vae_path, map_location="cpu")
    vae_state_dict = convert_vae_state_dict(vae_state_dict)
    vae_state_dict = {"first_stage_model." + k: v for k, v in vae_state_dict.items()}

+    # Convert the text encoder model
+    text_enc_dict = torch.load(text_enc_path, map_location="cpu")
+
    # Easiest way to identify v2.0 model seems to be that the text encoder (OpenCLIP) is deeper
    is_v20_model = "text_model.encoder.layers.22.layer_norm2.bias" in text_enc_dict

@@ -326,9 +306,5 @@ if __name__ == "__main__":
    state_dict = {**unet_state_dict, **vae_state_dict, **text_enc_dict}
    if args.half:
        state_dict = {k: v.half() for k, v in state_dict.items()}
-
-    if args.use_safetensors:
-        save_file(state_dict, args.checkpoint_path)
-    else:
-        state_dict = {"state_dict": state_dict}
-        torch.save(state_dict, args.checkpoint_path)
+    state_dict = {"state_dict": state_dict}
+    torch.save(state_dict, args.checkpoint_path)
--- a/scripts/convert_dit_to_diffusers.py
+++ b/scripts/convert_dit_to_diffusers.py
@@ -1,162 +0,0 @@
-import argparse
-import os
-
-import torch
-
-from diffusers import AutoencoderKL, DDIMScheduler, DiTPipeline, Transformer2DModel
-from torchvision.datasets.utils import download_url
-
-
-pretrained_models = {512: "DiT-XL-2-512x512.pt", 256: "DiT-XL-2-256x256.pt"}
-
-
-def download_model(model_name):
-    """
-    Downloads a pre-trained DiT model from the web.
-    """
-    local_path = f"pretrained_models/{model_name}"
-    if not os.path.isfile(local_path):
-        os.makedirs("pretrained_models", exist_ok=True)
-        web_path = f"https://dl.fbaipublicfiles.com/DiT/models/{model_name}"
-        download_url(web_path, "pretrained_models")
-    model = torch.load(local_path, map_location=lambda storage, loc: storage)
-    return model
-
-
-def main(args):
-    state_dict = download_model(pretrained_models[args.image_size])
-
-    state_dict["pos_embed.proj.weight"] = state_dict["x_embedder.proj.weight"]
-    state_dict["pos_embed.proj.bias"] = state_dict["x_embedder.proj.bias"]
-    state_dict.pop("x_embedder.proj.weight")
-    state_dict.pop("x_embedder.proj.bias")
-
-    for depth in range(28):
-        state_dict[f"transformer_blocks.{depth}.norm1.emb.timestep_embedder.linear_1.weight"] = state_dict[
-            "t_embedder.mlp.0.weight"
-        ]
-        state_dict[f"transformer_blocks.{depth}.norm1.emb.timestep_embedder.linear_1.bias"] = state_dict[
-            "t_embedder.mlp.0.bias"
-        ]
-        state_dict[f"transformer_blocks.{depth}.norm1.emb.timestep_embedder.linear_2.weight"] = state_dict[
-            "t_embedder.mlp.2.weight"
-        ]
-        state_dict[f"transformer_blocks.{depth}.norm1.emb.timestep_embedder.linear_2.bias"] = state_dict[
-            "t_embedder.mlp.2.bias"
-        ]
-        state_dict[f"transformer_blocks.{depth}.norm1.emb.class_embedder.embedding_table.weight"] = state_dict[
-            "y_embedder.embedding_table.weight"
-        ]
-
-        state_dict[f"transformer_blocks.{depth}.norm1.linear.weight"] = state_dict[
-            f"blocks.{depth}.adaLN_modulation.1.weight"
-        ]
-        state_dict[f"transformer_blocks.{depth}.norm1.linear.bias"] = state_dict[
-            f"blocks.{depth}.adaLN_modulation.1.bias"
-        ]
-
-        q, k, v = torch.chunk(state_dict[f"blocks.{depth}.attn.qkv.weight"], 3, dim=0)
-        q_bias, k_bias, v_bias = torch.chunk(state_dict[f"blocks.{depth}.attn.qkv.bias"], 3, dim=0)
-
-        state_dict[f"transformer_blocks.{depth}.attn1.to_q.weight"] = q
-        state_dict[f"transformer_blocks.{depth}.attn1.to_q.bias"] = q_bias
-        state_dict[f"transformer_blocks.{depth}.attn1.to_k.weight"] = k
-        state_dict[f"transformer_blocks.{depth}.attn1.to_k.bias"] = k_bias
-        state_dict[f"transformer_blocks.{depth}.attn1.to_v.weight"] = v
-        state_dict[f"transformer_blocks.{depth}.attn1.to_v.bias"] = v_bias
-
-        state_dict[f"transformer_blocks.{depth}.attn1.to_out.0.weight"] = state_dict[
-            f"blocks.{depth}.attn.proj.weight"
-        ]
-        state_dict[f"transformer_blocks.{depth}.attn1.to_out.0.bias"] = state_dict[f"blocks.{depth}.attn.proj.bias"]
-
-        state_dict[f"transformer_blocks.{depth}.ff.net.0.proj.weight"] = state_dict[f"blocks.{depth}.mlp.fc1.weight"]
-        state_dict[f"transformer_blocks.{depth}.ff.net.0.proj.bias"] = state_dict[f"blocks.{depth}.mlp.fc1.bias"]
-        state_dict[f"transformer_blocks.{depth}.ff.net.2.weight"] = state_dict[f"blocks.{depth}.mlp.fc2.weight"]
-        state_dict[f"transformer_blocks.{depth}.ff.net.2.bias"] = state_dict[f"blocks.{depth}.mlp.fc2.bias"]
-
-        state_dict.pop(f"blocks.{depth}.attn.qkv.weight")
-        state_dict.pop(f"blocks.{depth}.attn.qkv.bias")
-        state_dict.pop(f"blocks.{depth}.attn.proj.weight")
-        state_dict.pop(f"blocks.{depth}.attn.proj.bias")
-        state_dict.pop(f"blocks.{depth}.mlp.fc1.weight")
-        state_dict.pop(f"blocks.{depth}.mlp.fc1.bias")
-        state_dict.pop(f"blocks.{depth}.mlp.fc2.weight")
-        state_dict.pop(f"blocks.{depth}.mlp.fc2.bias")
-        state_dict.pop(f"blocks.{depth}.adaLN_modulation.1.weight")
-        state_dict.pop(f"blocks.{depth}.adaLN_modulation.1.bias")
-
-    state_dict.pop("t_embedder.mlp.0.weight")
-    state_dict.pop("t_embedder.mlp.0.bias")
-    state_dict.pop("t_embedder.mlp.2.weight")
-    state_dict.pop("t_embedder.mlp.2.bias")
-    state_dict.pop("y_embedder.embedding_table.weight")
-
-    state_dict["proj_out_1.weight"] = state_dict["final_layer.adaLN_modulation.1.weight"]
-    state_dict["proj_out_1.bias"] = state_dict["final_layer.adaLN_modulation.1.bias"]
-    state_dict["proj_out_2.weight"] = state_dict["final_layer.linear.weight"]
-    state_dict["proj_out_2.bias"] = state_dict["final_layer.linear.bias"]
-
-    state_dict.pop("final_layer.linear.weight")
-    state_dict.pop("final_layer.linear.bias")
-    state_dict.pop("final_layer.adaLN_modulation.1.weight")
-    state_dict.pop("final_layer.adaLN_modulation.1.bias")
-
-    # DiT XL/2
-    transformer = Transformer2DModel(
-        sample_size=args.image_size // 8,
-        num_layers=28,
-        attention_head_dim=72,
-        in_channels=4,
-        out_channels=8,
-        patch_size=2,
-        attention_bias=True,
-        num_attention_heads=16,
-        activation_fn="gelu-approximate",
-        num_embeds_ada_norm=1000,
-        norm_type="ada_norm_zero",
-        norm_elementwise_affine=False,
-    )
-    transformer.load_state_dict(state_dict, strict=True)
-
-    scheduler = DDIMScheduler(
-        num_train_timesteps=1000,
-        beta_schedule="linear",
-        prediction_type="epsilon",
-        clip_sample=False,
-    )
-
-    vae = AutoencoderKL.from_pretrained(args.vae_model)
-
-    pipeline = DiTPipeline(transformer=transformer, vae=vae, scheduler=scheduler)
-
-    if args.save:
-        pipeline.save_pretrained(args.checkpoint_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        "--image_size",
-        default=256,
-        type=int,
-        required=False,
-        help="Image size of pretrained model, either 256 or 512.",
-    )
-    parser.add_argument(
-        "--vae_model",
-        default="stabilityai/sd-vae-ft-ema",
-        type=str,
-        required=False,
-        help="Path to pretrained VAE model, either stabilityai/sd-vae-ft-mse or stabilityai/sd-vae-ft-ema.",
-    )
-    parser.add_argument(
-        "--save", default=True, type=bool, required=False, help="Whether to save the converted pipeline or not."
-    )
-    parser.add_argument(
-        "--checkpoint_path", default=None, type=str, required=True, help="Path to the output pipeline."
-    )
-
-    args = parser.parse_args()
-    main(args)
--- a/scripts/convert_original_stable_diffusion_to_diffusers.py
+++ b/scripts/convert_original_stable_diffusion_to_diffusers.py
@@ -15,8 +15,770 @@
 """ Conversion script for the LDM checkpoints. """

 import argparse
+import os
+import re

-from diffusers.pipelines.stable_diffusion.convert_from_ckpt import load_pipeline_from_original_stable_diffusion_ckpt
+import torch
+
+
+try:
+    from omegaconf import OmegaConf
+except ImportError:
+    raise ImportError(
+        "OmegaConf is required to convert the LDM checkpoints. Please install it with `pip install OmegaConf`."
+    )
+
+from diffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    HeunDiscreteScheduler,
+    LDMTextToImagePipeline,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    StableDiffusionPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.pipelines.latent_diffusion.pipeline_latent_diffusion import LDMBertConfig, LDMBertModel
+from diffusers.pipelines.paint_by_example import PaintByExampleImageEncoder, PaintByExamplePipeline
+from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker
+from transformers import AutoFeatureExtractor, BertTokenizerFast, CLIPTextModel, CLIPTokenizer, CLIPVisionConfig
+
+
+def shave_segments(path, n_shave_prefix_segments=1):
+    """
+    Removes segments. Positive values shave the first segments, negative shave the last segments.
+    """
+    if n_shave_prefix_segments >= 0:
+        return ".".join(path.split(".")[n_shave_prefix_segments:])
+    else:
+        return ".".join(path.split(".")[:n_shave_prefix_segments])
+
+
+def renew_resnet_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside resnets to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item.replace("in_layers.0", "norm1")
+        new_item = new_item.replace("in_layers.2", "conv1")
+
+        new_item = new_item.replace("out_layers.0", "norm2")
+        new_item = new_item.replace("out_layers.3", "conv2")
+
+        new_item = new_item.replace("emb_layers.1", "time_emb_proj")
+        new_item = new_item.replace("skip_connection", "conv_shortcut")
+
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside resnets to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+
+        new_item = new_item.replace("nin_shortcut", "conv_shortcut")
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+def renew_attention_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside attentions to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+
+        #         new_item = new_item.replace('norm.weight', 'group_norm.weight')
+        #         new_item = new_item.replace('norm.bias', 'group_norm.bias')
+
+        #         new_item = new_item.replace('proj_out.weight', 'proj_attn.weight')
+        #         new_item = new_item.replace('proj_out.bias', 'proj_attn.bias')
+
+        #         new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside attentions to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+
+        new_item = new_item.replace("norm.weight", "group_norm.weight")
+        new_item = new_item.replace("norm.bias", "group_norm.bias")
+
+        new_item = new_item.replace("q.weight", "query.weight")
+        new_item = new_item.replace("q.bias", "query.bias")
+
+        new_item = new_item.replace("k.weight", "key.weight")
+        new_item = new_item.replace("k.bias", "key.bias")
+
+        new_item = new_item.replace("v.weight", "value.weight")
+        new_item = new_item.replace("v.bias", "value.bias")
+
+        new_item = new_item.replace("proj_out.weight", "proj_attn.weight")
+        new_item = new_item.replace("proj_out.bias", "proj_attn.bias")
+
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+def assign_to_checkpoint(
+    paths, checkpoint, old_checkpoint, attention_paths_to_split=None, additional_replacements=None, config=None
+):
+    """
+    This does the final conversion step: take locally converted weights and apply a global renaming
+    to them. It splits attention layers, and takes into account additional replacements
+    that may arise.
+
+    Assigns the weights to the new checkpoint.
+    """
+    assert isinstance(paths, list), "Paths should be a list of dicts containing 'old' and 'new' keys."
+
+    # Splits the attention layers into three variables.
+    if attention_paths_to_split is not None:
+        for path, path_map in attention_paths_to_split.items():
+            old_tensor = old_checkpoint[path]
+            channels = old_tensor.shape[0] // 3
+
+            target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (-1)
+
+            num_heads = old_tensor.shape[0] // config["num_head_channels"] // 3
+
+            old_tensor = old_tensor.reshape((num_heads, 3 * channels // num_heads) + old_tensor.shape[1:])
+            query, key, value = old_tensor.split(channels // num_heads, dim=1)
+
+            checkpoint[path_map["query"]] = query.reshape(target_shape)
+            checkpoint[path_map["key"]] = key.reshape(target_shape)
+            checkpoint[path_map["value"]] = value.reshape(target_shape)
+
+    for path in paths:
+        new_path = path["new"]
+
+        # These have already been assigned
+        if attention_paths_to_split is not None and new_path in attention_paths_to_split:
+            continue
+
+        # Global renaming happens here
+        new_path = new_path.replace("middle_block.0", "mid_block.resnets.0")
+        new_path = new_path.replace("middle_block.1", "mid_block.attentions.0")
+        new_path = new_path.replace("middle_block.2", "mid_block.resnets.1")
+
+        if additional_replacements is not None:
+            for replacement in additional_replacements:
+                new_path = new_path.replace(replacement["old"], replacement["new"])
+
+        # proj_attn.weight has to be converted from conv 1D to linear
+        if "proj_attn.weight" in new_path:
+            checkpoint[new_path] = old_checkpoint[path["old"]][:, :, 0]
+        else:
+            checkpoint[new_path] = old_checkpoint[path["old"]]
+
+
+def conv_attn_to_linear(checkpoint):
+    keys = list(checkpoint.keys())
+    attn_keys = ["query.weight", "key.weight", "value.weight"]
+    for key in keys:
+        if ".".join(key.split(".")[-2:]) in attn_keys:
+            if checkpoint[key].ndim > 2:
+                checkpoint[key] = checkpoint[key][:, :, 0, 0]
+        elif "proj_attn.weight" in key:
+            if checkpoint[key].ndim > 2:
+                checkpoint[key] = checkpoint[key][:, :, 0]
+
+
+def create_unet_diffusers_config(original_config, image_size: int):
+    """
+    Creates a config for the diffusers based on the config of the LDM model.
+    """
+    unet_params = original_config.model.params.unet_config.params
+    vae_params = original_config.model.params.first_stage_config.params.ddconfig
+
+    block_out_channels = [unet_params.model_channels * mult for mult in unet_params.channel_mult]
+
+    down_block_types = []
+    resolution = 1
+    for i in range(len(block_out_channels)):
+        block_type = "CrossAttnDownBlock2D" if resolution in unet_params.attention_resolutions else "DownBlock2D"
+        down_block_types.append(block_type)
+        if i != len(block_out_channels) - 1:
+            resolution *= 2
+
+    up_block_types = []
+    for i in range(len(block_out_channels)):
+        block_type = "CrossAttnUpBlock2D" if resolution in unet_params.attention_resolutions else "UpBlock2D"
+        up_block_types.append(block_type)
+        resolution //= 2
+
+    vae_scale_factor = 2 ** (len(vae_params.ch_mult) - 1)
+
+    head_dim = unet_params.num_heads if "num_heads" in unet_params else None
+    use_linear_projection = (
+        unet_params.use_linear_in_transformer if "use_linear_in_transformer" in unet_params else False
+    )
+    if use_linear_projection:
+        # stable diffusion 2-base-512 and 2-768
+        if head_dim is None:
+            head_dim = [5, 10, 20, 20]
+
+    config = dict(
+        sample_size=image_size // vae_scale_factor,
+        in_channels=unet_params.in_channels,
+        out_channels=unet_params.out_channels,
+        down_block_types=tuple(down_block_types),
+        up_block_types=tuple(up_block_types),
+        block_out_channels=tuple(block_out_channels),
+        layers_per_block=unet_params.num_res_blocks,
+        cross_attention_dim=unet_params.context_dim,
+        attention_head_dim=head_dim,
+        use_linear_projection=use_linear_projection,
+    )
+
+    return config
+
+
+def create_vae_diffusers_config(original_config, image_size: int):
+    """
+    Creates a config for the diffusers based on the config of the LDM model.
+    """
+    vae_params = original_config.model.params.first_stage_config.params.ddconfig
+    _ = original_config.model.params.first_stage_config.params.embed_dim
+
+    block_out_channels = [vae_params.ch * mult for mult in vae_params.ch_mult]
+    down_block_types = ["DownEncoderBlock2D"] * len(block_out_channels)
+    up_block_types = ["UpDecoderBlock2D"] * len(block_out_channels)
+
+    config = dict(
+        sample_size=image_size,
+        in_channels=vae_params.in_channels,
+        out_channels=vae_params.out_ch,
+        down_block_types=tuple(down_block_types),
+        up_block_types=tuple(up_block_types),
+        block_out_channels=tuple(block_out_channels),
+        latent_channels=vae_params.z_channels,
+        layers_per_block=vae_params.num_res_blocks,
+    )
+    return config
+
+
+def create_diffusers_schedular(original_config):
+    schedular = DDIMScheduler(
+        num_train_timesteps=original_config.model.params.timesteps,
+        beta_start=original_config.model.params.linear_start,
+        beta_end=original_config.model.params.linear_end,
+        beta_schedule="scaled_linear",
+    )
+    return schedular
+
+
+def create_ldm_bert_config(original_config):
+    bert_params = original_config.model.parms.cond_stage_config.params
+    config = LDMBertConfig(
+        d_model=bert_params.n_embed,
+        encoder_layers=bert_params.n_layer,
+        encoder_ffn_dim=bert_params.n_embed * 4,
+    )
+    return config
+
+
+def convert_ldm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False):
+    """
+    Takes a state dict and a config, and returns a converted checkpoint.
+    """
+
+    # extract state_dict for UNet
+    unet_state_dict = {}
+    keys = list(checkpoint.keys())
+
+    unet_key = "model.diffusion_model."
+    # at least a 100 parameters have to start with `model_ema` in order for the checkpoint to be EMA
+    if sum(k.startswith("model_ema") for k in keys) > 100 and extract_ema:
+        print(f"Checkpoint {path} has both EMA and non-EMA weights.")
+        print(
+            "In this conversion only the EMA weights are extracted. If you want to instead extract the non-EMA"
+            " weights (useful to continue fine-tuning), please make sure to remove the `--extract_ema` flag."
+        )
+        for key in keys:
+            if key.startswith("model.diffusion_model"):
+                flat_ema_key = "model_ema." + "".join(key.split(".")[1:])
+                unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(flat_ema_key)
+    else:
+        if sum(k.startswith("model_ema") for k in keys) > 100:
+            print(
+                "In this conversion only the non-EMA weights are extracted. If you want to instead extract the EMA"
+                " weights (usually better for inference), please make sure to add the `--extract_ema` flag."
+            )
+
+        for key in keys:
+            if key.startswith(unet_key):
+                unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(key)
+
+    new_checkpoint = {}
+
+    new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict["time_embed.0.weight"]
+    new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict["time_embed.0.bias"]
+    new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict["time_embed.2.weight"]
+    new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict["time_embed.2.bias"]
+
+    new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"]
+    new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"]
+
+    new_checkpoint["conv_norm_out.weight"] = unet_state_dict["out.0.weight"]
+    new_checkpoint["conv_norm_out.bias"] = unet_state_dict["out.0.bias"]
+    new_checkpoint["conv_out.weight"] = unet_state_dict["out.2.weight"]
+    new_checkpoint["conv_out.bias"] = unet_state_dict["out.2.bias"]
+
+    # Retrieves the keys for the input blocks only
+    num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "input_blocks" in layer})
+    input_blocks = {
+        layer_id: [key for key in unet_state_dict if f"input_blocks.{layer_id}" in key]
+        for layer_id in range(num_input_blocks)
+    }
+
+    # Retrieves the keys for the middle blocks only
+    num_middle_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "middle_block" in layer})
+    middle_blocks = {
+        layer_id: [key for key in unet_state_dict if f"middle_block.{layer_id}" in key]
+        for layer_id in range(num_middle_blocks)
+    }
+
+    # Retrieves the keys for the output blocks only
+    num_output_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "output_blocks" in layer})
+    output_blocks = {
+        layer_id: [key for key in unet_state_dict if f"output_blocks.{layer_id}" in key]
+        for layer_id in range(num_output_blocks)
+    }
+
+    for i in range(1, num_input_blocks):
+        block_id = (i - 1) // (config["layers_per_block"] + 1)
+        layer_in_block_id = (i - 1) % (config["layers_per_block"] + 1)
+
+        resnets = [
+            key for key in input_blocks[i] if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in key
+        ]
+        attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key]
+
+        if f"input_blocks.{i}.0.op.weight" in unet_state_dict:
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop(
+                f"input_blocks.{i}.0.op.weight"
+            )
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop(
+                f"input_blocks.{i}.0.op.bias"
+            )
+
+        paths = renew_resnet_paths(resnets)
+        meta_path = {"old": f"input_blocks.{i}.0", "new": f"down_blocks.{block_id}.resnets.{layer_in_block_id}"}
+        assign_to_checkpoint(
+            paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+        )
+
+        if len(attentions):
+            paths = renew_attention_paths(attentions)
+            meta_path = {"old": f"input_blocks.{i}.1", "new": f"down_blocks.{block_id}.attentions.{layer_in_block_id}"}
+            assign_to_checkpoint(
+                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+            )
+
+    resnet_0 = middle_blocks[0]
+    attentions = middle_blocks[1]
+    resnet_1 = middle_blocks[2]
+
+    resnet_0_paths = renew_resnet_paths(resnet_0)
+    assign_to_checkpoint(resnet_0_paths, new_checkpoint, unet_state_dict, config=config)
+
+    resnet_1_paths = renew_resnet_paths(resnet_1)
+    assign_to_checkpoint(resnet_1_paths, new_checkpoint, unet_state_dict, config=config)
+
+    attentions_paths = renew_attention_paths(attentions)
+    meta_path = {"old": "middle_block.1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(
+        attentions_paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+    )
+
+    for i in range(num_output_blocks):
+        block_id = i // (config["layers_per_block"] + 1)
+        layer_in_block_id = i % (config["layers_per_block"] + 1)
+        output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]]
+        output_block_list = {}
+
+        for layer in output_block_layers:
+            layer_id, layer_name = layer.split(".")[0], shave_segments(layer, 1)
+            if layer_id in output_block_list:
+                output_block_list[layer_id].append(layer_name)
+            else:
+                output_block_list[layer_id] = [layer_name]
+
+        if len(output_block_list) > 1:
+            resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.0" in key]
+            attentions = [key for key in output_blocks[i] if f"output_blocks.{i}.1" in key]
+
+            resnet_0_paths = renew_resnet_paths(resnets)
+            paths = renew_resnet_paths(resnets)
+
+            meta_path = {"old": f"output_blocks.{i}.0", "new": f"up_blocks.{block_id}.resnets.{layer_in_block_id}"}
+            assign_to_checkpoint(
+                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+            )
+
+            output_block_list = {k: sorted(v) for k, v in output_block_list.items()}
+            if ["conv.bias", "conv.weight"] in output_block_list.values():
+                index = list(output_block_list.values()).index(["conv.bias", "conv.weight"])
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
+                    f"output_blocks.{i}.{index}.conv.weight"
+                ]
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[
+                    f"output_blocks.{i}.{index}.conv.bias"
+                ]
+
+                # Clear attentions as they have been attributed above.
+                if len(attentions) == 2:
+                    attentions = []
+
+            if len(attentions):
+                paths = renew_attention_paths(attentions)
+                meta_path = {
+                    "old": f"output_blocks.{i}.1",
+                    "new": f"up_blocks.{block_id}.attentions.{layer_in_block_id}",
+                }
+                assign_to_checkpoint(
+                    paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+                )
+        else:
+            resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1)
+            for path in resnet_0_paths:
+                old_path = ".".join(["output_blocks", str(i), path["old"]])
+                new_path = ".".join(["up_blocks", str(block_id), "resnets", str(layer_in_block_id), path["new"]])
+
+                new_checkpoint[new_path] = unet_state_dict[old_path]
+
+    return new_checkpoint
+
+
+def convert_ldm_vae_checkpoint(checkpoint, config):
+    # extract state dict for VAE
+    vae_state_dict = {}
+    vae_key = "first_stage_model."
+    keys = list(checkpoint.keys())
+    for key in keys:
+        if key.startswith(vae_key):
+            vae_state_dict[key.replace(vae_key, "")] = checkpoint.get(key)
+
+    new_checkpoint = {}
+
+    new_checkpoint["encoder.conv_in.weight"] = vae_state_dict["encoder.conv_in.weight"]
+    new_checkpoint["encoder.conv_in.bias"] = vae_state_dict["encoder.conv_in.bias"]
+    new_checkpoint["encoder.conv_out.weight"] = vae_state_dict["encoder.conv_out.weight"]
+    new_checkpoint["encoder.conv_out.bias"] = vae_state_dict["encoder.conv_out.bias"]
+    new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict["encoder.norm_out.weight"]
+    new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict["encoder.norm_out.bias"]
+
+    new_checkpoint["decoder.conv_in.weight"] = vae_state_dict["decoder.conv_in.weight"]
+    new_checkpoint["decoder.conv_in.bias"] = vae_state_dict["decoder.conv_in.bias"]
+    new_checkpoint["decoder.conv_out.weight"] = vae_state_dict["decoder.conv_out.weight"]
+    new_checkpoint["decoder.conv_out.bias"] = vae_state_dict["decoder.conv_out.bias"]
+    new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict["decoder.norm_out.weight"]
+    new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict["decoder.norm_out.bias"]
+
+    new_checkpoint["quant_conv.weight"] = vae_state_dict["quant_conv.weight"]
+    new_checkpoint["quant_conv.bias"] = vae_state_dict["quant_conv.bias"]
+    new_checkpoint["post_quant_conv.weight"] = vae_state_dict["post_quant_conv.weight"]
+    new_checkpoint["post_quant_conv.bias"] = vae_state_dict["post_quant_conv.bias"]
+
+    # Retrieves the keys for the encoder down blocks only
+    num_down_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "encoder.down" in layer})
+    down_blocks = {
+        layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key] for layer_id in range(num_down_blocks)
+    }
+
+    # Retrieves the keys for the decoder up blocks only
+    num_up_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "decoder.up" in layer})
+    up_blocks = {
+        layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] for layer_id in range(num_up_blocks)
+    }
+
+    for i in range(num_down_blocks):
+        resnets = [key for key in down_blocks[i] if f"down.{i}" in key and f"down.{i}.downsample" not in key]
+
+        if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict:
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(
+                f"encoder.down.{i}.downsample.conv.weight"
+            )
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop(
+                f"encoder.down.{i}.downsample.conv.bias"
+            )
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"down.{i}.block", "new": f"down_blocks.{i}.resnets"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+
+    mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key]
+    num_mid_res_blocks = 2
+    for i in range(1, num_mid_res_blocks + 1):
+        resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key]
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+
+    mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key]
+    paths = renew_vae_attention_paths(mid_attentions)
+    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    conv_attn_to_linear(new_checkpoint)
+
+    for i in range(num_up_blocks):
+        block_id = num_up_blocks - 1 - i
+        resnets = [
+            key for key in up_blocks[block_id] if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key
+        ]
+
+        if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict:
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[
+                f"decoder.up.{block_id}.upsample.conv.weight"
+            ]
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[
+                f"decoder.up.{block_id}.upsample.conv.bias"
+            ]
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"up.{block_id}.block", "new": f"up_blocks.{i}.resnets"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+
+    mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key]
+    num_mid_res_blocks = 2
+    for i in range(1, num_mid_res_blocks + 1):
+        resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key]
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+
+    mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key]
+    paths = renew_vae_attention_paths(mid_attentions)
+    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    conv_attn_to_linear(new_checkpoint)
+    return new_checkpoint
+
+
+def convert_ldm_bert_checkpoint(checkpoint, config):
+    def _copy_attn_layer(hf_attn_layer, pt_attn_layer):
+        hf_attn_layer.q_proj.weight.data = pt_attn_layer.to_q.weight
+        hf_attn_layer.k_proj.weight.data = pt_attn_layer.to_k.weight
+        hf_attn_layer.v_proj.weight.data = pt_attn_layer.to_v.weight
+
+        hf_attn_layer.out_proj.weight = pt_attn_layer.to_out.weight
+        hf_attn_layer.out_proj.bias = pt_attn_layer.to_out.bias
+
+    def _copy_linear(hf_linear, pt_linear):
+        hf_linear.weight = pt_linear.weight
+        hf_linear.bias = pt_linear.bias
+
+    def _copy_layer(hf_layer, pt_layer):
+        # copy layer norms
+        _copy_linear(hf_layer.self_attn_layer_norm, pt_layer[0][0])
+        _copy_linear(hf_layer.final_layer_norm, pt_layer[1][0])
+
+        # copy attn
+        _copy_attn_layer(hf_layer.self_attn, pt_layer[0][1])
+
+        # copy MLP
+        pt_mlp = pt_layer[1][1]
+        _copy_linear(hf_layer.fc1, pt_mlp.net[0][0])
+        _copy_linear(hf_layer.fc2, pt_mlp.net[2])
+
+    def _copy_layers(hf_layers, pt_layers):
+        for i, hf_layer in enumerate(hf_layers):
+            if i != 0:
+                i += i
+            pt_layer = pt_layers[i : i + 2]
+            _copy_layer(hf_layer, pt_layer)
+
+    hf_model = LDMBertModel(config).eval()
+
+    # copy  embeds
+    hf_model.model.embed_tokens.weight = checkpoint.transformer.token_emb.weight
+    hf_model.model.embed_positions.weight.data = checkpoint.transformer.pos_emb.emb.weight
+
+    # copy layer norm
+    _copy_linear(hf_model.model.layer_norm, checkpoint.transformer.norm)
+
+    # copy hidden layers
+    _copy_layers(hf_model.model.layers, checkpoint.transformer.attn_layers.layers)
+
+    _copy_linear(hf_model.to_logits, checkpoint.transformer.to_logits)
+
+    return hf_model
+
+
+def convert_ldm_clip_checkpoint(checkpoint):
+    text_model = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14")
+
+    keys = list(checkpoint.keys())
+
+    text_model_dict = {}
+
+    for key in keys:
+        if key.startswith("cond_stage_model.transformer"):
+            text_model_dict[key[len("cond_stage_model.transformer.") :]] = checkpoint[key]
+
+    text_model.load_state_dict(text_model_dict)
+
+    return text_model
+
+
+textenc_conversion_lst = [
+    ("cond_stage_model.model.positional_embedding", "text_model.embeddings.position_embedding.weight"),
+    ("cond_stage_model.model.token_embedding.weight", "text_model.embeddings.token_embedding.weight"),
+    ("cond_stage_model.model.ln_final.weight", "text_model.final_layer_norm.weight"),
+    ("cond_stage_model.model.ln_final.bias", "text_model.final_layer_norm.bias"),
+]
+textenc_conversion_map = {x[0]: x[1] for x in textenc_conversion_lst}
+
+textenc_transformer_conversion_lst = [
+    # (stable-diffusion, HF Diffusers)
+    ("resblocks.", "text_model.encoder.layers."),
+    ("ln_1", "layer_norm1"),
+    ("ln_2", "layer_norm2"),
+    (".c_fc.", ".fc1."),
+    (".c_proj.", ".fc2."),
+    (".attn", ".self_attn"),
+    ("ln_final.", "transformer.text_model.final_layer_norm."),
+    ("token_embedding.weight", "transformer.text_model.embeddings.token_embedding.weight"),
+    ("positional_embedding", "transformer.text_model.embeddings.position_embedding.weight"),
+]
+protected = {re.escape(x[0]): x[1] for x in textenc_transformer_conversion_lst}
+textenc_pattern = re.compile("|".join(protected.keys()))
+
+
+def convert_paint_by_example_checkpoint(checkpoint):
+    config = CLIPVisionConfig.from_pretrained("openai/clip-vit-large-patch14")
+    model = PaintByExampleImageEncoder(config)
+
+    keys = list(checkpoint.keys())
+
+    text_model_dict = {}
+
+    for key in keys:
+        if key.startswith("cond_stage_model.transformer"):
+            text_model_dict[key[len("cond_stage_model.transformer.") :]] = checkpoint[key]
+
+    # load clip vision
+    model.model.load_state_dict(text_model_dict)
+
+    # load mapper
+    keys_mapper = {
+        k[len("cond_stage_model.mapper.res") :]: v
+        for k, v in checkpoint.items()
+        if k.startswith("cond_stage_model.mapper")
+    }
+
+    MAPPING = {
+        "attn.c_qkv": ["attn1.to_q", "attn1.to_k", "attn1.to_v"],
+        "attn.c_proj": ["attn1.to_out.0"],
+        "ln_1": ["norm1"],
+        "ln_2": ["norm3"],
+        "mlp.c_fc": ["ff.net.0.proj"],
+        "mlp.c_proj": ["ff.net.2"],
+    }
+
+    mapped_weights = {}
+    for key, value in keys_mapper.items():
+        prefix = key[: len("blocks.i")]
+        suffix = key.split(prefix)[-1].split(".")[-1]
+        name = key.split(prefix)[-1].split(suffix)[0][1:-1]
+        mapped_names = MAPPING[name]
+
+        num_splits = len(mapped_names)
+        for i, mapped_name in enumerate(mapped_names):
+            new_name = ".".join([prefix, mapped_name, suffix])
+            shape = value.shape[0] // num_splits
+            mapped_weights[new_name] = value[i * shape : (i + 1) * shape]
+
+    model.mapper.load_state_dict(mapped_weights)
+
+    # load final layer norm
+    model.final_layer_norm.load_state_dict(
+        {
+            "bias": checkpoint["cond_stage_model.final_ln.bias"],
+            "weight": checkpoint["cond_stage_model.final_ln.weight"],
+        }
+    )
+
+    # load final proj
+    model.proj_out.load_state_dict(
+        {
+            "bias": checkpoint["proj_out.bias"],
+            "weight": checkpoint["proj_out.weight"],
+        }
+    )
+
+    # load uncond vector
+    model.uncond_vector.data = torch.nn.Parameter(checkpoint["learnable_vector"])
+    return model
+
+
+def convert_open_clip_checkpoint(checkpoint):
+    text_model = CLIPTextModel.from_pretrained("stabilityai/stable-diffusion-2", subfolder="text_encoder")
+
+    keys = list(checkpoint.keys())
+
+    text_model_dict = {}
+
+    d_model = int(checkpoint["cond_stage_model.model.text_projection"].shape[0])
+
+    text_model_dict["text_model.embeddings.position_ids"] = text_model.text_model.embeddings.get_buffer("position_ids")
+
+    for key in keys:
+        if "resblocks.23" in key:  # Diffusers drops the final layer and only uses the penultimate layer
+            continue
+        if key in textenc_conversion_map:
+            text_model_dict[textenc_conversion_map[key]] = checkpoint[key]
+        if key.startswith("cond_stage_model.model.transformer."):
+            new_key = key[len("cond_stage_model.model.transformer.") :]
+            if new_key.endswith(".in_proj_weight"):
+                new_key = new_key[: -len(".in_proj_weight")]
+                new_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], new_key)
+                text_model_dict[new_key + ".q_proj.weight"] = checkpoint[key][:d_model, :]
+                text_model_dict[new_key + ".k_proj.weight"] = checkpoint[key][d_model : d_model * 2, :]
+                text_model_dict[new_key + ".v_proj.weight"] = checkpoint[key][d_model * 2 :, :]
+            elif new_key.endswith(".in_proj_bias"):
+                new_key = new_key[: -len(".in_proj_bias")]
+                new_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], new_key)
+                text_model_dict[new_key + ".q_proj.bias"] = checkpoint[key][:d_model]
+                text_model_dict[new_key + ".k_proj.bias"] = checkpoint[key][d_model : d_model * 2]
+                text_model_dict[new_key + ".v_proj.bias"] = checkpoint[key][d_model * 2 :]
+            else:
+                new_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], new_key)
+
+                text_model_dict[new_key] = checkpoint[key]
+
+    text_model.load_state_dict(text_model_dict)
+
+    return text_model


 if __name__ == "__main__":
@@ -78,7 +840,7 @@ if __name__ == "__main__":
        ),
    )
    parser.add_argument(
-        "--upcast_attention",
+        "--upcast_attn",
        default=False,
        type=bool,
        help=(
@@ -86,30 +848,179 @@ if __name__ == "__main__":
            " diffusion 2.1."
        ),
    )
-    parser.add_argument(
-        "--from_safetensors",
-        action="store_true",
-        help="If `--checkpoint_path` is in `safetensors` format, load checkpoint with safetensors instead of PyTorch.",
-    )
-    parser.add_argument(
-        "--to_safetensors",
-        action="store_true",
-        help="Whether to store pipeline in safetensors format or not.",
-    )
    parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output model.")
    parser.add_argument("--device", type=str, help="Device to use (e.g. cpu, cuda:0, cuda:1, etc.)")
    args = parser.parse_args()

-    pipe = load_pipeline_from_original_stable_diffusion_ckpt(
-        checkpoint_path=args.checkpoint_path,
-        original_config_file=args.original_config_file,
-        image_size=args.image_size,
-        prediction_type=args.prediction_type,
-        model_type=args.pipeline_type,
-        extract_ema=args.extract_ema,
-        scheduler_type=args.scheduler_type,
-        num_in_channels=args.num_in_channels,
-        upcast_attention=args.upcast_attention,
-        from_safetensors=args.from_safetensors,
+    image_size = args.image_size
+    prediction_type = args.prediction_type
+
+    if args.device is None:
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        checkpoint = torch.load(args.checkpoint_path, map_location=device)
+    else:
+        checkpoint = torch.load(args.checkpoint_path, map_location=args.device)
+
+    # Sometimes models don't have the global_step item
+    if "global_step" in checkpoint:
+        global_step = checkpoint["global_step"]
+    else:
+        print("global_step key not found in model")
+        global_step = None
+
+    if "state_dict" in checkpoint:
+        checkpoint = checkpoint["state_dict"]
+
+    upcast_attention = False
+    if args.original_config_file is None:
+        key_name = "model.diffusion_model.input_blocks.2.1.transformer_blocks.0.attn2.to_k.weight"
+
+        if key_name in checkpoint and checkpoint[key_name].shape[-1] == 1024:
+            if not os.path.isfile("v2-inference-v.yaml"):
+                # model_type = "v2"
+                os.system(
+                    "wget https://raw.githubusercontent.com/Stability-AI/stablediffusion/main/configs/stable-diffusion/v2-inference-v.yaml"
+                    " -O v2-inference-v.yaml"
+                )
+            args.original_config_file = "./v2-inference-v.yaml"
+
+            if global_step == 110000:
+                # v2.1 needs to upcast attention
+                upcast_attention = True
+        else:
+            if not os.path.isfile("v1-inference.yaml"):
+                # model_type = "v1"
+                os.system(
+                    "wget https://raw.githubusercontent.com/CompVis/stable-diffusion/main/configs/stable-diffusion/v1-inference.yaml"
+                    " -O v1-inference.yaml"
+                )
+            args.original_config_file = "./v1-inference.yaml"
+
+    original_config = OmegaConf.load(args.original_config_file)
+
+    if args.num_in_channels is not None:
+        original_config["model"]["params"]["unet_config"]["params"]["in_channels"] = args.num_in_channels
+
+    if (
+        "parameterization" in original_config["model"]["params"]
+        and original_config["model"]["params"]["parameterization"] == "v"
+    ):
+        if prediction_type is None:
+            # NOTE: For stable diffusion 2 base it is recommended to pass `prediction_type=="epsilon"`
+            # as it relies on a brittle global step parameter here
+            prediction_type = "epsilon" if global_step == 875000 else "v_prediction"
+        if image_size is None:
+            # NOTE: For stable diffusion 2 base one has to pass `image_size==512`
+            # as it relies on a brittle global step parameter here
+            image_size = 512 if global_step == 875000 else 768
+    else:
+        if prediction_type is None:
+            prediction_type = "epsilon"
+        if image_size is None:
+            image_size = 512
+
+    num_train_timesteps = original_config.model.params.timesteps
+    beta_start = original_config.model.params.linear_start
+    beta_end = original_config.model.params.linear_end
+
+    scheduler = DDIMScheduler(
+        beta_end=beta_end,
+        beta_schedule="scaled_linear",
+        beta_start=beta_start,
+        num_train_timesteps=num_train_timesteps,
+        steps_offset=1,
+        clip_sample=False,
+        set_alpha_to_one=False,
+        prediction_type=prediction_type,
    )
-    pipe.save_pretrained(args.dump_path, safe_serialization=args.to_safetensors)
+    # make sure scheduler works correctly with DDIM
+    scheduler.register_to_config(clip_sample=False)
+
+    if args.scheduler_type == "pndm":
+        config = dict(scheduler.config)
+        config["skip_prk_steps"] = True
+        scheduler = PNDMScheduler.from_config(config)
+    elif args.scheduler_type == "lms":
+        scheduler = LMSDiscreteScheduler.from_config(scheduler.config)
+    elif args.scheduler_type == "heun":
+        scheduler = HeunDiscreteScheduler.from_config(scheduler.config)
+    elif args.scheduler_type == "euler":
+        scheduler = EulerDiscreteScheduler.from_config(scheduler.config)
+    elif args.scheduler_type == "euler-ancestral":
+        scheduler = EulerAncestralDiscreteScheduler.from_config(scheduler.config)
+    elif args.scheduler_type == "dpm":
+        scheduler = DPMSolverMultistepScheduler.from_config(scheduler.config)
+    elif args.scheduler_type == "ddim":
+        scheduler = scheduler
+    else:
+        raise ValueError(f"Scheduler of type {args.scheduler_type} doesn't exist!")
+
+    # Convert the UNet2DConditionModel model.
+    unet_config = create_unet_diffusers_config(original_config, image_size=image_size)
+    unet_config["upcast_attention"] = upcast_attention
+    unet = UNet2DConditionModel(**unet_config)
+
+    converted_unet_checkpoint = convert_ldm_unet_checkpoint(
+        checkpoint, unet_config, path=args.checkpoint_path, extract_ema=args.extract_ema
+    )
+
+    unet.load_state_dict(converted_unet_checkpoint)
+
+    # Convert the VAE model.
+    vae_config = create_vae_diffusers_config(original_config, image_size=image_size)
+    converted_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config)
+
+    vae = AutoencoderKL(**vae_config)
+    vae.load_state_dict(converted_vae_checkpoint)
+
+    # Convert the text model.
+    model_type = args.pipeline_type
+    if model_type is None:
+        model_type = original_config.model.params.cond_stage_config.target.split(".")[-1]
+
+    if model_type == "FrozenOpenCLIPEmbedder":
+        text_model = convert_open_clip_checkpoint(checkpoint)
+        tokenizer = CLIPTokenizer.from_pretrained("stabilityai/stable-diffusion-2", subfolder="tokenizer")
+        pipe = StableDiffusionPipeline(
+            vae=vae,
+            text_encoder=text_model,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=None,
+            feature_extractor=None,
+            requires_safety_checker=False,
+        )
+    elif model_type == "PaintByExample":
+        vision_model = convert_paint_by_example_checkpoint(checkpoint)
+        tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
+        feature_extractor = AutoFeatureExtractor.from_pretrained("CompVis/stable-diffusion-safety-checker")
+        pipe = PaintByExamplePipeline(
+            vae=vae,
+            image_encoder=vision_model,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=None,
+            feature_extractor=feature_extractor,
+        )
+    elif model_type == "FrozenCLIPEmbedder":
+        text_model = convert_ldm_clip_checkpoint(checkpoint)
+        tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
+        safety_checker = StableDiffusionSafetyChecker.from_pretrained("CompVis/stable-diffusion-safety-checker")
+        feature_extractor = AutoFeatureExtractor.from_pretrained("CompVis/stable-diffusion-safety-checker")
+        pipe = StableDiffusionPipeline(
+            vae=vae,
+            text_encoder=text_model,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+    else:
+        text_config = create_ldm_bert_config(original_config)
+        text_model = convert_ldm_bert_checkpoint(checkpoint, text_config)
+        tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
+        pipe = LDMTextToImagePipeline(vqvae=vae, bert=text_model, tokenizer=tokenizer, unet=unet, scheduler=scheduler)
+
+    pipe.save_pretrained(args.dump_path)
--- a/setup.py
+++ b/setup.py
@@ -80,7 +80,7 @@ from setuptools import find_packages, setup
 _deps = [
    "Pillow",  # keep the PIL.Image.Resampling deprecation away
    "accelerate>=0.11.0",
-    "black==22.12",
+    "black==22.8",
    "datasets",
    "filelock",
    "flake8>=3.8.3",
@@ -91,9 +91,9 @@ _deps = [
    "isort>=5.5.4",
    "jax>=0.2.8,!=0.3.2",
    "jaxlib>=0.1.65",
-    "Jinja2",
-    "k-diffusion>=0.0.12",
+    "k-diffusion",
    "librosa",
+    "modelcards>=0.1.4",
    "numpy",
    "parameterized",
    "pytest",
@@ -180,10 +180,9 @@ extras = {}
 extras = {}
 extras["quality"] = deps_list("black", "isort", "flake8", "hf-doc-builder")
 extras["docs"] = deps_list("hf-doc-builder")
-extras["training"] = deps_list("accelerate", "datasets", "tensorboard", "Jinja2")
+extras["training"] = deps_list("accelerate", "datasets", "tensorboard", "modelcards")
 extras["test"] = deps_list(
    "datasets",
-    "Jinja2",
    "k-diffusion",
    "librosa",
    "parameterized",
@@ -219,7 +218,7 @@ install_requires = [

 setup(
    name="diffusers",
-    version="0.12.1",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
+    version="0.12.0.dev0",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
    description="Diffusers",
    long_description=open("README.md", "r", encoding="utf-8").read(),
    long_description_content_type="text/markdown",
--- a/src/diffusers/init.py
+++ b/src/diffusers/init.py
@@ -1,4 +1,4 @@
-__version__ = "0.12.1"
+__version__ = "0.12.0.dev0"

 from .configuration_utils import ConfigMixin
 from .utils import (
@@ -6,7 +6,6 @@ from .utils import (
    is_flax_available,
    is_inflect_available,
    is_k_diffusion_available,
-    is_k_diffusion_version,
    is_librosa_available,
    is_onnx_available,
    is_scipy_available,
@@ -57,7 +56,6 @@ else:
        DDIMPipeline,
        DDPMPipeline,
        DiffusionPipeline,
-        DiTPipeline,
        ImagePipelineOutput,
        KarrasVePipeline,
        LDMPipeline,
@@ -114,7 +112,6 @@ else:
        StableDiffusionImg2ImgPipeline,
        StableDiffusionInpaintPipeline,
        StableDiffusionInpaintPipelineLegacy,
-        StableDiffusionInstructPix2PixPipeline,
        StableDiffusionPipeline,
        StableDiffusionPipelineSafe,
        StableDiffusionUpscalePipeline,
@@ -185,8 +182,4 @@ try:
 except OptionalDependencyNotAvailable:
    from .utils.dummy_flax_and_transformers_objects import *  # noqa F403
 else:
-    from .pipelines import (
-        FlaxStableDiffusionImg2ImgPipeline,
-        FlaxStableDiffusionInpaintPipeline,
-        FlaxStableDiffusionPipeline,
-    )
+    from .pipelines import FlaxStableDiffusionImg2ImgPipeline, FlaxStableDiffusionPipeline
--- a/src/diffusers/dependency_versions_check.py
+++ b/src/diffusers/dependency_versions_check.py
@@ -1,4 +1,4 @@
-# Copyright 2022 The HuggingFace Team. All rights reserved.
+# Copyright 2020 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
--- a/src/diffusers/dependency_versions_table.py
+++ b/src/diffusers/dependency_versions_table.py
@@ -4,7 +4,7 @@
 deps = {
    "Pillow": "Pillow",
    "accelerate": "accelerate>=0.11.0",
-    "black": "black==22.12",
+    "black": "black==22.8",
    "datasets": "datasets",
    "filelock": "filelock",
    "flake8": "flake8>=3.8.3",
@@ -15,9 +15,9 @@ deps = {
    "isort": "isort>=5.5.4",
    "jax": "jax>=0.2.8,!=0.3.2",
    "jaxlib": "jaxlib>=0.1.65",
-    "Jinja2": "Jinja2",
-    "k-diffusion": "k-diffusion>=0.0.12",
+    "k-diffusion": "k-diffusion",
    "librosa": "librosa",
+    "modelcards": "modelcards>=0.1.4",
    "numpy": "numpy",
    "parameterized": "parameterized",
    "pytest": "pytest",
--- a/src/diffusers/loaders.py
+++ b/src/diffusers/loaders.py
@@ -1,243 +0,0 @@
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-from collections import defaultdict
-from typing import Callable, Dict, Union
-
-import torch
-
-from .models.cross_attention import LoRACrossAttnProcessor
-from .models.modeling_utils import _get_model_file
-from .utils import DIFFUSERS_CACHE, HF_HUB_OFFLINE, logging
-
-
-logger = logging.get_logger(__name__)
-
-
-LORA_WEIGHT_NAME = "pytorch_lora_weights.bin"
-
-
-class AttnProcsLayers(torch.nn.Module):
-    def __init__(self, state_dict: Dict[str, torch.Tensor]):
-        super().__init__()
-        self.layers = torch.nn.ModuleList(state_dict.values())
-        self.mapping = {k: v for k, v in enumerate(state_dict.keys())}
-        self.rev_mapping = {v: k for k, v in enumerate(state_dict.keys())}
-
-        # we add a hook to state_dict() and load_state_dict() so that the
-        # naming fits with `unet.attn_processors`
-        def map_to(module, state_dict, *args, **kwargs):
-            new_state_dict = {}
-            for key, value in state_dict.items():
-                num = int(key.split(".")[1])  # 0 is always "layers"
-                new_key = key.replace(f"layers.{num}", module.mapping[num])
-                new_state_dict[new_key] = value
-
-            return new_state_dict
-
-        def map_from(module, state_dict, *args, **kwargs):
-            all_keys = list(state_dict.keys())
-            for key in all_keys:
-                replace_key = key.split(".processor")[0] + ".processor"
-                new_key = key.replace(replace_key, f"layers.{module.rev_mapping[replace_key]}")
-                state_dict[new_key] = state_dict[key]
-                del state_dict[key]
-
-        self._register_state_dict_hook(map_to)
-        self._register_load_state_dict_pre_hook(map_from, with_module=True)
-
-
-class UNet2DConditionLoadersMixin:
-    def load_attn_procs(self, pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]], **kwargs):
-        r"""
-        Load pretrained attention processor layers into `UNet2DConditionModel`. Attention processor layers have to be
-        defined in
-        [cross_attention.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py)
-        and be a `torch.nn.Module` class.
-
-        <Tip warning={true}>
-
-            This function is experimental and might change in the future
-
-        </Tip>
-
-        Parameters:
-            pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`):
-                Can be either:
-
-                    - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
-                      Valid model ids should have an organization name, like `google/ddpm-celebahq-256`.
-                    - A path to a *directory* containing model weights saved using [`~ModelMixin.save_config`], e.g.,
-                      `./my_model_directory/`.
-                    - A [torch state
-                      dict](https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict).
-
-            cache_dir (`Union[str, os.PathLike]`, *optional*):
-                Path to a directory in which a downloaded pretrained model configuration should be cached if the
-                standard cache should not be used.
-            force_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
-                cached versions if they exist.
-            resume_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to delete incompletely received files. Will attempt to resume the download if such a
-                file exists.
-            proxies (`Dict[str, str]`, *optional*):
-                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
-                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
-            local_files_only(`bool`, *optional*, defaults to `False`):
-                Whether or not to only look at local files (i.e., do not try to download the model).
-            use_auth_token (`str` or *bool*, *optional*):
-                The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
-                when running `diffusers-cli login` (stored in `~/.huggingface`).
-            revision (`str`, *optional*, defaults to `"main"`):
-                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
-                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
-                identifier allowed by git.
-            subfolder (`str`, *optional*, defaults to `""`):
-                In case the relevant files are located inside a subfolder of the model repo (either remote in
-                huggingface.co or downloaded locally), you can specify the folder name here.
-
-            mirror (`str`, *optional*):
-                Mirror source to accelerate downloads in China. If you are from China and have an accessibility
-                problem, you can set this option to resolve it. Note that we do not guarantee the timeliness or safety.
-                Please refer to the mirror site for more information.
-
-        <Tip>
-
-         It is required to be logged in (`huggingface-cli login`) when you want to use private or [gated
-         models](https://huggingface.co/docs/hub/models-gated#gated-models).
-
-        </Tip>
-
-        <Tip>
-
-        Activate the special ["offline-mode"](https://huggingface.co/diffusers/installation.html#offline-mode) to use
-        this method in a firewalled environment.
-
-        </Tip>
-        """
-
-        cache_dir = kwargs.pop("cache_dir", DIFFUSERS_CACHE)
-        force_download = kwargs.pop("force_download", False)
-        resume_download = kwargs.pop("resume_download", False)
-        proxies = kwargs.pop("proxies", None)
-        local_files_only = kwargs.pop("local_files_only", HF_HUB_OFFLINE)
-        use_auth_token = kwargs.pop("use_auth_token", None)
-        revision = kwargs.pop("revision", None)
-        subfolder = kwargs.pop("subfolder", None)
-        weight_name = kwargs.pop("weight_name", LORA_WEIGHT_NAME)
-
-        user_agent = {
-            "file_type": "attn_procs_weights",
-            "framework": "pytorch",
-        }
-
-        if not isinstance(pretrained_model_name_or_path_or_dict, dict):
-            model_file = _get_model_file(
-                pretrained_model_name_or_path_or_dict,
-                weights_name=weight_name,
-                cache_dir=cache_dir,
-                force_download=force_download,
-                resume_download=resume_download,
-                proxies=proxies,
-                local_files_only=local_files_only,
-                use_auth_token=use_auth_token,
-                revision=revision,
-                subfolder=subfolder,
-                user_agent=user_agent,
-            )
-            state_dict = torch.load(model_file, map_location="cpu")
-        else:
-            state_dict = pretrained_model_name_or_path_or_dict
-
-        # fill attn processors
-        attn_processors = {}
-
-        is_lora = all("lora" in k for k in state_dict.keys())
-
-        if is_lora:
-            lora_grouped_dict = defaultdict(dict)
-            for key, value in state_dict.items():
-                attn_processor_key, sub_key = ".".join(key.split(".")[:-3]), ".".join(key.split(".")[-3:])
-                lora_grouped_dict[attn_processor_key][sub_key] = value
-
-            for key, value_dict in lora_grouped_dict.items():
-                rank = value_dict["to_k_lora.down.weight"].shape[0]
-                cross_attention_dim = value_dict["to_k_lora.down.weight"].shape[1]
-                hidden_size = value_dict["to_k_lora.up.weight"].shape[0]
-
-                attn_processors[key] = LoRACrossAttnProcessor(
-                    hidden_size=hidden_size, cross_attention_dim=cross_attention_dim, rank=rank
-                )
-                attn_processors[key].load_state_dict(value_dict)
-
-        else:
-            raise ValueError(f"{model_file} does not seem to be in the correct format expected by LoRA training.")
-
-        # set correct dtype & device
-        attn_processors = {k: v.to(device=self.device, dtype=self.dtype) for k, v in attn_processors.items()}
-
-        # set layers
-        self.set_attn_processor(attn_processors)
-
-    def save_attn_procs(
-        self,
-        save_directory: Union[str, os.PathLike],
-        is_main_process: bool = True,
-        weights_name: str = LORA_WEIGHT_NAME,
-        save_function: Callable = None,
-    ):
-        r"""
-        Save an attention procesor to a directory, so that it can be re-loaded using the
-        `[`~loaders.UNet2DConditionLoadersMixin.load_attn_procs`]` method.
-
-        Arguments:
-            save_directory (`str` or `os.PathLike`):
-                Directory to which to save. Will be created if it doesn't exist.
-            is_main_process (`bool`, *optional*, defaults to `True`):
-                Whether the process calling this is the main process or not. Useful when in distributed training like
-                TPUs and need to call this function on all processes. In this case, set `is_main_process=True` only on
-                the main process to avoid race conditions.
-            save_function (`Callable`):
-                The function to use to save the state dictionary. Useful on distributed training like TPUs when one
-                need to replace `torch.save` by another method. Can be configured with the environment variable
-                `DIFFUSERS_SAVE_MODE`.
-        """
-        if os.path.isfile(save_directory):
-            logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
-            return
-
-        if save_function is None:
-            save_function = torch.save
-
-        os.makedirs(save_directory, exist_ok=True)
-
-        model_to_save = AttnProcsLayers(self.attn_processors)
-
-        # Save the model
-        state_dict = model_to_save.state_dict()
-
-        # Clean the folder from a previous save
-        for filename in os.listdir(save_directory):
-            full_filename = os.path.join(save_directory, filename)
-            # If we have a shard file that is not going to be replaced, we delete it, but only from the main process
-            # in distributed settings to avoid race conditions.
-            weights_no_suffix = weights_name.replace(".bin", "")
-            if filename.startswith(weights_no_suffix) and os.path.isfile(full_filename) and is_main_process:
-                os.remove(full_filename)
-
-        # Save the model
-        save_function(state_dict, os.path.join(save_directory, weights_name))
-
-        logger.info(f"Model weights saved in {os.path.join(save_directory, weights_name)}")
--- a/src/diffusers/models/attention.py
+++ b/src/diffusers/models/attention.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import math
-from typing import Callable, Optional
+from typing import Optional

 import torch
 import torch.nn.functional as F
@@ -20,7 +20,6 @@ from torch import nn

 from ..utils.import_utils import is_xformers_available
 from .cross_attention import CrossAttention
-from .embeddings import CombinedTimestepLabelEmbeddings


 if is_xformers_available():
@@ -72,7 +71,6 @@ class AttentionBlock(nn.Module):
        self.proj_attn = nn.Linear(channels, channels, 1)

        self._use_memory_efficient_attention_xformers = False
-        self._attention_op = None

    def reshape_heads_to_batch_dim(self, tensor):
        batch_size, seq_len, dim = tensor.shape
@@ -88,16 +86,12 @@ class AttentionBlock(nn.Module):
        tensor = tensor.permute(0, 2, 1, 3).reshape(batch_size // head_size, seq_len, dim * head_size)
        return tensor

-    def set_use_memory_efficient_attention_xformers(
-        self, use_memory_efficient_attention_xformers: bool, attention_op: Optional[Callable] = None
-    ):
+    def set_use_memory_efficient_attention_xformers(self, use_memory_efficient_attention_xformers: bool):
        if use_memory_efficient_attention_xformers:
            if not is_xformers_available():
                raise ModuleNotFoundError(
-                    (
-                        "Refer to https://github.com/facebookresearch/xformers for more information on how to install"
-                        " xformers"
-                    ),
+                    "Refer to https://github.com/facebookresearch/xformers for more information on how to install"
+                    " xformers",
                    name="xformers",
                )
            elif not torch.cuda.is_available():
@@ -116,7 +110,6 @@ class AttentionBlock(nn.Module):
                except Exception as e:
                    raise e
        self._use_memory_efficient_attention_xformers = use_memory_efficient_attention_xformers
-        self._attention_op = attention_op

    def forward(self, hidden_states):
        residual = hidden_states
@@ -140,9 +133,7 @@ class AttentionBlock(nn.Module):

        if self._use_memory_efficient_attention_xformers:
            # Memory efficient attention
-            hidden_states = xformers.ops.memory_efficient_attention(
-                query_proj, key_proj, value_proj, attn_bias=None, op=self._attention_op
-            )
+            hidden_states = xformers.ops.memory_efficient_attention(query_proj, key_proj, value_proj, attn_bias=None)
            hidden_states = hidden_states.to(query_proj.dtype)
        else:
            attention_scores = torch.baddbmm(
@@ -203,21 +194,10 @@ class BasicTransformerBlock(nn.Module):
        attention_bias: bool = False,
        only_cross_attention: bool = False,
        upcast_attention: bool = False,
-        norm_elementwise_affine: bool = True,
-        norm_type: str = "layer_norm",
-        final_dropout: bool = False,
    ):
        super().__init__()
        self.only_cross_attention = only_cross_attention
-
-        self.use_ada_layer_norm_zero = (num_embeds_ada_norm is not None) and norm_type == "ada_norm_zero"
-        self.use_ada_layer_norm = (num_embeds_ada_norm is not None) and norm_type == "ada_norm"
-
-        if norm_type in ("ada_norm", "ada_norm_zero") and num_embeds_ada_norm is None:
-            raise ValueError(
-                f"`norm_type` is set to {norm_type}, but `num_embeds_ada_norm` is not defined. Please make sure to"
-                f" define `num_embeds_ada_norm` if setting `norm_type` to {norm_type}."
-            )
+        self.use_ada_layer_norm = num_embeds_ada_norm is not None

        # 1. Self-Attn
        self.attn1 = CrossAttention(
@@ -230,7 +210,7 @@ class BasicTransformerBlock(nn.Module):
            upcast_attention=upcast_attention,
        )

-        self.ff = FeedForward(dim, dropout=dropout, activation_fn=activation_fn, final_dropout=final_dropout)
+        self.ff = FeedForward(dim, dropout=dropout, activation_fn=activation_fn)

        # 2. Cross-Attn
        if cross_attention_dim is not None:
@@ -246,27 +226,15 @@ class BasicTransformerBlock(nn.Module):
        else:
            self.attn2 = None

-        if self.use_ada_layer_norm:
-            self.norm1 = AdaLayerNorm(dim, num_embeds_ada_norm)
-        elif self.use_ada_layer_norm_zero:
-            self.norm1 = AdaLayerNormZero(dim, num_embeds_ada_norm)
-        else:
-            self.norm1 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine)
+        self.norm1 = AdaLayerNorm(dim, num_embeds_ada_norm) if self.use_ada_layer_norm else nn.LayerNorm(dim)

        if cross_attention_dim is not None:
-            # We currently only use AdaLayerNormZero for self attention where there will only be one attention block.
-            # I.e. the number of returned modulation chunks from AdaLayerZero would not make sense if returned during
-            # the second cross attention block.
-            self.norm2 = (
-                AdaLayerNorm(dim, num_embeds_ada_norm)
-                if self.use_ada_layer_norm
-                else nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine)
-            )
+            self.norm2 = AdaLayerNorm(dim, num_embeds_ada_norm) if self.use_ada_layer_norm else nn.LayerNorm(dim)
        else:
            self.norm2 = None

        # 3. Feed-forward
-        self.norm3 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine)
+        self.norm3 = nn.LayerNorm(dim)

    def forward(
        self,
@@ -275,18 +243,11 @@ class BasicTransformerBlock(nn.Module):
        timestep=None,
        attention_mask=None,
        cross_attention_kwargs=None,
-        class_labels=None,
    ):
-        if self.use_ada_layer_norm:
-            norm_hidden_states = self.norm1(hidden_states, timestep)
-        elif self.use_ada_layer_norm_zero:
-            norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(
-                hidden_states, timestep, class_labels, hidden_dtype=hidden_states.dtype
-            )
-        else:
-            norm_hidden_states = self.norm1(hidden_states)
-
        # 1. Self-Attention
+        norm_hidden_states = (
+            self.norm1(hidden_states, timestep) if self.use_ada_layer_norm else self.norm1(hidden_states)
+        )
        cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
        attn_output = self.attn1(
            norm_hidden_states,
@@ -294,16 +255,13 @@ class BasicTransformerBlock(nn.Module):
            attention_mask=attention_mask,
            **cross_attention_kwargs,
        )
-        if self.use_ada_layer_norm_zero:
-            attn_output = gate_msa.unsqueeze(1) * attn_output
        hidden_states = attn_output + hidden_states

        if self.attn2 is not None:
+            # 2. Cross-Attention
            norm_hidden_states = (
                self.norm2(hidden_states, timestep) if self.use_ada_layer_norm else self.norm2(hidden_states)
            )
-
-            # 2. Cross-Attention
            attn_output = self.attn2(
                norm_hidden_states,
                encoder_hidden_states=encoder_hidden_states,
@@ -313,17 +271,7 @@ class BasicTransformerBlock(nn.Module):
            hidden_states = attn_output + hidden_states

        # 3. Feed-forward
-        norm_hidden_states = self.norm3(hidden_states)
-
-        if self.use_ada_layer_norm_zero:
-            norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
-
-        ff_output = self.ff(norm_hidden_states)
-
-        if self.use_ada_layer_norm_zero:
-            ff_output = gate_mlp.unsqueeze(1) * ff_output
-
-        hidden_states = ff_output + hidden_states
+        hidden_states = self.ff(self.norm3(hidden_states)) + hidden_states

        return hidden_states

@@ -338,7 +286,6 @@ class FeedForward(nn.Module):
        mult (`int`, *optional*, defaults to 4): The multiplier to use for the hidden dimension.
        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
-        final_dropout (`bool` *optional*, defaults to False): Apply a final dropout.
    """

    def __init__(
@@ -348,7 +295,6 @@ class FeedForward(nn.Module):
        mult: int = 4,
        dropout: float = 0.0,
        activation_fn: str = "geglu",
-        final_dropout: bool = False,
    ):
        super().__init__()
        inner_dim = int(dim * mult)
@@ -356,8 +302,6 @@ class FeedForward(nn.Module):

        if activation_fn == "gelu":
            act_fn = GELU(dim, inner_dim)
-        if activation_fn == "gelu-approximate":
-            act_fn = GELU(dim, inner_dim, approximate="tanh")
        elif activation_fn == "geglu":
            act_fn = GEGLU(dim, inner_dim)
        elif activation_fn == "geglu-approximate":
@@ -370,9 +314,6 @@ class FeedForward(nn.Module):
        self.net.append(nn.Dropout(dropout))
        # project out
        self.net.append(nn.Linear(inner_dim, dim_out))
-        # FF as used in Vision Transformer, MLP-Mixer, etc. have a final dropout
-        if final_dropout:
-            self.net.append(nn.Dropout(dropout))

    def forward(self, hidden_states):
        for module in self.net:
@@ -382,19 +323,18 @@ class FeedForward(nn.Module):

 class GELU(nn.Module):
    r"""
-    GELU activation function with tanh approximation support with `approximate="tanh"`.
+    GELU activation function
    """

-    def __init__(self, dim_in: int, dim_out: int, approximate: str = "none"):
+    def __init__(self, dim_in: int, dim_out: int):
        super().__init__()
        self.proj = nn.Linear(dim_in, dim_out)
-        self.approximate = approximate

    def gelu(self, gate):
        if gate.device.type != "mps":
-            return F.gelu(gate, approximate=self.approximate)
+            return F.gelu(gate)
        # mps: gelu is not implemented for float16
-        return F.gelu(gate.to(dtype=torch.float32), approximate=self.approximate).to(dtype=gate.dtype)
+        return F.gelu(gate.to(dtype=torch.float32)).to(dtype=gate.dtype)

    def forward(self, hidden_states):
        hidden_states = self.proj(hidden_states)
@@ -402,6 +342,7 @@ class GELU(nn.Module):
        return hidden_states


+# feedforward
 class GEGLU(nn.Module):
    r"""
    A variant of the gated linear unit activation function from https://arxiv.org/abs/2002.05202.
@@ -459,24 +400,3 @@ class AdaLayerNorm(nn.Module):
        scale, shift = torch.chunk(emb, 2)
        x = self.norm(x) * (1 + scale) + shift
        return x
-
-
-class AdaLayerNormZero(nn.Module):
-    """
-    Norm layer adaptive layer norm zero (adaLN-Zero).
-    """
-
-    def __init__(self, embedding_dim, num_embeddings):
-        super().__init__()
-
-        self.emb = CombinedTimestepLabelEmbeddings(num_embeddings, embedding_dim)
-
-        self.silu = nn.SiLU()
-        self.linear = nn.Linear(embedding_dim, 6 * embedding_dim, bias=True)
-        self.norm = nn.LayerNorm(embedding_dim, elementwise_affine=False, eps=1e-6)
-
-    def forward(self, x, timestep, class_labels, hidden_dtype=None):
-        emb = self.linear(self.silu(self.emb(timestep, class_labels, hidden_dtype=hidden_dtype)))
-        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = emb.chunk(6, dim=1)
-        x = self.norm(x) * (1 + scale_msa[:, None]) + shift_msa[:, None]
-        return x, gate_msa, shift_mlp, scale_mlp, gate_mlp
--- a/src/diffusers/models/cross_attention.py
+++ b/src/diffusers/models/cross_attention.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Callable, Optional, Union
+from typing import Optional, Union

 import torch
 import torch.nn.functional as F
@@ -93,9 +93,7 @@ class CrossAttention(nn.Module):
        processor = processor if processor is not None else CrossAttnProcessor()
        self.set_processor(processor)

-    def set_use_memory_efficient_attention_xformers(
-        self, use_memory_efficient_attention_xformers: bool, attention_op: Optional[Callable] = None
-    ):
+    def set_use_memory_efficient_attention_xformers(self, use_memory_efficient_attention_xformers: bool):
        if use_memory_efficient_attention_xformers:
            if self.added_kv_proj_dim is not None:
                # TODO(Anton, Patrick, Suraj, William) - currently xformers doesn't work for UnCLIP
@@ -107,10 +105,8 @@ class CrossAttention(nn.Module):
                )
            elif not is_xformers_available():
                raise ModuleNotFoundError(
-                    (
-                        "Refer to https://github.com/facebookresearch/xformers for more information on how to install"
-                        " xformers"
-                    ),
+                    "Refer to https://github.com/facebookresearch/xformers for more information on how to install"
+                    " xformers",
                    name="xformers",
                )
            elif not torch.cuda.is_available():
@@ -129,7 +125,7 @@ class CrossAttention(nn.Module):
                except Exception as e:
                    raise e

-            processor = XFormersCrossAttnProcessor(attention_op=attention_op)
+            processor = XFormersCrossAttnProcessor()
        else:
            processor = CrossAttnProcessor()

@@ -210,14 +206,7 @@ class CrossAttention(nn.Module):
            return attention_mask

        if attention_mask.shape[-1] != target_length:
-            if attention_mask.device.type == "mps":
-                # HACK: MPS: Does not support padding by greater than dimension of input tensor.
-                # Instead, we can manually construct the padding tensor.
-                padding_shape = (attention_mask.shape[0], attention_mask.shape[1], target_length)
-                padding = torch.zeros(padding_shape, device=attention_mask.device)
-                attention_mask = torch.concat([attention_mask, padding], dim=2)
-            else:
-                attention_mask = F.pad(attention_mask, (0, target_length), value=0.0)
+            attention_mask = F.pad(attention_mask, (0, target_length), value=0.0)
            attention_mask = attention_mask.repeat_interleave(head_size, dim=0)
        return attention_mask

@@ -248,68 +237,6 @@ class CrossAttnProcessor:
        return hidden_states


-class LoRALinearLayer(nn.Module):
-    def __init__(self, in_features, out_features, rank=4):
-        super().__init__()
-
-        if rank > min(in_features, out_features):
-            raise ValueError(f"LoRA rank {rank} must be less or equal than {min(in_features, out_features)}")
-
-        self.down = nn.Linear(in_features, rank, bias=False)
-        self.up = nn.Linear(rank, out_features, bias=False)
-        self.scale = 1.0
-
-        nn.init.normal_(self.down.weight, std=1 / rank)
-        nn.init.zeros_(self.up.weight)
-
-    def forward(self, hidden_states):
-        orig_dtype = hidden_states.dtype
-        dtype = self.down.weight.dtype
-
-        down_hidden_states = self.down(hidden_states.to(dtype))
-        up_hidden_states = self.up(down_hidden_states)
-
-        return up_hidden_states.to(orig_dtype)
-
-
-class LoRACrossAttnProcessor(nn.Module):
-    def __init__(self, hidden_size, cross_attention_dim=None, rank=4):
-        super().__init__()
-
-        self.to_q_lora = LoRALinearLayer(hidden_size, hidden_size)
-        self.to_k_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size)
-        self.to_v_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size)
-        self.to_out_lora = LoRALinearLayer(hidden_size, hidden_size)
-
-    def __call__(
-        self, attn: CrossAttention, hidden_states, encoder_hidden_states=None, attention_mask=None, scale=1.0
-    ):
-        batch_size, sequence_length, _ = hidden_states.shape
-        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length)
-
-        query = attn.to_q(hidden_states) + scale * self.to_q_lora(hidden_states)
-        query = attn.head_to_batch_dim(query)
-
-        encoder_hidden_states = encoder_hidden_states if encoder_hidden_states is not None else hidden_states
-
-        key = attn.to_k(encoder_hidden_states) + scale * self.to_k_lora(encoder_hidden_states)
-        value = attn.to_v(encoder_hidden_states) + scale * self.to_v_lora(encoder_hidden_states)
-
-        key = attn.head_to_batch_dim(key)
-        value = attn.head_to_batch_dim(value)
-
-        attention_probs = attn.get_attention_scores(query, key, attention_mask)
-        hidden_states = torch.bmm(attention_probs, value)
-        hidden_states = attn.batch_to_head_dim(hidden_states)
-
-        # linear proj
-        hidden_states = attn.to_out[0](hidden_states) + scale * self.to_out_lora(hidden_states)
-        # dropout
-        hidden_states = attn.to_out[1](hidden_states)
-
-        return hidden_states
-
-
 class CrossAttnAddedKVProcessor:
    def __call__(self, attn: CrossAttention, hidden_states, encoder_hidden_states=None, attention_mask=None):
        residual = hidden_states
@@ -353,9 +280,6 @@ class CrossAttnAddedKVProcessor:


 class XFormersCrossAttnProcessor:
-    def __init__(self, attention_op: Optional[Callable] = None):
-        self.attention_op = attention_op
-
    def __call__(self, attn: CrossAttention, hidden_states, encoder_hidden_states=None, attention_mask=None):
        batch_size, sequence_length, _ = hidden_states.shape

@@ -371,9 +295,7 @@ class XFormersCrossAttnProcessor:
        key = attn.head_to_batch_dim(key).contiguous()
        value = attn.head_to_batch_dim(value).contiguous()

-        hidden_states = xformers.ops.memory_efficient_attention(
-            query, key, value, attn_bias=attention_mask, op=self.attention_op
-        )
+        hidden_states = xformers.ops.memory_efficient_attention(query, key, value, attn_bias=attention_mask)
        hidden_states = hidden_states.to(query.dtype)
        hidden_states = attn.batch_to_head_dim(hidden_states)

@@ -381,41 +303,6 @@ class XFormersCrossAttnProcessor:
        hidden_states = attn.to_out[0](hidden_states)
        # dropout
        hidden_states = attn.to_out[1](hidden_states)
-        return hidden_states
-
-
-class LoRAXFormersCrossAttnProcessor(nn.Module):
-    def __init__(self, hidden_size, cross_attention_dim, rank=4):
-        super().__init__()
-
-        self.to_q_lora = LoRALinearLayer(hidden_size, hidden_size)
-        self.to_k_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size)
-        self.to_v_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size)
-        self.to_out_lora = LoRALinearLayer(hidden_size, hidden_size)
-
-    def __call__(
-        self, attn: CrossAttention, hidden_states, encoder_hidden_states=None, attention_mask=None, scale=1.0
-    ):
-        batch_size, sequence_length, _ = hidden_states.shape
-        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length)
-
-        query = attn.to_q(hidden_states) + scale * self.to_q_lora(hidden_states)
-        query = attn.head_to_batch_dim(query).contiguous()
-
-        encoder_hidden_states = encoder_hidden_states if encoder_hidden_states is not None else hidden_states
-
-        key = attn.to_k(encoder_hidden_states) + scale * self.to_k_lora(encoder_hidden_states)
-        value = attn.to_v(encoder_hidden_states) + scale * self.to_v_lora(encoder_hidden_states)
-
-        key = attn.head_to_batch_dim(key).contiguous()
-        value = attn.head_to_batch_dim(value).contiguous()
-
-        hidden_states = xformers.ops.memory_efficient_attention(query, key, value, attn_bias=attention_mask)
-
-        # linear proj
-        hidden_states = attn.to_out[0](hidden_states) + scale * self.to_out_lora(hidden_states)
-        # dropout
-        hidden_states = attn.to_out[1](hidden_states)

        return hidden_states

--- a/src/diffusers/models/embeddings.py
+++ b/src/diffusers/models/embeddings.py
@@ -61,96 +61,6 @@ def get_timestep_embedding(
    return emb


-def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False, extra_tokens=0):
-    """
-    grid_size: int of the grid height and width return: pos_embed: [grid_size*grid_size, embed_dim] or
-    [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
-    """
-    grid_h = np.arange(grid_size, dtype=np.float32)
-    grid_w = np.arange(grid_size, dtype=np.float32)
-    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
-    grid = np.stack(grid, axis=0)
-
-    grid = grid.reshape([2, 1, grid_size, grid_size])
-    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
-    if cls_token and extra_tokens > 0:
-        pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0)
-    return pos_embed
-
-
-def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
-    if embed_dim % 2 != 0:
-        raise ValueError("embed_dim must be divisible by 2")
-
-    # use half of dimensions to encode grid_h
-    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
-    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
-
-    emb = np.concatenate([emb_h, emb_w], axis=1)  # (H*W, D)
-    return emb
-
-
-def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
-    """
-    embed_dim: output dimension for each position pos: a list of positions to be encoded: size (M,) out: (M, D)
-    """
-    if embed_dim % 2 != 0:
-        raise ValueError("embed_dim must be divisible by 2")
-
-    omega = np.arange(embed_dim // 2, dtype=np.float64)
-    omega /= embed_dim / 2.0
-    omega = 1.0 / 10000**omega  # (D/2,)
-
-    pos = pos.reshape(-1)  # (M,)
-    out = np.einsum("m,d->md", pos, omega)  # (M, D/2), outer product
-
-    emb_sin = np.sin(out)  # (M, D/2)
-    emb_cos = np.cos(out)  # (M, D/2)
-
-    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
-    return emb
-
-
-class PatchEmbed(nn.Module):
-    """2D Image to Patch Embedding"""
-
-    def __init__(
-        self,
-        height=224,
-        width=224,
-        patch_size=16,
-        in_channels=3,
-        embed_dim=768,
-        layer_norm=False,
-        flatten=True,
-        bias=True,
-    ):
-        super().__init__()
-
-        num_patches = (height // patch_size) * (width // patch_size)
-        self.flatten = flatten
-        self.layer_norm = layer_norm
-
-        self.proj = nn.Conv2d(
-            in_channels, embed_dim, kernel_size=(patch_size, patch_size), stride=patch_size, bias=bias
-        )
-        if layer_norm:
-            self.norm = nn.LayerNorm(embed_dim, elementwise_affine=False, eps=1e-6)
-        else:
-            self.norm = None
-
-        pos_embed = get_2d_sincos_pos_embed(embed_dim, int(num_patches**0.5))
-        self.register_buffer("pos_embed", torch.from_numpy(pos_embed).float().unsqueeze(0), persistent=False)
-
-    def forward(self, latent):
-        latent = self.proj(latent)
-        if self.flatten:
-            latent = latent.flatten(2).transpose(1, 2)  # BCHW -> BNC
-        if self.layer_norm:
-            latent = self.norm(latent)
-        return latent + self.pos_embed
-
-
 class TimestepEmbedding(nn.Module):
    def __init__(self, in_channels: int, time_embed_dim: int, act_fn: str = "silu", out_dim: int = None):
        super().__init__()
@@ -288,58 +198,3 @@ class ImagePositionalEmbeddings(nn.Module):
        emb = emb + pos_emb[:, : emb.shape[1], :]

        return emb
-
-
-class LabelEmbedding(nn.Module):
-    """
-    Embeds class labels into vector representations. Also handles label dropout for classifier-free guidance.
-
-    Args:
-        num_classes (`int`): The number of classes.
-        hidden_size (`int`): The size of the vector embeddings.
-        dropout_prob (`float`): The probability of dropping a label.
-    """
-
-    def __init__(self, num_classes, hidden_size, dropout_prob):
-        super().__init__()
-        use_cfg_embedding = dropout_prob > 0
-        self.embedding_table = nn.Embedding(num_classes + use_cfg_embedding, hidden_size)
-        self.num_classes = num_classes
-        self.dropout_prob = dropout_prob
-
-    def token_drop(self, labels, force_drop_ids=None):
-        """
-        Drops labels to enable classifier-free guidance.
-        """
-        if force_drop_ids is None:
-            drop_ids = torch.rand(labels.shape[0], device=labels.device) < self.dropout_prob
-        else:
-            drop_ids = torch.tensor(force_drop_ids == 1)
-        labels = torch.where(drop_ids, self.num_classes, labels)
-        return labels
-
-    def forward(self, labels, force_drop_ids=None):
-        use_dropout = self.dropout_prob > 0
-        if (self.training and use_dropout) or (force_drop_ids is not None):
-            labels = self.token_drop(labels, force_drop_ids)
-        embeddings = self.embedding_table(labels)
-        return embeddings
-
-
-class CombinedTimestepLabelEmbeddings(nn.Module):
-    def __init__(self, num_classes, embedding_dim, class_dropout_prob=0.1):
-        super().__init__()
-
-        self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=1)
-        self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
-        self.class_embedder = LabelEmbedding(num_classes, embedding_dim, class_dropout_prob)
-
-    def forward(self, timestep, class_labels, hidden_dtype=None):
-        timesteps_proj = self.time_proj(timestep)
-        timesteps_emb = self.timestep_embedder(timesteps_proj.to(dtype=hidden_dtype))  # (N, D)
-
-        class_labels = self.class_embedder(class_labels)  # (N, D)
-
-        conditioning = timesteps_emb + class_labels  # (N, D)
-
-        return conditioning
--- a/src/diffusers/models/modeling_pytorch_flax_utils.py
+++ b/src/diffusers/models/modeling_pytorch_flax_utils.py
@@ -1,156 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" PyTorch - Flax general utilities."""
-
-from pickle import UnpicklingError
-
-import numpy as np
-
-import jax
-import jax.numpy as jnp
-from flax.serialization import from_bytes
-from flax.traverse_util import flatten_dict
-
-from ..utils import logging
-
-
-logger = logging.get_logger(__name__)
-
-
-#####################
-# Flax => PyTorch #
-#####################
-
-
-# from https://github.com/huggingface/transformers/blob/main/src/transformers/modeling_flax_pytorch_utils.py#L224-L352
-def load_flax_checkpoint_in_pytorch_model(pt_model, model_file):
-    try:
-        with open(model_file, "rb") as flax_state_f:
-            flax_state = from_bytes(None, flax_state_f.read())
-    except UnpicklingError as e:
-        try:
-            with open(model_file) as f:
-                if f.read().startswith("version"):
-                    raise OSError(
-                        "You seem to have cloned a repository without having git-lfs installed. Please"
-                        " install git-lfs and run `git lfs install` followed by `git lfs pull` in the"
-                        " folder you cloned."
-                    )
-                else:
-                    raise ValueError from e
-        except (UnicodeDecodeError, ValueError):
-            raise EnvironmentError(f"Unable to convert {model_file} to Flax deserializable object. ")
-
-    return load_flax_weights_in_pytorch_model(pt_model, flax_state)
-
-
-def load_flax_weights_in_pytorch_model(pt_model, flax_state):
-    """Load flax checkpoints in a PyTorch model"""
-
-    try:
-        import torch  # noqa: F401
-    except ImportError:
-        logger.error(
-            "Loading Flax weights in PyTorch requires both PyTorch and Flax to be installed. Please see"
-            " https://pytorch.org/ and https://flax.readthedocs.io/en/latest/installation.html for installation"
-            " instructions."
-        )
-        raise
-
-    # check if we have bf16 weights
-    is_type_bf16 = flatten_dict(jax.tree_util.tree_map(lambda x: x.dtype == jnp.bfloat16, flax_state)).values()
-    if any(is_type_bf16):
-        # convert all weights to fp32 if they are bf16 since torch.from_numpy can-not handle bf16
-
-        # and bf16 is not fully supported in PT yet.
-        logger.warning(
-            "Found ``bfloat16`` weights in Flax model. Casting all ``bfloat16`` weights to ``float32`` "
-            "before loading those in PyTorch model."
-        )
-        flax_state = jax.tree_util.tree_map(
-            lambda params: params.astype(np.float32) if params.dtype == jnp.bfloat16 else params, flax_state
-        )
-
-    pt_model.base_model_prefix = ""
-
-    flax_state_dict = flatten_dict(flax_state, sep=".")
-    pt_model_dict = pt_model.state_dict()
-
-    # keep track of unexpected & missing keys
-    unexpected_keys = []
-    missing_keys = set(pt_model_dict.keys())
-
-    for flax_key_tuple, flax_tensor in flax_state_dict.items():
-        flax_key_tuple_array = flax_key_tuple.split(".")
-
-        if flax_key_tuple_array[-1] == "kernel" and flax_tensor.ndim == 4:
-            flax_key_tuple_array = flax_key_tuple_array[:-1] + ["weight"]
-            flax_tensor = jnp.transpose(flax_tensor, (3, 2, 0, 1))
-        elif flax_key_tuple_array[-1] == "kernel":
-            flax_key_tuple_array = flax_key_tuple_array[:-1] + ["weight"]
-            flax_tensor = flax_tensor.T
-        elif flax_key_tuple_array[-1] == "scale":
-            flax_key_tuple_array = flax_key_tuple_array[:-1] + ["weight"]
-
-        if "time_embedding" not in flax_key_tuple_array:
-            for i, flax_key_tuple_string in enumerate(flax_key_tuple_array):
-                flax_key_tuple_array[i] = (
-                    flax_key_tuple_string.replace("_0", ".0")
-                    .replace("_1", ".1")
-                    .replace("_2", ".2")
-                    .replace("_3", ".3")
-                )
-
-        flax_key = ".".join(flax_key_tuple_array)
-
-        if flax_key in pt_model_dict:
-            if flax_tensor.shape != pt_model_dict[flax_key].shape:
-                raise ValueError(
-                    f"Flax checkpoint seems to be incorrect. Weight {flax_key_tuple} was expected "
-                    f"to be of shape {pt_model_dict[flax_key].shape}, but is {flax_tensor.shape}."
-                )
-            else:
-                # add weight to pytorch dict
-                flax_tensor = np.asarray(flax_tensor) if not isinstance(flax_tensor, np.ndarray) else flax_tensor
-                pt_model_dict[flax_key] = torch.from_numpy(flax_tensor)
-                # remove from missing keys
-                missing_keys.remove(flax_key)
-        else:
-            # weight is not expected by PyTorch model
-            unexpected_keys.append(flax_key)
-
-    pt_model.load_state_dict(pt_model_dict)
-
-    # re-transform missing_keys to list
-    missing_keys = list(missing_keys)
-
-    if len(unexpected_keys) > 0:
-        logger.warning(
-            "Some weights of the Flax model were not used when initializing the PyTorch model"
-            f" {pt_model.__class__.__name__}: {unexpected_keys}\n- This IS expected if you are initializing"
-            f" {pt_model.__class__.__name__} from a Flax model trained on another task or with another architecture"
-            " (e.g. initializing a BertForSequenceClassification model from a FlaxBertForPreTraining model).\n- This"
-            f" IS NOT expected if you are initializing {pt_model.__class__.__name__} from a Flax model that you expect"
-            " to be exactly identical (e.g. initializing a BertForSequenceClassification model from a"
-            " FlaxBertForSequenceClassification model)."
-        )
-    if len(missing_keys) > 0:
-        logger.warning(
-            f"Some weights of {pt_model.__class__.__name__} were not initialized from the Flax model and are newly"
-            f" initialized: {missing_keys}\nYou should probably TRAIN this model on a down-stream task to be able to"
-            " use it for predictions and inference."
-        )
-
-    return pt_model
--- a/src/diffusers/models/modeling_utils.py
+++ b/src/diffusers/models/modeling_utils.py
@@ -30,7 +30,6 @@ from .. import __version__
 from ..utils import (
    CONFIG_NAME,
    DIFFUSERS_CACHE,
-    FLAX_WEIGHTS_NAME,
    HF_HUB_OFFLINE,
    HUGGINGFACE_CO_RESOLVE_ENDPOINT,
    SAFETENSORS_WEIGHTS_NAME,
@@ -190,15 +189,13 @@ class ModelMixin(torch.nn.Module):
        if self._supports_gradient_checkpointing:
            self.apply(partial(self._set_gradient_checkpointing, value=False))

-    def set_use_memory_efficient_attention_xformers(
-        self, valid: bool, attention_op: Optional[Callable] = None
-    ) -> None:
+    def set_use_memory_efficient_attention_xformers(self, valid: bool) -> None:
        # Recursively walk through all the children.
        # Any children which exposes the set_use_memory_efficient_attention_xformers method
        # gets the message
        def fn_recursive_set_mem_eff(module: torch.nn.Module):
            if hasattr(module, "set_use_memory_efficient_attention_xformers"):
-                module.set_use_memory_efficient_attention_xformers(valid, attention_op)
+                module.set_use_memory_efficient_attention_xformers(valid)

            for child in module.children():
                fn_recursive_set_mem_eff(child)
@@ -207,7 +204,7 @@ class ModelMixin(torch.nn.Module):
            if isinstance(module, torch.nn.Module):
                fn_recursive_set_mem_eff(module)

-    def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None):
+    def enable_xformers_memory_efficient_attention(self):
        r"""
        Enable memory efficient attention as implemented in xformers.

@@ -216,28 +213,8 @@ class ModelMixin(torch.nn.Module):

        Warning: When Memory Efficient Attention and Sliced attention are both enabled, the Memory Efficient Attention
        is used.
-
-        Parameters:
-            attention_op (`Callable`, *optional*):
-                Override the default `None` operator for use as `op` argument to the
-                [`memory_efficient_attention()`](https://facebookresearch.github.io/xformers/components/ops.html#xformers.ops.memory_efficient_attention)
-                function of xFormers.
-
-        Examples:
-
-        ```py
-        >>> import torch
-        >>> from diffusers import UNet2DConditionModel
-        >>> from xformers.ops import MemoryEfficientAttentionFlashAttentionOp
-
-        >>> model = UNet2DConditionModel.from_pretrained(
-        ...     "stabilityai/stable-diffusion-2-1", subfolder="unet", torch_dtype=torch.float16
-        ... )
-        >>> model = model.to("cuda")
-        >>> model.enable_xformers_memory_efficient_attention(attention_op=MemoryEfficientAttentionFlashAttentionOp)
-        ```
        """
-        self.set_use_memory_efficient_attention_xformers(True, attention_op)
+        self.set_use_memory_efficient_attention_xformers(True)

    def disable_xformers_memory_efficient_attention(self):
        r"""
@@ -294,6 +271,15 @@ class ModelMixin(torch.nn.Module):

        weights_name = SAFETENSORS_WEIGHTS_NAME if safe_serialization else WEIGHTS_NAME

+        # Clean the folder from a previous save
+        for filename in os.listdir(save_directory):
+            full_filename = os.path.join(save_directory, filename)
+            # If we have a shard file that is not going to be replaced, we delete it, but only from the main process
+            # in distributed settings to avoid race conditions.
+            weights_no_suffix = weights_name.replace(".bin", "").replace(".safetensors", "")
+            if filename.startswith(weights_no_suffix) and os.path.isfile(full_filename) and is_main_process:
+                os.remove(full_filename)
+
        # Save the model
        save_function(state_dict, os.path.join(save_directory, weights_name))

@@ -349,8 +335,6 @@ class ModelMixin(torch.nn.Module):
                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
                identifier allowed by git.
-            from_flax (`bool`, *optional*, defaults to `False`):
-                Load the model weights from a Flax checkpoint save file.
            subfolder (`str`, *optional*, defaults to `""`):
                In case the relevant files are located inside a subfolder of the model repo (either remote in
                huggingface.co or downloaded locally), you can specify the folder name here.
@@ -391,7 +375,6 @@ class ModelMixin(torch.nn.Module):
        cache_dir = kwargs.pop("cache_dir", DIFFUSERS_CACHE)
        ignore_mismatched_sizes = kwargs.pop("ignore_mismatched_sizes", False)
        force_download = kwargs.pop("force_download", False)
-        from_flax = kwargs.pop("from_flax", False)
        resume_download = kwargs.pop("resume_download", False)
        proxies = kwargs.pop("proxies", None)
        output_loading_info = kwargs.pop("output_loading_info", False)
@@ -450,62 +433,11 @@ class ModelMixin(torch.nn.Module):
        # Load model

        model_file = None
-        if from_flax:
-            model_file = _get_model_file(
-                pretrained_model_name_or_path,
-                weights_name=FLAX_WEIGHTS_NAME,
-                cache_dir=cache_dir,
-                force_download=force_download,
-                resume_download=resume_download,
-                proxies=proxies,
-                local_files_only=local_files_only,
-                use_auth_token=use_auth_token,
-                revision=revision,
-                subfolder=subfolder,
-                user_agent=user_agent,
-            )
-            config, unused_kwargs = cls.load_config(
-                config_path,
-                cache_dir=cache_dir,
-                return_unused_kwargs=True,
-                force_download=force_download,
-                resume_download=resume_download,
-                proxies=proxies,
-                local_files_only=local_files_only,
-                use_auth_token=use_auth_token,
-                revision=revision,
-                subfolder=subfolder,
-                device_map=device_map,
-                **kwargs,
-            )
-            model = cls.from_config(config, **unused_kwargs)
-
-            # Convert the weights
-            from .modeling_pytorch_flax_utils import load_flax_checkpoint_in_pytorch_model
-
-            model = load_flax_checkpoint_in_pytorch_model(model, model_file)
-        else:
-            if is_safetensors_available():
-                try:
-                    model_file = _get_model_file(
-                        pretrained_model_name_or_path,
-                        weights_name=SAFETENSORS_WEIGHTS_NAME,
-                        cache_dir=cache_dir,
-                        force_download=force_download,
-                        resume_download=resume_download,
-                        proxies=proxies,
-                        local_files_only=local_files_only,
-                        use_auth_token=use_auth_token,
-                        revision=revision,
-                        subfolder=subfolder,
-                        user_agent=user_agent,
-                    )
-                except:
-                    pass
-            if model_file is None:
-                model_file = _get_model_file(
+        if is_safetensors_available():
+            try:
+                model_file = cls._get_model_file(
                    pretrained_model_name_or_path,
-                    weights_name=WEIGHTS_NAME,
+                    weights_name=SAFETENSORS_WEIGHTS_NAME,
                    cache_dir=cache_dir,
                    force_download=force_download,
                    resume_download=resume_download,
@@ -516,53 +448,26 @@ class ModelMixin(torch.nn.Module):
                    subfolder=subfolder,
                    user_agent=user_agent,
                )
+            except:
+                pass
+        if model_file is None:
+            model_file = cls._get_model_file(
+                pretrained_model_name_or_path,
+                weights_name=WEIGHTS_NAME,
+                cache_dir=cache_dir,
+                force_download=force_download,
+                resume_download=resume_download,
+                proxies=proxies,
+                local_files_only=local_files_only,
+                use_auth_token=use_auth_token,
+                revision=revision,
+                subfolder=subfolder,
+                user_agent=user_agent,
+            )

-            if low_cpu_mem_usage:
-                # Instantiate model with empty weights
-                with accelerate.init_empty_weights():
-                    config, unused_kwargs = cls.load_config(
-                        config_path,
-                        cache_dir=cache_dir,
-                        return_unused_kwargs=True,
-                        force_download=force_download,
-                        resume_download=resume_download,
-                        proxies=proxies,
-                        local_files_only=local_files_only,
-                        use_auth_token=use_auth_token,
-                        revision=revision,
-                        subfolder=subfolder,
-                        device_map=device_map,
-                        **kwargs,
-                    )
-                    model = cls.from_config(config, **unused_kwargs)
-
-                # if device_map is None, load the state dict and move the params from meta device to the cpu
-                if device_map is None:
-                    param_device = "cpu"
-                    state_dict = load_state_dict(model_file)
-                    # move the params from meta device to cpu
-                    for param_name, param in state_dict.items():
-                        accepts_dtype = "dtype" in set(
-                            inspect.signature(set_module_tensor_to_device).parameters.keys()
-                        )
-                        if accepts_dtype:
-                            set_module_tensor_to_device(
-                                model, param_name, param_device, value=param, dtype=torch_dtype
-                            )
-                        else:
-                            set_module_tensor_to_device(model, param_name, param_device, value=param)
-                else:  # else let accelerate handle loading and dispatching.
-                    # Load weights and dispatch according to the device_map
-                    # by deafult the device_map is None and the weights are loaded on the CPU
-                    accelerate.load_checkpoint_and_dispatch(model, model_file, device_map, dtype=torch_dtype)
-
-                loading_info = {
-                    "missing_keys": [],
-                    "unexpected_keys": [],
-                    "mismatched_keys": [],
-                    "error_msgs": [],
-                }
-            else:
+        if low_cpu_mem_usage:
+            # Instantiate model with empty weights
+            with accelerate.init_empty_weights():
                config, unused_kwargs = cls.load_config(
                    config_path,
                    cache_dir=cache_dir,
@@ -579,22 +484,61 @@ class ModelMixin(torch.nn.Module):
                )
                model = cls.from_config(config, **unused_kwargs)

+            # if device_map is Non,e load the state dict on move the params from meta device to the cpu
+            if device_map is None:
+                param_device = "cpu"
                state_dict = load_state_dict(model_file)
+                # move the parms from meta device to cpu
+                for param_name, param in state_dict.items():
+                    accepts_dtype = "dtype" in set(inspect.signature(set_module_tensor_to_device).parameters.keys())
+                    if accepts_dtype:
+                        set_module_tensor_to_device(model, param_name, param_device, value=param, dtype=torch_dtype)
+                    else:
+                        set_module_tensor_to_device(model, param_name, param_device, value=param)
+            else:  # else let accelerate handle loading and dispatching.
+                # Load weights and dispatch according to the device_map
+                # by deafult the device_map is None and the weights are loaded on the CPU
+                accelerate.load_checkpoint_and_dispatch(model, model_file, device_map, dtype=torch_dtype)

-                model, missing_keys, unexpected_keys, mismatched_keys, error_msgs = cls._load_pretrained_model(
-                    model,
-                    state_dict,
-                    model_file,
-                    pretrained_model_name_or_path,
-                    ignore_mismatched_sizes=ignore_mismatched_sizes,
-                )
+            loading_info = {
+                "missing_keys": [],
+                "unexpected_keys": [],
+                "mismatched_keys": [],
+                "error_msgs": [],
+            }
+        else:
+            config, unused_kwargs = cls.load_config(
+                config_path,
+                cache_dir=cache_dir,
+                return_unused_kwargs=True,
+                force_download=force_download,
+                resume_download=resume_download,
+                proxies=proxies,
+                local_files_only=local_files_only,
+                use_auth_token=use_auth_token,
+                revision=revision,
+                subfolder=subfolder,
+                device_map=device_map,
+                **kwargs,
+            )
+            model = cls.from_config(config, **unused_kwargs)

-                loading_info = {
-                    "missing_keys": missing_keys,
-                    "unexpected_keys": unexpected_keys,
-                    "mismatched_keys": mismatched_keys,
-                    "error_msgs": error_msgs,
-                }
+            state_dict = load_state_dict(model_file)
+
+            model, missing_keys, unexpected_keys, mismatched_keys, error_msgs = cls._load_pretrained_model(
+                model,
+                state_dict,
+                model_file,
+                pretrained_model_name_or_path,
+                ignore_mismatched_sizes=ignore_mismatched_sizes,
+            )
+
+            loading_info = {
+                "missing_keys": missing_keys,
+                "unexpected_keys": unexpected_keys,
+                "mismatched_keys": mismatched_keys,
+                "error_msgs": error_msgs,
+            }

        if torch_dtype is not None and not isinstance(torch_dtype, torch.dtype):
            raise ValueError(
@@ -612,6 +556,92 @@ class ModelMixin(torch.nn.Module):

        return model

+    @classmethod
+    def _get_model_file(
+        cls,
+        pretrained_model_name_or_path,
+        *,
+        weights_name,
+        subfolder,
+        cache_dir,
+        force_download,
+        proxies,
+        resume_download,
+        local_files_only,
+        use_auth_token,
+        user_agent,
+        revision,
+    ):
+        pretrained_model_name_or_path = str(pretrained_model_name_or_path)
+        if os.path.isdir(pretrained_model_name_or_path):
+            if os.path.isfile(os.path.join(pretrained_model_name_or_path, weights_name)):
+                # Load from a PyTorch checkpoint
+                model_file = os.path.join(pretrained_model_name_or_path, weights_name)
+            elif subfolder is not None and os.path.isfile(
+                os.path.join(pretrained_model_name_or_path, subfolder, weights_name)
+            ):
+                model_file = os.path.join(pretrained_model_name_or_path, subfolder, weights_name)
+            else:
+                raise EnvironmentError(
+                    f"Error no file named {weights_name} found in directory {pretrained_model_name_or_path}."
+                )
+            return model_file
+        else:
+            try:
+                # Load from URL or cache if already cached
+                model_file = hf_hub_download(
+                    pretrained_model_name_or_path,
+                    filename=weights_name,
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    proxies=proxies,
+                    resume_download=resume_download,
+                    local_files_only=local_files_only,
+                    use_auth_token=use_auth_token,
+                    user_agent=user_agent,
+                    subfolder=subfolder,
+                    revision=revision,
+                )
+                return model_file
+
+            except RepositoryNotFoundError:
+                raise EnvironmentError(
+                    f"{pretrained_model_name_or_path} is not a local folder and is not a valid model identifier "
+                    "listed on 'https://huggingface.co/models'\nIf this is a private repository, make sure to pass a "
+                    "token having permission to this repo with `use_auth_token` or log in with `huggingface-cli "
+                    "login`."
+                )
+            except RevisionNotFoundError:
+                raise EnvironmentError(
+                    f"{revision} is not a valid git identifier (branch name, tag name or commit id) that exists for "
+                    "this model name. Check the model page at "
+                    f"'https://huggingface.co/{pretrained_model_name_or_path}' for available revisions."
+                )
+            except EntryNotFoundError:
+                raise EnvironmentError(
+                    f"{pretrained_model_name_or_path} does not appear to have a file named {weights_name}."
+                )
+            except HTTPError as err:
+                raise EnvironmentError(
+                    "There was a specific connection error when trying to load"
+                    f" {pretrained_model_name_or_path}:\n{err}"
+                )
+            except ValueError:
+                raise EnvironmentError(
+                    f"We couldn't connect to '{HUGGINGFACE_CO_RESOLVE_ENDPOINT}' to load this model, couldn't find it"
+                    f" in the cached files and it looks like {pretrained_model_name_or_path} is not the path to a"
+                    f" directory containing a file named {weights_name} or"
+                    " \nCheckout your internet connection or see how to run the library in"
+                    " offline mode at 'https://huggingface.co/docs/diffusers/installation#offline-mode'."
+                )
+            except EnvironmentError:
+                raise EnvironmentError(
+                    f"Can't load the model for '{pretrained_model_name_or_path}'. If you were trying to load it from "
+                    "'https://huggingface.co/models', make sure you don't have a local directory with the same name. "
+                    f"Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a directory "
+                    f"containing a file named {weights_name}"
+                )
+
    @classmethod
    def _load_pretrained_model(
        cls,
@@ -775,9 +805,7 @@ def _get_model_file(
    revision,
 ):
    pretrained_model_name_or_path = str(pretrained_model_name_or_path)
-    if os.path.isfile(pretrained_model_name_or_path):
-        return pretrained_model_name_or_path
-    elif os.path.isdir(pretrained_model_name_or_path):
+    if os.path.isdir(pretrained_model_name_or_path):
        if os.path.isfile(os.path.join(pretrained_model_name_or_path, weights_name)):
            # Load from a PyTorch checkpoint
            model_file = os.path.join(pretrained_model_name_or_path, weights_name)
--- a/src/diffusers/models/transformer_2d.py
+++ b/src/diffusers/models/transformer_2d.py
@@ -20,9 +20,8 @@ from torch import nn

 from ..configuration_utils import ConfigMixin, register_to_config
 from ..models.embeddings import ImagePositionalEmbeddings
-from ..utils import BaseOutput, deprecate
+from ..utils import BaseOutput
 from .attention import BasicTransformerBlock
-from .embeddings import PatchEmbed
 from .modeling_utils import ModelMixin


@@ -82,7 +81,6 @@ class Transformer2DModel(ModelMixin, ConfigMixin):
        num_attention_heads: int = 16,
        attention_head_dim: int = 88,
        in_channels: Optional[int] = None,
-        out_channels: Optional[int] = None,
        num_layers: int = 1,
        dropout: float = 0.0,
        norm_num_groups: int = 32,
@@ -90,14 +88,11 @@ class Transformer2DModel(ModelMixin, ConfigMixin):
        attention_bias: bool = False,
        sample_size: Optional[int] = None,
        num_vector_embeds: Optional[int] = None,
-        patch_size: Optional[int] = None,
        activation_fn: str = "geglu",
        num_embeds_ada_norm: Optional[int] = None,
        use_linear_projection: bool = False,
        only_cross_attention: bool = False,
        upcast_attention: bool = False,
-        norm_type: str = "layer_norm",
-        norm_elementwise_affine: bool = True,
    ):
        super().__init__()
        self.use_linear_projection = use_linear_projection
@@ -107,35 +102,18 @@ class Transformer2DModel(ModelMixin, ConfigMixin):

        # 1. Transformer2DModel can process both standard continous images of shape `(batch_size, num_channels, width, height)` as well as quantized image embeddings of shape `(batch_size, num_image_vectors)`
        # Define whether input is continuous or discrete depending on configuration
-        self.is_input_continuous = (in_channels is not None) and (patch_size is None)
+        self.is_input_continuous = in_channels is not None
        self.is_input_vectorized = num_vector_embeds is not None
-        self.is_input_patches = in_channels is not None and patch_size is not None
-
-        if norm_type == "layer_norm" and num_embeds_ada_norm is not None:
-            deprecation_message = (
-                f"The configuration file of this model: {self.__class__} is outdated. `norm_type` is either not set or"
-                " incorrectly set to `'layer_norm'`.Make sure to set `norm_type` to `'ada_norm'` in the config."
-                " Please make sure to update the config accordingly as leaving `norm_type` might led to incorrect"
-                " results in future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it"
-                " would be very nice if you could open a Pull request for the `transformer/config.json` file"
-            )
-            deprecate("norm_type!=num_embeds_ada_norm", "1.0.0", deprecation_message, standard_warn=False)
-            norm_type = "ada_norm"

        if self.is_input_continuous and self.is_input_vectorized:
            raise ValueError(
                f"Cannot define both `in_channels`: {in_channels} and `num_vector_embeds`: {num_vector_embeds}. Make"
                " sure that either `in_channels` or `num_vector_embeds` is None."
            )
-        elif self.is_input_vectorized and self.is_input_patches:
+        elif not self.is_input_continuous and not self.is_input_vectorized:
            raise ValueError(
-                f"Cannot define both `num_vector_embeds`: {num_vector_embeds} and `patch_size`: {patch_size}. Make"
-                " sure that either `num_vector_embeds` or `num_patches` is None."
-            )
-        elif not self.is_input_continuous and not self.is_input_vectorized and not self.is_input_patches:
-            raise ValueError(
-                f"Has to define `in_channels`: {in_channels}, `num_vector_embeds`: {num_vector_embeds}, or patch_size:"
-                f" {patch_size}. Make sure that `in_channels`, `num_vector_embeds` or `num_patches` is not None."
+                f"Has to define either `in_channels`: {in_channels} or `num_vector_embeds`: {num_vector_embeds}. Make"
+                " sure that either `in_channels` or `num_vector_embeds` is not None."
            )

        # 2. Define input layers
@@ -159,20 +137,6 @@ class Transformer2DModel(ModelMixin, ConfigMixin):
            self.latent_image_embedding = ImagePositionalEmbeddings(
                num_embed=num_vector_embeds, embed_dim=inner_dim, height=self.height, width=self.width
            )
-        elif self.is_input_patches:
-            assert sample_size is not None, "Transformer2DModel over patched input must provide sample_size"
-
-            self.height = sample_size
-            self.width = sample_size
-
-            self.patch_size = patch_size
-            self.pos_embed = PatchEmbed(
-                height=sample_size,
-                width=sample_size,
-                patch_size=patch_size,
-                in_channels=in_channels,
-                embed_dim=inner_dim,
-            )

        # 3. Define transformers blocks
        self.transformer_blocks = nn.ModuleList(
@@ -188,17 +152,13 @@ class Transformer2DModel(ModelMixin, ConfigMixin):
                    attention_bias=attention_bias,
                    only_cross_attention=only_cross_attention,
                    upcast_attention=upcast_attention,
-                    norm_type=norm_type,
-                    norm_elementwise_affine=norm_elementwise_affine,
                )
                for d in range(num_layers)
            ]
        )

        # 4. Define output layers
-        self.out_channels = in_channels if out_channels is None else out_channels
        if self.is_input_continuous:
-            # TODO: should use out_channels for continous projections
            if use_linear_projection:
                self.proj_out = nn.Linear(in_channels, inner_dim)
            else:
@@ -206,17 +166,12 @@ class Transformer2DModel(ModelMixin, ConfigMixin):
        elif self.is_input_vectorized:
            self.norm_out = nn.LayerNorm(inner_dim)
            self.out = nn.Linear(inner_dim, self.num_vector_embeds - 1)
-        elif self.is_input_patches:
-            self.norm_out = nn.LayerNorm(inner_dim, elementwise_affine=False, eps=1e-6)
-            self.proj_out_1 = nn.Linear(inner_dim, 2 * inner_dim)
-            self.proj_out_2 = nn.Linear(inner_dim, patch_size * patch_size * self.out_channels)

    def forward(
        self,
        hidden_states,
        encoder_hidden_states=None,
        timestep=None,
-        class_labels=None,
        cross_attention_kwargs=None,
        return_dict: bool = True,
    ):
@@ -230,9 +185,6 @@ class Transformer2DModel(ModelMixin, ConfigMixin):
                self-attention.
            timestep ( `torch.long`, *optional*):
                Optional timestep to be applied as an embedding in AdaLayerNorm's. Used to indicate denoising step.
-            class_labels ( `torch.LongTensor` of shape `(batch size, num classes)`, *optional*):
-                Optional class labels to be applied as an embedding in AdaLayerZeroNorm. Used to indicate class labels
-                conditioning.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple.

@@ -243,7 +195,7 @@ class Transformer2DModel(ModelMixin, ConfigMixin):
        """
        # 1. Input
        if self.is_input_continuous:
-            batch, _, height, width = hidden_states.shape
+            batch, channel, height, width = hidden_states.shape
            residual = hidden_states

            hidden_states = self.norm(hidden_states)
@@ -257,8 +209,6 @@ class Transformer2DModel(ModelMixin, ConfigMixin):
                hidden_states = self.proj_in(hidden_states)
        elif self.is_input_vectorized:
            hidden_states = self.latent_image_embedding(hidden_states)
-        elif self.is_input_patches:
-            hidden_states = self.pos_embed(hidden_states)

        # 2. Blocks
        for block in self.transformer_blocks:
@@ -267,7 +217,6 @@ class Transformer2DModel(ModelMixin, ConfigMixin):
                encoder_hidden_states=encoder_hidden_states,
                timestep=timestep,
                cross_attention_kwargs=cross_attention_kwargs,
-                class_labels=class_labels,
            )

        # 3. Output
@@ -288,24 +237,6 @@ class Transformer2DModel(ModelMixin, ConfigMixin):

            # log(p(x_0))
            output = F.log_softmax(logits.double(), dim=1).float()
-        elif self.is_input_patches:
-            # TODO: cleanup!
-            conditioning = self.transformer_blocks[0].norm1.emb(
-                timestep, class_labels, hidden_dtype=hidden_states.dtype
-            )
-            shift, scale = self.proj_out_1(F.silu(conditioning)).chunk(2, dim=1)
-            hidden_states = self.norm_out(hidden_states) * (1 + scale[:, None]) + shift[:, None]
-            hidden_states = self.proj_out_2(hidden_states)
-
-            # unpatchify
-            height = width = int(hidden_states.shape[1] ** 0.5)
-            hidden_states = hidden_states.reshape(
-                shape=(-1, height, width, self.patch_size, self.patch_size, self.out_channels)
-            )
-            hidden_states = torch.einsum("nhwpqc->nchpwq", hidden_states)
-            output = hidden_states.reshape(
-                shape=(-1, self.out_channels, height * self.patch_size, width * self.patch_size)
-            )

        if not return_dict:
            return (output,)
--- a/src/diffusers/models/unet_2d_condition.py
+++ b/src/diffusers/models/unet_2d_condition.py
@@ -19,7 +19,6 @@ import torch.nn as nn
 import torch.utils.checkpoint

 from ..configuration_utils import ConfigMixin, register_to_config
-from ..loaders import UNet2DConditionLoadersMixin
 from ..utils import BaseOutput, logging
 from .cross_attention import AttnProcessor
 from .embeddings import TimestepEmbedding, Timesteps
@@ -50,7 +49,7 @@ class UNet2DConditionOutput(BaseOutput):
    sample: torch.FloatTensor


-class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
+class UNet2DConditionModel(ModelMixin, ConfigMixin):
    r"""
    UNet2DConditionModel is a conditional 2D UNet model that takes in a noisy sample, conditional state, and a timestep
    and returns sample shaped output.
@@ -267,59 +266,17 @@ class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin)
        self.conv_act = nn.SiLU()
        self.conv_out = nn.Conv2d(block_out_channels[0], out_channels, kernel_size=3, padding=1)

-    @property
-    def attn_processors(self) -> Dict[str, AttnProcessor]:
-        r"""
-        Returns:
-            `dict` of attention processors: A dictionary containing all attention processors used in the model with
-            indexed by its weight name.
-        """
+    def set_attn_processor(self, processor: AttnProcessor):
        # set recursively
-        processors = {}
-
-        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttnProcessor]):
+        def fn_recursive_attn_processor(module: torch.nn.Module):
            if hasattr(module, "set_processor"):
-                processors[f"{name}.processor"] = module.processor
+                module.set_processor(processor)

-            for sub_name, child in module.named_children():
-                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+            for child in module.children():
+                fn_recursive_attn_processor(child)

-            return processors
-
-        for name, module in self.named_children():
-            fn_recursive_add_processors(name, module, processors)
-
-        return processors
-
-    def set_attn_processor(self, processor: Union[AttnProcessor, Dict[str, AttnProcessor]]):
-        r"""
-        Parameters:
-            `processor (`dict` of `AttnProcessor` or `AttnProcessor`):
-                The instantiated processor class or a dictionary of processor classes that will be set as the processor
-                of **all** `CrossAttention` layers.
-            In case `processor` is a dict, the key needs to define the path to the corresponding cross attention processor. This is strongly recommended when setting trainablae attention processors.:
-
-        """
-        count = len(self.attn_processors.keys())
-
-        if isinstance(processor, dict) and len(processor) != count:
-            raise ValueError(
-                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
-                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
-            )
-
-        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
-            if hasattr(module, "set_processor"):
-                if not isinstance(processor, dict):
-                    module.set_processor(processor)
-                else:
-                    module.set_processor(processor.pop(f"{name}.processor"))
-
-            for sub_name, child in module.named_children():
-                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
-
-        for name, module in self.named_children():
-            fn_recursive_attn_processor(name, module, processor)
+        for module in self.children():
+            fn_recursive_attn_processor(module)

    def set_attention_slice(self, slice_size):
        r"""
--- a/src/diffusers/pipelines/init.py
+++ b/src/diffusers/pipelines/init.py
@@ -18,7 +18,6 @@ else:
    from .dance_diffusion import DanceDiffusionPipeline
    from .ddim import DDIMPipeline
    from .ddpm import DDPMPipeline
-    from .dit import DiTPipeline
    from .latent_diffusion import LDMSuperResolutionPipeline
    from .latent_diffusion_uncond import LDMPipeline
    from .pipeline_utils import AudioPipelineOutput, DiffusionPipeline, ImagePipelineOutput
@@ -51,7 +50,6 @@ else:
        StableDiffusionImg2ImgPipeline,
        StableDiffusionInpaintPipeline,
        StableDiffusionInpaintPipelineLegacy,
-        StableDiffusionInstructPix2PixPipeline,
        StableDiffusionPipeline,
        StableDiffusionUpscalePipeline,
    )
@@ -110,8 +108,4 @@ try:
 except OptionalDependencyNotAvailable:
    from ..utils.dummy_flax_and_transformers_objects import *  # noqa F403
 else:
-    from .stable_diffusion import (
-        FlaxStableDiffusionImg2ImgPipeline,
-        FlaxStableDiffusionInpaintPipeline,
-        FlaxStableDiffusionPipeline,
-    )
+    from .stable_diffusion import FlaxStableDiffusionImg2ImgPipeline, FlaxStableDiffusionPipeline
--- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
+++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
@@ -23,7 +23,14 @@ from transformers import CLIPFeatureExtractor, XLMRobertaTokenizer

 from ...configuration_utils import FrozenDict
 from ...models import AutoencoderKL, UNet2DConditionModel
-from ...schedulers import KarrasDiffusionSchedulers
+from ...schedulers import (
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+)
 from ...utils import deprecate, logging, randn_tensor, replace_example_docstring
 from ..pipeline_utils import DiffusionPipeline
 from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
@@ -84,7 +91,14 @@ class AltDiffusionPipeline(DiffusionPipeline):
        text_encoder: RobertaSeriesModelWithTransformation,
        tokenizer: XLMRobertaTokenizer,
        unet: UNet2DConditionModel,
-        scheduler: KarrasDiffusionSchedulers,
+        scheduler: Union[
+            DDIMScheduler,
+            PNDMScheduler,
+            LMSDiscreteScheduler,
+            EulerDiscreteScheduler,
+            EulerAncestralDiscreteScheduler,
+            DPMSolverMultistepScheduler,
+        ],
        safety_checker: StableDiffusionSafetyChecker,
        feature_extractor: CLIPFeatureExtractor,
        requires_safety_checker: bool = True,
@@ -197,10 +211,13 @@ class AltDiffusionPipeline(DiffusionPipeline):
        device = torch.device(f"cuda:{gpu_id}")

        for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
-            cpu_offload(cpu_offloaded_model, device)
+            if cpu_offloaded_model is not None:
+                cpu_offload(cpu_offloaded_model, device)

        if self.safety_checker is not None:
-            cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True)
+            # TODO(Patrick) - there is currently a bug with cpu offload of nn.Parameter in accelerate
+            # fix by only offloading self.safety_checker for now
+            cpu_offload(self.safety_checker.vision_model, device)

    @property
    def _execution_device(self):
@@ -220,21 +237,12 @@ class AltDiffusionPipeline(DiffusionPipeline):
                return torch.device(module._hf_hook.execution_device)
        return self.device

-    def _encode_prompt(
-        self,
-        prompt,
-        device,
-        num_images_per_prompt,
-        do_classifier_free_guidance,
-        negative_prompt=None,
-        prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
-    ):
+    def _encode_prompt(self, prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt):
        r"""
        Encodes the prompt into text encoder hidden states.

        Args:
-             prompt (`str` or `List[str]`, *optional*):
+            prompt (`str` or `list(int)`):
                prompt to be encoded
            device: (`torch.device`):
                torch device
@@ -242,67 +250,47 @@ class AltDiffusionPipeline(DiffusionPipeline):
                number of images that should be generated per prompt
            do_classifier_free_guidance (`bool`):
                whether to use classifier free guidance or not
-            negative_ prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
-                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
-            prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
+            negative_prompt (`str` or `List[str]`):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
        """
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+
+        if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+            attention_mask = text_inputs.attention_mask.to(device)
        else:
-            batch_size = prompt_embeds.shape[0]
+            attention_mask = None

-        if prompt_embeds is None:
-            text_inputs = self.tokenizer(
-                prompt,
-                padding="max_length",
-                max_length=self.tokenizer.model_max_length,
-                truncation=True,
-                return_tensors="pt",
-            )
-            text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+        text_embeddings = self.text_encoder(
+            text_input_ids.to(device),
+            attention_mask=attention_mask,
+        )
+        text_embeddings = text_embeddings[0]

-            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
-                text_input_ids, untruncated_ids
-            ):
-                removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
-                )
-                logger.warning(
-                    "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
-                )
-
-            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-                attention_mask = text_inputs.attention_mask.to(device)
-            else:
-                attention_mask = None
-
-            prompt_embeds = self.text_encoder(
-                text_input_ids.to(device),
-                attention_mask=attention_mask,
-            )
-            prompt_embeds = prompt_embeds[0]
-
-        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
-
-        bs_embed, seq_len, _ = prompt_embeds.shape
        # duplicate text embeddings for each generation per prompt, using mps friendly method
-        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
-        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+        bs_embed, seq_len, _ = text_embeddings.shape
+        text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)
+        text_embeddings = text_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)

        # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance and negative_prompt_embeds is None:
+        if do_classifier_free_guidance:
            uncond_tokens: List[str]
            if negative_prompt is None:
                uncond_tokens = [""] * batch_size
@@ -322,7 +310,7 @@ class AltDiffusionPipeline(DiffusionPipeline):
            else:
                uncond_tokens = negative_prompt

-            max_length = prompt_embeds.shape[1]
+            max_length = text_input_ids.shape[-1]
            uncond_input = self.tokenizer(
                uncond_tokens,
                padding="max_length",
@@ -336,27 +324,23 @@ class AltDiffusionPipeline(DiffusionPipeline):
            else:
                attention_mask = None

-            negative_prompt_embeds = self.text_encoder(
+            uncond_embeddings = self.text_encoder(
                uncond_input.input_ids.to(device),
                attention_mask=attention_mask,
            )
-            negative_prompt_embeds = negative_prompt_embeds[0]
+            uncond_embeddings = uncond_embeddings[0]

-        if do_classifier_free_guidance:
            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-            seq_len = negative_prompt_embeds.shape[1]
-
-            negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
-
-            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
-            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+            seq_len = uncond_embeddings.shape[1]
+            uncond_embeddings = uncond_embeddings.repeat(1, num_images_per_prompt, 1)
+            uncond_embeddings = uncond_embeddings.view(batch_size * num_images_per_prompt, seq_len, -1)

            # For classifier free guidance, we need to do two forward passes.
            # Here we concatenate the unconditional and text embeddings into a single batch
            # to avoid doing two forward passes
-            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])

-        return prompt_embeds
+        return text_embeddings

    def run_safety_checker(self, image, device, dtype):
        if self.safety_checker is not None:
@@ -393,16 +377,10 @@ class AltDiffusionPipeline(DiffusionPipeline):
            extra_step_kwargs["generator"] = generator
        return extra_step_kwargs

-    def check_inputs(
-        self,
-        prompt,
-        height,
-        width,
-        callback_steps,
-        negative_prompt=None,
-        prompt_embeds=None,
-        negative_prompt_embeds=None,
-    ):
+    def check_inputs(self, prompt, height, width, callback_steps):
+        if not isinstance(prompt, str) and not isinstance(prompt, list):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
        if height % 8 != 0 or width % 8 != 0:
            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")

@@ -414,32 +392,6 @@ class AltDiffusionPipeline(DiffusionPipeline):
                f" {type(callback_steps)}."
            )

-        if prompt is not None and prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two."
-            )
-        elif prompt is None and prompt_embeds is None:
-            raise ValueError(
-                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
-            )
-        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if negative_prompt is not None and negative_prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
-                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
-            )
-
-        if prompt_embeds is not None and negative_prompt_embeds is not None:
-            if prompt_embeds.shape != negative_prompt_embeds.shape:
-                raise ValueError(
-                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
-                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}."
-                )
-
    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
        if isinstance(generator, list) and len(generator) != batch_size:
@@ -461,7 +413,7 @@ class AltDiffusionPipeline(DiffusionPipeline):
    @replace_example_docstring(EXAMPLE_DOC_STRING)
    def __call__(
        self,
-        prompt: Union[str, List[str]] = None,
+        prompt: Union[str, List[str]],
        height: Optional[int] = None,
        width: Optional[int] = None,
        num_inference_steps: int = 50,
@@ -471,8 +423,6 @@ class AltDiffusionPipeline(DiffusionPipeline):
        eta: float = 0.0,
        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
        latents: Optional[torch.FloatTensor] = None,
-        prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
        output_type: Optional[str] = "pil",
        return_dict: bool = True,
        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
@@ -482,9 +432,8 @@ class AltDiffusionPipeline(DiffusionPipeline):
        Function invoked when calling the pipeline for generation.

        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
-                instead.
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
                The height in pixels of the generated image.
            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
@@ -499,28 +448,20 @@ class AltDiffusionPipeline(DiffusionPipeline):
                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
                usually at the expense of lower image quality.
            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
-                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
            num_images_per_prompt (`int`, *optional*, defaults to 1):
                The number of images to generate per prompt.
            eta (`float`, *optional*, defaults to 0.0):
                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+            generator (`torch.Generator`, *optional*):
                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
                to make generation deterministic.
            latents (`torch.FloatTensor`, *optional*):
                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                tensor will ge generated by sampling using the supplied random `generator`.
-            prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generate image. Choose between
                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
@@ -548,18 +489,10 @@ class AltDiffusionPipeline(DiffusionPipeline):
        width = width or self.unet.config.sample_size * self.vae_scale_factor

        # 1. Check inputs. Raise error if not correct
-        self.check_inputs(
-            prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
-        )
+        self.check_inputs(prompt, height, width, callback_steps)

        # 2. Define call parameters
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
+        batch_size = 1 if isinstance(prompt, str) else len(prompt)
        device = self._execution_device
        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
@@ -567,14 +500,8 @@ class AltDiffusionPipeline(DiffusionPipeline):
        do_classifier_free_guidance = guidance_scale > 1.0

        # 3. Encode input prompt
-        prompt_embeds = self._encode_prompt(
-            prompt,
-            device,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
+        text_embeddings = self._encode_prompt(
+            prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
        )

        # 4. Prepare timesteps
@@ -588,7 +515,7 @@ class AltDiffusionPipeline(DiffusionPipeline):
            num_channels_latents,
            height,
            width,
-            prompt_embeds.dtype,
+            text_embeddings.dtype,
            device,
            generator,
            latents,
@@ -606,7 +533,7 @@ class AltDiffusionPipeline(DiffusionPipeline):
                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)

                # predict the noise residual
-                noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=prompt_embeds).sample
+                noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample

                # perform guidance
                if do_classifier_free_guidance:
@@ -626,7 +553,7 @@ class AltDiffusionPipeline(DiffusionPipeline):
        image = self.decode_latents(latents)

        # 9. Run safety checker
-        image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        image, has_nsfw_concept = self.run_safety_checker(image, device, text_embeddings.dtype)

        # 10. Convert to PIL
        if output_type == "pil":
--- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
+++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
@@ -25,7 +25,14 @@ from transformers import CLIPFeatureExtractor, XLMRobertaTokenizer

 from ...configuration_utils import FrozenDict
 from ...models import AutoencoderKL, UNet2DConditionModel
-from ...schedulers import KarrasDiffusionSchedulers
+from ...schedulers import (
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+)
 from ...utils import PIL_INTERPOLATION, deprecate, logging, randn_tensor, replace_example_docstring
 from ..pipeline_utils import DiffusionPipeline
 from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
@@ -73,7 +80,7 @@ def preprocess(image):

    if isinstance(image[0], PIL.Image.Image):
        w, h = image[0].size
-        w, h = map(lambda x: x - x % 8, (w, h))  # resize to integer multiple of 8
+        w, h = map(lambda x: x - x % 32, (w, h))  # resize to integer multiple of 32

        image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image]
        image = np.concatenate(image, axis=0)
@@ -122,7 +129,14 @@ class AltDiffusionImg2ImgPipeline(DiffusionPipeline):
        text_encoder: RobertaSeriesModelWithTransformation,
        tokenizer: XLMRobertaTokenizer,
        unet: UNet2DConditionModel,
-        scheduler: KarrasDiffusionSchedulers,
+        scheduler: Union[
+            DDIMScheduler,
+            PNDMScheduler,
+            LMSDiscreteScheduler,
+            EulerDiscreteScheduler,
+            EulerAncestralDiscreteScheduler,
+            DPMSolverMultistepScheduler,
+        ],
        safety_checker: StableDiffusionSafetyChecker,
        feature_extractor: CLIPFeatureExtractor,
        requires_safety_checker: bool = True,
@@ -219,10 +233,13 @@ class AltDiffusionImg2ImgPipeline(DiffusionPipeline):
        device = torch.device(f"cuda:{gpu_id}")

        for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
-            cpu_offload(cpu_offloaded_model, device)
+            if cpu_offloaded_model is not None:
+                cpu_offload(cpu_offloaded_model, device)

        if self.safety_checker is not None:
-            cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True)
+            # TODO(Patrick) - there is currently a bug with cpu offload of nn.Parameter in accelerate
+            # fix by only offloading self.safety_checker for now
+            cpu_offload(self.safety_checker.vision_model, device)

    @property
    def _execution_device(self):
@@ -242,21 +259,12 @@ class AltDiffusionImg2ImgPipeline(DiffusionPipeline):
                return torch.device(module._hf_hook.execution_device)
        return self.device

-    def _encode_prompt(
-        self,
-        prompt,
-        device,
-        num_images_per_prompt,
-        do_classifier_free_guidance,
-        negative_prompt=None,
-        prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
-    ):
+    def _encode_prompt(self, prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt):
        r"""
        Encodes the prompt into text encoder hidden states.

        Args:
-             prompt (`str` or `List[str]`, *optional*):
+            prompt (`str` or `list(int)`):
                prompt to be encoded
            device: (`torch.device`):
                torch device
@@ -264,67 +272,47 @@ class AltDiffusionImg2ImgPipeline(DiffusionPipeline):
                number of images that should be generated per prompt
            do_classifier_free_guidance (`bool`):
                whether to use classifier free guidance or not
-            negative_ prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
-                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
-            prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
+            negative_prompt (`str` or `List[str]`):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
        """
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+
+        if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+            attention_mask = text_inputs.attention_mask.to(device)
        else:
-            batch_size = prompt_embeds.shape[0]
+            attention_mask = None

-        if prompt_embeds is None:
-            text_inputs = self.tokenizer(
-                prompt,
-                padding="max_length",
-                max_length=self.tokenizer.model_max_length,
-                truncation=True,
-                return_tensors="pt",
-            )
-            text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+        text_embeddings = self.text_encoder(
+            text_input_ids.to(device),
+            attention_mask=attention_mask,
+        )
+        text_embeddings = text_embeddings[0]

-            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
-                text_input_ids, untruncated_ids
-            ):
-                removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
-                )
-                logger.warning(
-                    "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
-                )
-
-            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-                attention_mask = text_inputs.attention_mask.to(device)
-            else:
-                attention_mask = None
-
-            prompt_embeds = self.text_encoder(
-                text_input_ids.to(device),
-                attention_mask=attention_mask,
-            )
-            prompt_embeds = prompt_embeds[0]
-
-        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
-
-        bs_embed, seq_len, _ = prompt_embeds.shape
        # duplicate text embeddings for each generation per prompt, using mps friendly method
-        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
-        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+        bs_embed, seq_len, _ = text_embeddings.shape
+        text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)
+        text_embeddings = text_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)

        # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance and negative_prompt_embeds is None:
+        if do_classifier_free_guidance:
            uncond_tokens: List[str]
            if negative_prompt is None:
                uncond_tokens = [""] * batch_size
@@ -344,7 +332,7 @@ class AltDiffusionImg2ImgPipeline(DiffusionPipeline):
            else:
                uncond_tokens = negative_prompt

-            max_length = prompt_embeds.shape[1]
+            max_length = text_input_ids.shape[-1]
            uncond_input = self.tokenizer(
                uncond_tokens,
                padding="max_length",
@@ -358,27 +346,23 @@ class AltDiffusionImg2ImgPipeline(DiffusionPipeline):
            else:
                attention_mask = None

-            negative_prompt_embeds = self.text_encoder(
+            uncond_embeddings = self.text_encoder(
                uncond_input.input_ids.to(device),
                attention_mask=attention_mask,
            )
-            negative_prompt_embeds = negative_prompt_embeds[0]
+            uncond_embeddings = uncond_embeddings[0]

-        if do_classifier_free_guidance:
            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-            seq_len = negative_prompt_embeds.shape[1]
-
-            negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
-
-            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
-            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+            seq_len = uncond_embeddings.shape[1]
+            uncond_embeddings = uncond_embeddings.repeat(1, num_images_per_prompt, 1)
+            uncond_embeddings = uncond_embeddings.view(batch_size * num_images_per_prompt, seq_len, -1)

            # For classifier free guidance, we need to do two forward passes.
            # Here we concatenate the unconditional and text embeddings into a single batch
            # to avoid doing two forward passes
-            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])

-        return prompt_embeds
+        return text_embeddings

    def run_safety_checker(self, image, device, dtype):
        if self.safety_checker is not None:
@@ -415,14 +399,12 @@ class AltDiffusionImg2ImgPipeline(DiffusionPipeline):
            extra_step_kwargs["generator"] = generator
        return extra_step_kwargs

-    def check_inputs(
-        self, prompt, strength, callback_steps, negative_prompt=None, prompt_embeds=None, negative_prompt_embeds=None
-    ):
+    def check_inputs(self, prompt, strength, callback_steps):
        if not isinstance(prompt, str) and not isinstance(prompt, list):
            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")

        if strength < 0 or strength > 1:
-            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
+            raise ValueError(f"The value of strength should in [1.0, 1.0] but is {strength}")

        if (callback_steps is None) or (
            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
@@ -432,32 +414,6 @@ class AltDiffusionImg2ImgPipeline(DiffusionPipeline):
                f" {type(callback_steps)}."
            )

-        if prompt is not None and prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two."
-            )
-        elif prompt is None and prompt_embeds is None:
-            raise ValueError(
-                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
-            )
-        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if negative_prompt is not None and negative_prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
-                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
-            )
-
-        if prompt_embeds is not None and negative_prompt_embeds is not None:
-            if prompt_embeds.shape != negative_prompt_embeds.shape:
-                raise ValueError(
-                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
-                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}."
-                )
-
    def get_timesteps(self, num_inference_steps, strength, device):
        # get the original timestep using init_timestep
        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
@@ -523,7 +479,7 @@ class AltDiffusionImg2ImgPipeline(DiffusionPipeline):
    @replace_example_docstring(EXAMPLE_DOC_STRING)
    def __call__(
        self,
-        prompt: Union[str, List[str]] = None,
+        prompt: Union[str, List[str]],
        image: Union[torch.FloatTensor, PIL.Image.Image] = None,
        strength: float = 0.8,
        num_inference_steps: Optional[int] = 50,
@@ -532,8 +488,6 @@ class AltDiffusionImg2ImgPipeline(DiffusionPipeline):
        num_images_per_prompt: Optional[int] = 1,
        eta: Optional[float] = 0.0,
        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
-        prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
        output_type: Optional[str] = "pil",
        return_dict: bool = True,
        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
@@ -544,9 +498,8 @@ class AltDiffusionImg2ImgPipeline(DiffusionPipeline):
        Function invoked when calling the pipeline for generation.

        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
-                instead.
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
            image (`torch.FloatTensor` or `PIL.Image.Image`):
                `Image`, or tensor representing an image batch, that will be used as the starting point for the
                process.
@@ -566,9 +519,8 @@ class AltDiffusionImg2ImgPipeline(DiffusionPipeline):
                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
                usually at the expense of lower image quality.
            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale`
-                is less than `1`).
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
            num_images_per_prompt (`int`, *optional*, defaults to 1):
                The number of images to generate per prompt.
            eta (`float`, *optional*, defaults to 0.0):
@@ -577,13 +529,6 @@ class AltDiffusionImg2ImgPipeline(DiffusionPipeline):
            generator (`torch.Generator`, *optional*):
                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
                to make generation deterministic.
-            prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generate image. Choose between
                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
@@ -609,8 +554,8 @@ class AltDiffusionImg2ImgPipeline(DiffusionPipeline):
        init_image = deprecate("init_image", "0.13.0", message, take_from=kwargs)
        image = init_image or image

-        # 1. Check inputs. Raise error if not correct
-        self.check_inputs(prompt, strength, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds)
+        # 1. Check inputs
+        self.check_inputs(prompt, strength, callback_steps)

        # 2. Define call parameters
        batch_size = 1 if isinstance(prompt, str) else len(prompt)
@@ -621,14 +566,8 @@ class AltDiffusionImg2ImgPipeline(DiffusionPipeline):
        do_classifier_free_guidance = guidance_scale > 1.0

        # 3. Encode input prompt
-        prompt_embeds = self._encode_prompt(
-            prompt,
-            device,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
+        text_embeddings = self._encode_prompt(
+            prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
        )

        # 4. Preprocess image
@@ -641,7 +580,7 @@ class AltDiffusionImg2ImgPipeline(DiffusionPipeline):

        # 6. Prepare latent variables
        latents = self.prepare_latents(
-            image, latent_timestep, batch_size, num_images_per_prompt, prompt_embeds.dtype, device, generator
+            image, latent_timestep, batch_size, num_images_per_prompt, text_embeddings.dtype, device, generator
        )

        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
@@ -656,7 +595,7 @@ class AltDiffusionImg2ImgPipeline(DiffusionPipeline):
                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)

                # predict the noise residual
-                noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=prompt_embeds).sample
+                noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample

                # perform guidance
                if do_classifier_free_guidance:
@@ -676,7 +615,7 @@ class AltDiffusionImg2ImgPipeline(DiffusionPipeline):
        image = self.decode_latents(latents)

        # 10. Run safety checker
-        image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        image, has_nsfw_concept = self.run_safety_checker(image, device, text_embeddings.dtype)

        # 11. Convert to PIL
        if output_type == "pil":
--- a/src/diffusers/pipelines/ddim/pipeline_ddim.py
+++ b/src/diffusers/pipelines/ddim/pipeline_ddim.py
@@ -17,7 +17,7 @@ from typing import List, Optional, Tuple, Union
 import torch

 from ...schedulers import DDIMScheduler
-from ...utils import randn_tensor
+from ...utils import deprecate, randn_tensor
 from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput


@@ -78,6 +78,24 @@ class DDIMPipeline(DiffusionPipeline):
            True, otherwise a `tuple. When returning a tuple, the first element is a list with the generated images.
        """

+        if (
+            generator is not None
+            and isinstance(generator, torch.Generator)
+            and generator.device.type != self.device.type
+            and self.device.type != "mps"
+        ):
+            message = (
+                f"The `generator` device is `{generator.device}` and does not match the pipeline "
+                f"device `{self.device}`, so the `generator` will be ignored. "
+                f'Please use `generator=torch.Generator(device="{self.device}")` instead.'
+            )
+            deprecate(
+                "generator.device == 'cpu'",
+                "0.13.0",
+                message,
+            )
+            generator = None
+
        # Sample gaussian noise to begin loop
        if isinstance(self.unet.sample_size, int):
            image_shape = (batch_size, self.unet.in_channels, self.unet.sample_size, self.unet.sample_size)
--- a/src/diffusers/pipelines/dit/init.py
+++ b/src/diffusers/pipelines/dit/init.py
@@ -1 +0,0 @@
-from .pipeline_dit import DiTPipeline
--- a/src/diffusers/pipelines/dit/pipeline_dit.py
+++ b/src/diffusers/pipelines/dit/pipeline_dit.py
@@ -1,199 +0,0 @@
-# Attribution-NonCommercial 4.0 International (CC BY-NC 4.0)
-# William Peebles and Saining Xie
-#
-# Copyright (c) 2021 OpenAI
-# MIT License
-#
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Dict, List, Optional, Tuple, Union
-
-import torch
-
-from ...models import AutoencoderKL, Transformer2DModel
-from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import randn_tensor
-from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
-
-
-class DiTPipeline(DiffusionPipeline):
-    r"""
-    This pipeline inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-
-    Parameters:
-        transformer ([`Transformer2DModel`]):
-            Class conditioned Transformer in Diffusion model to denoise the encoded image latents.
-        vae ([`AutoencoderKL`]):
-            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
-        scheduler ([`DDIMScheduler`]):
-            A scheduler to be used in combination with `dit` to denoise the encoded image latents.
-    """
-
-    def __init__(
-        self,
-        transformer: Transformer2DModel,
-        vae: AutoencoderKL,
-        scheduler: KarrasDiffusionSchedulers,
-        id2label: Optional[Dict[int, str]] = None,
-    ):
-        super().__init__()
-        self.register_modules(transformer=transformer, vae=vae, scheduler=scheduler)
-
-        # create a imagenet -> id dictionary for easier use
-        self.labels = {}
-        if id2label is not None:
-            for key, value in id2label.items():
-                for label in value.split(","):
-                    self.labels[label.lstrip().rstrip()] = int(key)
-            self.labels = dict(sorted(self.labels.items()))
-
-    def get_label_ids(self, label: Union[str, List[str]]) -> List[int]:
-        r"""
-
-        Map label strings, *e.g.* from ImageNet, to corresponding class ids.
-
-        Parameters:
-            label (`str` or `dict` of `str`): label strings to be mapped to class ids.
-
-        Returns:
-            `list` of `int`: Class ids to be processed by pipeline.
-        """
-
-        if not isinstance(label, list):
-            label = list(label)
-
-        for l in label:
-            if l not in self.labels:
-                raise ValueError(
-                    f"{l} does not exist. Please make sure to select one of the following labels: \n {self.labels}."
-                )
-
-        return [self.labels[l] for l in label]
-
-    @torch.no_grad()
-    def __call__(
-        self,
-        class_labels: List[int],
-        guidance_scale: float = 4.0,
-        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
-        num_inference_steps: int = 50,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-    ) -> Union[ImagePipelineOutput, Tuple]:
-        r"""
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            class_labels (List[int]):
-                List of imagenet class labels for the images to be generated.
-            guidance_scale (`float`, *optional*, defaults to 4.0):
-                Scale of the guidance signal.
-            generator (`torch.Generator`, *optional*):
-                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
-                deterministic.
-            num_inference_steps (`int`, *optional*, defaults to 250):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`ImagePipelineOutput`] instead of a plain tuple.
-        """
-
-        batch_size = len(class_labels)
-        latent_size = self.transformer.config.sample_size
-        latent_channels = self.transformer.config.in_channels
-
-        latents = randn_tensor(
-            shape=(batch_size, latent_channels, latent_size, latent_size),
-            generator=generator,
-            device=self.device,
-            dtype=self.transformer.dtype,
-        )
-        latent_model_input = torch.cat([latents] * 2) if guidance_scale > 1 else latents
-
-        class_labels = torch.tensor(class_labels, device=self.device).reshape(-1)
-        class_null = torch.tensor([1000] * batch_size, device=self.device)
-        class_labels_input = torch.cat([class_labels, class_null], 0) if guidance_scale > 1 else class_labels
-
-        # set step values
-        self.scheduler.set_timesteps(num_inference_steps)
-
-        for t in self.progress_bar(self.scheduler.timesteps):
-            if guidance_scale > 1:
-                half = latent_model_input[: len(latent_model_input) // 2]
-                latent_model_input = torch.cat([half, half], dim=0)
-            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-
-            timesteps = t
-            if not torch.is_tensor(timesteps):
-                # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
-                # This would be a good case for the `match` statement (Python 3.10+)
-                is_mps = latent_model_input.device.type == "mps"
-                if isinstance(timesteps, float):
-                    dtype = torch.float32 if is_mps else torch.float64
-                else:
-                    dtype = torch.int32 if is_mps else torch.int64
-                timesteps = torch.tensor([timesteps], dtype=dtype, device=latent_model_input.device)
-            elif len(timesteps.shape) == 0:
-                timesteps = timesteps[None].to(latent_model_input.device)
-            # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
-            timesteps = timesteps.expand(latent_model_input.shape[0])
-            # predict noise model_output
-            noise_pred = self.transformer(
-                latent_model_input, timestep=timesteps, class_labels=class_labels_input
-            ).sample
-
-            # perform guidance
-            if guidance_scale > 1:
-                eps, rest = noise_pred[:, :latent_channels], noise_pred[:, latent_channels:]
-                cond_eps, uncond_eps = torch.split(eps, len(eps) // 2, dim=0)
-
-                half_eps = uncond_eps + guidance_scale * (cond_eps - uncond_eps)
-                eps = torch.cat([half_eps, half_eps], dim=0)
-
-                noise_pred = torch.cat([eps, rest], dim=1)
-
-            # learned sigma
-            if self.transformer.config.out_channels // 2 == latent_channels:
-                model_output, _ = torch.split(noise_pred, latent_channels, dim=1)
-            else:
-                model_output = noise_pred
-
-            # compute previous image: x_t -> x_t-1
-            latent_model_input = self.scheduler.step(model_output, t, latent_model_input).prev_sample
-
-        if guidance_scale > 1:
-            latents, _ = latent_model_input.chunk(2, dim=0)
-        else:
-            latents = latent_model_input
-
-        latents = 1 / 0.18215 * latents
-        samples = self.vae.decode(latents).sample
-
-        samples = (samples / 2 + 0.5).clamp(0, 1)
-
-        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
-        samples = samples.cpu().permute(0, 2, 3, 1).float().numpy()
-
-        if output_type == "pil":
-            samples = self.numpy_to_pil(samples)
-
-        if not return_dict:
-            return (samples,)
-
-        return ImagePipelineOutput(images=samples)
--- a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py
+++ b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py
@@ -129,11 +129,11 @@ class LDMTextToImagePipeline(DiffusionPipeline):
            uncond_input = self.tokenizer(
                [""] * batch_size, padding="max_length", max_length=77, truncation=True, return_tensors="pt"
            )
-            negative_prompt_embeds = self.bert(uncond_input.input_ids.to(self.device))[0]
+            uncond_embeddings = self.bert(uncond_input.input_ids.to(self.device))[0]

        # get prompt text embeddings
        text_input = self.tokenizer(prompt, padding="max_length", max_length=77, truncation=True, return_tensors="pt")
-        prompt_embeds = self.bert(text_input.input_ids.to(self.device))[0]
+        text_embeddings = self.bert(text_input.input_ids.to(self.device))[0]

        # get the initial random noise unless the user supplied it
        latents_shape = (batch_size, self.unet.in_channels, height // 8, width // 8)
@@ -144,7 +144,7 @@ class LDMTextToImagePipeline(DiffusionPipeline):
            )

        if latents is None:
-            latents = randn_tensor(latents_shape, generator=generator, device=self.device, dtype=prompt_embeds.dtype)
+            latents = randn_tensor(latents_shape, generator=generator, device=self.device, dtype=text_embeddings.dtype)
        else:
            if latents.shape != latents_shape:
                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
@@ -163,13 +163,13 @@ class LDMTextToImagePipeline(DiffusionPipeline):
            if guidance_scale == 1.0:
                # guidance_scale of 1 means no guidance
                latents_input = latents
-                context = prompt_embeds
+                context = text_embeddings
            else:
                # For classifier free guidance, we need to do two forward passes.
                # Here we concatenate the unconditional and text embeddings into a single batch
                # to avoid doing two forward passes
                latents_input = torch.cat([latents] * 2)
-                context = torch.cat([negative_prompt_embeds, prompt_embeds])
+                context = torch.cat([uncond_embeddings, text_embeddings])

            # predict the noise residual
            noise_pred = self.unet(latents_input, t, encoder_hidden_states=context).sample
--- a/src/diffusers/pipelines/paint_by_example/image_encoder.py
+++ b/src/diffusers/pipelines/paint_by_example/image_encoder.py
@@ -36,15 +36,12 @@ class PaintByExampleImageEncoder(CLIPPreTrainedModel):
        # uncondition for scaling
        self.uncond_vector = nn.Parameter(torch.randn((1, 1, self.proj_size)))

-    def forward(self, pixel_values, return_uncond_vector=False):
+    def forward(self, pixel_values):
        clip_output = self.model(pixel_values=pixel_values)
        latent_states = clip_output.pooler_output
        latent_states = self.mapper(latent_states[:, None])
        latent_states = self.final_layer_norm(latent_states)
        latent_states = self.proj_out(latent_states)
-        if return_uncond_vector:
-            return latent_states, self.uncond_vector
-
        return latent_states


--- a/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py
+++ b/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py
@@ -201,11 +201,14 @@ class PaintByExamplePipeline(DiffusionPipeline):

        device = torch.device(f"cuda:{gpu_id}")

-        for cpu_offloaded_model in [self.unet, self.vae, self.image_encoder]:
-            cpu_offload(cpu_offloaded_model, execution_device=device)
+        for cpu_offloaded_model in [self.unet, self.vae]:
+            if cpu_offloaded_model is not None:
+                cpu_offload(cpu_offloaded_model, device)

        if self.safety_checker is not None:
-            cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True)
+            # TODO(Patrick) - there is currently a bug with cpu offload of nn.Parameter in accelerate
+            # fix by only offloading self.safety_checker for now
+            cpu_offload(self.safety_checker.vision_model, device)

    @property
    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
@@ -364,7 +367,7 @@ class PaintByExamplePipeline(DiffusionPipeline):
            image = self.feature_extractor(images=image, return_tensors="pt").pixel_values

        image = image.to(device=device, dtype=dtype)
-        image_embeddings, negative_prompt_embeds = self.image_encoder(image, return_uncond_vector=True)
+        image_embeddings = self.image_encoder(image)

        # duplicate image embeddings for each generation per prompt, using mps friendly method
        bs_embed, seq_len, _ = image_embeddings.shape
@@ -372,13 +375,14 @@ class PaintByExamplePipeline(DiffusionPipeline):
        image_embeddings = image_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)

        if do_classifier_free_guidance:
-            negative_prompt_embeds = negative_prompt_embeds.repeat(1, image_embeddings.shape[0], 1)
-            negative_prompt_embeds = negative_prompt_embeds.view(bs_embed * num_images_per_prompt, 1, -1)
+            uncond_embeddings = self.image_encoder.uncond_vector
+            uncond_embeddings = uncond_embeddings.repeat(1, image_embeddings.shape[0], 1)
+            uncond_embeddings = uncond_embeddings.view(bs_embed * num_images_per_prompt, 1, -1)

            # For classifier free guidance, we need to do two forward passes.
            # Here we concatenate the unconditional and text embeddings into a single batch
            # to avoid doing two forward passes
-            image_embeddings = torch.cat([negative_prompt_embeds, image_embeddings])
+            image_embeddings = torch.cat([uncond_embeddings, image_embeddings])

        return image_embeddings

--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -19,7 +19,7 @@ import inspect
 import os
 from dataclasses import dataclass
 from pathlib import Path
-from typing import Any, Callable, Dict, List, Optional, Union
+from typing import Any, Dict, List, Optional, Union

 import numpy as np
 import torch
@@ -37,7 +37,6 @@ from ..schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME
 from ..utils import (
    CONFIG_NAME,
    DIFFUSERS_CACHE,
-    FLAX_WEIGHTS_NAME,
    HF_HUB_OFFLINE,
    ONNX_WEIGHTS_NAME,
    WEIGHTS_NAME,
@@ -446,7 +445,6 @@ class DiffusionPipeline(ConfigMixin):
        local_files_only = kwargs.pop("local_files_only", HF_HUB_OFFLINE)
        use_auth_token = kwargs.pop("use_auth_token", None)
        revision = kwargs.pop("revision", None)
-        from_flax = kwargs.pop("from_flax", False)
        torch_dtype = kwargs.pop("torch_dtype", None)
        custom_pipeline = kwargs.pop("custom_pipeline", None)
        custom_revision = kwargs.pop("custom_revision", None)
@@ -472,26 +470,11 @@ class DiffusionPipeline(ConfigMixin):
            # make sure we only download sub-folders and `diffusers` filenames
            folder_names = [k for k in config_dict.keys() if not k.startswith("_")]
            allow_patterns = [os.path.join(k, "*") for k in folder_names]
-            allow_patterns += [
-                WEIGHTS_NAME,
-                SCHEDULER_CONFIG_NAME,
-                CONFIG_NAME,
-                ONNX_WEIGHTS_NAME,
-                cls.config_name,
-            ]
+            allow_patterns += [WEIGHTS_NAME, SCHEDULER_CONFIG_NAME, CONFIG_NAME, ONNX_WEIGHTS_NAME, cls.config_name]

            # make sure we don't download flax weights
            ignore_patterns = ["*.msgpack"]

-            if from_flax:
-                ignore_patterns = ["*.bin", "*.safetensors"]
-                allow_patterns += [
-                    SCHEDULER_CONFIG_NAME,
-                    CONFIG_NAME,
-                    FLAX_WEIGHTS_NAME,
-                    cls.config_name,
-                ]
-
            if custom_pipeline is not None:
                allow_patterns += [CUSTOM_PIPELINE_FILE_NAME]

@@ -505,7 +488,7 @@ class DiffusionPipeline(ConfigMixin):

            user_agent = http_user_agent(user_agent)

-            if is_safetensors_available() and not local_files_only:
+            if is_safetensors_available():
                info = model_info(
                    pretrained_model_name_or_path,
                    use_auth_token=use_auth_token,
@@ -514,11 +497,7 @@ class DiffusionPipeline(ConfigMixin):
                if is_safetensors_compatible(info):
                    ignore_patterns.append("*.bin")
                else:
-                    # as a safety mechanism we also don't download safetensors if
-                    # not all safetensors files are there
                    ignore_patterns.append("*.safetensors")
-            else:
-                ignore_patterns.append("*.safetensors")

            # download all allow_patterns
            cached_folder = snapshot_download(
@@ -725,14 +704,7 @@ class DiffusionPipeline(ConfigMixin):
                # This makes sure that the weights won't be initialized which significantly speeds up loading.
                if is_diffusers_model or is_transformers_model:
                    loading_kwargs["device_map"] = device_map
-                    if from_flax:
-                        loading_kwargs["from_flax"] = True
-
-                    # if `from_flax` and model is transformer model, can currently not load with `low_cpu_mem_usage`
-                    if not (from_flax and is_transformers_model):
-                        loading_kwargs["low_cpu_mem_usage"] = low_cpu_mem_usage
-                    else:
-                        loading_kwargs["low_cpu_mem_usage"] = False
+                    loading_kwargs["low_cpu_mem_usage"] = low_cpu_mem_usage

                # check if the module is in a subdirectory
                if os.path.isdir(os.path.join(cached_folder, name)):
@@ -842,7 +814,7 @@ class DiffusionPipeline(ConfigMixin):
    def set_progress_bar_config(self, **kwargs):
        self._progress_bar_config = kwargs

-    def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None):
+    def enable_xformers_memory_efficient_attention(self):
        r"""
        Enable memory efficient attention as implemented in xformers.

@@ -851,28 +823,8 @@ class DiffusionPipeline(ConfigMixin):

        Warning: When Memory Efficient Attention and Sliced attention are both enabled, the Memory Efficient Attention
        is used.
-
-        Parameters:
-            attention_op (`Callable`, *optional*):
-                Override the default `None` operator for use as `op` argument to the
-                [`memory_efficient_attention()`](https://facebookresearch.github.io/xformers/components/ops.html#xformers.ops.memory_efficient_attention)
-                function of xFormers.
-
-        Examples:
-
-        ```py
-        >>> import torch
-        >>> from diffusers import DiffusionPipeline
-        >>> from xformers.ops import MemoryEfficientAttentionFlashAttentionOp
-
-        >>> pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1", torch_dtype=torch.float16)
-        >>> pipe = pipe.to("cuda")
-        >>> pipe.enable_xformers_memory_efficient_attention(attention_op=MemoryEfficientAttentionFlashAttentionOp)
-        >>> # Workaround for not accepting attention shape using VAE for Flash Attention
-        >>> pipe.vae.enable_xformers_memory_efficient_attention(attention_op=None)
-        ```
        """
-        self.set_use_memory_efficient_attention_xformers(True, attention_op)
+        self.set_use_memory_efficient_attention_xformers(True)

    def disable_xformers_memory_efficient_attention(self):
        r"""
@@ -880,15 +832,13 @@ class DiffusionPipeline(ConfigMixin):
        """
        self.set_use_memory_efficient_attention_xformers(False)

-    def set_use_memory_efficient_attention_xformers(
-        self, valid: bool, attention_op: Optional[Callable] = None
-    ) -> None:
+    def set_use_memory_efficient_attention_xformers(self, valid: bool) -> None:
        # Recursively walk through all the children.
        # Any children which exposes the set_use_memory_efficient_attention_xformers method
        # gets the message
        def fn_recursive_set_mem_eff(module: torch.nn.Module):
            if hasattr(module, "set_use_memory_efficient_attention_xformers"):
-                module.set_use_memory_efficient_attention_xformers(valid, attention_op)
+                module.set_use_memory_efficient_attention_xformers(valid)

            for child in module.children():
                fn_recursive_set_mem_eff(child)
--- a/src/diffusers/pipelines/pndm/pipeline_pndm.py
+++ b/src/diffusers/pipelines/pndm/pipeline_pndm.py
@@ -39,9 +39,6 @@ class PNDMPipeline(DiffusionPipeline):

    def __init__(self, unet: UNet2DModel, scheduler: PNDMScheduler):
        super().__init__()
-
-        scheduler = PNDMScheduler.from_config(scheduler.config)
-
        self.register_modules(unet=unet, scheduler=scheduler)

    @torch.no_grad()
--- a/src/diffusers/pipelines/repaint/pipeline_repaint.py
+++ b/src/diffusers/pipelines/repaint/pipeline_repaint.py
@@ -38,7 +38,7 @@ def _preprocess_image(image: Union[List, PIL.Image.Image, torch.Tensor]):

    if isinstance(image[0], PIL.Image.Image):
        w, h = image[0].size
-        w, h = map(lambda x: x - x % 8, (w, h))  # resize to integer multiple of 8
+        w, h = map(lambda x: x - x % 32, (w, h))  # resize to integer multiple of 32

        image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image]
        image = np.concatenate(image, axis=0)
--- a/src/diffusers/pipelines/stable_diffusion/init.py
+++ b/src/diffusers/pipelines/stable_diffusion/init.py
@@ -11,7 +11,6 @@ from ...utils import (
    OptionalDependencyNotAvailable,
    is_flax_available,
    is_k_diffusion_available,
-    is_k_diffusion_version,
    is_onnx_available,
    is_torch_available,
    is_transformers_available,
@@ -43,7 +42,6 @@ if is_transformers_available() and is_torch_available():
    from .pipeline_stable_diffusion_img2img import StableDiffusionImg2ImgPipeline
    from .pipeline_stable_diffusion_inpaint import StableDiffusionInpaintPipeline
    from .pipeline_stable_diffusion_inpaint_legacy import StableDiffusionInpaintPipelineLegacy
-    from .pipeline_stable_diffusion_instruct_pix2pix import StableDiffusionInstructPix2PixPipeline
    from .pipeline_stable_diffusion_upscale import StableDiffusionUpscalePipeline
    from .safety_checker import StableDiffusionSafetyChecker

@@ -57,7 +55,7 @@ else:


 try:
-    if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.26.0")):
+    if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.26.0.dev0")):
        raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
    from ...utils.dummy_torch_and_transformers_objects import StableDiffusionDepth2ImgPipeline
@@ -66,7 +64,7 @@ else:


 try:
-    if not (is_torch_available() and is_transformers_available() and is_k_diffusion_version(">=", "0.0.12")):
+    if not (is_torch_available() and is_transformers_available() and is_k_diffusion_available()):
        raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
    from ...utils.dummy_torch_and_transformers_and_k_diffusion_objects import *  # noqa F403
@@ -101,5 +99,4 @@ if is_transformers_available() and is_flax_available():
    from ...schedulers.scheduling_pndm_flax import PNDMSchedulerState
    from .pipeline_flax_stable_diffusion import FlaxStableDiffusionPipeline
    from .pipeline_flax_stable_diffusion_img2img import FlaxStableDiffusionImg2ImgPipeline
-    from .pipeline_flax_stable_diffusion_inpaint import FlaxStableDiffusionInpaintPipeline
    from .safety_checker_flax import FlaxStableDiffusionSafetyChecker
--- a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
+++ b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
@@ -44,7 +44,7 @@ def preprocess(image):

    if isinstance(image[0], PIL.Image.Image):
        w, h = image[0].size
-        w, h = map(lambda x: x - x % 8, (w, h))  # resize to integer multiple of 8
+        w, h = map(lambda x: x - x % 32, (w, h))  # resize to integer multiple of 32

        image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image]
        image = np.concatenate(image, axis=0)
@@ -236,10 +236,13 @@ class CycleDiffusionPipeline(DiffusionPipeline):
        device = torch.device(f"cuda:{gpu_id}")

        for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
-            cpu_offload(cpu_offloaded_model, device)
+            if cpu_offloaded_model is not None:
+                cpu_offload(cpu_offloaded_model, device)

        if self.safety_checker is not None:
-            cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True)
+            # TODO(Patrick) - there is currently a bug with cpu offload of nn.Parameter in accelerate
+            # fix by only offloading self.safety_checker for now
+            cpu_offload(self.safety_checker.vision_model, device)

    @property
    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
@@ -261,21 +264,12 @@ class CycleDiffusionPipeline(DiffusionPipeline):
        return self.device

    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
-    def _encode_prompt(
-        self,
-        prompt,
-        device,
-        num_images_per_prompt,
-        do_classifier_free_guidance,
-        negative_prompt=None,
-        prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
-    ):
+    def _encode_prompt(self, prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt):
        r"""
        Encodes the prompt into text encoder hidden states.

        Args:
-             prompt (`str` or `List[str]`, *optional*):
+            prompt (`str` or `list(int)`):
                prompt to be encoded
            device: (`torch.device`):
                torch device
@@ -283,67 +277,47 @@ class CycleDiffusionPipeline(DiffusionPipeline):
                number of images that should be generated per prompt
            do_classifier_free_guidance (`bool`):
                whether to use classifier free guidance or not
-            negative_ prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
-                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
-            prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
+            negative_prompt (`str` or `List[str]`):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
        """
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+
+        if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+            attention_mask = text_inputs.attention_mask.to(device)
        else:
-            batch_size = prompt_embeds.shape[0]
+            attention_mask = None

-        if prompt_embeds is None:
-            text_inputs = self.tokenizer(
-                prompt,
-                padding="max_length",
-                max_length=self.tokenizer.model_max_length,
-                truncation=True,
-                return_tensors="pt",
-            )
-            text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+        text_embeddings = self.text_encoder(
+            text_input_ids.to(device),
+            attention_mask=attention_mask,
+        )
+        text_embeddings = text_embeddings[0]

-            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
-                text_input_ids, untruncated_ids
-            ):
-                removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
-                )
-                logger.warning(
-                    "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
-                )
-
-            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-                attention_mask = text_inputs.attention_mask.to(device)
-            else:
-                attention_mask = None
-
-            prompt_embeds = self.text_encoder(
-                text_input_ids.to(device),
-                attention_mask=attention_mask,
-            )
-            prompt_embeds = prompt_embeds[0]
-
-        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
-
-        bs_embed, seq_len, _ = prompt_embeds.shape
        # duplicate text embeddings for each generation per prompt, using mps friendly method
-        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
-        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+        bs_embed, seq_len, _ = text_embeddings.shape
+        text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)
+        text_embeddings = text_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)

        # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance and negative_prompt_embeds is None:
+        if do_classifier_free_guidance:
            uncond_tokens: List[str]
            if negative_prompt is None:
                uncond_tokens = [""] * batch_size
@@ -363,7 +337,7 @@ class CycleDiffusionPipeline(DiffusionPipeline):
            else:
                uncond_tokens = negative_prompt

-            max_length = prompt_embeds.shape[1]
+            max_length = text_input_ids.shape[-1]
            uncond_input = self.tokenizer(
                uncond_tokens,
                padding="max_length",
@@ -377,37 +351,31 @@ class CycleDiffusionPipeline(DiffusionPipeline):
            else:
                attention_mask = None

-            negative_prompt_embeds = self.text_encoder(
+            uncond_embeddings = self.text_encoder(
                uncond_input.input_ids.to(device),
                attention_mask=attention_mask,
            )
-            negative_prompt_embeds = negative_prompt_embeds[0]
+            uncond_embeddings = uncond_embeddings[0]

-        if do_classifier_free_guidance:
            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-            seq_len = negative_prompt_embeds.shape[1]
-
-            negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
-
-            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
-            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+            seq_len = uncond_embeddings.shape[1]
+            uncond_embeddings = uncond_embeddings.repeat(1, num_images_per_prompt, 1)
+            uncond_embeddings = uncond_embeddings.view(batch_size * num_images_per_prompt, seq_len, -1)

            # For classifier free guidance, we need to do two forward passes.
            # Here we concatenate the unconditional and text embeddings into a single batch
            # to avoid doing two forward passes
-            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])

-        return prompt_embeds
+        return text_embeddings

    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.check_inputs
-    def check_inputs(
-        self, prompt, strength, callback_steps, negative_prompt=None, prompt_embeds=None, negative_prompt_embeds=None
-    ):
+    def check_inputs(self, prompt, strength, callback_steps):
        if not isinstance(prompt, str) and not isinstance(prompt, list):
            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")

        if strength < 0 or strength > 1:
-            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
+            raise ValueError(f"The value of strength should in [1.0, 1.0] but is {strength}")

        if (callback_steps is None) or (
            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
@@ -417,32 +385,6 @@ class CycleDiffusionPipeline(DiffusionPipeline):
                f" {type(callback_steps)}."
            )

-        if prompt is not None and prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two."
-            )
-        elif prompt is None and prompt_embeds is None:
-            raise ValueError(
-                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
-            )
-        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if negative_prompt is not None and negative_prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
-                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
-            )
-
-        if prompt_embeds is not None and negative_prompt_embeds is not None:
-            if prompt_embeds.shape != negative_prompt_embeds.shape:
-                raise ValueError(
-                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
-                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}."
-                )
-
    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
    def prepare_extra_step_kwargs(self, generator, eta):
        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
@@ -553,7 +495,6 @@ class CycleDiffusionPipeline(DiffusionPipeline):
        num_images_per_prompt: Optional[int] = 1,
        eta: Optional[float] = 0.1,
        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
-        prompt_embeds: Optional[torch.FloatTensor] = None,
        output_type: Optional[str] = "pil",
        return_dict: bool = True,
        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
@@ -595,13 +536,6 @@ class CycleDiffusionPipeline(DiffusionPipeline):
            generator (`torch.Generator`, *optional*):
                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
                to make generation deterministic.
-            prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generate image. Choose between
                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
@@ -638,14 +572,8 @@ class CycleDiffusionPipeline(DiffusionPipeline):
        do_classifier_free_guidance = guidance_scale > 1.0

        # 3. Encode input prompt
-        prompt_embeds = self._encode_prompt(
-            prompt,
-            device,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            prompt_embeds=prompt_embeds,
-        )
-        source_prompt_embeds = self._encode_prompt(
+        text_embeddings = self._encode_prompt(prompt, device, num_images_per_prompt, do_classifier_free_guidance, None)
+        source_text_embeddings = self._encode_prompt(
            source_prompt, device, num_images_per_prompt, do_classifier_free_guidance, None
        )

@@ -659,7 +587,7 @@ class CycleDiffusionPipeline(DiffusionPipeline):

        # 6. Prepare latent variables
        latents, clean_latents = self.prepare_latents(
-            image, latent_timestep, batch_size, num_images_per_prompt, prompt_embeds.dtype, device, generator
+            image, latent_timestep, batch_size, num_images_per_prompt, text_embeddings.dtype, device, generator
        )
        source_latents = latents

@@ -687,17 +615,17 @@ class CycleDiffusionPipeline(DiffusionPipeline):
                    ],
                    dim=0,
                )
-                concat_prompt_embeds = torch.stack(
+                concat_text_embeddings = torch.stack(
                    [
-                        source_prompt_embeds[0],
-                        prompt_embeds[0],
-                        source_prompt_embeds[1],
-                        prompt_embeds[1],
+                        source_text_embeddings[0],
+                        text_embeddings[0],
+                        source_text_embeddings[1],
+                        text_embeddings[1],
                    ],
                    dim=0,
                )
                concat_noise_pred = self.unet(
-                    concat_latent_model_input, t, encoder_hidden_states=concat_prompt_embeds
+                    concat_latent_model_input, t, encoder_hidden_states=concat_text_embeddings
                ).sample

                # perform guidance
@@ -737,7 +665,7 @@ class CycleDiffusionPipeline(DiffusionPipeline):
        image = self.decode_latents(latents)

        # 10. Run safety checker
-        image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        image, has_nsfw_concept = self.run_safety_checker(image, device, text_embeddings.dtype)

        # 11. Convert to PIL
        if output_type == "pil":
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py
@@ -196,7 +196,7 @@ class FlaxStableDiffusionPipeline(FlaxDiffusionPipeline):
            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")

        # get prompt text embeddings
-        prompt_embeds = self.text_encoder(prompt_ids, params=params["text_encoder"])[0]
+        text_embeddings = self.text_encoder(prompt_ids, params=params["text_encoder"])[0]

        # TODO: currently it is assumed `do_classifier_free_guidance = guidance_scale > 1.0`
        # implement this conditional `do_classifier_free_guidance = guidance_scale > 1.0`
@@ -210,8 +210,8 @@ class FlaxStableDiffusionPipeline(FlaxDiffusionPipeline):
            ).input_ids
        else:
            uncond_input = neg_prompt_ids
-        negative_prompt_embeds = self.text_encoder(uncond_input, params=params["text_encoder"])[0]
-        context = jnp.concatenate([negative_prompt_embeds, prompt_embeds])
+        uncond_embeddings = self.text_encoder(uncond_input, params=params["text_encoder"])[0]
+        context = jnp.concatenate([uncond_embeddings, text_embeddings])

        latents_shape = (
            batch_size,
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_img2img.py
@@ -192,7 +192,7 @@ class FlaxStableDiffusionImg2ImgPipeline(FlaxDiffusionPipeline):
            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")

        # get prompt text embeddings
-        prompt_embeds = self.text_encoder(prompt_ids, params=params["text_encoder"])[0]
+        text_embeddings = self.text_encoder(prompt_ids, params=params["text_encoder"])[0]

        # TODO: currently it is assumed `do_classifier_free_guidance = guidance_scale > 1.0`
        # implement this conditional `do_classifier_free_guidance = guidance_scale > 1.0`
@@ -206,8 +206,8 @@ class FlaxStableDiffusionImg2ImgPipeline(FlaxDiffusionPipeline):
            ).input_ids
        else:
            uncond_input = neg_prompt_ids
-        negative_prompt_embeds = self.text_encoder(uncond_input, params=params["text_encoder"])[0]
-        context = jnp.concatenate([negative_prompt_embeds, prompt_embeds])
+        uncond_embeddings = self.text_encoder(uncond_input, params=params["text_encoder"])[0]
+        context = jnp.concatenate([uncond_embeddings, text_embeddings])

        latents_shape = (
            batch_size,
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_inpaint.py
@@ -1,524 +0,0 @@
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import warnings
-from functools import partial
-from typing import Dict, List, Optional, Union
-
-import numpy as np
-
-import jax
-import jax.numpy as jnp
-from flax.core.frozen_dict import FrozenDict
-from flax.jax_utils import unreplicate
-from flax.training.common_utils import shard
-from packaging import version
-from PIL import Image
-from transformers import CLIPFeatureExtractor, CLIPTokenizer, FlaxCLIPTextModel
-
-from ...models import FlaxAutoencoderKL, FlaxUNet2DConditionModel
-from ...schedulers import (
-    FlaxDDIMScheduler,
-    FlaxDPMSolverMultistepScheduler,
-    FlaxLMSDiscreteScheduler,
-    FlaxPNDMScheduler,
-)
-from ...utils import PIL_INTERPOLATION, deprecate, logging
-from ..pipeline_flax_utils import FlaxDiffusionPipeline
-from . import FlaxStableDiffusionPipelineOutput
-from .safety_checker_flax import FlaxStableDiffusionSafetyChecker
-
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-# Set to True to use python for loop instead of jax.fori_loop for easier debugging
-DEBUG = False
-
-
-class FlaxStableDiffusionInpaintPipeline(FlaxDiffusionPipeline):
-    r"""
-    Pipeline for text-guided image inpainting using Stable Diffusion. *This is an experimental feature*.
-
-    This model inherits from [`FlaxDiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-
-    Args:
-        vae ([`FlaxAutoencoderKL`]):
-            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
-        text_encoder ([`FlaxCLIPTextModel`]):
-            Frozen text-encoder. Stable Diffusion uses the text portion of
-            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.FlaxCLIPTextModel),
-            specifically the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
-        tokenizer (`CLIPTokenizer`):
-            Tokenizer of class
-            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
-        unet ([`FlaxUNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
-        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
-            [`FlaxDDIMScheduler`], [`FlaxLMSDiscreteScheduler`], [`FlaxPNDMScheduler`], or
-            [`FlaxDPMSolverMultistepScheduler`].
-        safety_checker ([`FlaxStableDiffusionSafetyChecker`]):
-            Classification module that estimates whether generated images could be considered offensive or harmful.
-            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
-        feature_extractor ([`CLIPFeatureExtractor`]):
-            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
-    """
-
-    def __init__(
-        self,
-        vae: FlaxAutoencoderKL,
-        text_encoder: FlaxCLIPTextModel,
-        tokenizer: CLIPTokenizer,
-        unet: FlaxUNet2DConditionModel,
-        scheduler: Union[
-            FlaxDDIMScheduler, FlaxPNDMScheduler, FlaxLMSDiscreteScheduler, FlaxDPMSolverMultistepScheduler
-        ],
-        safety_checker: FlaxStableDiffusionSafetyChecker,
-        feature_extractor: CLIPFeatureExtractor,
-        dtype: jnp.dtype = jnp.float32,
-    ):
-        super().__init__()
-        self.dtype = dtype
-
-        if safety_checker is None:
-            logger.warning(
-                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
-                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
-                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
-                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
-                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
-                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
-            )
-
-        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
-            version.parse(unet.config._diffusers_version).base_version
-        ) < version.parse("0.9.0.dev0")
-        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
-        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
-            deprecation_message = (
-                "The configuration file of the unet has set the default `sample_size` to smaller than"
-                " 64 which seems highly unlikely .If you're checkpoint is a fine-tuned version of any of the"
-                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
-                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
-                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
-                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
-                " in the config might lead to incorrect results in future versions. If you have downloaded this"
-                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
-                " the `unet/config.json` file"
-            )
-            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
-            new_config = dict(unet.config)
-            new_config["sample_size"] = 64
-            unet._internal_dict = FrozenDict(new_config)
-
-        self.register_modules(
-            vae=vae,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            unet=unet,
-            scheduler=scheduler,
-            safety_checker=safety_checker,
-            feature_extractor=feature_extractor,
-        )
-        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
-
-    def prepare_inputs(
-        self,
-        prompt: Union[str, List[str]],
-        image: Union[Image.Image, List[Image.Image]],
-        mask: Union[Image.Image, List[Image.Image]],
-    ):
-        if not isinstance(prompt, (str, list)):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if not isinstance(image, (Image.Image, list)):
-            raise ValueError(f"image has to be of type `PIL.Image.Image` or list but is {type(image)}")
-
-        if isinstance(image, Image.Image):
-            image = [image]
-
-        if not isinstance(mask, (Image.Image, list)):
-            raise ValueError(f"image has to be of type `PIL.Image.Image` or list but is {type(image)}")
-
-        if isinstance(mask, Image.Image):
-            mask = [mask]
-
-        processed_images = jnp.concatenate([preprocess_image(img, jnp.float32) for img in image])
-        processed_masks = jnp.concatenate([preprocess_mask(m, jnp.float32) for m in mask])
-        # processed_masks[processed_masks < 0.5] = 0
-        processed_masks = processed_masks.at[processed_masks < 0.5].set(0)
-        # processed_masks[processed_masks >= 0.5] = 1
-        processed_masks = processed_masks.at[processed_masks >= 0.5].set(1)
-
-        processed_masked_images = processed_images * (processed_masks < 0.5)
-
-        text_input = self.tokenizer(
-            prompt,
-            padding="max_length",
-            max_length=self.tokenizer.model_max_length,
-            truncation=True,
-            return_tensors="np",
-        )
-        return text_input.input_ids, processed_masked_images, processed_masks
-
-    def _get_has_nsfw_concepts(self, features, params):
-        has_nsfw_concepts = self.safety_checker(features, params)
-        return has_nsfw_concepts
-
-    def _run_safety_checker(self, images, safety_model_params, jit=False):
-        # safety_model_params should already be replicated when jit is True
-        pil_images = [Image.fromarray(image) for image in images]
-        features = self.feature_extractor(pil_images, return_tensors="np").pixel_values
-
-        if jit:
-            features = shard(features)
-            has_nsfw_concepts = _p_get_has_nsfw_concepts(self, features, safety_model_params)
-            has_nsfw_concepts = unshard(has_nsfw_concepts)
-            safety_model_params = unreplicate(safety_model_params)
-        else:
-            has_nsfw_concepts = self._get_has_nsfw_concepts(features, safety_model_params)
-
-        images_was_copied = False
-        for idx, has_nsfw_concept in enumerate(has_nsfw_concepts):
-            if has_nsfw_concept:
-                if not images_was_copied:
-                    images_was_copied = True
-                    images = images.copy()
-
-                images[idx] = np.zeros(images[idx].shape, dtype=np.uint8)  # black image
-
-            if any(has_nsfw_concepts):
-                warnings.warn(
-                    "Potential NSFW content was detected in one or more images. A black image will be returned"
-                    " instead. Try again with a different prompt and/or seed."
-                )
-
-        return images, has_nsfw_concepts
-
-    def _generate(
-        self,
-        prompt_ids: jnp.array,
-        mask: jnp.array,
-        masked_image: jnp.array,
-        params: Union[Dict, FrozenDict],
-        prng_seed: jax.random.KeyArray,
-        num_inference_steps: int,
-        height: int,
-        width: int,
-        guidance_scale: float,
-        latents: Optional[jnp.array] = None,
-        neg_prompt_ids: Optional[jnp.array] = None,
-    ):
-        if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
-
-        # get prompt text embeddings
-        prompt_embeds = self.text_encoder(prompt_ids, params=params["text_encoder"])[0]
-
-        # TODO: currently it is assumed `do_classifier_free_guidance = guidance_scale > 1.0`
-        # implement this conditional `do_classifier_free_guidance = guidance_scale > 1.0`
-        batch_size = prompt_ids.shape[0]
-
-        max_length = prompt_ids.shape[-1]
-
-        if neg_prompt_ids is None:
-            uncond_input = self.tokenizer(
-                [""] * batch_size, padding="max_length", max_length=max_length, return_tensors="np"
-            ).input_ids
-        else:
-            uncond_input = neg_prompt_ids
-        negative_prompt_embeds = self.text_encoder(uncond_input, params=params["text_encoder"])[0]
-        context = jnp.concatenate([negative_prompt_embeds, prompt_embeds])
-
-        latents_shape = (
-            batch_size,
-            self.vae.config.latent_channels,
-            height // self.vae_scale_factor,
-            width // self.vae_scale_factor,
-        )
-        if latents is None:
-            latents = jax.random.normal(prng_seed, shape=latents_shape, dtype=self.dtype)
-        else:
-            if latents.shape != latents_shape:
-                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
-
-        prng_seed, mask_prng_seed = jax.random.split(prng_seed)
-
-        masked_image_latent_dist = self.vae.apply(
-            {"params": params["vae"]}, masked_image, method=self.vae.encode
-        ).latent_dist
-        masked_image_latents = masked_image_latent_dist.sample(key=mask_prng_seed).transpose((0, 3, 1, 2))
-        masked_image_latents = 0.18215 * masked_image_latents
-        del mask_prng_seed
-
-        mask = jax.image.resize(mask, (*mask.shape[:-2], *masked_image_latents.shape[-2:]), method="nearest")
-
-        # 8. Check that sizes of mask, masked image and latents match
-        num_channels_latents = self.vae.config.latent_channels
-        num_channels_mask = mask.shape[1]
-        num_channels_masked_image = masked_image_latents.shape[1]
-        if num_channels_latents + num_channels_mask + num_channels_masked_image != self.unet.config.in_channels:
-            raise ValueError(
-                f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
-                f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
-                f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}"
-                f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of"
-                " `pipeline.unet` or your `mask_image` or `image` input."
-            )
-
-        def loop_body(step, args):
-            latents, mask, masked_image_latents, scheduler_state = args
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-            latents_input = jnp.concatenate([latents] * 2)
-            mask_input = jnp.concatenate([mask] * 2)
-            masked_image_latents_input = jnp.concatenate([masked_image_latents] * 2)
-
-            t = jnp.array(scheduler_state.timesteps, dtype=jnp.int32)[step]
-            timestep = jnp.broadcast_to(t, latents_input.shape[0])
-
-            latents_input = self.scheduler.scale_model_input(scheduler_state, latents_input, t)
-            # concat latents, mask, masked_image_latents in the channel dimension
-            latents_input = jnp.concatenate([latents_input, mask_input, masked_image_latents_input], axis=1)
-
-            # predict the noise residual
-            noise_pred = self.unet.apply(
-                {"params": params["unet"]},
-                jnp.array(latents_input),
-                jnp.array(timestep, dtype=jnp.int32),
-                encoder_hidden_states=context,
-            ).sample
-            # perform guidance
-            noise_pred_uncond, noise_prediction_text = jnp.split(noise_pred, 2, axis=0)
-            noise_pred = noise_pred_uncond + guidance_scale * (noise_prediction_text - noise_pred_uncond)
-
-            # compute the previous noisy sample x_t -> x_t-1
-            latents, scheduler_state = self.scheduler.step(scheduler_state, noise_pred, t, latents).to_tuple()
-            return latents, mask, masked_image_latents, scheduler_state
-
-        scheduler_state = self.scheduler.set_timesteps(
-            params["scheduler"], num_inference_steps=num_inference_steps, shape=latents.shape
-        )
-
-        # scale the initial noise by the standard deviation required by the scheduler
-        latents = latents * params["scheduler"].init_noise_sigma
-
-        if DEBUG:
-            # run with python for loop
-            for i in range(num_inference_steps):
-                latents, mask, masked_image_latents, scheduler_state = loop_body(
-                    i, (latents, mask, masked_image_latents, scheduler_state)
-                )
-        else:
-            latents, _, _, _ = jax.lax.fori_loop(
-                0, num_inference_steps, loop_body, (latents, mask, masked_image_latents, scheduler_state)
-            )
-
-        # scale and decode the image latents with vae
-        latents = 1 / 0.18215 * latents
-        image = self.vae.apply({"params": params["vae"]}, latents, method=self.vae.decode).sample
-
-        image = (image / 2 + 0.5).clip(0, 1).transpose(0, 2, 3, 1)
-        return image
-
-    def __call__(
-        self,
-        prompt_ids: jnp.array,
-        mask: jnp.array,
-        masked_image: jnp.array,
-        params: Union[Dict, FrozenDict],
-        prng_seed: jax.random.KeyArray,
-        num_inference_steps: int = 50,
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        guidance_scale: Union[float, jnp.array] = 7.5,
-        latents: jnp.array = None,
-        neg_prompt_ids: jnp.array = None,
-        return_dict: bool = True,
-        jit: bool = False,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`str` or `List[str]`):
-                The prompt or prompts to guide the image generation.
-            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
-                The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
-                The width in pixels of the generated image.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            latents (`jnp.array`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. tensor will ge generated
-                by sampling using the supplied random `generator`.
-            jit (`bool`, defaults to `False`):
-                Whether to run `pmap` versions of the generation and safety scoring functions. NOTE: This argument
-                exists because `__call__` is not yet end-to-end pmap-able. It will be removed in a future release.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.FlaxStableDiffusionPipelineOutput`] instead of
-                a plain tuple.
-
-        Returns:
-            [`~pipelines.stable_diffusion.FlaxStableDiffusionPipelineOutput`] or `tuple`:
-                [`~pipelines.stable_diffusion.FlaxStableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a
-            `tuple. When returning a tuple, the first element is a list with the generated images, and the second
-            element is a list of `bool`s denoting whether the corresponding generated image likely represents
-            "not-safe-for-work" (nsfw) content, according to the `safety_checker`.
-        """
-        # 0. Default height and width to unet
-        height = height or self.unet.config.sample_size * self.vae_scale_factor
-        width = width or self.unet.config.sample_size * self.vae_scale_factor
-
-        masked_image = jax.image.resize(masked_image, (*masked_image.shape[:-2], height, width), method="bicubic")
-        mask = jax.image.resize(mask, (*mask.shape[:-2], height, width), method="nearest")
-
-        if isinstance(guidance_scale, float):
-            # Convert to a tensor so each device gets a copy. Follow the prompt_ids for
-            # shape information, as they may be sharded (when `jit` is `True`), or not.
-            guidance_scale = jnp.array([guidance_scale] * prompt_ids.shape[0])
-            if len(prompt_ids.shape) > 2:
-                # Assume sharded
-                guidance_scale = guidance_scale[:, None]
-
-        if jit:
-            images = _p_generate(
-                self,
-                prompt_ids,
-                mask,
-                masked_image,
-                params,
-                prng_seed,
-                num_inference_steps,
-                height,
-                width,
-                guidance_scale,
-                latents,
-                neg_prompt_ids,
-            )
-        else:
-            images = self._generate(
-                prompt_ids,
-                mask,
-                masked_image,
-                params,
-                prng_seed,
-                num_inference_steps,
-                height,
-                width,
-                guidance_scale,
-                latents,
-                neg_prompt_ids,
-            )
-
-        if self.safety_checker is not None:
-            safety_params = params["safety_checker"]
-            images_uint8_casted = (images * 255).round().astype("uint8")
-            num_devices, batch_size = images.shape[:2]
-
-            images_uint8_casted = np.asarray(images_uint8_casted).reshape(num_devices * batch_size, height, width, 3)
-            images_uint8_casted, has_nsfw_concept = self._run_safety_checker(images_uint8_casted, safety_params, jit)
-            images = np.asarray(images)
-
-            # block images
-            if any(has_nsfw_concept):
-                for i, is_nsfw in enumerate(has_nsfw_concept):
-                    if is_nsfw:
-                        images[i] = np.asarray(images_uint8_casted[i])
-
-            images = images.reshape(num_devices, batch_size, height, width, 3)
-        else:
-            images = np.asarray(images)
-            has_nsfw_concept = False
-
-        if not return_dict:
-            return (images, has_nsfw_concept)
-
-        return FlaxStableDiffusionPipelineOutput(images=images, nsfw_content_detected=has_nsfw_concept)
-
-
-# Static argnums are pipe, num_inference_steps, height, width. A change would trigger recompilation.
-# Non-static args are (sharded) input tensors mapped over their first dimension (hence, `0`).
-@partial(
-    jax.pmap,
-    in_axes=(None, 0, 0, 0, 0, 0, None, None, None, 0, 0, 0),
-    static_broadcasted_argnums=(0, 6, 7, 8),
-)
-def _p_generate(
-    pipe,
-    prompt_ids,
-    mask,
-    masked_image,
-    params,
-    prng_seed,
-    num_inference_steps,
-    height,
-    width,
-    guidance_scale,
-    latents,
-    neg_prompt_ids,
-):
-    return pipe._generate(
-        prompt_ids,
-        mask,
-        masked_image,
-        params,
-        prng_seed,
-        num_inference_steps,
-        height,
-        width,
-        guidance_scale,
-        latents,
-        neg_prompt_ids,
-    )
-
-
-@partial(jax.pmap, static_broadcasted_argnums=(0,))
-def _p_get_has_nsfw_concepts(pipe, features, params):
-    return pipe._get_has_nsfw_concepts(features, params)
-
-
-def unshard(x: jnp.ndarray):
-    # einops.rearrange(x, 'd b ... -> (d b) ...')
-    num_devices, batch_size = x.shape[:2]
-    rest = x.shape[2:]
-    return x.reshape(num_devices * batch_size, *rest)
-
-
-def preprocess_image(image, dtype):
-    w, h = image.size
-    w, h = map(lambda x: x - x % 32, (w, h))  # resize to integer multiple of 32
-    image = image.resize((w, h), resample=PIL_INTERPOLATION["lanczos"])
-    image = jnp.array(image).astype(dtype) / 255.0
-    image = image[None].transpose(0, 3, 1, 2)
-    return 2.0 * image - 1.0
-
-
-def preprocess_mask(mask, dtype):
-    w, h = mask.size
-    w, h = map(lambda x: x - x % 32, (w, h))  # resize to integer multiple of 32
-    mask = mask.resize((w, h))
-    mask = jnp.array(mask.convert("L")).astype(dtype) / 255.0
-    mask = jnp.expand_dims(mask, axis=(0, 1))
-
-    return mask
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py
@@ -117,7 +117,7 @@ class OnnxStableDiffusionPipeline(DiffusionPipeline):
        Encodes the prompt into text encoder hidden states.

        Args:
-            prompt (`str` or `List[str]`):
+            prompt (`str` or `list(int)`):
                prompt to be encoded
            num_images_per_prompt (`int`):
                number of images that should be generated per prompt
@@ -147,8 +147,8 @@ class OnnxStableDiffusionPipeline(DiffusionPipeline):
                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
            )

-        prompt_embeds = self.text_encoder(input_ids=text_input_ids.astype(np.int32))[0]
-        prompt_embeds = np.repeat(prompt_embeds, num_images_per_prompt, axis=0)
+        text_embeddings = self.text_encoder(input_ids=text_input_ids.astype(np.int32))[0]
+        text_embeddings = np.repeat(text_embeddings, num_images_per_prompt, axis=0)

        # get unconditional embeddings for classifier free guidance
        if do_classifier_free_guidance:
@@ -179,15 +179,15 @@ class OnnxStableDiffusionPipeline(DiffusionPipeline):
                truncation=True,
                return_tensors="np",
            )
-            negative_prompt_embeds = self.text_encoder(input_ids=uncond_input.input_ids.astype(np.int32))[0]
-            negative_prompt_embeds = np.repeat(negative_prompt_embeds, num_images_per_prompt, axis=0)
+            uncond_embeddings = self.text_encoder(input_ids=uncond_input.input_ids.astype(np.int32))[0]
+            uncond_embeddings = np.repeat(uncond_embeddings, num_images_per_prompt, axis=0)

            # For classifier free guidance, we need to do two forward passes.
            # Here we concatenate the unconditional and text embeddings into a single batch
            # to avoid doing two forward passes
-            prompt_embeds = np.concatenate([negative_prompt_embeds, prompt_embeds])
+            text_embeddings = np.concatenate([uncond_embeddings, text_embeddings])

-        return prompt_embeds
+        return text_embeddings

    def __call__(
        self,
@@ -232,12 +232,12 @@ class OnnxStableDiffusionPipeline(DiffusionPipeline):
        # corresponds to doing no classifier free guidance.
        do_classifier_free_guidance = guidance_scale > 1.0

-        prompt_embeds = self._encode_prompt(
+        text_embeddings = self._encode_prompt(
            prompt, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
        )

        # get the initial random noise unless the user supplied it
-        latents_dtype = prompt_embeds.dtype
+        latents_dtype = text_embeddings.dtype
        latents_shape = (batch_size * num_images_per_prompt, 4, height // 8, width // 8)
        if latents is None:
            latents = generator.randn(*latents_shape).astype(latents_dtype)
@@ -271,7 +271,7 @@ class OnnxStableDiffusionPipeline(DiffusionPipeline):

            # predict the noise residual
            timestep = np.array([t], dtype=timestep_dtype)
-            noise_pred = self.unet(sample=latent_model_input, timestep=timestep, encoder_hidden_states=prompt_embeds)
+            noise_pred = self.unet(sample=latent_model_input, timestep=timestep, encoder_hidden_states=text_embeddings)
            noise_pred = noise_pred[0]

            # perform guidance
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py
@@ -32,7 +32,7 @@ from . import StableDiffusionPipelineOutput
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name


-# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.preprocess with 8->64
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.preprocess
 def preprocess(image):
    if isinstance(image, torch.Tensor):
        return image
@@ -41,7 +41,7 @@ def preprocess(image):

    if isinstance(image[0], PIL.Image.Image):
        w, h = image[0].size
-        w, h = map(lambda x: x - x % 64, (w, h))  # resize to integer multiple of 64
+        w, h = map(lambda x: x - x % 32, (w, h))  # resize to integer multiple of 32

        image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image]
        image = np.concatenate(image, axis=0)
@@ -167,7 +167,7 @@ class OnnxStableDiffusionImg2ImgPipeline(DiffusionPipeline):
        Encodes the prompt into text encoder hidden states.

        Args:
-            prompt (`str` or `List[str]`):
+            prompt (`str` or `list(int)`):
                prompt to be encoded
            num_images_per_prompt (`int`):
                number of images that should be generated per prompt
@@ -197,8 +197,8 @@ class OnnxStableDiffusionImg2ImgPipeline(DiffusionPipeline):
                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
            )

-        prompt_embeds = self.text_encoder(input_ids=text_input_ids.astype(np.int32))[0]
-        prompt_embeds = np.repeat(prompt_embeds, num_images_per_prompt, axis=0)
+        text_embeddings = self.text_encoder(input_ids=text_input_ids.astype(np.int32))[0]
+        text_embeddings = np.repeat(text_embeddings, num_images_per_prompt, axis=0)

        # get unconditional embeddings for classifier free guidance
        if do_classifier_free_guidance:
@@ -229,15 +229,15 @@ class OnnxStableDiffusionImg2ImgPipeline(DiffusionPipeline):
                truncation=True,
                return_tensors="np",
            )
-            negative_prompt_embeds = self.text_encoder(input_ids=uncond_input.input_ids.astype(np.int32))[0]
-            negative_prompt_embeds = np.repeat(negative_prompt_embeds, num_images_per_prompt, axis=0)
+            uncond_embeddings = self.text_encoder(input_ids=uncond_input.input_ids.astype(np.int32))[0]
+            uncond_embeddings = np.repeat(uncond_embeddings, num_images_per_prompt, axis=0)

            # For classifier free guidance, we need to do two forward passes.
            # Here we concatenate the unconditional and text embeddings into a single batch
            # to avoid doing two forward passes
-            prompt_embeds = np.concatenate([negative_prompt_embeds, prompt_embeds])
+            text_embeddings = np.concatenate([uncond_embeddings, text_embeddings])

-        return prompt_embeds
+        return text_embeddings

    def __call__(
        self,
@@ -345,11 +345,11 @@ class OnnxStableDiffusionImg2ImgPipeline(DiffusionPipeline):
        # corresponds to doing no classifier free guidance.
        do_classifier_free_guidance = guidance_scale > 1.0

-        prompt_embeds = self._encode_prompt(
+        text_embeddings = self._encode_prompt(
            prompt, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
        )

-        latents_dtype = prompt_embeds.dtype
+        latents_dtype = text_embeddings.dtype
        image = image.astype(latents_dtype)
        # encode the init image into latents and scale the latents
        init_latents = self.vae_encoder(sample=image)[0]
@@ -417,9 +417,9 @@ class OnnxStableDiffusionImg2ImgPipeline(DiffusionPipeline):

            # predict the noise residual
            timestep = np.array([t], dtype=timestep_dtype)
-            noise_pred = self.unet(sample=latent_model_input, timestep=timestep, encoder_hidden_states=prompt_embeds)[
-                0
-            ]
+            noise_pred = self.unet(
+                sample=latent_model_input, timestep=timestep, encoder_hidden_states=text_embeddings
+            )[0]

            # perform guidance
            if do_classifier_free_guidance:
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py
@@ -168,7 +168,7 @@ class OnnxStableDiffusionInpaintPipeline(DiffusionPipeline):
        Encodes the prompt into text encoder hidden states.

        Args:
-            prompt (`str` or `List[str]`):
+            prompt (`str` or `list(int)`):
                prompt to be encoded
            num_images_per_prompt (`int`):
                number of images that should be generated per prompt
@@ -198,8 +198,8 @@ class OnnxStableDiffusionInpaintPipeline(DiffusionPipeline):
                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
            )

-        prompt_embeds = self.text_encoder(input_ids=text_input_ids.astype(np.int32))[0]
-        prompt_embeds = np.repeat(prompt_embeds, num_images_per_prompt, axis=0)
+        text_embeddings = self.text_encoder(input_ids=text_input_ids.astype(np.int32))[0]
+        text_embeddings = np.repeat(text_embeddings, num_images_per_prompt, axis=0)

        # get unconditional embeddings for classifier free guidance
        if do_classifier_free_guidance:
@@ -230,15 +230,15 @@ class OnnxStableDiffusionInpaintPipeline(DiffusionPipeline):
                truncation=True,
                return_tensors="np",
            )
-            negative_prompt_embeds = self.text_encoder(input_ids=uncond_input.input_ids.astype(np.int32))[0]
-            negative_prompt_embeds = np.repeat(negative_prompt_embeds, num_images_per_prompt, axis=0)
+            uncond_embeddings = self.text_encoder(input_ids=uncond_input.input_ids.astype(np.int32))[0]
+            uncond_embeddings = np.repeat(uncond_embeddings, num_images_per_prompt, axis=0)

            # For classifier free guidance, we need to do two forward passes.
            # Here we concatenate the unconditional and text embeddings into a single batch
            # to avoid doing two forward passes
-            prompt_embeds = np.concatenate([negative_prompt_embeds, prompt_embeds])
+            text_embeddings = np.concatenate([uncond_embeddings, text_embeddings])

-        return prompt_embeds
+        return text_embeddings

    @torch.no_grad()
    def __call__(
@@ -351,13 +351,13 @@ class OnnxStableDiffusionInpaintPipeline(DiffusionPipeline):
        # corresponds to doing no classifier free guidance.
        do_classifier_free_guidance = guidance_scale > 1.0

-        prompt_embeds = self._encode_prompt(
+        text_embeddings = self._encode_prompt(
            prompt, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
        )

        num_channels_latents = NUM_LATENT_CHANNELS
        latents_shape = (batch_size * num_images_per_prompt, num_channels_latents, height // 8, width // 8)
-        latents_dtype = prompt_embeds.dtype
+        latents_dtype = text_embeddings.dtype
        if latents is None:
            latents = generator.randn(*latents_shape).astype(latents_dtype)
        else:
@@ -424,9 +424,9 @@ class OnnxStableDiffusionInpaintPipeline(DiffusionPipeline):

            # predict the noise residual
            timestep = np.array([t], dtype=timestep_dtype)
-            noise_pred = self.unet(sample=latent_model_input, timestep=timestep, encoder_hidden_states=prompt_embeds)[
-                0
-            ]
+            noise_pred = self.unet(
+                sample=latent_model_input, timestep=timestep, encoder_hidden_states=text_embeddings
+            )[0]

            # perform guidance
            if do_classifier_free_guidance:
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint_legacy.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint_legacy.py
@@ -153,7 +153,7 @@ class OnnxStableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
        Encodes the prompt into text encoder hidden states.

        Args:
-            prompt (`str` or `List[str]`):
+            prompt (`str` or `list(int)`):
                prompt to be encoded
            num_images_per_prompt (`int`):
                number of images that should be generated per prompt
@@ -183,8 +183,8 @@ class OnnxStableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
            )

-        prompt_embeds = self.text_encoder(input_ids=text_input_ids.astype(np.int32))[0]
-        prompt_embeds = np.repeat(prompt_embeds, num_images_per_prompt, axis=0)
+        text_embeddings = self.text_encoder(input_ids=text_input_ids.astype(np.int32))[0]
+        text_embeddings = np.repeat(text_embeddings, num_images_per_prompt, axis=0)

        # get unconditional embeddings for classifier free guidance
        if do_classifier_free_guidance:
@@ -215,15 +215,15 @@ class OnnxStableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
                truncation=True,
                return_tensors="np",
            )
-            negative_prompt_embeds = self.text_encoder(input_ids=uncond_input.input_ids.astype(np.int32))[0]
-            negative_prompt_embeds = np.repeat(negative_prompt_embeds, num_images_per_prompt, axis=0)
+            uncond_embeddings = self.text_encoder(input_ids=uncond_input.input_ids.astype(np.int32))[0]
+            uncond_embeddings = np.repeat(uncond_embeddings, num_images_per_prompt, axis=0)

            # For classifier free guidance, we need to do two forward passes.
            # Here we concatenate the unconditional and text embeddings into a single batch
            # to avoid doing two forward passes
-            prompt_embeds = np.concatenate([negative_prompt_embeds, prompt_embeds])
+            text_embeddings = np.concatenate([uncond_embeddings, text_embeddings])

-        return prompt_embeds
+        return text_embeddings

    def __call__(
        self,
@@ -338,11 +338,11 @@ class OnnxStableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
        # corresponds to doing no classifier free guidance.
        do_classifier_free_guidance = guidance_scale > 1.0

-        prompt_embeds = self._encode_prompt(
+        text_embeddings = self._encode_prompt(
            prompt, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
        )

-        latents_dtype = prompt_embeds.dtype
+        latents_dtype = text_embeddings.dtype
        image = image.astype(latents_dtype)

        # encode the init image into latents and scale the latents
@@ -399,7 +399,7 @@ class OnnxStableDiffusionInpaintPipelineLegacy(DiffusionPipeline):

            # predict the noise residual
            noise_pred = self.unet(
-                sample=latent_model_input, timestep=np.array([t]), encoder_hidden_states=prompt_embeds
+                sample=latent_model_input, timestep=np.array([t]), encoder_hidden_states=text_embeddings
            )[0]

            # perform guidance
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -22,7 +22,14 @@ from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer

 from ...configuration_utils import FrozenDict
 from ...models import AutoencoderKL, UNet2DConditionModel
-from ...schedulers import KarrasDiffusionSchedulers
+from ...schedulers import (
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+)
 from ...utils import deprecate, is_accelerate_available, logging, randn_tensor, replace_example_docstring
 from ..pipeline_utils import DiffusionPipeline
 from . import StableDiffusionPipelineOutput
@@ -81,7 +88,14 @@ class StableDiffusionPipeline(DiffusionPipeline):
        text_encoder: CLIPTextModel,
        tokenizer: CLIPTokenizer,
        unet: UNet2DConditionModel,
-        scheduler: KarrasDiffusionSchedulers,
+        scheduler: Union[
+            DDIMScheduler,
+            PNDMScheduler,
+            LMSDiscreteScheduler,
+            EulerDiscreteScheduler,
+            EulerAncestralDiscreteScheduler,
+            DPMSolverMultistepScheduler,
+        ],
        safety_checker: StableDiffusionSafetyChecker,
        feature_extractor: CLIPFeatureExtractor,
        requires_safety_checker: bool = True,
@@ -194,10 +208,13 @@ class StableDiffusionPipeline(DiffusionPipeline):
        device = torch.device(f"cuda:{gpu_id}")

        for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
-            cpu_offload(cpu_offloaded_model, device)
+            if cpu_offloaded_model is not None:
+                cpu_offload(cpu_offloaded_model, device)

        if self.safety_checker is not None:
-            cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True)
+            # TODO(Patrick) - there is currently a bug with cpu offload of nn.Parameter in accelerate
+            # fix by only offloading self.safety_checker for now
+            cpu_offload(self.safety_checker.vision_model, device)

    @property
    def _execution_device(self):
@@ -217,21 +234,12 @@ class StableDiffusionPipeline(DiffusionPipeline):
                return torch.device(module._hf_hook.execution_device)
        return self.device

-    def _encode_prompt(
-        self,
-        prompt,
-        device,
-        num_images_per_prompt,
-        do_classifier_free_guidance,
-        negative_prompt=None,
-        prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
-    ):
+    def _encode_prompt(self, prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt):
        r"""
        Encodes the prompt into text encoder hidden states.

        Args:
-             prompt (`str` or `List[str]`, *optional*):
+            prompt (`str` or `list(int)`):
                prompt to be encoded
            device: (`torch.device`):
                torch device
@@ -239,67 +247,47 @@ class StableDiffusionPipeline(DiffusionPipeline):
                number of images that should be generated per prompt
            do_classifier_free_guidance (`bool`):
                whether to use classifier free guidance or not
-            negative_ prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
-                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
-            prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
+            negative_prompt (`str` or `List[str]`):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
        """
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+
+        if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+            attention_mask = text_inputs.attention_mask.to(device)
        else:
-            batch_size = prompt_embeds.shape[0]
+            attention_mask = None

-        if prompt_embeds is None:
-            text_inputs = self.tokenizer(
-                prompt,
-                padding="max_length",
-                max_length=self.tokenizer.model_max_length,
-                truncation=True,
-                return_tensors="pt",
-            )
-            text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+        text_embeddings = self.text_encoder(
+            text_input_ids.to(device),
+            attention_mask=attention_mask,
+        )
+        text_embeddings = text_embeddings[0]

-            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
-                text_input_ids, untruncated_ids
-            ):
-                removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
-                )
-                logger.warning(
-                    "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
-                )
-
-            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-                attention_mask = text_inputs.attention_mask.to(device)
-            else:
-                attention_mask = None
-
-            prompt_embeds = self.text_encoder(
-                text_input_ids.to(device),
-                attention_mask=attention_mask,
-            )
-            prompt_embeds = prompt_embeds[0]
-
-        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
-
-        bs_embed, seq_len, _ = prompt_embeds.shape
        # duplicate text embeddings for each generation per prompt, using mps friendly method
-        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
-        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+        bs_embed, seq_len, _ = text_embeddings.shape
+        text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)
+        text_embeddings = text_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)

        # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance and negative_prompt_embeds is None:
+        if do_classifier_free_guidance:
            uncond_tokens: List[str]
            if negative_prompt is None:
                uncond_tokens = [""] * batch_size
@@ -319,7 +307,7 @@ class StableDiffusionPipeline(DiffusionPipeline):
            else:
                uncond_tokens = negative_prompt

-            max_length = prompt_embeds.shape[1]
+            max_length = text_input_ids.shape[-1]
            uncond_input = self.tokenizer(
                uncond_tokens,
                padding="max_length",
@@ -333,27 +321,23 @@ class StableDiffusionPipeline(DiffusionPipeline):
            else:
                attention_mask = None

-            negative_prompt_embeds = self.text_encoder(
+            uncond_embeddings = self.text_encoder(
                uncond_input.input_ids.to(device),
                attention_mask=attention_mask,
            )
-            negative_prompt_embeds = negative_prompt_embeds[0]
+            uncond_embeddings = uncond_embeddings[0]

-        if do_classifier_free_guidance:
            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-            seq_len = negative_prompt_embeds.shape[1]
-
-            negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
-
-            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
-            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+            seq_len = uncond_embeddings.shape[1]
+            uncond_embeddings = uncond_embeddings.repeat(1, num_images_per_prompt, 1)
+            uncond_embeddings = uncond_embeddings.view(batch_size * num_images_per_prompt, seq_len, -1)

            # For classifier free guidance, we need to do two forward passes.
            # Here we concatenate the unconditional and text embeddings into a single batch
            # to avoid doing two forward passes
-            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])

-        return prompt_embeds
+        return text_embeddings

    def run_safety_checker(self, image, device, dtype):
        if self.safety_checker is not None:
@@ -390,16 +374,10 @@ class StableDiffusionPipeline(DiffusionPipeline):
            extra_step_kwargs["generator"] = generator
        return extra_step_kwargs

-    def check_inputs(
-        self,
-        prompt,
-        height,
-        width,
-        callback_steps,
-        negative_prompt=None,
-        prompt_embeds=None,
-        negative_prompt_embeds=None,
-    ):
+    def check_inputs(self, prompt, height, width, callback_steps):
+        if not isinstance(prompt, str) and not isinstance(prompt, list):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
        if height % 8 != 0 or width % 8 != 0:
            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")

@@ -411,32 +389,6 @@ class StableDiffusionPipeline(DiffusionPipeline):
                f" {type(callback_steps)}."
            )

-        if prompt is not None and prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two."
-            )
-        elif prompt is None and prompt_embeds is None:
-            raise ValueError(
-                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
-            )
-        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if negative_prompt is not None and negative_prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
-                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
-            )
-
-        if prompt_embeds is not None and negative_prompt_embeds is not None:
-            if prompt_embeds.shape != negative_prompt_embeds.shape:
-                raise ValueError(
-                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
-                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}."
-                )
-
    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
        if isinstance(generator, list) and len(generator) != batch_size:
@@ -458,7 +410,7 @@ class StableDiffusionPipeline(DiffusionPipeline):
    @replace_example_docstring(EXAMPLE_DOC_STRING)
    def __call__(
        self,
-        prompt: Union[str, List[str]] = None,
+        prompt: Union[str, List[str]],
        height: Optional[int] = None,
        width: Optional[int] = None,
        num_inference_steps: int = 50,
@@ -468,8 +420,6 @@ class StableDiffusionPipeline(DiffusionPipeline):
        eta: float = 0.0,
        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
        latents: Optional[torch.FloatTensor] = None,
-        prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
        output_type: Optional[str] = "pil",
        return_dict: bool = True,
        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
@@ -479,9 +429,8 @@ class StableDiffusionPipeline(DiffusionPipeline):
        Function invoked when calling the pipeline for generation.

        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
-                instead.
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
                The height in pixels of the generated image.
            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
@@ -496,28 +445,20 @@ class StableDiffusionPipeline(DiffusionPipeline):
                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
                usually at the expense of lower image quality.
            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
-                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
            num_images_per_prompt (`int`, *optional*, defaults to 1):
                The number of images to generate per prompt.
            eta (`float`, *optional*, defaults to 0.0):
                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+            generator (`torch.Generator`, *optional*):
                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
                to make generation deterministic.
            latents (`torch.FloatTensor`, *optional*):
                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                tensor will ge generated by sampling using the supplied random `generator`.
-            prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generate image. Choose between
                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
@@ -545,18 +486,10 @@ class StableDiffusionPipeline(DiffusionPipeline):
        width = width or self.unet.config.sample_size * self.vae_scale_factor

        # 1. Check inputs. Raise error if not correct
-        self.check_inputs(
-            prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
-        )
+        self.check_inputs(prompt, height, width, callback_steps)

        # 2. Define call parameters
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
+        batch_size = 1 if isinstance(prompt, str) else len(prompt)
        device = self._execution_device
        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
@@ -564,15 +497,10 @@ class StableDiffusionPipeline(DiffusionPipeline):
        do_classifier_free_guidance = guidance_scale > 1.0

        # 3. Encode input prompt
-        prompt_embeds = self._encode_prompt(
-            prompt,
-            device,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
+        text_embeddings = self._encode_prompt(
+            prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
        )
+        torch.cuda.empty_cache()

        # 4. Prepare timesteps
        self.scheduler.set_timesteps(num_inference_steps, device=device)
@@ -585,7 +513,7 @@ class StableDiffusionPipeline(DiffusionPipeline):
            num_channels_latents,
            height,
            width,
-            prompt_embeds.dtype,
+            text_embeddings.dtype,
            device,
            generator,
            latents,
@@ -603,7 +531,7 @@ class StableDiffusionPipeline(DiffusionPipeline):
                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)

                # predict the noise residual
-                noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=prompt_embeds).sample
+                noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample

                # perform guidance
                if do_classifier_free_guidance:
@@ -623,7 +551,7 @@ class StableDiffusionPipeline(DiffusionPipeline):
        image = self.decode_latents(latents)

        # 9. Run safety checker
-        image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        image, has_nsfw_concept = self.run_safety_checker(image, device, text_embeddings.dtype)

        # 10. Convert to PIL
        if output_type == "pil":
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py
@@ -25,7 +25,14 @@ from transformers import CLIPTextModel, CLIPTokenizer, DPTFeatureExtractor, DPTF

 from ...configuration_utils import FrozenDict
 from ...models import AutoencoderKL, UNet2DConditionModel
-from ...schedulers import KarrasDiffusionSchedulers
+from ...schedulers import (
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+)
 from ...utils import PIL_INTERPOLATION, deprecate, is_accelerate_available, logging, randn_tensor
 from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput

@@ -42,7 +49,7 @@ def preprocess(image):

    if isinstance(image[0], PIL.Image.Image):
        w, h = image[0].size
-        w, h = map(lambda x: x - x % 8, (w, h))  # resize to integer multiple of 8
+        w, h = map(lambda x: x - x % 32, (w, h))  # resize to integer multiple of 32

        image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image]
        image = np.concatenate(image, axis=0)
@@ -84,7 +91,14 @@ class StableDiffusionDepth2ImgPipeline(DiffusionPipeline):
        text_encoder: CLIPTextModel,
        tokenizer: CLIPTokenizer,
        unet: UNet2DConditionModel,
-        scheduler: KarrasDiffusionSchedulers,
+        scheduler: Union[
+            DDIMScheduler,
+            PNDMScheduler,
+            LMSDiscreteScheduler,
+            EulerDiscreteScheduler,
+            EulerAncestralDiscreteScheduler,
+            DPMSolverMultistepScheduler,
+        ],
        depth_estimator: DPTForDepthEstimation,
        feature_extractor: DPTFeatureExtractor,
    ):
@@ -159,21 +173,12 @@ class StableDiffusionDepth2ImgPipeline(DiffusionPipeline):
        return self.device

    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
-    def _encode_prompt(
-        self,
-        prompt,
-        device,
-        num_images_per_prompt,
-        do_classifier_free_guidance,
-        negative_prompt=None,
-        prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
-    ):
+    def _encode_prompt(self, prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt):
        r"""
        Encodes the prompt into text encoder hidden states.

        Args:
-             prompt (`str` or `List[str]`, *optional*):
+            prompt (`str` or `list(int)`):
                prompt to be encoded
            device: (`torch.device`):
                torch device
@@ -181,67 +186,47 @@ class StableDiffusionDepth2ImgPipeline(DiffusionPipeline):
                number of images that should be generated per prompt
            do_classifier_free_guidance (`bool`):
                whether to use classifier free guidance or not
-            negative_ prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
-                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
-            prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
+            negative_prompt (`str` or `List[str]`):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
        """
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+
+        if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+            attention_mask = text_inputs.attention_mask.to(device)
        else:
-            batch_size = prompt_embeds.shape[0]
+            attention_mask = None

-        if prompt_embeds is None:
-            text_inputs = self.tokenizer(
-                prompt,
-                padding="max_length",
-                max_length=self.tokenizer.model_max_length,
-                truncation=True,
-                return_tensors="pt",
-            )
-            text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+        text_embeddings = self.text_encoder(
+            text_input_ids.to(device),
+            attention_mask=attention_mask,
+        )
+        text_embeddings = text_embeddings[0]

-            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
-                text_input_ids, untruncated_ids
-            ):
-                removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
-                )
-                logger.warning(
-                    "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
-                )
-
-            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-                attention_mask = text_inputs.attention_mask.to(device)
-            else:
-                attention_mask = None
-
-            prompt_embeds = self.text_encoder(
-                text_input_ids.to(device),
-                attention_mask=attention_mask,
-            )
-            prompt_embeds = prompt_embeds[0]
-
-        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
-
-        bs_embed, seq_len, _ = prompt_embeds.shape
        # duplicate text embeddings for each generation per prompt, using mps friendly method
-        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
-        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+        bs_embed, seq_len, _ = text_embeddings.shape
+        text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)
+        text_embeddings = text_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)

        # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance and negative_prompt_embeds is None:
+        if do_classifier_free_guidance:
            uncond_tokens: List[str]
            if negative_prompt is None:
                uncond_tokens = [""] * batch_size
@@ -261,7 +246,7 @@ class StableDiffusionDepth2ImgPipeline(DiffusionPipeline):
            else:
                uncond_tokens = negative_prompt

-            max_length = prompt_embeds.shape[1]
+            max_length = text_input_ids.shape[-1]
            uncond_input = self.tokenizer(
                uncond_tokens,
                padding="max_length",
@@ -275,27 +260,23 @@ class StableDiffusionDepth2ImgPipeline(DiffusionPipeline):
            else:
                attention_mask = None

-            negative_prompt_embeds = self.text_encoder(
+            uncond_embeddings = self.text_encoder(
                uncond_input.input_ids.to(device),
                attention_mask=attention_mask,
            )
-            negative_prompt_embeds = negative_prompt_embeds[0]
+            uncond_embeddings = uncond_embeddings[0]

-        if do_classifier_free_guidance:
            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-            seq_len = negative_prompt_embeds.shape[1]
-
-            negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
-
-            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
-            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+            seq_len = uncond_embeddings.shape[1]
+            uncond_embeddings = uncond_embeddings.repeat(1, num_images_per_prompt, 1)
+            uncond_embeddings = uncond_embeddings.view(batch_size * num_images_per_prompt, seq_len, -1)

            # For classifier free guidance, we need to do two forward passes.
            # Here we concatenate the unconditional and text embeddings into a single batch
            # to avoid doing two forward passes
-            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])

-        return prompt_embeds
+        return text_embeddings

    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
    def run_safety_checker(self, image, device, dtype):
@@ -335,15 +316,12 @@ class StableDiffusionDepth2ImgPipeline(DiffusionPipeline):
            extra_step_kwargs["generator"] = generator
        return extra_step_kwargs

-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.check_inputs
-    def check_inputs(
-        self, prompt, strength, callback_steps, negative_prompt=None, prompt_embeds=None, negative_prompt_embeds=None
-    ):
+    def check_inputs(self, prompt, strength, callback_steps):
        if not isinstance(prompt, str) and not isinstance(prompt, list):
            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")

        if strength < 0 or strength > 1:
-            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
+            raise ValueError(f"The value of strength should in [1.0, 1.0] but is {strength}")

        if (callback_steps is None) or (
            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
@@ -353,32 +331,6 @@ class StableDiffusionDepth2ImgPipeline(DiffusionPipeline):
                f" {type(callback_steps)}."
            )

-        if prompt is not None and prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two."
-            )
-        elif prompt is None and prompt_embeds is None:
-            raise ValueError(
-                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
-            )
-        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if negative_prompt is not None and negative_prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
-                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
-            )
-
-        if prompt_embeds is not None and negative_prompt_embeds is not None:
-            if prompt_embeds.shape != negative_prompt_embeds.shape:
-                raise ValueError(
-                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
-                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}."
-                )
-
    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps
    def get_timesteps(self, num_inference_steps, strength, device):
        # get the original timestep using init_timestep
@@ -486,8 +438,8 @@ class StableDiffusionDepth2ImgPipeline(DiffusionPipeline):
    @torch.no_grad()
    def __call__(
        self,
-        prompt: Union[str, List[str]] = None,
-        image: Union[torch.FloatTensor, PIL.Image.Image] = None,
+        prompt: Union[str, List[str]],
+        image: Union[torch.FloatTensor, PIL.Image.Image],
        depth_map: Optional[torch.FloatTensor] = None,
        strength: float = 0.8,
        num_inference_steps: Optional[int] = 50,
@@ -496,8 +448,6 @@ class StableDiffusionDepth2ImgPipeline(DiffusionPipeline):
        num_images_per_prompt: Optional[int] = 1,
        eta: Optional[float] = 0.0,
        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
-        prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
        output_type: Optional[str] = "pil",
        return_dict: bool = True,
        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
@@ -507,9 +457,8 @@ class StableDiffusionDepth2ImgPipeline(DiffusionPipeline):
        Function invoked when calling the pipeline for generation.

        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
-                instead.
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
            image (`torch.FloatTensor` or `PIL.Image.Image`):
                `Image`, or tensor representing an image batch, that will be used as the starting point for the
                process.
@@ -529,9 +478,8 @@ class StableDiffusionDepth2ImgPipeline(DiffusionPipeline):
                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
                usually at the expense of lower image quality.
            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale`
-                is less than `1`).
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
            num_images_per_prompt (`int`, *optional*, defaults to 1):
                The number of images to generate per prompt.
            eta (`float`, *optional*, defaults to 0.0):
@@ -540,13 +488,6 @@ class StableDiffusionDepth2ImgPipeline(DiffusionPipeline):
            generator (`torch.Generator`, *optional*):
                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
                to make generation deterministic.
-            prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generate image. Choose between
                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
@@ -593,9 +534,6 @@ class StableDiffusionDepth2ImgPipeline(DiffusionPipeline):
        # 1. Check inputs
        self.check_inputs(prompt, strength, callback_steps)

-        if image is None:
-            raise ValueError("`image` input cannot be undefined.")
-
        # 2. Define call parameters
        batch_size = 1 if isinstance(prompt, str) else len(prompt)
        device = self._execution_device
@@ -605,14 +543,8 @@ class StableDiffusionDepth2ImgPipeline(DiffusionPipeline):
        do_classifier_free_guidance = guidance_scale > 1.0

        # 3. Encode input prompt
-        prompt_embeds = self._encode_prompt(
-            prompt,
-            device,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
+        text_embeddings = self._encode_prompt(
+            prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
        )

        # 4. Prepare depth mask
@@ -621,7 +553,7 @@ class StableDiffusionDepth2ImgPipeline(DiffusionPipeline):
            depth_map,
            batch_size * num_images_per_prompt,
            do_classifier_free_guidance,
-            prompt_embeds.dtype,
+            text_embeddings.dtype,
            device,
        )

@@ -635,7 +567,7 @@ class StableDiffusionDepth2ImgPipeline(DiffusionPipeline):

        # 7. Prepare latent variables
        latents = self.prepare_latents(
-            image, latent_timestep, batch_size, num_images_per_prompt, prompt_embeds.dtype, device, generator
+            image, latent_timestep, batch_size, num_images_per_prompt, text_embeddings.dtype, device, generator
        )

        # 8. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
@@ -651,7 +583,7 @@ class StableDiffusionDepth2ImgPipeline(DiffusionPipeline):
                latent_model_input = torch.cat([latent_model_input, depth_mask], dim=1)

                # predict the noise residual
-                noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=prompt_embeds).sample
+                noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample

                # perform guidance
                if do_classifier_free_guidance:
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py
@@ -23,7 +23,14 @@ from transformers import CLIPFeatureExtractor, CLIPVisionModelWithProjection

 from ...configuration_utils import FrozenDict
 from ...models import AutoencoderKL, UNet2DConditionModel
-from ...schedulers import KarrasDiffusionSchedulers
+from ...schedulers import (
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+)
 from ...utils import deprecate, is_accelerate_available, logging, randn_tensor
 from ..pipeline_utils import DiffusionPipeline
 from . import StableDiffusionPipelineOutput
@@ -66,7 +73,14 @@ class StableDiffusionImageVariationPipeline(DiffusionPipeline):
        vae: AutoencoderKL,
        image_encoder: CLIPVisionModelWithProjection,
        unet: UNet2DConditionModel,
-        scheduler: KarrasDiffusionSchedulers,
+        scheduler: Union[
+            DDIMScheduler,
+            PNDMScheduler,
+            LMSDiscreteScheduler,
+            EulerDiscreteScheduler,
+            EulerAncestralDiscreteScheduler,
+            DPMSolverMultistepScheduler,
+        ],
        safety_checker: StableDiffusionSafetyChecker,
        feature_extractor: CLIPFeatureExtractor,
        requires_safety_checker: bool = True,
@@ -173,12 +187,12 @@ class StableDiffusionImageVariationPipeline(DiffusionPipeline):
        image_embeddings = image_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)

        if do_classifier_free_guidance:
-            negative_prompt_embeds = torch.zeros_like(image_embeddings)
+            uncond_embeddings = torch.zeros_like(image_embeddings)

            # For classifier free guidance, we need to do two forward passes.
            # Here we concatenate the unconditional and text embeddings into a single batch
            # to avoid doing two forward passes
-            image_embeddings = torch.cat([negative_prompt_embeds, image_embeddings])
+            image_embeddings = torch.cat([uncond_embeddings, image_embeddings])

        return image_embeddings

--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
@@ -24,7 +24,14 @@ from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer

 from ...configuration_utils import FrozenDict
 from ...models import AutoencoderKL, UNet2DConditionModel
-from ...schedulers import KarrasDiffusionSchedulers
+from ...schedulers import (
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+)
 from ...utils import (
    PIL_INTERPOLATION,
    deprecate,
@@ -77,7 +84,7 @@ def preprocess(image):

    if isinstance(image[0], PIL.Image.Image):
        w, h = image[0].size
-        w, h = map(lambda x: x - x % 8, (w, h))  # resize to integer multiple of 8
+        w, h = map(lambda x: x - x % 32, (w, h))  # resize to integer multiple of 32

        image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image]
        image = np.concatenate(image, axis=0)
@@ -126,7 +133,14 @@ class StableDiffusionImg2ImgPipeline(DiffusionPipeline):
        text_encoder: CLIPTextModel,
        tokenizer: CLIPTokenizer,
        unet: UNet2DConditionModel,
-        scheduler: KarrasDiffusionSchedulers,
+        scheduler: Union[
+            DDIMScheduler,
+            PNDMScheduler,
+            LMSDiscreteScheduler,
+            EulerDiscreteScheduler,
+            EulerAncestralDiscreteScheduler,
+            DPMSolverMultistepScheduler,
+        ],
        safety_checker: StableDiffusionSafetyChecker,
        feature_extractor: CLIPFeatureExtractor,
        requires_safety_checker: bool = True,
@@ -224,10 +238,13 @@ class StableDiffusionImg2ImgPipeline(DiffusionPipeline):
        device = torch.device(f"cuda:{gpu_id}")

        for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
-            cpu_offload(cpu_offloaded_model, device)
+            if cpu_offloaded_model is not None:
+                cpu_offload(cpu_offloaded_model, device)

        if self.safety_checker is not None:
-            cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True)
+            # TODO(Patrick) - there is currently a bug with cpu offload of nn.Parameter in accelerate
+            # fix by only offloading self.safety_checker for now
+            cpu_offload(self.safety_checker.vision_model, device)

    @property
    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
@@ -249,21 +266,12 @@ class StableDiffusionImg2ImgPipeline(DiffusionPipeline):
        return self.device

    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
-    def _encode_prompt(
-        self,
-        prompt,
-        device,
-        num_images_per_prompt,
-        do_classifier_free_guidance,
-        negative_prompt=None,
-        prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
-    ):
+    def _encode_prompt(self, prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt):
        r"""
        Encodes the prompt into text encoder hidden states.

        Args:
-             prompt (`str` or `List[str]`, *optional*):
+            prompt (`str` or `list(int)`):
                prompt to be encoded
            device: (`torch.device`):
                torch device
@@ -271,67 +279,47 @@ class StableDiffusionImg2ImgPipeline(DiffusionPipeline):
                number of images that should be generated per prompt
            do_classifier_free_guidance (`bool`):
                whether to use classifier free guidance or not
-            negative_ prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
-                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
-            prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
+            negative_prompt (`str` or `List[str]`):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
        """
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+
+        if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+            attention_mask = text_inputs.attention_mask.to(device)
        else:
-            batch_size = prompt_embeds.shape[0]
+            attention_mask = None

-        if prompt_embeds is None:
-            text_inputs = self.tokenizer(
-                prompt,
-                padding="max_length",
-                max_length=self.tokenizer.model_max_length,
-                truncation=True,
-                return_tensors="pt",
-            )
-            text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+        text_embeddings = self.text_encoder(
+            text_input_ids.to(device),
+            attention_mask=attention_mask,
+        )
+        text_embeddings = text_embeddings[0]

-            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
-                text_input_ids, untruncated_ids
-            ):
-                removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
-                )
-                logger.warning(
-                    "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
-                )
-
-            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-                attention_mask = text_inputs.attention_mask.to(device)
-            else:
-                attention_mask = None
-
-            prompt_embeds = self.text_encoder(
-                text_input_ids.to(device),
-                attention_mask=attention_mask,
-            )
-            prompt_embeds = prompt_embeds[0]
-
-        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
-
-        bs_embed, seq_len, _ = prompt_embeds.shape
        # duplicate text embeddings for each generation per prompt, using mps friendly method
-        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
-        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+        bs_embed, seq_len, _ = text_embeddings.shape
+        text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)
+        text_embeddings = text_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)

        # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance and negative_prompt_embeds is None:
+        if do_classifier_free_guidance:
            uncond_tokens: List[str]
            if negative_prompt is None:
                uncond_tokens = [""] * batch_size
@@ -351,7 +339,7 @@ class StableDiffusionImg2ImgPipeline(DiffusionPipeline):
            else:
                uncond_tokens = negative_prompt

-            max_length = prompt_embeds.shape[1]
+            max_length = text_input_ids.shape[-1]
            uncond_input = self.tokenizer(
                uncond_tokens,
                padding="max_length",
@@ -365,27 +353,23 @@ class StableDiffusionImg2ImgPipeline(DiffusionPipeline):
            else:
                attention_mask = None

-            negative_prompt_embeds = self.text_encoder(
+            uncond_embeddings = self.text_encoder(
                uncond_input.input_ids.to(device),
                attention_mask=attention_mask,
            )
-            negative_prompt_embeds = negative_prompt_embeds[0]
+            uncond_embeddings = uncond_embeddings[0]

-        if do_classifier_free_guidance:
            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-            seq_len = negative_prompt_embeds.shape[1]
-
-            negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
-
-            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
-            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+            seq_len = uncond_embeddings.shape[1]
+            uncond_embeddings = uncond_embeddings.repeat(1, num_images_per_prompt, 1)
+            uncond_embeddings = uncond_embeddings.view(batch_size * num_images_per_prompt, seq_len, -1)

            # For classifier free guidance, we need to do two forward passes.
            # Here we concatenate the unconditional and text embeddings into a single batch
            # to avoid doing two forward passes
-            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])

-        return prompt_embeds
+        return text_embeddings

    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
    def run_safety_checker(self, image, device, dtype):
@@ -425,14 +409,12 @@ class StableDiffusionImg2ImgPipeline(DiffusionPipeline):
            extra_step_kwargs["generator"] = generator
        return extra_step_kwargs

-    def check_inputs(
-        self, prompt, strength, callback_steps, negative_prompt=None, prompt_embeds=None, negative_prompt_embeds=None
-    ):
+    def check_inputs(self, prompt, strength, callback_steps):
        if not isinstance(prompt, str) and not isinstance(prompt, list):
            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")

        if strength < 0 or strength > 1:
-            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
+            raise ValueError(f"The value of strength should in [1.0, 1.0] but is {strength}")

        if (callback_steps is None) or (
            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
@@ -442,32 +424,6 @@ class StableDiffusionImg2ImgPipeline(DiffusionPipeline):
                f" {type(callback_steps)}."
            )

-        if prompt is not None and prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two."
-            )
-        elif prompt is None and prompt_embeds is None:
-            raise ValueError(
-                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
-            )
-        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if negative_prompt is not None and negative_prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
-                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
-            )
-
-        if prompt_embeds is not None and negative_prompt_embeds is not None:
-            if prompt_embeds.shape != negative_prompt_embeds.shape:
-                raise ValueError(
-                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
-                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}."
-                )
-
    def get_timesteps(self, num_inference_steps, strength, device):
        # get the original timestep using init_timestep
        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
@@ -533,7 +489,7 @@ class StableDiffusionImg2ImgPipeline(DiffusionPipeline):
    @replace_example_docstring(EXAMPLE_DOC_STRING)
    def __call__(
        self,
-        prompt: Union[str, List[str]] = None,
+        prompt: Union[str, List[str]],
        image: Union[torch.FloatTensor, PIL.Image.Image] = None,
        strength: float = 0.8,
        num_inference_steps: Optional[int] = 50,
@@ -542,8 +498,6 @@ class StableDiffusionImg2ImgPipeline(DiffusionPipeline):
        num_images_per_prompt: Optional[int] = 1,
        eta: Optional[float] = 0.0,
        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
-        prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
        output_type: Optional[str] = "pil",
        return_dict: bool = True,
        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
@@ -554,9 +508,8 @@ class StableDiffusionImg2ImgPipeline(DiffusionPipeline):
        Function invoked when calling the pipeline for generation.

        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
-                instead.
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
            image (`torch.FloatTensor` or `PIL.Image.Image`):
                `Image`, or tensor representing an image batch, that will be used as the starting point for the
                process.
@@ -576,9 +529,8 @@ class StableDiffusionImg2ImgPipeline(DiffusionPipeline):
                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
                usually at the expense of lower image quality.
            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale`
-                is less than `1`).
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
            num_images_per_prompt (`int`, *optional*, defaults to 1):
                The number of images to generate per prompt.
            eta (`float`, *optional*, defaults to 0.0):
@@ -587,13 +539,6 @@ class StableDiffusionImg2ImgPipeline(DiffusionPipeline):
            generator (`torch.Generator`, *optional*):
                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
                to make generation deterministic.
-            prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generate image. Choose between
                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
@@ -619,8 +564,8 @@ class StableDiffusionImg2ImgPipeline(DiffusionPipeline):
        init_image = deprecate("init_image", "0.13.0", message, take_from=kwargs)
        image = init_image or image

-        # 1. Check inputs. Raise error if not correct
-        self.check_inputs(prompt, strength, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds)
+        # 1. Check inputs
+        self.check_inputs(prompt, strength, callback_steps)

        # 2. Define call parameters
        batch_size = 1 if isinstance(prompt, str) else len(prompt)
@@ -631,14 +576,8 @@ class StableDiffusionImg2ImgPipeline(DiffusionPipeline):
        do_classifier_free_guidance = guidance_scale > 1.0

        # 3. Encode input prompt
-        prompt_embeds = self._encode_prompt(
-            prompt,
-            device,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
+        text_embeddings = self._encode_prompt(
+            prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
        )

        # 4. Preprocess image
@@ -651,7 +590,7 @@ class StableDiffusionImg2ImgPipeline(DiffusionPipeline):

        # 6. Prepare latent variables
        latents = self.prepare_latents(
-            image, latent_timestep, batch_size, num_images_per_prompt, prompt_embeds.dtype, device, generator
+            image, latent_timestep, batch_size, num_images_per_prompt, text_embeddings.dtype, device, generator
        )

        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
@@ -666,7 +605,7 @@ class StableDiffusionImg2ImgPipeline(DiffusionPipeline):
                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)

                # predict the noise residual
-                noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=prompt_embeds).sample
+                noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample

                # perform guidance
                if do_classifier_free_guidance:
@@ -686,7 +625,7 @@ class StableDiffusionImg2ImgPipeline(DiffusionPipeline):
        image = self.decode_latents(latents)

        # 10. Run safety checker
-        image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        image, has_nsfw_concept = self.run_safety_checker(image, device, text_embeddings.dtype)

        # 11. Convert to PIL
        if output_type == "pil":
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
@@ -24,7 +24,7 @@ from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer

 from ...configuration_utils import FrozenDict
 from ...models import AutoencoderKL, UNet2DConditionModel
-from ...schedulers import KarrasDiffusionSchedulers
+from ...schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
 from ...utils import deprecate, is_accelerate_available, logging, randn_tensor
 from ..pipeline_utils import DiffusionPipeline
 from . import StableDiffusionPipelineOutput
@@ -173,7 +173,7 @@ class StableDiffusionInpaintPipeline(DiffusionPipeline):
        text_encoder: CLIPTextModel,
        tokenizer: CLIPTokenizer,
        unet: UNet2DConditionModel,
-        scheduler: KarrasDiffusionSchedulers,
+        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
        safety_checker: StableDiffusionSafetyChecker,
        feature_extractor: CLIPFeatureExtractor,
        requires_safety_checker: bool = True,
@@ -272,10 +272,13 @@ class StableDiffusionInpaintPipeline(DiffusionPipeline):
        device = torch.device(f"cuda:{gpu_id}")

        for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
-            cpu_offload(cpu_offloaded_model, device)
+            if cpu_offloaded_model is not None:
+                cpu_offload(cpu_offloaded_model, device)

        if self.safety_checker is not None:
-            cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True)
+            # TODO(Patrick) - there is currently a bug with cpu offload of nn.Parameter in accelerate
+            # fix by only offloading self.safety_checker for now
+            cpu_offload(self.safety_checker.vision_model, device)

    @property
    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
@@ -297,21 +300,12 @@ class StableDiffusionInpaintPipeline(DiffusionPipeline):
        return self.device

    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
-    def _encode_prompt(
-        self,
-        prompt,
-        device,
-        num_images_per_prompt,
-        do_classifier_free_guidance,
-        negative_prompt=None,
-        prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
-    ):
+    def _encode_prompt(self, prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt):
        r"""
        Encodes the prompt into text encoder hidden states.

        Args:
-             prompt (`str` or `List[str]`, *optional*):
+            prompt (`str` or `list(int)`):
                prompt to be encoded
            device: (`torch.device`):
                torch device
@@ -319,67 +313,47 @@ class StableDiffusionInpaintPipeline(DiffusionPipeline):
                number of images that should be generated per prompt
            do_classifier_free_guidance (`bool`):
                whether to use classifier free guidance or not
-            negative_ prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
-                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
-            prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
+            negative_prompt (`str` or `List[str]`):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
        """
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+
+        if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+            attention_mask = text_inputs.attention_mask.to(device)
        else:
-            batch_size = prompt_embeds.shape[0]
+            attention_mask = None

-        if prompt_embeds is None:
-            text_inputs = self.tokenizer(
-                prompt,
-                padding="max_length",
-                max_length=self.tokenizer.model_max_length,
-                truncation=True,
-                return_tensors="pt",
-            )
-            text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+        text_embeddings = self.text_encoder(
+            text_input_ids.to(device),
+            attention_mask=attention_mask,
+        )
+        text_embeddings = text_embeddings[0]

-            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
-                text_input_ids, untruncated_ids
-            ):
-                removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
-                )
-                logger.warning(
-                    "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
-                )
-
-            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-                attention_mask = text_inputs.attention_mask.to(device)
-            else:
-                attention_mask = None
-
-            prompt_embeds = self.text_encoder(
-                text_input_ids.to(device),
-                attention_mask=attention_mask,
-            )
-            prompt_embeds = prompt_embeds[0]
-
-        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
-
-        bs_embed, seq_len, _ = prompt_embeds.shape
        # duplicate text embeddings for each generation per prompt, using mps friendly method
-        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
-        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+        bs_embed, seq_len, _ = text_embeddings.shape
+        text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)
+        text_embeddings = text_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)

        # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance and negative_prompt_embeds is None:
+        if do_classifier_free_guidance:
            uncond_tokens: List[str]
            if negative_prompt is None:
                uncond_tokens = [""] * batch_size
@@ -399,7 +373,7 @@ class StableDiffusionInpaintPipeline(DiffusionPipeline):
            else:
                uncond_tokens = negative_prompt

-            max_length = prompt_embeds.shape[1]
+            max_length = text_input_ids.shape[-1]
            uncond_input = self.tokenizer(
                uncond_tokens,
                padding="max_length",
@@ -413,27 +387,23 @@ class StableDiffusionInpaintPipeline(DiffusionPipeline):
            else:
                attention_mask = None

-            negative_prompt_embeds = self.text_encoder(
+            uncond_embeddings = self.text_encoder(
                uncond_input.input_ids.to(device),
                attention_mask=attention_mask,
            )
-            negative_prompt_embeds = negative_prompt_embeds[0]
+            uncond_embeddings = uncond_embeddings[0]

-        if do_classifier_free_guidance:
            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-            seq_len = negative_prompt_embeds.shape[1]
-
-            negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
-
-            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
-            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+            seq_len = uncond_embeddings.shape[1]
+            uncond_embeddings = uncond_embeddings.repeat(1, num_images_per_prompt, 1)
+            uncond_embeddings = uncond_embeddings.view(batch_size * num_images_per_prompt, seq_len, -1)

            # For classifier free guidance, we need to do two forward passes.
            # Here we concatenate the unconditional and text embeddings into a single batch
            # to avoid doing two forward passes
-            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])

-        return prompt_embeds
+        return text_embeddings

    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
    def run_safety_checker(self, image, device, dtype):
@@ -474,16 +444,10 @@ class StableDiffusionInpaintPipeline(DiffusionPipeline):
        return image

    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs
-    def check_inputs(
-        self,
-        prompt,
-        height,
-        width,
-        callback_steps,
-        negative_prompt=None,
-        prompt_embeds=None,
-        negative_prompt_embeds=None,
-    ):
+    def check_inputs(self, prompt, height, width, callback_steps):
+        if not isinstance(prompt, str) and not isinstance(prompt, list):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
        if height % 8 != 0 or width % 8 != 0:
            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")

@@ -495,32 +459,6 @@ class StableDiffusionInpaintPipeline(DiffusionPipeline):
                f" {type(callback_steps)}."
            )

-        if prompt is not None and prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two."
-            )
-        elif prompt is None and prompt_embeds is None:
-            raise ValueError(
-                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
-            )
-        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if negative_prompt is not None and negative_prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
-                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
-            )
-
-        if prompt_embeds is not None and negative_prompt_embeds is not None:
-            if prompt_embeds.shape != negative_prompt_embeds.shape:
-                raise ValueError(
-                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
-                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}."
-                )
-
    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
@@ -593,9 +531,9 @@ class StableDiffusionInpaintPipeline(DiffusionPipeline):
    @torch.no_grad()
    def __call__(
        self,
-        prompt: Union[str, List[str]] = None,
-        image: Union[torch.FloatTensor, PIL.Image.Image] = None,
-        mask_image: Union[torch.FloatTensor, PIL.Image.Image] = None,
+        prompt: Union[str, List[str]],
+        image: Union[torch.FloatTensor, PIL.Image.Image],
+        mask_image: Union[torch.FloatTensor, PIL.Image.Image],
        height: Optional[int] = None,
        width: Optional[int] = None,
        num_inference_steps: int = 50,
@@ -605,8 +543,6 @@ class StableDiffusionInpaintPipeline(DiffusionPipeline):
        eta: float = 0.0,
        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
        latents: Optional[torch.FloatTensor] = None,
-        nrompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
        output_type: Optional[str] = "pil",
        return_dict: bool = True,
        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
@@ -616,9 +552,8 @@ class StableDiffusionInpaintPipeline(DiffusionPipeline):
        Function invoked when calling the pipeline for generation.

        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
-                instead.
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
            image (`PIL.Image.Image`):
                `Image`, or tensor representing an image batch which will be inpainted, *i.e.* parts of the image will
                be masked out with `mask_image` and repainted according to `prompt`.
@@ -641,9 +576,8 @@ class StableDiffusionInpaintPipeline(DiffusionPipeline):
                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
                usually at the expense of lower image quality.
            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale`
-                is less than `1`).
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
            num_images_per_prompt (`int`, *optional*, defaults to 1):
                The number of images to generate per prompt.
            eta (`float`, *optional*, defaults to 0.0):
@@ -656,13 +590,6 @@ class StableDiffusionInpaintPipeline(DiffusionPipeline):
                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                tensor will ge generated by sampling using the supplied random `generator`.
-            prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generate image. Choose between
                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
@@ -721,12 +648,6 @@ class StableDiffusionInpaintPipeline(DiffusionPipeline):
        # 1. Check inputs
        self.check_inputs(prompt, height, width, callback_steps)

-        if image is None:
-            raise ValueError("`image` input cannot be undefined.")
-
-        if mask_image is None:
-            raise ValueError("`mask_image` input cannot be undefined.")
-
        # 2. Define call parameters
        batch_size = 1 if isinstance(prompt, str) else len(prompt)
        device = self._execution_device
@@ -736,7 +657,7 @@ class StableDiffusionInpaintPipeline(DiffusionPipeline):
        do_classifier_free_guidance = guidance_scale > 1.0

        # 3. Encode input prompt
-        prompt_embeds = self._encode_prompt(
+        text_embeddings = self._encode_prompt(
            prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
        )

@@ -754,7 +675,7 @@ class StableDiffusionInpaintPipeline(DiffusionPipeline):
            num_channels_latents,
            height,
            width,
-            prompt_embeds.dtype,
+            text_embeddings.dtype,
            device,
            generator,
            latents,
@@ -767,7 +688,7 @@ class StableDiffusionInpaintPipeline(DiffusionPipeline):
            batch_size * num_images_per_prompt,
            height,
            width,
-            prompt_embeds.dtype,
+            text_embeddings.dtype,
            device,
            generator,
            do_classifier_free_guidance,
@@ -800,7 +721,7 @@ class StableDiffusionInpaintPipeline(DiffusionPipeline):
                latent_model_input = torch.cat([latent_model_input, mask, masked_image_latents], dim=1)

                # predict the noise residual
-                noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=prompt_embeds).sample
+                noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample

                # perform guidance
                if do_classifier_free_guidance:
@@ -820,7 +741,7 @@ class StableDiffusionInpaintPipeline(DiffusionPipeline):
        image = self.decode_latents(latents)

        # 12. Run safety checker
-        image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        image, has_nsfw_concept = self.run_safety_checker(image, device, text_embeddings.dtype)

        # 13. Convert to PIL
        if output_type == "pil":
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
@@ -24,7 +24,14 @@ from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer

 from ...configuration_utils import FrozenDict
 from ...models import AutoencoderKL, UNet2DConditionModel
-from ...schedulers import KarrasDiffusionSchedulers
+from ...schedulers import (
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+)
 from ...utils import PIL_INTERPOLATION, deprecate, is_accelerate_available, logging, randn_tensor
 from ..pipeline_utils import DiffusionPipeline
 from . import StableDiffusionPipelineOutput
@@ -93,7 +100,14 @@ class StableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
        text_encoder: CLIPTextModel,
        tokenizer: CLIPTokenizer,
        unet: UNet2DConditionModel,
-        scheduler: KarrasDiffusionSchedulers,
+        scheduler: Union[
+            DDIMScheduler,
+            PNDMScheduler,
+            LMSDiscreteScheduler,
+            EulerDiscreteScheduler,
+            EulerAncestralDiscreteScheduler,
+            DPMSolverMultistepScheduler,
+        ],
        safety_checker: StableDiffusionSafetyChecker,
        feature_extractor: CLIPFeatureExtractor,
        requires_safety_checker: bool = True,
@@ -191,10 +205,13 @@ class StableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
        device = torch.device(f"cuda:{gpu_id}")

        for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
-            cpu_offload(cpu_offloaded_model, device)
+            if cpu_offloaded_model is not None:
+                cpu_offload(cpu_offloaded_model, device)

        if self.safety_checker is not None:
-            cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True)
+            # TODO(Patrick) - there is currently a bug with cpu offload of nn.Parameter in accelerate
+            # fix by only offloading self.safety_checker for now
+            cpu_offload(self.safety_checker.vision_model, device)

    @property
    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
@@ -216,21 +233,12 @@ class StableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
        return self.device

    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
-    def _encode_prompt(
-        self,
-        prompt,
-        device,
-        num_images_per_prompt,
-        do_classifier_free_guidance,
-        negative_prompt=None,
-        prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
-    ):
+    def _encode_prompt(self, prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt):
        r"""
        Encodes the prompt into text encoder hidden states.

        Args:
-             prompt (`str` or `List[str]`, *optional*):
+            prompt (`str` or `list(int)`):
                prompt to be encoded
            device: (`torch.device`):
                torch device
@@ -238,67 +246,47 @@ class StableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
                number of images that should be generated per prompt
            do_classifier_free_guidance (`bool`):
                whether to use classifier free guidance or not
-            negative_ prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
-                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
-            prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
+            negative_prompt (`str` or `List[str]`):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
        """
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+
+        if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+            attention_mask = text_inputs.attention_mask.to(device)
        else:
-            batch_size = prompt_embeds.shape[0]
+            attention_mask = None

-        if prompt_embeds is None:
-            text_inputs = self.tokenizer(
-                prompt,
-                padding="max_length",
-                max_length=self.tokenizer.model_max_length,
-                truncation=True,
-                return_tensors="pt",
-            )
-            text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+        text_embeddings = self.text_encoder(
+            text_input_ids.to(device),
+            attention_mask=attention_mask,
+        )
+        text_embeddings = text_embeddings[0]

-            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
-                text_input_ids, untruncated_ids
-            ):
-                removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
-                )
-                logger.warning(
-                    "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
-                )
-
-            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-                attention_mask = text_inputs.attention_mask.to(device)
-            else:
-                attention_mask = None
-
-            prompt_embeds = self.text_encoder(
-                text_input_ids.to(device),
-                attention_mask=attention_mask,
-            )
-            prompt_embeds = prompt_embeds[0]
-
-        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
-
-        bs_embed, seq_len, _ = prompt_embeds.shape
        # duplicate text embeddings for each generation per prompt, using mps friendly method
-        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
-        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+        bs_embed, seq_len, _ = text_embeddings.shape
+        text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)
+        text_embeddings = text_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)

        # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance and negative_prompt_embeds is None:
+        if do_classifier_free_guidance:
            uncond_tokens: List[str]
            if negative_prompt is None:
                uncond_tokens = [""] * batch_size
@@ -318,7 +306,7 @@ class StableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
            else:
                uncond_tokens = negative_prompt

-            max_length = prompt_embeds.shape[1]
+            max_length = text_input_ids.shape[-1]
            uncond_input = self.tokenizer(
                uncond_tokens,
                padding="max_length",
@@ -332,27 +320,23 @@ class StableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
            else:
                attention_mask = None

-            negative_prompt_embeds = self.text_encoder(
+            uncond_embeddings = self.text_encoder(
                uncond_input.input_ids.to(device),
                attention_mask=attention_mask,
            )
-            negative_prompt_embeds = negative_prompt_embeds[0]
+            uncond_embeddings = uncond_embeddings[0]

-        if do_classifier_free_guidance:
            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-            seq_len = negative_prompt_embeds.shape[1]
-
-            negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
-
-            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
-            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+            seq_len = uncond_embeddings.shape[1]
+            uncond_embeddings = uncond_embeddings.repeat(1, num_images_per_prompt, 1)
+            uncond_embeddings = uncond_embeddings.view(batch_size * num_images_per_prompt, seq_len, -1)

            # For classifier free guidance, we need to do two forward passes.
            # Here we concatenate the unconditional and text embeddings into a single batch
            # to avoid doing two forward passes
-            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])

-        return prompt_embeds
+        return text_embeddings

    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
    def run_safety_checker(self, image, device, dtype):
@@ -393,14 +377,12 @@ class StableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
        return extra_step_kwargs

    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.check_inputs
-    def check_inputs(
-        self, prompt, strength, callback_steps, negative_prompt=None, prompt_embeds=None, negative_prompt_embeds=None
-    ):
+    def check_inputs(self, prompt, strength, callback_steps):
        if not isinstance(prompt, str) and not isinstance(prompt, list):
            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")

        if strength < 0 or strength > 1:
-            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
+            raise ValueError(f"The value of strength should in [1.0, 1.0] but is {strength}")

        if (callback_steps is None) or (
            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
@@ -410,32 +392,6 @@ class StableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
                f" {type(callback_steps)}."
            )

-        if prompt is not None and prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two."
-            )
-        elif prompt is None and prompt_embeds is None:
-            raise ValueError(
-                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
-            )
-        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if negative_prompt is not None and negative_prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
-                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
-            )
-
-        if prompt_embeds is not None and negative_prompt_embeds is not None:
-            if prompt_embeds.shape != negative_prompt_embeds.shape:
-                raise ValueError(
-                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
-                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}."
-                )
-
    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps
    def get_timesteps(self, num_inference_steps, strength, device):
        # get the original timestep using init_timestep
@@ -476,8 +432,6 @@ class StableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
        add_predicted_noise: Optional[bool] = False,
        eta: Optional[float] = 0.0,
        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
-        prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
        output_type: Optional[str] = "pil",
        return_dict: bool = True,
        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
@@ -488,9 +442,8 @@ class StableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
        Function invoked when calling the pipeline for generation.

        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
-                instead.
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
            image (`torch.FloatTensor` or `PIL.Image.Image`):
                `Image`, or tensor representing an image batch, that will be used as the starting point for the
                process. This is the image whose masked region will be inpainted.
@@ -514,9 +467,8 @@ class StableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
                usually at the expense of lower image quality.
            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale`
-                is less than `1`).
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
            num_images_per_prompt (`int`, *optional*, defaults to 1):
                The number of images to generate per prompt.
            add_predicted_noise (`bool`, *optional*, defaults to True):
@@ -528,13 +480,6 @@ class StableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
            generator (`torch.Generator`, *optional*):
                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
                to make generation deterministic.
-            prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generate image. Choose between
                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
@@ -571,14 +516,8 @@ class StableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
        do_classifier_free_guidance = guidance_scale > 1.0

        # 3. Encode input prompt
-        prompt_embeds = self._encode_prompt(
-            prompt,
-            device,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
+        text_embeddings = self._encode_prompt(
+            prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
        )

        # 4. Preprocess image and mask
@@ -596,7 +535,7 @@ class StableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
        # 6. Prepare latent variables
        # encode the init image into latents and scale the latents
        latents, init_latents_orig, noise = self.prepare_latents(
-            image, latent_timestep, batch_size, num_images_per_prompt, prompt_embeds.dtype, device, generator
+            image, latent_timestep, batch_size, num_images_per_prompt, text_embeddings.dtype, device, generator
        )

        # 7. Prepare mask latent
@@ -615,7 +554,7 @@ class StableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)

                # predict the noise residual
-                noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=prompt_embeds).sample
+                noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample

                # perform guidance
                if do_classifier_free_guidance:
@@ -644,7 +583,7 @@ class StableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
        image = self.decode_latents(latents)

        # 11. Run safety checker
-        image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        image, has_nsfw_concept = self.run_safety_checker(image, device, text_embeddings.dtype)

        # 12. Convert to PIL
        if output_type == "pil":
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
@@ -1,673 +0,0 @@
-# Copyright 2023 The InstructPix2Pix Authors and The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-from typing import Callable, List, Optional, Union
-
-import numpy as np
-import torch
-
-import PIL
-from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
-
-from ...models import AutoencoderKL, UNet2DConditionModel
-from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import PIL_INTERPOLATION, deprecate, is_accelerate_available, logging, randn_tensor
-from ..pipeline_utils import DiffusionPipeline
-from . import StableDiffusionPipelineOutput
-from .safety_checker import StableDiffusionSafetyChecker
-
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-
-# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.preprocess
-def preprocess(image):
-    if isinstance(image, torch.Tensor):
-        return image
-    elif isinstance(image, PIL.Image.Image):
-        image = [image]
-
-    if isinstance(image[0], PIL.Image.Image):
-        w, h = image[0].size
-        w, h = map(lambda x: x - x % 8, (w, h))  # resize to integer multiple of 8
-
-        image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image]
-        image = np.concatenate(image, axis=0)
-        image = np.array(image).astype(np.float32) / 255.0
-        image = image.transpose(0, 3, 1, 2)
-        image = 2.0 * image - 1.0
-        image = torch.from_numpy(image)
-    elif isinstance(image[0], torch.Tensor):
-        image = torch.cat(image, dim=0)
-    return image
-
-
-class StableDiffusionInstructPix2PixPipeline(DiffusionPipeline):
-    r"""
-    Pipeline for pixel-level image editing by following text instructions. Based on Stable Diffusion.
-
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-
-    Args:
-        vae ([`AutoencoderKL`]):
-            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
-        text_encoder ([`CLIPTextModel`]):
-            Frozen text-encoder. Stable Diffusion uses the text portion of
-            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
-            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
-        tokenizer (`CLIPTokenizer`):
-            Tokenizer of class
-            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
-        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
-        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
-            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
-        safety_checker ([`StableDiffusionSafetyChecker`]):
-            Classification module that estimates whether generated images could be considered offensive or harmful.
-            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
-        feature_extractor ([`CLIPFeatureExtractor`]):
-            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
-    """
-    _optional_components = ["safety_checker", "feature_extractor"]
-
-    def __init__(
-        self,
-        vae: AutoencoderKL,
-        text_encoder: CLIPTextModel,
-        tokenizer: CLIPTokenizer,
-        unet: UNet2DConditionModel,
-        scheduler: KarrasDiffusionSchedulers,
-        safety_checker: StableDiffusionSafetyChecker,
-        feature_extractor: CLIPFeatureExtractor,
-        requires_safety_checker: bool = True,
-    ):
-        super().__init__()
-
-        if safety_checker is None and requires_safety_checker:
-            logger.warning(
-                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
-                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
-                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
-                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
-                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
-                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
-            )
-
-        if safety_checker is not None and feature_extractor is None:
-            raise ValueError(
-                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
-                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
-            )
-
-        self.register_modules(
-            vae=vae,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            unet=unet,
-            scheduler=scheduler,
-            safety_checker=safety_checker,
-            feature_extractor=feature_extractor,
-        )
-        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
-        self.register_to_config(requires_safety_checker=requires_safety_checker)
-
-    @torch.no_grad()
-    def __call__(
-        self,
-        prompt: Union[str, List[str]] = None,
-        image: Union[torch.FloatTensor, PIL.Image.Image] = None,
-        num_inference_steps: int = 100,
-        guidance_scale: float = 7.5,
-        image_guidance_scale: float = 1.5,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: float = 0.0,
-        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
-        latents: Optional[torch.FloatTensor] = None,
-        prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
-        callback_steps: Optional[int] = 1,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-             prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
-                instead.
-            image (`PIL.Image.Image`):
-                `Image`, or tensor representing an image batch which will be repainted according to `prompt`.
-            num_inference_steps (`int`, *optional*, defaults to 100):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality. This pipeline requires a value of at least `1`.
-            image_guidance_scale (`float`, *optional*, defaults to 1.5):
-                Image guidance scale is to push the generated image towards the inital image `image`. Image guidance
-                scale is enabled by setting `image_guidance_scale > 1`. Higher image guidance scale encourages to
-                generate images that are closely linked to the source image `image`, usually at the expense of lower
-                image quality. This pipeline requires a value of at least `1`.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale`
-                is less than `1`).
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`torch.Generator`, *optional*):
-                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
-                to make generation deterministic.
-            latents (`torch.FloatTensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
-            prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-
-        Examples:
-
-        ```py
-        >>> import PIL
-        >>> import requests
-        >>> import torch
-        >>> from io import BytesIO
-
-        >>> from diffusers import StableDiffusionInstructPix2PixPipeline
-
-
-        >>> def download_image(url):
-        ...     response = requests.get(url)
-        ...     return PIL.Image.open(BytesIO(response.content)).convert("RGB")
-
-
-        >>> img_url = "https://huggingface.co/datasets/diffusers/diffusers-images-docs/resolve/main/mountain.png"
-
-        >>> image = download_image(img_url).resize((512, 512))
-
-        >>> pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(
-        ...     "timbrooks/instruct-pix2pix", torch_dtype=torch.float16
-        ... )
-        >>> pipe = pipe.to("cuda")
-
-        >>> prompt = "make the mountains snowy"
-        >>> image = pipe(prompt=prompt, image=image).images[0]
-        ```
-
-        Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-            When returning a tuple, the first element is a list with the generated images, and the second element is a
-            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, according to the `safety_checker`.
-        """
-        # 0. Check inputs
-        self.check_inputs(prompt, callback_steps)
-
-        if image is None:
-            raise ValueError("`image` input cannot be undefined.")
-
-        # 1. Define call parameters
-        batch_size = 1 if isinstance(prompt, str) else len(prompt)
-        device = self._execution_device
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0 and image_guidance_scale >= 1.0
-        # check if scheduler is in sigmas space
-        scheduler_is_in_sigma_space = hasattr(self.scheduler, "sigmas")
-
-        # 2. Encode input prompt
-        prompt_embeds = self._encode_prompt(
-            prompt,
-            device,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-        )
-
-        # 3. Preprocess image
-        image = preprocess(image)
-        height, width = image.shape[-2:]
-
-        # 4. set timesteps
-        self.scheduler.set_timesteps(num_inference_steps, device=device)
-        timesteps = self.scheduler.timesteps
-
-        # 5. Prepare Image latents
-        image_latents = self.prepare_image_latents(
-            image,
-            batch_size,
-            num_images_per_prompt,
-            prompt_embeds.dtype,
-            device,
-            do_classifier_free_guidance,
-            generator,
-        )
-
-        # 6. Prepare latent variables
-        num_channels_latents = self.vae.config.latent_channels
-        latents = self.prepare_latents(
-            batch_size * num_images_per_prompt,
-            num_channels_latents,
-            height,
-            width,
-            prompt_embeds.dtype,
-            device,
-            generator,
-            latents,
-        )
-
-        # 7. Check that shapes of latents and image match the UNet channels
-        num_channels_image = image_latents.shape[1]
-        if num_channels_latents + num_channels_image != self.unet.config.in_channels:
-            raise ValueError(
-                f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
-                f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
-                f" `num_channels_image`: {num_channels_image} "
-                f" = {num_channels_latents+num_channels_image}. Please verify the config of"
-                " `pipeline.unet` or your `image` input."
-            )
-
-        # 8. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-
-        # 9. Denoising loop
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                # Expand the latents if we are doing classifier free guidance.
-                # The latents are expanded 3 times because for pix2pix the guidance\
-                # is applied for both the text and the input image.
-                latent_model_input = torch.cat([latents] * 3) if do_classifier_free_guidance else latents
-
-                # concat latents, image_latents in the channel dimension
-                scaled_latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-                scaled_latent_model_input = torch.cat([scaled_latent_model_input, image_latents], dim=1)
-
-                # predict the noise residual
-                noise_pred = self.unet(scaled_latent_model_input, t, encoder_hidden_states=prompt_embeds).sample
-
-                # Hack:
-                # For karras style schedulers the model does classifer free guidance using the
-                # predicted_original_sample instead of the noise_pred. So we need to compute the
-                # predicted_original_sample here if we are using a karras style scheduler.
-                if scheduler_is_in_sigma_space:
-                    step_index = (self.scheduler.timesteps == t).nonzero().item()
-                    sigma = self.scheduler.sigmas[step_index]
-                    noise_pred = latent_model_input - sigma * noise_pred
-
-                # perform guidance
-                if do_classifier_free_guidance:
-                    noise_pred_text, noise_pred_image, noise_pred_uncond = noise_pred.chunk(3)
-                    noise_pred = (
-                        noise_pred_uncond
-                        + guidance_scale * (noise_pred_text - noise_pred_image)
-                        + image_guidance_scale * (noise_pred_image - noise_pred_uncond)
-                    )
-
-                # Hack:
-                # For karras style schedulers the model does classifer free guidance using the
-                # predicted_original_sample instead of the noise_pred. But the scheduler.step function
-                # expects the noise_pred and computes the predicted_original_sample internally. So we
-                # need to overwrite the noise_pred here such that the value of the computed
-                # predicted_original_sample is correct.
-                if scheduler_is_in_sigma_space:
-                    noise_pred = (noise_pred - latents) / (-sigma)
-
-                # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
-
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    progress_bar.update()
-                    if callback is not None and i % callback_steps == 0:
-                        callback(i, t, latents)
-
-        # 10. Post-processing
-        image = self.decode_latents(latents)
-
-        # 11. Run safety checker
-        image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
-
-        # 12. Convert to PIL
-        if output_type == "pil":
-            image = self.numpy_to_pil(image)
-
-        if not return_dict:
-            return (image, has_nsfw_concept)
-
-        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
-
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_sequential_cpu_offload
-    def enable_sequential_cpu_offload(self, gpu_id=0):
-        r"""
-        Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
-        text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
-        `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
-        """
-        if is_accelerate_available():
-            from accelerate import cpu_offload
-        else:
-            raise ImportError("Please install accelerate via `pip install accelerate`")
-
-        device = torch.device(f"cuda:{gpu_id}")
-
-        for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
-            cpu_offload(cpu_offloaded_model, device)
-
-        if self.safety_checker is not None:
-            cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True)
-
-    @property
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
-    def _execution_device(self):
-        r"""
-        Returns the device on which the pipeline's models will be executed. After calling
-        `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
-        hooks.
-        """
-        if self.device != torch.device("meta") or not hasattr(self.unet, "_hf_hook"):
-            return self.device
-        for module in self.unet.modules():
-            if (
-                hasattr(module, "_hf_hook")
-                and hasattr(module._hf_hook, "execution_device")
-                and module._hf_hook.execution_device is not None
-            ):
-                return torch.device(module._hf_hook.execution_device)
-        return self.device
-
-    def _encode_prompt(
-        self,
-        prompt,
-        device,
-        num_images_per_prompt,
-        do_classifier_free_guidance,
-        negative_prompt=None,
-        prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
-    ):
-        r"""
-        Encodes the prompt into text encoder hidden states.
-
-        Args:
-             prompt (`str` or `List[str]`, *optional*):
-                prompt to be encoded
-            device: (`torch.device`):
-                torch device
-            num_images_per_prompt (`int`):
-                number of images that should be generated per prompt
-            do_classifier_free_guidance (`bool`):
-                whether to use classifier free guidance or not
-            negative_ prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
-                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
-            prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-        """
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        if prompt_embeds is None:
-            text_inputs = self.tokenizer(
-                prompt,
-                padding="max_length",
-                max_length=self.tokenizer.model_max_length,
-                truncation=True,
-                return_tensors="pt",
-            )
-            text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
-
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
-                text_input_ids, untruncated_ids
-            ):
-                removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
-                )
-                logger.warning(
-                    "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
-                )
-
-            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-                attention_mask = text_inputs.attention_mask.to(device)
-            else:
-                attention_mask = None
-
-            prompt_embeds = self.text_encoder(
-                text_input_ids.to(device),
-                attention_mask=attention_mask,
-            )
-            prompt_embeds = prompt_embeds[0]
-
-        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
-
-        bs_embed, seq_len, _ = prompt_embeds.shape
-        # duplicate text embeddings for each generation per prompt, using mps friendly method
-        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
-        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
-
-        # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance and negative_prompt_embeds is None:
-            uncond_tokens: List[str]
-            if negative_prompt is None:
-                uncond_tokens = [""] * batch_size
-            elif type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}."
-                )
-            elif isinstance(negative_prompt, str):
-                uncond_tokens = [negative_prompt]
-            elif batch_size != len(negative_prompt):
-                raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`."
-                )
-            else:
-                uncond_tokens = negative_prompt
-
-            max_length = prompt_embeds.shape[1]
-            uncond_input = self.tokenizer(
-                uncond_tokens,
-                padding="max_length",
-                max_length=max_length,
-                truncation=True,
-                return_tensors="pt",
-            )
-
-            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-                attention_mask = uncond_input.attention_mask.to(device)
-            else:
-                attention_mask = None
-
-            negative_prompt_embeds = self.text_encoder(
-                uncond_input.input_ids.to(device),
-                attention_mask=attention_mask,
-            )
-            negative_prompt_embeds = negative_prompt_embeds[0]
-
-        if do_classifier_free_guidance:
-            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-            seq_len = negative_prompt_embeds.shape[1]
-
-            negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
-
-            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
-            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
-
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-            # pix2pix has two  negative embeddings, and unlike in other pipelines latents are ordered [prompt_embeds, negative_prompt_embeds, negative_prompt_embeds]
-            prompt_embeds = torch.cat([prompt_embeds, negative_prompt_embeds, negative_prompt_embeds])
-
-        return prompt_embeds
-
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
-    def run_safety_checker(self, image, device, dtype):
-        if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device)
-            image, has_nsfw_concept = self.safety_checker(
-                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
-            )
-        else:
-            has_nsfw_concept = None
-        return image, has_nsfw_concept
-
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
-    def prepare_extra_step_kwargs(self, generator, eta):
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        extra_step_kwargs = {}
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-
-        # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        if accepts_generator:
-            extra_step_kwargs["generator"] = generator
-        return extra_step_kwargs
-
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
-    def decode_latents(self, latents):
-        latents = 1 / 0.18215 * latents
-        image = self.vae.decode(latents).sample
-        image = (image / 2 + 0.5).clamp(0, 1)
-        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
-        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
-        return image
-
-    def check_inputs(self, prompt, callback_steps):
-        if not isinstance(prompt, str) and not isinstance(prompt, list):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if (callback_steps is None) or (
-            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
-        ):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}."
-            )
-
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
-    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
-        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
-        if isinstance(generator, list) and len(generator) != batch_size:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
-            )
-
-        if latents is None:
-            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
-        else:
-            latents = latents.to(device)
-
-        # scale the initial noise by the standard deviation required by the scheduler
-        latents = latents * self.scheduler.init_noise_sigma
-        return latents
-
-    def prepare_image_latents(
-        self, image, batch_size, num_images_per_prompt, dtype, device, do_classifier_free_guidance, generator=None
-    ):
-        if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
-            raise ValueError(
-                f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
-            )
-
-        image = image.to(device=device, dtype=dtype)
-
-        batch_size = batch_size * num_images_per_prompt
-        if isinstance(generator, list) and len(generator) != batch_size:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
-            )
-
-        if isinstance(generator, list):
-            image_latents = [self.vae.encode(image[i : i + 1]).latent_dist.mode() for i in range(batch_size)]
-            image_latents = torch.cat(image_latents, dim=0)
-        else:
-            image_latents = self.vae.encode(image).latent_dist.mode()
-
-        if batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] == 0:
-            # expand image_latents for batch_size
-            deprecation_message = (
-                f"You have passed {batch_size} text prompts (`prompt`), but only {image_latents.shape[0]} initial"
-                " images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
-                " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
-                " your script to pass as many initial images as text prompts to suppress this warning."
-            )
-            deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False)
-            additional_image_per_prompt = batch_size // image_latents.shape[0]
-            image_latents = torch.cat([image_latents] * additional_image_per_prompt, dim=0)
-        elif batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] != 0:
-            raise ValueError(
-                f"Cannot duplicate `image` of batch size {image_latents.shape[0]} to {batch_size} text prompts."
-            )
-        else:
-            image_latents = torch.cat([image_latents], dim=0)
-
-        if do_classifier_free_guidance:
-            uncond_image_latents = torch.zeros_like(image_latents)
-            image_latents = torch.cat([image_latents, image_latents, uncond_image_latents], dim=0)
-
-        return image_latents
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py
@@ -137,10 +137,13 @@ class StableDiffusionKDiffusionPipeline(DiffusionPipeline):
        device = torch.device(f"cuda:{gpu_id}")

        for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
-            cpu_offload(cpu_offloaded_model, device)
+            if cpu_offloaded_model is not None:
+                cpu_offload(cpu_offloaded_model, device)

        if self.safety_checker is not None:
-            cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True)
+            # TODO(Patrick) - there is currently a bug with cpu offload of nn.Parameter in accelerate
+            # fix by only offloading self.safety_checker for now
+            cpu_offload(self.safety_checker.vision_model, device)

    @property
    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
@@ -162,21 +165,12 @@ class StableDiffusionKDiffusionPipeline(DiffusionPipeline):
        return self.device

    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
-    def _encode_prompt(
-        self,
-        prompt,
-        device,
-        num_images_per_prompt,
-        do_classifier_free_guidance,
-        negative_prompt=None,
-        prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
-    ):
+    def _encode_prompt(self, prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt):
        r"""
        Encodes the prompt into text encoder hidden states.

        Args:
-             prompt (`str` or `List[str]`, *optional*):
+            prompt (`str` or `list(int)`):
                prompt to be encoded
            device: (`torch.device`):
                torch device
@@ -184,67 +178,47 @@ class StableDiffusionKDiffusionPipeline(DiffusionPipeline):
                number of images that should be generated per prompt
            do_classifier_free_guidance (`bool`):
                whether to use classifier free guidance or not
-            negative_ prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
-                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
-            prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
+            negative_prompt (`str` or `List[str]`):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
        """
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+
+        if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+            attention_mask = text_inputs.attention_mask.to(device)
        else:
-            batch_size = prompt_embeds.shape[0]
+            attention_mask = None

-        if prompt_embeds is None:
-            text_inputs = self.tokenizer(
-                prompt,
-                padding="max_length",
-                max_length=self.tokenizer.model_max_length,
-                truncation=True,
-                return_tensors="pt",
-            )
-            text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+        text_embeddings = self.text_encoder(
+            text_input_ids.to(device),
+            attention_mask=attention_mask,
+        )
+        text_embeddings = text_embeddings[0]

-            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
-                text_input_ids, untruncated_ids
-            ):
-                removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
-                )
-                logger.warning(
-                    "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
-                )
-
-            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-                attention_mask = text_inputs.attention_mask.to(device)
-            else:
-                attention_mask = None
-
-            prompt_embeds = self.text_encoder(
-                text_input_ids.to(device),
-                attention_mask=attention_mask,
-            )
-            prompt_embeds = prompt_embeds[0]
-
-        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
-
-        bs_embed, seq_len, _ = prompt_embeds.shape
        # duplicate text embeddings for each generation per prompt, using mps friendly method
-        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
-        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+        bs_embed, seq_len, _ = text_embeddings.shape
+        text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)
+        text_embeddings = text_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)

        # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance and negative_prompt_embeds is None:
+        if do_classifier_free_guidance:
            uncond_tokens: List[str]
            if negative_prompt is None:
                uncond_tokens = [""] * batch_size
@@ -264,7 +238,7 @@ class StableDiffusionKDiffusionPipeline(DiffusionPipeline):
            else:
                uncond_tokens = negative_prompt

-            max_length = prompt_embeds.shape[1]
+            max_length = text_input_ids.shape[-1]
            uncond_input = self.tokenizer(
                uncond_tokens,
                padding="max_length",
@@ -278,27 +252,23 @@ class StableDiffusionKDiffusionPipeline(DiffusionPipeline):
            else:
                attention_mask = None

-            negative_prompt_embeds = self.text_encoder(
+            uncond_embeddings = self.text_encoder(
                uncond_input.input_ids.to(device),
                attention_mask=attention_mask,
            )
-            negative_prompt_embeds = negative_prompt_embeds[0]
+            uncond_embeddings = uncond_embeddings[0]

-        if do_classifier_free_guidance:
            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-            seq_len = negative_prompt_embeds.shape[1]
-
-            negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
-
-            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
-            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+            seq_len = uncond_embeddings.shape[1]
+            uncond_embeddings = uncond_embeddings.repeat(1, num_images_per_prompt, 1)
+            uncond_embeddings = uncond_embeddings.view(batch_size * num_images_per_prompt, seq_len, -1)

            # For classifier free guidance, we need to do two forward passes.
            # Here we concatenate the unconditional and text embeddings into a single batch
            # to avoid doing two forward passes
-            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])

-        return prompt_embeds
+        return text_embeddings

    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
    def run_safety_checker(self, image, device, dtype):
@@ -350,7 +320,7 @@ class StableDiffusionKDiffusionPipeline(DiffusionPipeline):
    @torch.no_grad()
    def __call__(
        self,
-        prompt: Union[str, List[str]] = None,
+        prompt: Union[str, List[str]],
        height: Optional[int] = None,
        width: Optional[int] = None,
        num_inference_steps: int = 50,
@@ -360,8 +330,6 @@ class StableDiffusionKDiffusionPipeline(DiffusionPipeline):
        eta: float = 0.0,
        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
        latents: Optional[torch.FloatTensor] = None,
-        prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
        output_type: Optional[str] = "pil",
        return_dict: bool = True,
        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
@@ -371,9 +339,8 @@ class StableDiffusionKDiffusionPipeline(DiffusionPipeline):
        Function invoked when calling the pipeline for generation.

        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
-                instead.
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
                The height in pixels of the generated image.
            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
@@ -388,9 +355,8 @@ class StableDiffusionKDiffusionPipeline(DiffusionPipeline):
                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
                usually at the expense of lower image quality.
            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale`
-                is less than `1`).
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
            num_images_per_prompt (`int`, *optional*, defaults to 1):
                The number of images to generate per prompt.
            eta (`float`, *optional*, defaults to 0.0):
@@ -403,13 +369,6 @@ class StableDiffusionKDiffusionPipeline(DiffusionPipeline):
                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                tensor will ge generated by sampling using the supplied random `generator`.
-            prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generate image. Choose between
                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
@@ -448,20 +407,14 @@ class StableDiffusionKDiffusionPipeline(DiffusionPipeline):
            raise ValueError("has to use guidance_scale")

        # 3. Encode input prompt
-        prompt_embeds = self._encode_prompt(
-            prompt,
-            device,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
+        text_embeddings = self._encode_prompt(
+            prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
        )

        # 4. Prepare timesteps
-        self.scheduler.set_timesteps(num_inference_steps, device=prompt_embeds.device)
+        self.scheduler.set_timesteps(num_inference_steps, device=text_embeddings.device)
        sigmas = self.scheduler.sigmas
-        sigmas = sigmas.to(prompt_embeds.dtype)
+        sigmas = sigmas.to(text_embeddings.dtype)

        # 5. Prepare latent variables
        num_channels_latents = self.unet.in_channels
@@ -470,7 +423,7 @@ class StableDiffusionKDiffusionPipeline(DiffusionPipeline):
            num_channels_latents,
            height,
            width,
-            prompt_embeds.dtype,
+            text_embeddings.dtype,
            device,
            generator,
            latents,
@@ -484,7 +437,7 @@ class StableDiffusionKDiffusionPipeline(DiffusionPipeline):
            latent_model_input = torch.cat([x] * 2)
            t = torch.cat([t] * 2)

-            noise_pred = self.k_diffusion_model(latent_model_input, t, cond=prompt_embeds)
+            noise_pred = self.k_diffusion_model(latent_model_input, t, cond=text_embeddings)

            noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
            noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
@@ -497,7 +450,7 @@ class StableDiffusionKDiffusionPipeline(DiffusionPipeline):
        image = self.decode_latents(latents)

        # 9. Run safety checker
-        image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        image, has_nsfw_concept = self.run_safety_checker(image, device, text_embeddings.dtype)

        # 10. Convert to PIL
        if output_type == "pil":
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
@@ -22,7 +22,7 @@ import PIL
 from transformers import CLIPTextModel, CLIPTokenizer

 from ...models import AutoencoderKL, UNet2DConditionModel
-from ...schedulers import DDPMScheduler, KarrasDiffusionSchedulers
+from ...schedulers import DDIMScheduler, DDPMScheduler, LMSDiscreteScheduler, PNDMScheduler
 from ...utils import is_accelerate_available, logging, randn_tensor
 from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput

@@ -84,7 +84,7 @@ class StableDiffusionUpscalePipeline(DiffusionPipeline):
        tokenizer: CLIPTokenizer,
        unet: UNet2DConditionModel,
        low_res_scheduler: DDPMScheduler,
-        scheduler: KarrasDiffusionSchedulers,
+        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
        max_noise_level: int = 350,
    ):
        super().__init__()
@@ -136,21 +136,12 @@ class StableDiffusionUpscalePipeline(DiffusionPipeline):
        return self.device

    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
-    def _encode_prompt(
-        self,
-        prompt,
-        device,
-        num_images_per_prompt,
-        do_classifier_free_guidance,
-        negative_prompt=None,
-        prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
-    ):
+    def _encode_prompt(self, prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt):
        r"""
        Encodes the prompt into text encoder hidden states.

        Args:
-             prompt (`str` or `List[str]`, *optional*):
+            prompt (`str` or `list(int)`):
                prompt to be encoded
            device: (`torch.device`):
                torch device
@@ -158,67 +149,47 @@ class StableDiffusionUpscalePipeline(DiffusionPipeline):
                number of images that should be generated per prompt
            do_classifier_free_guidance (`bool`):
                whether to use classifier free guidance or not
-            negative_ prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
-                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
-            prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
+            negative_prompt (`str` or `List[str]`):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
        """
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+
+        if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+            attention_mask = text_inputs.attention_mask.to(device)
        else:
-            batch_size = prompt_embeds.shape[0]
+            attention_mask = None

-        if prompt_embeds is None:
-            text_inputs = self.tokenizer(
-                prompt,
-                padding="max_length",
-                max_length=self.tokenizer.model_max_length,
-                truncation=True,
-                return_tensors="pt",
-            )
-            text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+        text_embeddings = self.text_encoder(
+            text_input_ids.to(device),
+            attention_mask=attention_mask,
+        )
+        text_embeddings = text_embeddings[0]

-            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
-                text_input_ids, untruncated_ids
-            ):
-                removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
-                )
-                logger.warning(
-                    "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
-                )
-
-            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-                attention_mask = text_inputs.attention_mask.to(device)
-            else:
-                attention_mask = None
-
-            prompt_embeds = self.text_encoder(
-                text_input_ids.to(device),
-                attention_mask=attention_mask,
-            )
-            prompt_embeds = prompt_embeds[0]
-
-        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
-
-        bs_embed, seq_len, _ = prompt_embeds.shape
        # duplicate text embeddings for each generation per prompt, using mps friendly method
-        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
-        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+        bs_embed, seq_len, _ = text_embeddings.shape
+        text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)
+        text_embeddings = text_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)

        # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance and negative_prompt_embeds is None:
+        if do_classifier_free_guidance:
            uncond_tokens: List[str]
            if negative_prompt is None:
                uncond_tokens = [""] * batch_size
@@ -238,7 +209,7 @@ class StableDiffusionUpscalePipeline(DiffusionPipeline):
            else:
                uncond_tokens = negative_prompt

-            max_length = prompt_embeds.shape[1]
+            max_length = text_input_ids.shape[-1]
            uncond_input = self.tokenizer(
                uncond_tokens,
                padding="max_length",
@@ -252,27 +223,23 @@ class StableDiffusionUpscalePipeline(DiffusionPipeline):
            else:
                attention_mask = None

-            negative_prompt_embeds = self.text_encoder(
+            uncond_embeddings = self.text_encoder(
                uncond_input.input_ids.to(device),
                attention_mask=attention_mask,
            )
-            negative_prompt_embeds = negative_prompt_embeds[0]
+            uncond_embeddings = uncond_embeddings[0]

-        if do_classifier_free_guidance:
            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-            seq_len = negative_prompt_embeds.shape[1]
-
-            negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
-
-            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
-            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+            seq_len = uncond_embeddings.shape[1]
+            uncond_embeddings = uncond_embeddings.repeat(1, num_images_per_prompt, 1)
+            uncond_embeddings = uncond_embeddings.view(batch_size * num_images_per_prompt, seq_len, -1)

            # For classifier free guidance, we need to do two forward passes.
            # Here we concatenate the unconditional and text embeddings into a single batch
            # to avoid doing two forward passes
-            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])

-        return prompt_embeds
+        return text_embeddings

    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
    def prepare_extra_step_kwargs(self, generator, eta):
@@ -358,8 +325,8 @@ class StableDiffusionUpscalePipeline(DiffusionPipeline):
    @torch.no_grad()
    def __call__(
        self,
-        prompt: Union[str, List[str]] = None,
-        image: Union[torch.FloatTensor, PIL.Image.Image, List[PIL.Image.Image]] = None,
+        prompt: Union[str, List[str]],
+        image: Union[torch.FloatTensor, PIL.Image.Image, List[PIL.Image.Image]],
        num_inference_steps: int = 75,
        guidance_scale: float = 9.0,
        noise_level: int = 20,
@@ -368,8 +335,6 @@ class StableDiffusionUpscalePipeline(DiffusionPipeline):
        eta: float = 0.0,
        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
        latents: Optional[torch.FloatTensor] = None,
-        prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
        output_type: Optional[str] = "pil",
        return_dict: bool = True,
        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
@@ -379,9 +344,8 @@ class StableDiffusionUpscalePipeline(DiffusionPipeline):
        Function invoked when calling the pipeline for generation.

        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
-                instead.
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
            image (`PIL.Image.Image` or List[`PIL.Image.Image`] or `torch.FloatTensor`):
                `Image`, or tensor representing an image batch which will be upscaled. *
            num_inference_steps (`int`, *optional*, defaults to 50):
@@ -394,9 +358,8 @@ class StableDiffusionUpscalePipeline(DiffusionPipeline):
                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
                usually at the expense of lower image quality.
            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale`
-                is less than `1`).
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
            num_images_per_prompt (`int`, *optional*, defaults to 1):
                The number of images to generate per prompt.
            eta (`float`, *optional*, defaults to 0.0):
@@ -409,13 +372,6 @@ class StableDiffusionUpscalePipeline(DiffusionPipeline):
                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                tensor will ge generated by sampling using the supplied random `generator`.
-            prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generate image. Choose between
                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
@@ -466,9 +422,6 @@ class StableDiffusionUpscalePipeline(DiffusionPipeline):
        # 1. Check inputs
        self.check_inputs(prompt, image, noise_level, callback_steps)

-        if image is None:
-            raise ValueError("`image` input cannot be undefined.")
-
        # 2. Define call parameters
        batch_size = 1 if isinstance(prompt, str) else len(prompt)
        device = self._execution_device
@@ -478,19 +431,13 @@ class StableDiffusionUpscalePipeline(DiffusionPipeline):
        do_classifier_free_guidance = guidance_scale > 1.0

        # 3. Encode input prompt
-        prompt_embeds = self._encode_prompt(
-            prompt,
-            device,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
+        text_embeddings = self._encode_prompt(
+            prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
        )

        # 4. Preprocess image
        image = preprocess(image)
-        image = image.to(dtype=prompt_embeds.dtype, device=device)
+        image = image.to(dtype=text_embeddings.dtype, device=device)

        # 5. set timesteps
        self.scheduler.set_timesteps(num_inference_steps, device=device)
@@ -498,7 +445,7 @@ class StableDiffusionUpscalePipeline(DiffusionPipeline):

        # 5. Add noise to image
        noise_level = torch.tensor([noise_level], dtype=torch.long, device=device)
-        noise = randn_tensor(image.shape, generator=generator, device=device, dtype=prompt_embeds.dtype)
+        noise = randn_tensor(image.shape, generator=generator, device=device, dtype=text_embeddings.dtype)
        image = self.low_res_scheduler.add_noise(image, noise, noise_level)

        batch_multiplier = 2 if do_classifier_free_guidance else 1
@@ -513,7 +460,7 @@ class StableDiffusionUpscalePipeline(DiffusionPipeline):
            num_channels_latents,
            height,
            width,
-            prompt_embeds.dtype,
+            text_embeddings.dtype,
            device,
            generator,
            latents,
@@ -546,13 +493,13 @@ class StableDiffusionUpscalePipeline(DiffusionPipeline):

                # predict the noise residual
                noise_pred = self.unet(
-                    latent_model_input, t, encoder_hidden_states=prompt_embeds, class_labels=noise_level
+                    latent_model_input, t, encoder_hidden_states=text_embeddings, class_labels=noise_level
                ).sample

                # perform guidance
                if do_classifier_free_guidance:
                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)

                # compute the previous noisy sample x_t -> x_t-1
                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
--- a/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py
+++ b/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py
@@ -10,7 +10,14 @@ from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer

 from ...configuration_utils import FrozenDict
 from ...models import AutoencoderKL, UNet2DConditionModel
-from ...schedulers import KarrasDiffusionSchedulers
+from ...schedulers import (
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+)
 from ...utils import deprecate, is_accelerate_available, logging, randn_tensor
 from ..pipeline_utils import DiffusionPipeline
 from . import StableDiffusionSafePipelineOutput
@@ -58,7 +65,14 @@ class StableDiffusionPipelineSafe(DiffusionPipeline):
        text_encoder: CLIPTextModel,
        tokenizer: CLIPTokenizer,
        unet: UNet2DConditionModel,
-        scheduler: KarrasDiffusionSchedulers,
+        scheduler: Union[
+            DDIMScheduler,
+            DPMSolverMultistepScheduler,
+            EulerAncestralDiscreteScheduler,
+            EulerDiscreteScheduler,
+            LMSDiscreteScheduler,
+            PNDMScheduler,
+        ],
        safety_checker: SafeStableDiffusionSafetyChecker,
        feature_extractor: CLIPFeatureExtractor,
        requires_safety_checker: bool = True,
@@ -217,7 +231,7 @@ class StableDiffusionPipelineSafe(DiffusionPipeline):
        Encodes the prompt into text encoder hidden states.

        Args:
-            prompt (`str` or `List[str]`):
+            prompt (`str` or `list(int)`):
                prompt to be encoded
            device: (`torch.device`):
                torch device
@@ -253,16 +267,16 @@ class StableDiffusionPipelineSafe(DiffusionPipeline):
        else:
            attention_mask = None

-        prompt_embeds = self.text_encoder(
+        text_embeddings = self.text_encoder(
            text_input_ids.to(device),
            attention_mask=attention_mask,
        )
-        prompt_embeds = prompt_embeds[0]
+        text_embeddings = text_embeddings[0]

        # duplicate text embeddings for each generation per prompt, using mps friendly method
-        bs_embed, seq_len, _ = prompt_embeds.shape
-        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
-        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+        bs_embed, seq_len, _ = text_embeddings.shape
+        text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)
+        text_embeddings = text_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)

        # get unconditional embeddings for classifier free guidance
        if do_classifier_free_guidance:
@@ -299,16 +313,16 @@ class StableDiffusionPipelineSafe(DiffusionPipeline):
            else:
                attention_mask = None

-            negative_prompt_embeds = self.text_encoder(
+            uncond_embeddings = self.text_encoder(
                uncond_input.input_ids.to(device),
                attention_mask=attention_mask,
            )
-            negative_prompt_embeds = negative_prompt_embeds[0]
+            uncond_embeddings = uncond_embeddings[0]

            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-            seq_len = negative_prompt_embeds.shape[1]
-            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
-            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+            seq_len = uncond_embeddings.shape[1]
+            uncond_embeddings = uncond_embeddings.repeat(1, num_images_per_prompt, 1)
+            uncond_embeddings = uncond_embeddings.view(batch_size * num_images_per_prompt, seq_len, -1)

            # Encode the safety concept text
            if enable_safety_guidance:
@@ -329,15 +343,15 @@ class StableDiffusionPipelineSafe(DiffusionPipeline):
                # For classifier free guidance + sld, we need to do three forward passes.
                # Here we concatenate the unconditional and text embeddings into a single batch
                # to avoid doing three forward passes
-                prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds, safety_embeddings])
+                text_embeddings = torch.cat([uncond_embeddings, text_embeddings, safety_embeddings])

            else:
                # For classifier free guidance, we need to do two forward passes.
                # Here we concatenate the unconditional and text embeddings into a single batch
                # to avoid doing two forward passes
-                prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+                text_embeddings = torch.cat([uncond_embeddings, text_embeddings])

-        return prompt_embeds
+        return text_embeddings

    def run_safety_checker(self, image, device, dtype, enable_safety_guidance):
        if self.safety_checker is not None:
@@ -390,16 +404,10 @@ class StableDiffusionPipelineSafe(DiffusionPipeline):
        return extra_step_kwargs

    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs
-    def check_inputs(
-        self,
-        prompt,
-        height,
-        width,
-        callback_steps,
-        negative_prompt=None,
-        prompt_embeds=None,
-        negative_prompt_embeds=None,
-    ):
+    def check_inputs(self, prompt, height, width, callback_steps):
+        if not isinstance(prompt, str) and not isinstance(prompt, list):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
        if height % 8 != 0 or width % 8 != 0:
            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")

@@ -411,32 +419,6 @@ class StableDiffusionPipelineSafe(DiffusionPipeline):
                f" {type(callback_steps)}."
            )

-        if prompt is not None and prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two."
-            )
-        elif prompt is None and prompt_embeds is None:
-            raise ValueError(
-                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
-            )
-        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if negative_prompt is not None and negative_prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
-                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
-            )
-
-        if prompt_embeds is not None and negative_prompt_embeds is not None:
-            if prompt_embeds.shape != negative_prompt_embeds.shape:
-                raise ValueError(
-                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
-                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}."
-                )
-
    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
@@ -615,7 +597,7 @@ class StableDiffusionPipelineSafe(DiffusionPipeline):
            warnings.warn("Safety checker disabled!")

        # 3. Encode input prompt
-        prompt_embeds = self._encode_prompt(
+        text_embeddings = self._encode_prompt(
            prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt, enable_safety_guidance
        )

@@ -630,7 +612,7 @@ class StableDiffusionPipelineSafe(DiffusionPipeline):
            num_channels_latents,
            height,
            width,
-            prompt_embeds.dtype,
+            text_embeddings.dtype,
            device,
            generator,
            latents,
@@ -653,7 +635,7 @@ class StableDiffusionPipelineSafe(DiffusionPipeline):
                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)

                # predict the noise residual
-                noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=prompt_embeds).sample
+                noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample

                # perform guidance
                if do_classifier_free_guidance:
@@ -712,7 +694,7 @@ class StableDiffusionPipelineSafe(DiffusionPipeline):

        # 9. Run safety checker
        image, has_nsfw_concept, flagged_images = self.run_safety_checker(
-            image, device, prompt_embeds.dtype, enable_safety_guidance
+            image, device, text_embeddings.dtype, enable_safety_guidance
        )

        # 10. Convert to PIL
--- a/src/diffusers/pipelines/stochastic_karras_ve/pipeline_stochastic_karras_ve.py
+++ b/src/diffusers/pipelines/stochastic_karras_ve/pipeline_stochastic_karras_ve.py
@@ -82,7 +82,7 @@ class KarrasVePipeline(DiffusionPipeline):
        model = self.unet

        # sample x_0 ~ N(0, sigma_0^2 * I)
-        sample = randn_tensor(shape, generator=generator, device=self.device) * self.scheduler.init_noise_sigma
+        sample = randn_tensor(shape, device=self.device) * self.scheduler.init_noise_sigma

        self.scheduler.set_timesteps(num_inference_steps)

--- a/Show More
+++ b/Show More