[Tests] Refactor integration tests

2026-02-24 11:50:35 +08:00 · 2022-10-31 09:51:04 +00:00
285 changed files with 5456 additions and 41006 deletions
--- a/.github/ISSUE_TEMPLATE/bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/bug-report.yml
@@ -5,20 +5,7 @@ body:
  - type: markdown
    attributes:
      value: |
-        Thanks a lot for taking the time to file this issue 🤗.
-        Issues do not only help to improve the library, but also publicly document common problems, questions, workflows for the whole community!
-        Thus, issues are of the same importance as pull requests when contributing to this library ❤️.
-        In order to make your issue as **useful for the community as possible**, let's try to stick to some simple guidelines:
-        - 1. Please try to be as precise and concise as possible.
-             *Give your issue a fitting title. Assume that someone which very limited knowledge of diffusers can understand your issue. Add links to the source code, documentation other issues, pull requests etc...*
-        - 2. If your issue is about something not working, **always** provide a reproducible code snippet. The reader should be able to reproduce your issue by **only copy-pasting your code snippet into a Python shell**.
-             *The community cannot solve your issue if it cannot reproduce it. If your bug is related to training, add your training script and make everything needed to train public. Otherwise, just add a simple Python code snippet.*
-        - 3. Add the **minimum amount of code / context that is needed to understand, reproduce your issue**.
-             *Make the life of maintainers easy. `diffusers` is getting many issues every day. Make sure your issue is about one bug and one bug only. Make sure you add only the context, code needed to understand your issues - nothing more. Generally, every issue is a way of documenting this library, try to make it a good documentation entry.*
-  - type: markdown
-    attributes:
-      value: |
-        For more in-detail information on how to write good issues you can have a look [here](https://huggingface.co/course/chapter8/5?fw=pt)
+        Thanks for taking the time to fill out this bug report!
  - type: textarea
    id: bug-description
    attributes:
@@ -33,8 +20,6 @@ body:
      label: Reproduction
      description: Please provide a minimal reproducible code which we can copy/paste and reproduce the issue.
      placeholder: Reproduction
-    validations:
-      required: true
  - type: textarea
    id: logs
    attributes:
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -1,4 +1,7 @@
 contact_links:
+  - name: Forum
+    url: https://discuss.huggingface.co/c/discussion-related-to-httpsgithubcomhuggingfacediffusers/63
+    about: General usage questions and community discussions
  - name: Blank issue
    url: https://github.com/huggingface/diffusers/issues/new
-    about: General usage questions and community discussions
+    about: Please note that the Forum is in most places the right place for discussions
--- a/.github/workflows/build_docker_images.yml
+++ b/.github/workflows/build_docker_images.yml
@@ -1,50 +0,0 @@
-name: Build Docker images (nightly)
-
-on:
-  workflow_dispatch:
-  schedule:
-    - cron: "0 0 * * *" # every day at midnight
-
-concurrency:
-  group: docker-image-builds
-  cancel-in-progress: false
-
-env:
-  REGISTRY: diffusers
-
-jobs:
-  build-docker-images:
-    runs-on: ubuntu-latest
-
-    permissions:
-      contents: read
-      packages: write
-
-    strategy:
-      fail-fast: false
-      matrix:
-        image-name:
-          - diffusers-pytorch-cpu
-          - diffusers-pytorch-cuda
-          - diffusers-flax-cpu
-          - diffusers-flax-tpu
-          - diffusers-onnxruntime-cpu
-          - diffusers-onnxruntime-cuda
-
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v3
-
-      - name: Login to Docker Hub
-        uses: docker/login-action@v2
-        with:
-          username: ${{ env.REGISTRY }}
-          password: ${{ secrets.DOCKERHUB_TOKEN }}
-
-      - name: Build and push
-        uses: docker/build-push-action@v3
-        with:
-          no-cache: true
-          context: ./docker/${{ matrix.image-name }}
-          push: true
-          tags: ${{ env.REGISTRY }}/${{ matrix.image-name }}:latest
--- a/.github/workflows/nightly_tests.yml
+++ b/.github/workflows/nightly_tests.yml
@@ -1,66 +0,0 @@
-name: Nightly integration tests
-
-on:
-  schedule:
-    - cron: "0 0 * * *" # every day at midnight
-
-env:
-  DIFFUSERS_IS_CI: yes
-  HF_HOME: /mnt/cache
-  OMP_NUM_THREADS: 8
-  MKL_NUM_THREADS: 8
-  PYTEST_TIMEOUT: 1000
-  RUN_SLOW: yes
-
-jobs:
-  run_slow_tests_apple_m1:
-    name: Slow PyTorch MPS tests on MacOS
-    runs-on: [ self-hosted, apple-m1 ]
-
-    steps:
-      - name: Checkout diffusers
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: 2
-
-      - name: Clean checkout
-        shell: arch -arch arm64 bash {0}
-        run: |
-          git clean -fxd
-
-      - name: Setup miniconda
-        uses: ./.github/actions/setup-miniconda
-        with:
-          python-version: 3.9
-
-      - name: Install dependencies
-        shell: arch -arch arm64 bash {0}
-        run: |
-          ${CONDA_RUN} python -m pip install --upgrade pip
-          ${CONDA_RUN} python -m pip install -e .[quality,test]
-          ${CONDA_RUN} python -m pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu
-          ${CONDA_RUN} python -m pip install git+https://github.com/huggingface/accelerate
-
-      - name: Environment
-        shell: arch -arch arm64 bash {0}
-        run: |
-          ${CONDA_RUN} python utils/print_env.py
-
-      - name: Run slow PyTorch tests on M1 (MPS)
-        shell: arch -arch arm64 bash {0}
-        env:
-          HF_HOME: /System/Volumes/Data/mnt/cache
-          HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-        run: |
-          ${CONDA_RUN} python -m pytest -n 1 -s -v --make-reports=tests_torch_mps tests/
-
-      - name: Failure short reports
-        if: ${{ failure() }}
-        run: cat reports/tests_torch_mps_failures_short.txt
-
-      - name: Test suite reports artifacts
-        if: ${{ always() }}
-        uses: actions/upload-artifact@v2
-        with:
-          name: torch_mps_test_reports
-          path: reports
--- a/.github/workflows/pr_tests.yml
+++ b/.github/workflows/pr_tests.yml
@@ -10,45 +10,19 @@ concurrency:
  cancel-in-progress: true

 env:
-  DIFFUSERS_IS_CI: yes
-  OMP_NUM_THREADS: 4
-  MKL_NUM_THREADS: 4
+  OMP_NUM_THREADS: 8
+  MKL_NUM_THREADS: 8
  PYTEST_TIMEOUT: 60
+  MPS_TORCH_VERSION: 1.13.0

 jobs:
-  run_fast_tests:
-    strategy:
-      fail-fast: false
-      matrix:
-        config:
-          - name: Fast PyTorch CPU tests on Ubuntu
-            framework: pytorch
-            runner: docker-cpu
-            image: diffusers/diffusers-pytorch-cpu
-            report: torch_cpu
-          - name: Fast Flax CPU tests on Ubuntu
-            framework: flax
-            runner: docker-cpu
-            image: diffusers/diffusers-flax-cpu
-            report: flax_cpu
-          - name: Fast ONNXRuntime CPU tests on Ubuntu
-            framework: onnxruntime
-            runner: docker-cpu
-            image: diffusers/diffusers-onnxruntime-cpu
-            report: onnx_cpu
-
-    name: ${{ matrix.config.name }}
-
-    runs-on: ${{ matrix.config.runner }}
-
+  run_tests_cpu:
+    name: CPU tests on Ubuntu
+    runs-on: [ self-hosted, docker-gpu ]
    container:
-      image: ${{ matrix.config.image }}
+      image: python:3.7
      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/

-    defaults:
-      run:
-        shell: bash
-
    steps:
    - name: Checkout diffusers
      uses: actions/checkout@v3
@@ -57,52 +31,32 @@ jobs:

    - name: Install dependencies
      run: |
-        apt-get update && apt-get install libsndfile1-dev -y
+        python -m pip install --upgrade pip
+        python -m pip install torch --extra-index-url https://download.pytorch.org/whl/cpu
        python -m pip install -e .[quality,test]
        python -m pip install git+https://github.com/huggingface/accelerate
-        python -m pip install -U git+https://github.com/huggingface/transformers

    - name: Environment
      run: |
        python utils/print_env.py

-    - name: Run fast PyTorch CPU tests
-      if: ${{ matrix.config.framework == 'pytorch' }}
+    - name: Run all fast tests on CPU
      run: |
-        python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \
-          -s -v -k "not Flax and not Onnx" \
-          --make-reports=tests_${{ matrix.config.report }} \
-          tests/
-
-    - name: Run fast Flax TPU tests
-      if: ${{ matrix.config.framework == 'flax' }}
-      run: |
-        python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \
-          -s -v -k "Flax" \
-          --make-reports=tests_${{ matrix.config.report }} \
-          tests/
-
-    - name: Run fast ONNXRuntime CPU tests
-      if: ${{ matrix.config.framework == 'onnxruntime' }}
-      run: |
-        python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \
-          -s -v -k "Onnx" \
-          --make-reports=tests_${{ matrix.config.report }} \
-          tests/
+        python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile -s -v --make-reports=tests_torch_cpu tests/

    - name: Failure short reports
      if: ${{ failure() }}
-      run: cat reports/tests_${{ matrix.config.report }}_failures_short.txt
+      run: cat reports/tests_torch_cpu_failures_short.txt

    - name: Test suite reports artifacts
      if: ${{ always() }}
      uses: actions/upload-artifact@v2
      with:
-        name: pr_${{ matrix.config.report }}_test_reports
+        name: pr_torch_cpu_test_reports
        path: reports

-  run_fast_tests_apple_m1:
-    name: Fast PyTorch MPS tests on MacOS
+  run_tests_apple_m1:
+    name: MPS tests on Apple M1
    runs-on: [ self-hosted, apple-m1 ]

    steps:
@@ -126,22 +80,18 @@ jobs:
      run: |
        ${CONDA_RUN} python -m pip install --upgrade pip
        ${CONDA_RUN} python -m pip install -e .[quality,test]
-        ${CONDA_RUN} python -m pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu
+        ${CONDA_RUN} python -m pip install --pre torch==${MPS_TORCH_VERSION} --extra-index-url https://download.pytorch.org/whl/test/cpu
        ${CONDA_RUN} python -m pip install git+https://github.com/huggingface/accelerate
-        ${CONDA_RUN} python -m pip install -U git+https://github.com/huggingface/transformers

    - name: Environment
      shell: arch -arch arm64 bash {0}
      run: |
        ${CONDA_RUN} python utils/print_env.py

-    - name: Run fast PyTorch tests on M1 (MPS)
+    - name: Run all fast tests on MPS
      shell: arch -arch arm64 bash {0}
-      env:
-        HF_HOME: /System/Volumes/Data/mnt/cache
-        HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
      run: |
-        ${CONDA_RUN} python -m pytest -n 0 -s -v --make-reports=tests_torch_mps tests/
+        ${CONDA_RUN} python -m pytest -n 1 -s -v --make-reports=tests_torch_mps tests/

    - name: Failure short reports
      if: ${{ failure() }}
--- a/.github/workflows/push_tests.yml
+++ b/.github/workflows/push_tests.yml
@@ -6,7 +6,6 @@ on:
      - main

 env:
-  DIFFUSERS_IS_CI: yes
  HF_HOME: /mnt/cache
  OMP_NUM_THREADS: 8
  MKL_NUM_THREADS: 8
@@ -14,38 +13,12 @@ env:
  RUN_SLOW: yes

 jobs:
-  run_slow_tests:
-    strategy:
-      fail-fast: false
-      matrix:
-        config:
-          - name: Slow PyTorch CUDA tests on Ubuntu
-            framework: pytorch
-            runner: docker-gpu
-            image: diffusers/diffusers-pytorch-cuda
-            report: torch_cuda
-          - name: Slow Flax TPU tests on Ubuntu
-            framework: flax
-            runner: docker-tpu
-            image: diffusers/diffusers-flax-tpu
-            report: flax_tpu
-          - name: Slow ONNXRuntime CUDA tests on Ubuntu
-            framework: onnxruntime
-            runner: docker-gpu
-            image: diffusers/diffusers-onnxruntime-cuda
-            report: onnx_cuda
-
-    name: ${{ matrix.config.name }}
-
-    runs-on: ${{ matrix.config.runner }}
-
+  run_tests_single_gpu:
+    name: Diffusers tests
+    runs-on: [ self-hosted, docker-gpu, single-gpu ]
    container:
-      image: ${{ matrix.config.image }}
-      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ ${{ matrix.config.runner == 'docker-tpu' && '--privileged' || '--gpus 0'}}
-
-    defaults:
-      run:
-        shell: bash
+      image: nvcr.io/nvidia/pytorch:22.07-py3
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache

    steps:
    - name: Checkout diffusers
@@ -54,69 +27,44 @@ jobs:
        fetch-depth: 2

    - name: NVIDIA-SMI
-      if : ${{ matrix.config.runner == 'docker-gpu' }}
      run: |
        nvidia-smi

    - name: Install dependencies
      run: |
+        python -m pip install --upgrade pip
+        python -m pip uninstall -y torch torchvision torchtext
+        python -m pip install torch --extra-index-url https://download.pytorch.org/whl/cu117
        python -m pip install -e .[quality,test]
        python -m pip install git+https://github.com/huggingface/accelerate
-        python -m pip install -U git+https://github.com/huggingface/transformers

    - name: Environment
      run: |
        python utils/print_env.py

-    - name: Run slow PyTorch CUDA tests
-      if: ${{ matrix.config.framework == 'pytorch' }}
+    - name: Run all (incl. slow) tests on GPU
      env:
        HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
      run: |
-        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
-          -s -v -k "not Flax and not Onnx" \
-          --make-reports=tests_${{ matrix.config.report }} \
-          tests/
-
-    - name: Run slow Flax TPU tests
-      if: ${{ matrix.config.framework == 'flax' }}
-      env:
-        HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-      run: |
-        python -m pytest -n 0 \
-          -s -v -k "Flax" \
-          --make-reports=tests_${{ matrix.config.report }} \
-          tests/
-
-    - name: Run slow ONNXRuntime CUDA tests
-      if: ${{ matrix.config.framework == 'onnxruntime' }}
-      env:
-        HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-      run: |
-        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
-          -s -v -k "Onnx" \
-          --make-reports=tests_${{ matrix.config.report }} \
-          tests/
+        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v --make-reports=tests_torch_gpu tests/

    - name: Failure short reports
      if: ${{ failure() }}
-      run: cat reports/tests_${{ matrix.config.report }}_failures_short.txt
+      run: cat reports/tests_torch_gpu_failures_short.txt

    - name: Test suite reports artifacts
      if: ${{ always() }}
      uses: actions/upload-artifact@v2
      with:
-        name: ${{ matrix.config.report }}_test_reports
+        name: torch_test_reports
        path: reports

-  run_examples_tests:
-    name: Examples PyTorch CUDA tests on Ubuntu
-
-    runs-on: docker-gpu
-
+  run_examples_single_gpu:
+    name: Examples tests
+    runs-on: [ self-hosted, docker-gpu, single-gpu ]
    container:
-      image: diffusers/diffusers-pytorch-cuda
-      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
+      image: nvcr.io/nvidia/pytorch:22.07-py3
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache

    steps:
    - name: Checkout diffusers
@@ -130,9 +78,11 @@ jobs:

    - name: Install dependencies
      run: |
+        python -m pip install --upgrade pip
+        python -m pip uninstall -y torch torchvision torchtext
+        python -m pip install torch --extra-index-url https://download.pytorch.org/whl/cu117
        python -m pip install -e .[quality,test,training]
        python -m pip install git+https://github.com/huggingface/accelerate
-        python -m pip install -U git+https://github.com/huggingface/transformers

    - name: Environment
      run: |
@@ -142,15 +92,15 @@ jobs:
      env:
        HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
      run: |
-        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v --make-reports=examples_torch_cuda examples/
+        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v --make-reports=examples_torch_gpu examples/

    - name: Failure short reports
      if: ${{ failure() }}
-      run: cat reports/examples_torch_cuda_failures_short.txt
+      run: cat reports/examples_torch_gpu_failures_short.txt

    - name: Test suite reports artifacts
      if: ${{ always() }}
      uses: actions/upload-artifact@v2
      with:
        name: examples_test_reports
-        path: reports
+        path: reports
--- a/.gitignore
+++ b/.gitignore
@@ -163,6 +163,4 @@ tags
 *.lock

 # DS_Store (MacOS)
-.DS_Store
-# RL pipelines may produce mp4 outputs
-*.mp4
+.DS_Store
--- a/README.md
+++ b/README.md
@@ -27,12 +27,10 @@ More precisely, 🤗 Diffusers offers:

 ## Installation

-### For PyTorch
-
 **With `pip`**
    
 ```bash
-pip install --upgrade diffusers[torch]
+pip install --upgrade diffusers
 ```

 **With `conda`**
@@ -41,14 +39,6 @@ pip install --upgrade diffusers[torch]
 conda install -c conda-forge diffusers
 ```

-### For Flax
-
-**With `pip`**
-
-```bash
-pip install --upgrade diffusers[flax]
-```
-
 **Apple Silicon (M1/M2) support**

 Please, refer to [the documentation](https://huggingface.co/docs/diffusers/optimization/mps).
@@ -152,7 +142,19 @@ it before the pipeline and pass it to `from_pretrained`.
 ```python
 from diffusers import LMSDiscreteScheduler

-pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config)
+lms = LMSDiscreteScheduler(
+    beta_start=0.00085, 
+    beta_end=0.012, 
+    beta_schedule="scaled_linear"
+)
+
+pipe = StableDiffusionPipeline.from_pretrained(
+    "runwayml/stable-diffusion-v1-5", 
+    revision="fp16", 
+    torch_dtype=torch.float16,
+    scheduler=lms,
+)
+pipe = pipe.to("cuda")

 prompt = "a photo of an astronaut riding a horse on mars"
 image = pipe(prompt).images[0]  
@@ -280,7 +282,7 @@ init_image = init_image.resize((768, 512))

 prompt = "A fantasy landscape, trending on artstation"

-images = pipe(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images
+images = pipe(prompt=prompt, init_image=init_image, strength=0.75, guidance_scale=7.5).images

 images[0].save("fantasy_landscape.png")
 ```
@@ -338,15 +340,14 @@ Textual Inversion is a technique for capturing novel concepts from a small numbe

 - Textual Inversion. Capture novel concepts from a small set of sample images, and associate them with new "words" in the embedding space of the text encoder. Please, refer to [our training examples](https://github.com/huggingface/diffusers/tree/main/examples/textual_inversion) or [documentation](https://huggingface.co/docs/diffusers/training/text_inversion) to try for yourself.

- Dreambooth. Another technique to capture new concepts in Stable Diffusion. This method fine-tunes the UNet (and, optionally, also the text encoder) of the pipeline to achieve impressive results. Please, refer to [our training example](https://github.com/huggingface/diffusers/tree/main/examples/dreambooth) and [training report](https://huggingface.co/blog/dreambooth) for additional details and training recommendations.
+- Dreambooth. Another technique to capture new concepts in Stable Diffusion. This method fine-tunes the UNet (and, optionally, also the text encoder) of the pipeline to achieve impressive results. Please, refer to [our training examples](https://github.com/huggingface/diffusers/tree/main/examples/dreambooth) and [training report](https://wandb.ai/psuraj/dreambooth/reports/Dreambooth-Training-Analysis--VmlldzoyNzk0NDc3) for additional details and training recommendations.

 - Full Stable Diffusion fine-tuning. If you have a more sizable dataset with a specific look or style, you can fine-tune Stable Diffusion so that it outputs images following those examples. This was the approach taken to create [a Pokémon Stable Diffusion model](https://huggingface.co/justinpinkney/pokemon-stable-diffusion) (by Justing Pinkney / Lambda Labs), [a Japanese specific version of Stable Diffusion](https://huggingface.co/spaces/rinna/japanese-stable-diffusion) (by [Rinna Co.](https://github.com/rinnakk/japanese-stable-diffusion/) and others. You can start at [our text-to-image fine-tuning example](https://github.com/huggingface/diffusers/tree/main/examples/text_to_image) and go from there.


 ## Stable Diffusion Community Pipelines

-The release of Stable Diffusion as an open source model has fostered a lot of interesting ideas and experimentation. 
-Our [Community Examples folder](https://github.com/huggingface/diffusers/tree/main/examples/community) contains many ideas worth exploring, like interpolating to create animated videos, using CLIP Guidance for additional prompt fidelity, term weighting, and much more! [Take a look](https://huggingface.co/docs/diffusers/using-diffusers/custom_pipeline_overview) and [contribute your own](https://huggingface.co/docs/diffusers/using-diffusers/contribute_pipeline).
+The release of Stable Diffusion as an open source model has fostered a lot of interesting ideas and experimentation. Our [Community Examples folder](https://github.com/huggingface/diffusers/tree/main/examples/community) contains many ideas worth exploring, like interpolating to create animated videos, using CLIP Guidance for additional prompt fidelity, term weighting, and much more! Take a look and [contribute your own](https://huggingface.co/docs/diffusers/using-diffusers/custom_pipelines).

 ## Other Examples

@@ -357,7 +358,7 @@ There are many ways to try running Diffusers! Here we outline code-focused tools
 If you want to run the code yourself 💻, you can try out:
 - [Text-to-Image Latent Diffusion](https://huggingface.co/CompVis/ldm-text2im-large-256)
 ```python
-# !pip install diffusers["torch"] transformers
+# !pip install diffusers transformers
 from diffusers import DiffusionPipeline

 device = "cuda"
@@ -376,7 +377,7 @@ image.save("squirrel.png")
 ```
 - [Unconditional Diffusion with discrete scheduler](https://huggingface.co/google/ddpm-celebahq-256)
 ```python
-# !pip install diffusers["torch"]
+# !pip install diffusers
 from diffusers import DDPMPipeline, DDIMPipeline, PNDMPipeline

 model_id = "google/ddpm-celebahq-256"
@@ -395,14 +396,10 @@ image.save("ddpm_generated_image.png")
 - [Unconditional Latent Diffusion](https://huggingface.co/CompVis/ldm-celebahq-256)
 - [Unconditional Diffusion with continuous scheduler](https://huggingface.co/google/ncsnpp-ffhq-1024)

-**Other Image Notebooks**:
+**Other Notebooks**:
 * [image-to-image generation with Stable Diffusion](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/image_2_image_using_diffusers.ipynb) ![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg),
 * [tweak images via repeated Stable Diffusion seeds](https://colab.research.google.com/github/pcuenca/diffusers-examples/blob/main/notebooks/stable-diffusion-seeds.ipynb) ![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg),

-**Diffusers for Other Modalities**:
-* [Molecule conformation generation](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/geodiff_molecule_conformation.ipynb) ![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg),
-* [Model-based reinforcement learning](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/reinforcement_learning_with_diffusers.ipynb) ![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg),
-
 ### Web Demos
 If you just want to play around with some web demos, you can try out the following 🚀 Spaces:
 | Model                          	| Hugging Face Spaces                                                                                                                                               	|
@@ -425,7 +422,7 @@ If you just want to play around with some web demos, you can try out the followi
 <p>
    
 **Schedulers**: Algorithm class for both **inference** and **training**.
-The class provides functionality to compute previous image according to alpha, beta schedule as well as predict noise for training. Also known as **Samplers**.
+The class provides functionality to compute previous image according to alpha, beta schedule as well as predict noise for training.
 *Examples*: [DDPM](https://arxiv.org/abs/2006.11239), [DDIM](https://arxiv.org/abs/2010.02502), [PNDM](https://arxiv.org/abs/2202.09778), [DEIS](https://arxiv.org/abs/2204.13902)

 <p align="center">
--- a/docker/diffusers-flax-cpu/Dockerfile
+++ b/docker/diffusers-flax-cpu/Dockerfile
@@ -1,44 +0,0 @@
-FROM ubuntu:20.04
-LABEL maintainer="Hugging Face"
-LABEL repository="diffusers"
-
-ENV DEBIAN_FRONTEND=noninteractive
-
-RUN apt update && \
-    apt install -y bash \
-                   build-essential \
-                   git \
-                   git-lfs \
-                   curl \
-                   ca-certificates \
-                   libsndfile1-dev \
-                   python3.8 \
-                   python3-pip \
-                   python3.8-venv && \
-    rm -rf /var/lib/apt/lists
-
-# make sure to use venv
-RUN python3 -m venv /opt/venv
-ENV PATH="/opt/venv/bin:$PATH"
-
-# pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
-# follow the instructions here: https://cloud.google.com/tpu/docs/run-in-container#train_a_jax_model_in_a_docker_container
-RUN python3 -m pip install --no-cache-dir --upgrade pip && \
-    python3 -m pip install --upgrade --no-cache-dir \
-        clu \
-        "jax[cpu]>=0.2.16,!=0.3.2" \
-        "flax>=0.4.1" \
-        "jaxlib>=0.1.65" && \
-    python3 -m pip install --no-cache-dir \
-        accelerate \
-        datasets \
-        hf-doc-builder \
-        huggingface-hub \
-        librosa \
-        modelcards \
-        numpy \
-        scipy \
-        tensorboard \
-        transformers
-
-CMD ["/bin/bash"]
--- a/docker/diffusers-flax-tpu/Dockerfile
+++ b/docker/diffusers-flax-tpu/Dockerfile
@@ -1,46 +0,0 @@
-FROM ubuntu:20.04
-LABEL maintainer="Hugging Face"
-LABEL repository="diffusers"
-
-ENV DEBIAN_FRONTEND=noninteractive
-
-RUN apt update && \
-    apt install -y bash \
-                   build-essential \
-                   git \
-                   git-lfs \
-                   curl \
-                   ca-certificates \
-                   libsndfile1-dev \
-                   python3.8 \
-                   python3-pip \
-                   python3.8-venv && \
-    rm -rf /var/lib/apt/lists
-
-# make sure to use venv
-RUN python3 -m venv /opt/venv
-ENV PATH="/opt/venv/bin:$PATH"
-
-# pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
-# follow the instructions here: https://cloud.google.com/tpu/docs/run-in-container#train_a_jax_model_in_a_docker_container
-RUN python3 -m pip install --no-cache-dir --upgrade pip && \
-    python3 -m pip install --no-cache-dir \
-        "jax[tpu]>=0.2.16,!=0.3.2" \
-        -f https://storage.googleapis.com/jax-releases/libtpu_releases.html && \
-    python3 -m pip install --upgrade --no-cache-dir \
-        clu \
-        "flax>=0.4.1" \
-        "jaxlib>=0.1.65" && \
-    python3 -m pip install --no-cache-dir \
-        accelerate \
-        datasets \
-        hf-doc-builder \
-        huggingface-hub \
-        librosa \        
-        modelcards \
-        numpy \
-        scipy \
-        tensorboard \
-        transformers
-
-CMD ["/bin/bash"]
--- a/docker/diffusers-onnxruntime-cpu/Dockerfile
+++ b/docker/diffusers-onnxruntime-cpu/Dockerfile
@@ -1,44 +0,0 @@
-FROM ubuntu:20.04
-LABEL maintainer="Hugging Face"
-LABEL repository="diffusers"
-
-ENV DEBIAN_FRONTEND=noninteractive
-
-RUN apt update && \
-    apt install -y bash \
-                   build-essential \
-                   git \
-                   git-lfs \
-                   curl \
-                   ca-certificates \
-                   libsndfile1-dev \
-                   python3.8 \
-                   python3-pip \
-                   python3.8-venv && \
-    rm -rf /var/lib/apt/lists
-
-# make sure to use venv
-RUN python3 -m venv /opt/venv
-ENV PATH="/opt/venv/bin:$PATH"
-
-# pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
-RUN python3 -m pip install --no-cache-dir --upgrade pip && \
-    python3 -m pip install --no-cache-dir \
-        torch \
-        torchvision \
-        torchaudio \
-        onnxruntime \
-        --extra-index-url https://download.pytorch.org/whl/cpu && \
-    python3 -m pip install --no-cache-dir \
-        accelerate \
-        datasets \
-        hf-doc-builder \
-        huggingface-hub \
-        librosa \
-        modelcards \
-        numpy \
-        scipy \
-        tensorboard \
-        transformers
-
-CMD ["/bin/bash"]
--- a/docker/diffusers-onnxruntime-cuda/Dockerfile
+++ b/docker/diffusers-onnxruntime-cuda/Dockerfile
@@ -1,44 +0,0 @@
-FROM nvidia/cuda:11.6.2-cudnn8-devel-ubuntu20.04
-LABEL maintainer="Hugging Face"
-LABEL repository="diffusers"
-
-ENV DEBIAN_FRONTEND=noninteractive
-
-RUN apt update && \
-    apt install -y bash \
-                   build-essential \
-                   git \
-                   git-lfs \
-                   curl \
-                   ca-certificates \
-                   libsndfile1-dev \
-                   python3.8 \
-                   python3-pip \
-                   python3.8-venv && \
-    rm -rf /var/lib/apt/lists
-
-# make sure to use venv
-RUN python3 -m venv /opt/venv
-ENV PATH="/opt/venv/bin:$PATH"
-
-# pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
-RUN python3 -m pip install --no-cache-dir --upgrade pip && \
-    python3 -m pip install --no-cache-dir \
-        torch \
-        torchvision \
-        torchaudio \
-        "onnxruntime-gpu>=1.13.1" \
-        --extra-index-url https://download.pytorch.org/whl/cu117 && \
-    python3 -m pip install --no-cache-dir \
-        accelerate \
-        datasets \
-        hf-doc-builder \
-        huggingface-hub \
-        librosa \
-        modelcards \
-        numpy \
-        scipy \
-        tensorboard \
-        transformers
-
-CMD ["/bin/bash"]
--- a/docker/diffusers-pytorch-cpu/Dockerfile
+++ b/docker/diffusers-pytorch-cpu/Dockerfile
@@ -1,43 +0,0 @@
-FROM ubuntu:20.04
-LABEL maintainer="Hugging Face"
-LABEL repository="diffusers"
-
-ENV DEBIAN_FRONTEND=noninteractive
-
-RUN apt update && \
-    apt install -y bash \
-                   build-essential \
-                   git \
-                   git-lfs \
-                   curl \
-                   ca-certificates \
-                   libsndfile1-dev \
-                   python3.8 \
-                   python3-pip \
-                   python3.8-venv && \
-    rm -rf /var/lib/apt/lists
-
-# make sure to use venv
-RUN python3 -m venv /opt/venv
-ENV PATH="/opt/venv/bin:$PATH"
-
-# pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
-RUN python3 -m pip install --no-cache-dir --upgrade pip && \
-    python3 -m pip install --no-cache-dir \
-        torch \
-        torchvision \
-        torchaudio \
-        --extra-index-url https://download.pytorch.org/whl/cpu && \
-    python3 -m pip install --no-cache-dir \
-        accelerate \
-        datasets \
-        hf-doc-builder \
-        huggingface-hub \
-        librosa \
-        modelcards \
-        numpy \
-        scipy \
-        tensorboard \
-        transformers
-
-CMD ["/bin/bash"]
--- a/docker/diffusers-pytorch-cuda/Dockerfile
+++ b/docker/diffusers-pytorch-cuda/Dockerfile
@@ -1,43 +0,0 @@
-FROM nvidia/cuda:11.7.1-cudnn8-runtime-ubuntu20.04
-LABEL maintainer="Hugging Face"
-LABEL repository="diffusers"
-
-ENV DEBIAN_FRONTEND=noninteractive
-
-RUN apt update && \
-    apt install -y bash \
-                   build-essential \
-                   git \
-                   git-lfs \
-                   curl \
-                   ca-certificates \
-                   libsndfile1-dev \
-                   python3.8 \
-                   python3-pip \
-                   python3.8-venv && \
-    rm -rf /var/lib/apt/lists
-
-# make sure to use venv
-RUN python3 -m venv /opt/venv
-ENV PATH="/opt/venv/bin:$PATH"
-
-# pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
-RUN python3 -m pip install --no-cache-dir --upgrade pip && \
-    python3 -m pip install --no-cache-dir \
-        torch \
-        torchvision \
-        torchaudio \
-        --extra-index-url https://download.pytorch.org/whl/cu117 && \
-    python3 -m pip install --no-cache-dir \
-        accelerate \
-        datasets \
-        hf-doc-builder \
-        huggingface-hub \
-        librosa \
-        modelcards \
-        numpy \
-        scipy \
-        tensorboard \
-        transformers
-
-CMD ["/bin/bash"]
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -10,8 +10,6 @@
  - sections:
    - local: using-diffusers/loading
      title: "Loading Pipelines, Models, and Schedulers"
-    - local: using-diffusers/schedulers
-      title: "Using different Schedulers"
    - local: using-diffusers/configuration
      title: "Configuring Pipelines, Models, and Schedulers"
    - local: using-diffusers/custom_pipeline_overview
@@ -31,14 +29,6 @@
    - local: using-diffusers/contribute_pipeline
      title: "How to contribute a Pipeline"
    title: "Pipelines for Inference"
-  - sections:
-    - local: using-diffusers/rl
-      title: "Reinforcement Learning"
-    - local: using-diffusers/audio
-      title: "Audio"
-    - local: using-diffusers/other-modalities
-      title: "Other Modalities"
-    title: "Taking Diffusers Beyond Images"
  title: "Using Diffusers"
 - sections:
  - local: optimization/fp16
@@ -49,8 +39,6 @@
    title: "OpenVINO"
  - local: optimization/mps
    title: "MPS"
-  - local: optimization/habana
-    title: "Habana Gaudi"
  title: "Optimization/Special Hardware"
 - sections:
  - local: training/overview
@@ -90,10 +78,6 @@
  - sections:
    - local: api/pipelines/overview
      title: "Overview"
-    - local: api/pipelines/alt_diffusion
-      title: "AltDiffusion"
-    - local: api/pipelines/cycle_diffusion
-      title: "Cycle Diffusion"
    - local: api/pipelines/ddim
      title: "DDIM"
    - local: api/pipelines/ddpm
@@ -102,33 +86,15 @@
      title: "Latent Diffusion"
    - local: api/pipelines/latent_diffusion_uncond
      title: "Unconditional Latent Diffusion"
-    - local: api/pipelines/paint_by_example
-      title: "PaintByExample"
    - local: api/pipelines/pndm
      title: "PNDM"
    - local: api/pipelines/score_sde_ve
      title: "Score SDE VE"
    - local: api/pipelines/stable_diffusion
      title: "Stable Diffusion"
-    - local: api/pipelines/stable_diffusion_2
-      title: "Stable Diffusion 2"
-    - local: api/pipelines/stable_diffusion_safe
-      title: "Safe Stable Diffusion"
    - local: api/pipelines/stochastic_karras_ve
      title: "Stochastic Karras VE"
    - local: api/pipelines/dance_diffusion
      title: "Dance Diffusion"
-    - local: api/pipelines/versatile_diffusion
-      title: "Versatile Diffusion"
-    - local: api/pipelines/vq_diffusion
-      title: "VQ Diffusion"
-    - local: api/pipelines/repaint
-      title: "RePaint"
-    - local: api/pipelines/audio_diffusion
-      title: "Audio Diffusion"
    title: "Pipelines"
-  - sections:
-    - local: api/experimental/rl
-      title: "RL Planning"
-    title: "Experimental Features"
  title: "API"
--- a/docs/source/api/configuration.mdx
+++ b/docs/source/api/configuration.mdx
@@ -15,9 +15,9 @@ specific language governing permissions and limitations under the License.
 In Diffusers, schedulers of type [`schedulers.scheduling_utils.SchedulerMixin`], and models of type [`ModelMixin`] inherit from [`ConfigMixin`] which conveniently takes care of storing all parameters that are 
 passed to the respective `__init__` methods in a JSON-configuration file.

-## ConfigMixin
+TODO(PVP) - add example and better info here

+## ConfigMixin
 [[autodoc]] ConfigMixin
-	- load_config
 	- from_config
 	- save_config
--- a/docs/source/api/experimental/rl.mdx
+++ b/docs/source/api/experimental/rl.mdx
@@ -1,15 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# TODO
-
-Coming soon!
--- a/docs/source/api/models.mdx
+++ b/docs/source/api/models.mdx
@@ -22,15 +22,12 @@ The models are built on the base class ['ModelMixin'] that is a `torch.nn.module
 ## UNet2DOutput
 [[autodoc]] models.unet_2d.UNet2DOutput

-## UNet2DModel
-[[autodoc]] UNet2DModel
-
-## UNet1DOutput
-[[autodoc]] models.unet_1d.UNet1DOutput
-
 ## UNet1DModel
 [[autodoc]] UNet1DModel

+## UNet2DModel
+[[autodoc]] UNet2DModel
+
 ## UNet2DConditionOutput
 [[autodoc]] models.unet_2d_condition.UNet2DConditionOutput

@@ -52,12 +49,6 @@ The models are built on the base class ['ModelMixin'] that is a `torch.nn.module
 ## AutoencoderKL
 [[autodoc]] AutoencoderKL

-## Transformer2DModel
-[[autodoc]] Transformer2DModel
-
-## Transformer2DModelOutput
-[[autodoc]] models.attention.Transformer2DModelOutput
-
 ## FlaxModelMixin
 [[autodoc]] FlaxModelMixin

--- a/docs/source/api/pipelines/alt_diffusion.mdx
+++ b/docs/source/api/pipelines/alt_diffusion.mdx
@@ -1,83 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# AltDiffusion
-
-AltDiffusion was proposed in [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) by Zhongzhi Chen, Guang Liu, Bo-Wen Zhang, Fulong Ye, Qinghong Yang, Ledell Wu
-
-The abstract of the paper is the following:
-
-*In this work, we present a conceptually simple and effective method to train a strong bilingual multimodal representation model. Starting from the pretrained multimodal representation model CLIP released by OpenAI, we switched its text encoder with a pretrained multilingual text encoder XLM-R, and aligned both languages and image representations by a two-stage training schema consisting of teacher learning and contrastive learning. We validate our method through evaluations of a wide range of tasks. We set new state-of-the-art performances on a bunch of tasks including ImageNet-CN, Flicker30k- CN, and COCO-CN. Further, we obtain very close performances with CLIP on almost all tasks, suggesting that one can simply alter the text encoder in CLIP for extended capabilities such as multilingual understanding.*
-
-
-*Overview*:
-
-| Pipeline | Tasks | Colab | Demo
-|---|---|:---:|:---:|
-| [pipeline_alt_diffusion.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py) | *Text-to-Image Generation* | - | -
-| [pipeline_alt_diffusion_img2img.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py) | *Image-to-Image Text-Guided Generation* | - |-
-
-## Tips
-
- AltDiffusion is conceptually exaclty the same as [Stable Diffusion](./api/pipelines/stable_diffusion).
-
- *Run AltDiffusion*
-
-AltDiffusion can be tested very easily with the [`AltDiffusionPipeline`], [`AltDiffusionImg2ImgPipeline`] and the `"BAAI/AltDiffusion-m9"` checkpoint exactly in the same way it is shown in the [Conditional Image Generation Guide](./using-diffusers/conditional_image_generation) and the [Image-to-Image Generation Guide](./using-diffusers/img2img).
-
- *How to load and use different schedulers.*
-
-The alt diffusion pipeline uses [`DDIMScheduler`] scheduler by default. But `diffusers` provides many other schedulers that can be used with the alt diffusion pipeline such as [`PNDMScheduler`], [`LMSDiscreteScheduler`], [`EulerDiscreteScheduler`], [`EulerAncestralDiscreteScheduler`] etc.
-To use a different scheduler, you can either change it via the [`ConfigMixin.from_config`] method or pass the `scheduler` argument to the `from_pretrained` method of the pipeline. For example, to use the [`EulerDiscreteScheduler`], you can do the following:
-
-```python
->>> from diffusers import AltDiffusionPipeline, EulerDiscreteScheduler
-
->>> pipeline = AltDiffusionPipeline.from_pretrained("BAAI/AltDiffusion-m9")
->>> pipeline.scheduler = EulerDiscreteScheduler.from_config(pipeline.scheduler.config)
-
->>> # or
->>> euler_scheduler = EulerDiscreteScheduler.from_pretrained("BAAI/AltDiffusion-m9", subfolder="scheduler")
->>> pipeline = AltDiffusionPipeline.from_pretrained("BAAI/AltDiffusion-m9", scheduler=euler_scheduler)
-```
-
-
- *How to convert all use cases with multiple or single pipeline*
-
-If you want to use all possible use cases in a single `DiffusionPipeline` we recommend using the `components` functionality to instantiate all components in the most memory-efficient way:
-
-```python
->>> from diffusers import (
-...     AltDiffusionPipeline,
-...     AltDiffusionImg2ImgPipeline,
-... )
-
->>> text2img = AltDiffusionPipeline.from_pretrained("BAAI/AltDiffusion-m9")
->>> img2img = AltDiffusionImg2ImgPipeline(**text2img.components)
-
->>> # now you can use text2img(...) and img2img(...) just like the call methods of each respective pipeline
-```
-
-## AltDiffusionPipelineOutput
-[[autodoc]] pipelines.alt_diffusion.AltDiffusionPipelineOutput
-
-## AltDiffusionPipeline
-[[autodoc]] AltDiffusionPipeline
-	- __call__
-	- enable_attention_slicing
-	- disable_attention_slicing
-
-## AltDiffusionImg2ImgPipeline
-[[autodoc]] AltDiffusionImg2ImgPipeline
-	- __call__
-	- enable_attention_slicing
-	- disable_attention_slicing
--- a/docs/source/api/pipelines/audio_diffusion.mdx
+++ b/docs/source/api/pipelines/audio_diffusion.mdx
@@ -1,102 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# Audio Diffusion
-
-## Overview
-
-[Audio Diffusion](https://github.com/teticio/audio-diffusion) by Robert Dargavel Smith.
-
-Audio Diffusion leverages the recent advances in image generation using diffusion models by converting audio samples to
-and from mel spectrogram images.
-
-The original codebase of this implementation can be found [here](https://github.com/teticio/audio-diffusion), including
-training scripts and example notebooks.
-
-## Available Pipelines:
-
-| Pipeline | Tasks | Colab
-|---|---|:---:|
-| [pipeline_audio_diffusion.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/audio_diffusion/pipeline_audio_diffusion.py) | *Unconditional Audio Generation* | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/teticio/audio-diffusion/blob/master/notebooks/audio_diffusion_pipeline.ipynb) |
-
-
-## Examples:
-
-### Audio Diffusion
-
-```python
-import torch
-from IPython.display import Audio
-from diffusers import DiffusionPipeline
-
-device = "cuda" if torch.cuda.is_available() else "cpu"
-pipe = DiffusionPipeline.from_pretrained("teticio/audio-diffusion-256").to(device)
-
-output = pipe()
-display(output.images[0])
-display(Audio(output.audios[0], rate=mel.get_sample_rate()))
-```
-
-### Latent Audio Diffusion
-
-```python
-import torch
-from IPython.display import Audio
-from diffusers import DiffusionPipeline
-
-device = "cuda" if torch.cuda.is_available() else "cpu"
-pipe = DiffusionPipeline.from_pretrained("teticio/latent-audio-diffusion-256").to(device)
-
-output = pipe()
-display(output.images[0])
-display(Audio(output.audios[0], rate=pipe.mel.get_sample_rate()))
-```
-
-### Audio Diffusion with DDIM (faster)
-
-```python
-import torch
-from IPython.display import Audio
-from diffusers import DiffusionPipeline
-
-device = "cuda" if torch.cuda.is_available() else "cpu"
-pipe = DiffusionPipeline.from_pretrained("teticio/audio-diffusion-ddim-256").to(device)
-
-output = pipe()
-display(output.images[0])
-display(Audio(output.audios[0], rate=pipe.mel.get_sample_rate()))
-```
-
-### Variations, in-painting, out-painting etc.
-
-```python
-output = pipe(
-    raw_audio=output.audios[0, 0],
-    start_step=int(pipe.get_default_steps() / 2),
-    mask_start_secs=1,
-    mask_end_secs=1,
-)
-display(output.images[0])
-display(Audio(output.audios[0], rate=pipe.mel.get_sample_rate()))
-```
-
-## AudioDiffusionPipeline
-[[autodoc]] AudioDiffusionPipeline
-    - __call__
-    - encode
-    - slerp
-
-
-## Mel
-[[autodoc]] Mel
-    - audio_slice_to_image
-    - image_to_audio
--- a/docs/source/api/pipelines/cycle_diffusion.mdx
+++ b/docs/source/api/pipelines/cycle_diffusion.mdx
@@ -1,99 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# Cycle Diffusion
-
-## Overview
-
-Cycle Diffusion is a Text-Guided Image-to-Image Generation model proposed in [Unifying Diffusion Models' Latent Space, with Applications to CycleDiffusion and Guidance](https://arxiv.org/abs/2210.05559) by Chen Henry Wu, Fernando De la Torre.
-
-The abstract of the paper is the following:
-
-*Diffusion models have achieved unprecedented performance in generative modeling. The commonly-adopted formulation of the latent code of diffusion models is a sequence of gradually denoised samples, as opposed to the simpler (e.g., Gaussian) latent space of GANs, VAEs, and normalizing flows. This paper provides an alternative, Gaussian formulation of the latent space of various diffusion models, as well as an invertible DPM-Encoder that maps images into the latent space. While our formulation is purely based on the definition of diffusion models, we demonstrate several intriguing consequences. (1) Empirically, we observe that a common latent space emerges from two diffusion models trained independently on related domains. In light of this finding, we propose CycleDiffusion, which uses DPM-Encoder for unpaired image-to-image translation. Furthermore, applying CycleDiffusion to text-to-image diffusion models, we show that large-scale text-to-image diffusion models can be used as zero-shot image-to-image editors. (2) One can guide pre-trained diffusion models and GANs by controlling the latent codes in a unified, plug-and-play formulation based on energy-based models. Using the CLIP model and a face recognition model as guidance, we demonstrate that diffusion models have better coverage of low-density sub-populations and individuals than GANs.*
-
-*Tips*:
- The Cycle Diffusion pipeline is fully compatible with any [Stable Diffusion](./stable_diffusion) checkpoints
- Currently Cycle Diffusion only works with the [`DDIMScheduler`].
-
-*Example*:
-
-In the following we should how to best use the [`CycleDiffusionPipeline`]
-
-```python
-import requests
-import torch
-from PIL import Image
-from io import BytesIO
-
-from diffusers import CycleDiffusionPipeline, DDIMScheduler
-
-# load the pipeline
-# make sure you're logged in with `huggingface-cli login`
-model_id_or_path = "CompVis/stable-diffusion-v1-4"
-scheduler = DDIMScheduler.from_pretrained(model_id_or_path, subfolder="scheduler")
-pipe = CycleDiffusionPipeline.from_pretrained(model_id_or_path, scheduler=scheduler).to("cuda")
-
-# let's download an initial image
-url = "https://raw.githubusercontent.com/ChenWu98/cycle-diffusion/main/data/dalle2/An%20astronaut%20riding%20a%20horse.png"
-response = requests.get(url)
-init_image = Image.open(BytesIO(response.content)).convert("RGB")
-init_image = init_image.resize((512, 512))
-init_image.save("horse.png")
-
-# let's specify a prompt
-source_prompt = "An astronaut riding a horse"
-prompt = "An astronaut riding an elephant"
-
-# call the pipeline
-image = pipe(
-    prompt=prompt,
-    source_prompt=source_prompt,
-    image=init_image,
-    num_inference_steps=100,
-    eta=0.1,
-    strength=0.8,
-    guidance_scale=2,
-    source_guidance_scale=1,
-).images[0]
-
-image.save("horse_to_elephant.png")
-
-# let's try another example
-# See more samples at the original repo: https://github.com/ChenWu98/cycle-diffusion
-url = "https://raw.githubusercontent.com/ChenWu98/cycle-diffusion/main/data/dalle2/A%20black%20colored%20car.png"
-response = requests.get(url)
-init_image = Image.open(BytesIO(response.content)).convert("RGB")
-init_image = init_image.resize((512, 512))
-init_image.save("black.png")
-
-source_prompt = "A black colored car"
-prompt = "A blue colored car"
-
-# call the pipeline
-torch.manual_seed(0)
-image = pipe(
-    prompt=prompt,
-    source_prompt=source_prompt,
-    image=init_image,
-    num_inference_steps=100,
-    eta=0.1,
-    strength=0.85,
-    guidance_scale=3,
-    source_guidance_scale=1,
-).images[0]
-
-image.save("black_to_blue.png")
-```
-
-## CycleDiffusionPipeline
-[[autodoc]] CycleDiffusionPipeline
-	- __call__
--- a/docs/source/api/pipelines/ddim.mdx
+++ b/docs/source/api/pipelines/ddim.mdx
@@ -20,8 +20,7 @@ The abstract of the paper is the following:

 Denoising diffusion probabilistic models (DDPMs) have achieved high quality image generation without adversarial training, yet they require simulating a Markov chain for many steps to produce a sample. To accelerate sampling, we present denoising diffusion implicit models (DDIMs), a more efficient class of iterative implicit probabilistic models with the same training procedure as DDPMs. In DDPMs, the generative process is defined as the reverse of a Markovian diffusion process. We construct a class of non-Markovian diffusion processes that lead to the same training objective, but whose reverse process can be much faster to sample from. We empirically demonstrate that DDIMs can produce high quality samples 10× to 50× faster in terms of wall-clock time compared to DDPMs, allow us to trade off computation for sample quality, and can perform semantically meaningful image interpolation directly in the latent space.

-The original codebase of this paper can be found here: [ermongroup/ddim](https://github.com/ermongroup/ddim).
-For questions, feel free to contact the author on [tsong.me](https://tsong.me/).
+The original codebase of this paper can be found [here](https://github.com/ermongroup/ddim).

 ## Available Pipelines:

--- a/docs/source/api/pipelines/latent_diffusion.mdx
+++ b/docs/source/api/pipelines/latent_diffusion.mdx
@@ -33,15 +33,10 @@ The original codebase can be found [here](https://github.com/CompVis/latent-diff
 | Pipeline | Tasks | Colab
 |---|---|:---:|
 | [pipeline_latent_diffusion.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py) | *Text-to-Image Generation* | - |
-| [pipeline_latent_diffusion_superresolution.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py) | *Super Resolution* | - |

 ## Examples:


 ## LDMTextToImagePipeline
-[[autodoc]] LDMTextToImagePipeline
-    - __call__
-
-## LDMSuperResolutionPipeline
-[[autodoc]] LDMSuperResolutionPipeline
+[[autodoc]] pipelines.latent_diffusion.pipeline_latent_diffusion.LDMTextToImagePipeline
    - __call__
--- a/docs/source/api/pipelines/overview.mdx
+++ b/docs/source/api/pipelines/overview.mdx
@@ -28,7 +28,7 @@ or created independently from each other.

 To that end, we strive to offer all open-sourced, state-of-the-art diffusion system under a unified API. 
 More specifically, we strive to provide pipelines that
- 1. can load the officially published weights and yield 1-to-1 the same outputs as the original implementation according to the corresponding paper (*e.g.* [LDMTextToImagePipeline](https://github.com/huggingface/diffusers/tree/main/src/diffusers/pipelines/latent_diffusion), uses the officially released weights of [High-Resolution Image Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752)),
+- 1. can load the officially published weights and yield 1-to-1 the same outputs as the original implementation according to the corresponding paper (*e.g.* [LatentDiffusionPipeline](https://github.com/huggingface/diffusers/tree/main/src/diffusers/pipelines/latent_diffusion), uses the officially released weights of [High-Resolution Image Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752)),
 - 2. have a simple user interface to run the model in inference (see the [Pipelines API](#pipelines-api) section), 
 - 3. are easy to understand with code that is self-explanatory and can be read along-side the official paper (see [Pipelines summary](#pipelines-summary)),
 - 4. can easily be contributed by the community (see the [Contribution](#contribution) section).
@@ -41,35 +41,19 @@ If you are looking for *official* training examples, please have a look at [exam
 The following table summarizes all officially supported pipelines, their corresponding paper, and if 
 available a colab notebook to directly try them out.

-
 | Pipeline | Paper | Tasks | Colab
 |---|---|:---:|:---:|
-| [alt_diffusion](./api/pipelines/alt_diffusion) | [**AltDiffusion**](https://arxiv.org/abs/2211.06679) | Image-to-Image Text-Guided Generation | -
-| [audio_diffusion](./api/pipelines/audio_diffusion) | [**Audio Diffusion**](https://github.com/teticio/audio_diffusion.git) | Unconditional Audio Generation |
-| [cycle_diffusion](./api/pipelines/cycle_diffusion) | [**Cycle Diffusion**](https://arxiv.org/abs/2210.05559) | Image-to-Image Text-Guided Generation |
-| [dance_diffusion](./api/pipelines/dance_diffusion) | [**Dance Diffusion**](https://github.com/williamberman/diffusers.git) | Unconditional Audio Generation |
-| [ddpm](./api/pipelines/ddpm) | [**Denoising Diffusion Probabilistic Models**](https://arxiv.org/abs/2006.11239) | Unconditional Image Generation |
-| [ddim](./api/pipelines/ddim) | [**Denoising Diffusion Implicit Models**](https://arxiv.org/abs/2010.02502) | Unconditional Image Generation |
-| [latent_diffusion](./api/pipelines/latent_diffusion) | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752)| Text-to-Image Generation | 
-| [latent_diffusion](./api/pipelines/latent_diffusion) | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752)| Super Resolution Image-to-Image | 
-| [latent_diffusion_uncond](./api/pipelines/latent_diffusion_uncond) | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752) | Unconditional Image Generation | 
-| [paint_by_example](./api/pipelines/paint_by_example) | [**Paint by Example: Exemplar-based Image Editing with Diffusion Models**](https://arxiv.org/abs/2211.13227) | Image-Guided Image Inpainting | 
-| [pndm](./api/pipelines/pndm) | [**Pseudo Numerical Methods for Diffusion Models on Manifolds**](https://arxiv.org/abs/2202.09778) | Unconditional Image Generation | 
-| [score_sde_ve](./api/pipelines/score_sde_ve) | [**Score-Based Generative Modeling through Stochastic Differential Equations**](https://openreview.net/forum?id=PxTIG12RRHS) | Unconditional Image Generation | 
-| [score_sde_vp](./api/pipelines/score_sde_vp) | [**Score-Based Generative Modeling through Stochastic Differential Equations**](https://openreview.net/forum?id=PxTIG12RRHS) | Unconditional Image Generation | 
-| [stable_diffusion](./api/pipelines/stable_diffusion) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Text-to-Image Generation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/training_example.ipynb)
-| [stable_diffusion](./api/pipelines/stable_diffusion) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Image-to-Image Text-Guided Generation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/image_2_image_using_diffusers.ipynb)
-| [stable_diffusion](./api/pipelines/stable_diffusion) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Text-Guided Image Inpainting | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/in_painting_with_stable_diffusion_using_diffusers.ipynb)
-| [stable_diffusion_2](./api/pipelines/stable_diffusion_2) | [**Stable Diffusion 2**](https://stability.ai/blog/stable-diffusion-v2-release) | Text-to-Image Generation | 
-| [stable_diffusion_2](./api/pipelines/stable_diffusion_2) | [**Stable Diffusion 2**](https://stability.ai/blog/stable-diffusion-v2-release) | Text-Guided Image Inpainting | 
-| [stable_diffusion_2](./api/pipelines/stable_diffusion_2) | [**Stable Diffusion 2**](https://stability.ai/blog/stable-diffusion-v2-release) | Text-Guided Super Resolution Image-to-Image |
-| [stable_diffusion_safe](./api/pipelines/stable_diffusion_safe) | [**Safe Stable Diffusion**](https://arxiv.org/abs/2211.05105) | Text-Guided Generation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ml-research/safe-latent-diffusion/blob/main/examples/Safe%20Latent%20Diffusion.ipynb)
-| [stochastic_karras_ve](./api/pipelines/stochastic_karras_ve) | [**Elucidating the Design Space of Diffusion-Based Generative Models**](https://arxiv.org/abs/2206.00364) | Unconditional Image Generation |
-| [versatile_diffusion](./api/pipelines/versatile_diffusion) | [Versatile Diffusion: Text, Images and Variations All in One Diffusion Model](https://arxiv.org/abs/2211.08332) | Text-to-Image Generation | 
-| [versatile_diffusion](./api/pipelines/versatile_diffusion) | [Versatile Diffusion: Text, Images and Variations All in One Diffusion Model](https://arxiv.org/abs/2211.08332) | Image Variations Generation | 
-| [versatile_diffusion](./api/pipelines/versatile_diffusion) | [Versatile Diffusion: Text, Images and Variations All in One Diffusion Model](https://arxiv.org/abs/2211.08332) | Dual Image and Text Guided Generation | 
-| [vq_diffusion](./api/pipelines/vq_diffusion) | [Vector Quantized Diffusion Model for Text-to-Image Synthesis](https://arxiv.org/abs/2111.14822) | Text-to-Image Generation | 
-
+| [ddpm](./ddpm) | [**Denoising Diffusion Probabilistic Models**](https://arxiv.org/abs/2006.11239) | Unconditional Image Generation |
+| [ddim](./ddim) | [**Denoising Diffusion Implicit Models**](https://arxiv.org/abs/2010.02502) | Unconditional Image Generation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/training_example.ipynb)
+| [latent_diffusion](./latent_diffusion) | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752)| Text-to-Image Generation | 
+| [latent_diffusion_uncond](./latent_diffusion_uncond) | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752) | Unconditional Image Generation | 
+| [pndm](./pndm) | [**Pseudo Numerical Methods for Diffusion Models on Manifolds**](https://arxiv.org/abs/2202.09778) | Unconditional Image Generation | 
+| [score_sde_ve](./score_sde_ve) | [**Score-Based Generative Modeling through Stochastic Differential Equations**](https://openreview.net/forum?id=PxTIG12RRHS) | Unconditional Image Generation | 
+| [score_sde_vp](./score_sde_vp) | [**Score-Based Generative Modeling through Stochastic Differential Equations**](https://openreview.net/forum?id=PxTIG12RRHS) | Unconditional Image Generation | 
+| [stable_diffusion](./stable_diffusion) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Text-to-Image Generation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/training_example.ipynb)
+| [stable_diffusion](./stable_diffusion) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Image-to-Image Text-Guided Generation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/image_2_image_using_diffusers.ipynb)
+| [stable_diffusion](./stable_diffusion) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Text-Guided Image Inpainting | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/in_painting_with_stable_diffusion_using_diffusers.ipynb)
+| [stochastic_karras_ve](./stochastic_karras_ve) | [**Elucidating the Design Space of Diffusion-Based Generative Models**](https://arxiv.org/abs/2206.00364) | Unconditional Image Generation | 

 **Note**: Pipelines are simple examples of how to play around with the diffusion systems as described in the corresponding papers. 

@@ -151,7 +135,7 @@ init_image = init_image.resize((768, 512))

 prompt = "A fantasy landscape, trending on artstation"

-images = pipe(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images
+images = pipe(prompt=prompt, init_image=init_image, strength=0.75, guidance_scale=7.5).images

 images[0].save("fantasy_landscape.png")
 ```
--- a/docs/source/api/pipelines/paint_by_example.mdx
+++ b/docs/source/api/pipelines/paint_by_example.mdx
@@ -1,73 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# PaintByExample
-
-## Overview
-
-[Paint by Example: Exemplar-based Image Editing with Diffusion Models](https://arxiv.org/abs/2211.13227) by Binxin Yang, Shuyang Gu, Bo Zhang, Ting Zhang, Xuejin Chen, Xiaoyan Sun, Dong Chen, Fang Wen
-
-The abstract of the paper is the following:
-
-*Language-guided image editing has achieved great success recently. In this paper, for the first time, we investigate exemplar-guided image editing for more precise control. We achieve this goal by leveraging self-supervised training to disentangle and re-organize the source image and the exemplar. However, the naive approach will cause obvious fusing artifacts. We carefully analyze it and propose an information bottleneck and strong augmentations to avoid the trivial solution of directly copying and pasting the exemplar image. Meanwhile, to ensure the controllability of the editing process, we design an arbitrary shape mask for the exemplar image and leverage the classifier-free guidance to increase the similarity to the exemplar image. The whole framework involves a single forward of the diffusion model without any iterative optimization. We demonstrate that our method achieves an impressive performance and enables controllable editing on in-the-wild images with high fidelity.*
-
-The original codebase can be found [here](https://github.com/Fantasy-Studio/Paint-by-Example).
-
-## Available Pipelines:
-
-| Pipeline | Tasks | Colab
-|---|---|:---:|
-| [pipeline_paint_by_example.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py) | *Image-Guided Image Painting* | - |
-
-## Tips
-
- PaintByExample is supported by the official [Fantasy-Studio/Paint-by-Example](https://huggingface.co/Fantasy-Studio/Paint-by-Example) checkpoint. The checkpoint has been warm-started from the [CompVis/stable-diffusion-v1-4](https://huggingface.co/CompVis/stable-diffusion-v1-4) and with the objective to inpaint partly masked images conditioned on example / reference images
- To quickly demo *PaintByExample*, please have a look at [this demo](https://huggingface.co/spaces/Fantasy-Studio/Paint-by-Example)
- You can run the following code snippet as an example:
-
-
-```python
-# !pip install diffusers transformers
-
-import PIL
-import requests
-import torch
-from io import BytesIO
-from diffusers import DiffusionPipeline
-
-
-def download_image(url):
-    response = requests.get(url)
-    return PIL.Image.open(BytesIO(response.content)).convert("RGB")
-
-
-img_url = "https://raw.githubusercontent.com/Fantasy-Studio/Paint-by-Example/main/examples/image/example_1.png"
-mask_url = "https://raw.githubusercontent.com/Fantasy-Studio/Paint-by-Example/main/examples/mask/example_1.png"
-example_url = "https://raw.githubusercontent.com/Fantasy-Studio/Paint-by-Example/main/examples/reference/example_1.jpg"
-
-init_image = download_image(img_url).resize((512, 512))
-mask_image = download_image(mask_url).resize((512, 512))
-example_image = download_image(example_url).resize((512, 512))
-
-pipe = DiffusionPipeline.from_pretrained(
-    "Fantasy-Studio/Paint-by-Example",
-    torch_dtype=torch.float16,
-)
-pipe = pipe.to("cuda")
-
-image = pipe(image=init_image, mask_image=mask_image, example_image=example_image).images[0]
-image
-```
-
-## PaintByExamplePipeline
-[[autodoc]] pipelines.paint_by_example.pipeline_paint_by_example.PaintByExamplePipeline
-    - __call__
--- a/docs/source/api/pipelines/repaint.mdx
+++ b/docs/source/api/pipelines/repaint.mdx
@@ -1,77 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# RePaint
-
-## Overview
-
-[RePaint: Inpainting using Denoising Diffusion Probabilistic Models](https://arxiv.org/abs/2201.09865) (PNDM) by Andreas Lugmayr, Martin Danelljan, Andres Romero, Fisher Yu, Radu Timofte, Luc Van Gool.
-
-The abstract of the paper is the following:
-
-Free-form inpainting is the task of adding new content to an image in the regions specified by an arbitrary binary mask. Most existing approaches train for a certain distribution of masks, which limits their generalization capabilities to unseen mask types. Furthermore, training with pixel-wise and perceptual losses often leads to simple textural extensions towards the missing areas instead of semantically meaningful generation. In this work, we propose RePaint: A Denoising Diffusion Probabilistic Model (DDPM) based inpainting approach that is applicable to even extreme masks. We employ a pretrained unconditional DDPM as the generative prior. To condition the generation process, we only alter the reverse diffusion iterations by sampling the unmasked regions using the given image information. Since this technique does not modify or condition the original DDPM network itself, the model produces high-quality and diverse output images for any inpainting form. We validate our method for both faces and general-purpose image inpainting using standard and extreme masks.
-RePaint outperforms state-of-the-art Autoregressive, and GAN approaches for at least five out of six mask distributions.
-
-The original codebase can be found [here](https://github.com/andreas128/RePaint).
-
-## Available Pipelines:
-
-| Pipeline                                                                                                                      | Tasks              | Colab
-|-------------------------------------------------------------------------------------------------------------------------------|--------------------|:---:|
-| [pipeline_repaint.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/repaint/pipeline_repaint.py) | *Image Inpainting* | - |
-
-## Usage example
-
-```python
-from io import BytesIO
-
-import torch
-
-import PIL
-import requests
-from diffusers import RePaintPipeline, RePaintScheduler
-
-
-def download_image(url):
-    response = requests.get(url)
-    return PIL.Image.open(BytesIO(response.content)).convert("RGB")
-
-
-img_url = "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/repaint/celeba_hq_256.png"
-mask_url = "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/repaint/mask_256.png"
-
-# Load the original image and the mask as PIL images
-original_image = download_image(img_url).resize((256, 256))
-mask_image = download_image(mask_url).resize((256, 256))
-
-# Load the RePaint scheduler and pipeline based on a pretrained DDPM model
-scheduler = RePaintScheduler.from_pretrained("google/ddpm-ema-celebahq-256")
-pipe = RePaintPipeline.from_pretrained("google/ddpm-ema-celebahq-256", scheduler=scheduler)
-pipe = pipe.to("cuda")
-
-generator = torch.Generator(device="cuda").manual_seed(0)
-output = pipe(
-    original_image=original_image,
-    mask_image=mask_image,
-    num_inference_steps=250,
-    eta=0.0,
-    jump_length=10,
-    jump_n_sample=10,
-    generator=generator,
-)
-inpainted_image = output.images[0]
-```
-
-## RePaintPipeline
-[[autodoc]] pipelines.repaint.pipeline_repaint.RePaintPipeline
-    - __call__
-
--- a/docs/source/api/pipelines/stable_diffusion.mdx
+++ b/docs/source/api/pipelines/stable_diffusion.mdx
@@ -31,25 +31,6 @@ For more details about how Stable Diffusion works and how it differs from the ba

 ## Tips

-### How to load and use different schedulers.
-
-The stable diffusion pipeline uses [`PNDMScheduler`] scheduler by default. But `diffusers` provides many other schedulers that can be used with the stable diffusion pipeline such as [`DDIMScheduler`], [`LMSDiscreteScheduler`], [`EulerDiscreteScheduler`], [`EulerAncestralDiscreteScheduler`] etc.
-To use a different scheduler, you can either change it via the [`ConfigMixin.from_config`] method or pass the `scheduler` argument to the `from_pretrained` method of the pipeline. For example, to use the [`EulerDiscreteScheduler`], you can do the following:
-
-```python
->>> from diffusers import StableDiffusionPipeline, EulerDiscreteScheduler
-
->>> pipeline = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
->>> pipeline.scheduler = EulerDiscreteScheduler.from_config(pipeline.scheduler.config)
-
->>> # or
->>> euler_scheduler = EulerDiscreteScheduler.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="scheduler")
->>> pipeline = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", scheduler=euler_scheduler)
-```
-
-
-### How to convert all use cases with multiple or single pipeline
-
 If you want to use all possible use cases in a single `DiffusionPipeline` you can either:
 - Make use of the [Stable Diffusion Mega Pipeline](https://github.com/huggingface/diffusers/tree/main/examples/community#stable-diffusion-mega) or 
 - Make use of the `components` functionality to instantiate all components in the most memory-efficient way:
@@ -61,11 +42,11 @@ If you want to use all possible use cases in a single `DiffusionPipeline` you ca
 ...     StableDiffusionInpaintPipeline,
 ... )

->>> text2img = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
->>> img2img = StableDiffusionImg2ImgPipeline(**text2img.components)
->>> inpaint = StableDiffusionInpaintPipeline(**text2img.components)
+>>> img2text = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
+>>> img2img = StableDiffusionImg2ImgPipeline(**img2text.components)
+>>> inpaint = StableDiffusionInpaintPipeline(**img2text.components)

->>> # now you can use text2img(...), img2img(...), inpaint(...) just like the call methods of each respective pipeline
+>>> # now you can use img2text(...), img2img(...), inpaint(...) just like the call methods of each respective pipeline
 ```

 ## StableDiffusionPipelineOutput
@@ -76,40 +57,15 @@ If you want to use all possible use cases in a single `DiffusionPipeline` you ca
 	- __call__
 	- enable_attention_slicing
 	- disable_attention_slicing
-	- enable_vae_slicing
-	- disable_vae_slicing
-	- enable_xformers_memory_efficient_attention
-	- disable_xformers_memory_efficient_attention

 ## StableDiffusionImg2ImgPipeline
 [[autodoc]] StableDiffusionImg2ImgPipeline
 	- __call__
 	- enable_attention_slicing
 	- disable_attention_slicing
-	- enable_xformers_memory_efficient_attention
-	- disable_xformers_memory_efficient_attention

 ## StableDiffusionInpaintPipeline
 [[autodoc]] StableDiffusionInpaintPipeline
 	- __call__
 	- enable_attention_slicing
 	- disable_attention_slicing
-	- enable_xformers_memory_efficient_attention
-	- disable_xformers_memory_efficient_attention
-
-## StableDiffusionImageVariationPipeline
-[[autodoc]] StableDiffusionImageVariationPipeline
-	- __call__
-	- enable_attention_slicing
-	- disable_attention_slicing
-	- enable_xformers_memory_efficient_attention
-	- disable_xformers_memory_efficient_attention
-
-
-## StableDiffusionUpscalePipeline
-[[autodoc]] StableDiffusionUpscalePipeline
-	- __call__
-	- enable_attention_slicing
-	- disable_attention_slicing
-	- enable_xformers_memory_efficient_attention
-	- disable_xformers_memory_efficient_attention
--- a/docs/source/api/pipelines/stable_diffusion_2.mdx
+++ b/docs/source/api/pipelines/stable_diffusion_2.mdx
@@ -1,142 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# Stable diffusion 2
-
-Stable Diffusion 2 is a text-to-image _latent diffusion_ model built upon the work of [Stable Diffusion 1](https://stability.ai/blog/stable-diffusion-public-release). 
-The project to train Stable Diffusion 2 was led by Robin Rombach and Katherine Crowson from [Stability AI](https://stability.ai/) and [LAION](https://laion.ai/).
-
-*The Stable Diffusion 2.0 release includes robust text-to-image models trained using a brand new text encoder (OpenCLIP), developed by LAION with support from Stability AI, which greatly improves the quality of the generated images compared to earlier V1 releases. The text-to-image models in this release can generate images with default resolutions of both 512x512 pixels and 768x768 pixels. 
-These models are trained on an aesthetic subset of the [LAION-5B dataset](https://laion.ai/blog/laion-5b/) created by the DeepFloyd team at Stability AI, which is then further filtered to remove adult content using [LAION’s NSFW filter](https://openreview.net/forum?id=M3Y74vmsMcY).*
-
-For more details about how Stable Diffusion 2 works and how it differs from Stable Diffusion 1, please refer to the official [launch announcement post](https://stability.ai/blog/stable-diffusion-v2-release).
-
-## Tips
-
-### Available checkpoints:
-
-Note that the architecture is more or less identical to [Stable Diffusion 1](./api/pipelines/stable_diffusion) so please refer to [this page](./api/pipelines/stable_diffusion) for API documentation.
-
- *Text-to-Image (512x512 resolution)*: [stabilityai/stable-diffusion-2-base](https://huggingface.co/stabilityai/stable-diffusion-2-base) with [`StableDiffusionPipeline`]
- *Text-to-Image (768x768 resolution)*: [stabilityai/stable-diffusion-2](https://huggingface.co/stabilityai/stable-diffusion-2) with [`StableDiffusionPipeline`]
- *Image Inpainting (512x512 resolution)*: [stabilityai/stable-diffusion-2-inpainting](https://huggingface.co/stabilityai/stable-diffusion-2-inpainting) with [`StableDiffusionInpaintPipeline`]
- *Image Upscaling (x4 resolution resolution)*: [stable-diffusion-x4-upscaler](https://huggingface.co/stabilityai/stable-diffusion-x4-upscaler) [`StableDiffusionUpscalePipeline`]
-
-We recommend using the [`DPMSolverMultistepScheduler`] as it's currently the fastest scheduler there is.
-
- *Text-to-Image (512x512 resolution)*:
-
-```python
-from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler
-import torch
-
-repo_id = "stabilityai/stable-diffusion-2-base"
-pipe = DiffusionPipeline.from_pretrained(repo_id, torch_dtype=torch.float16, revision="fp16")
-
-pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
-pipe = pipe.to("cuda")
-
-prompt = "High quality photo of an astronaut riding a horse in space"
-image = pipe(prompt, num_inference_steps=25).images[0]
-image.save("astronaut.png")
-```
-
- *Text-to-Image (768x768 resolution)*:
-
-```python
-from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler
-import torch
-
-repo_id = "stabilityai/stable-diffusion-2"
-pipe = DiffusionPipeline.from_pretrained(repo_id, torch_dtype=torch.float16, revision="fp16")
-
-pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
-pipe = pipe.to("cuda")
-
-prompt = "High quality photo of an astronaut riding a horse in space"
-image = pipe(prompt, guidance_scale=9, num_inference_steps=25).images[0]
-image.save("astronaut.png")
-```
-
- *Image Inpainting (512x512 resolution)*:
-
-```python
-import PIL
-import requests
-import torch
-from io import BytesIO
-
-from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler
-
-
-def download_image(url):
-    response = requests.get(url)
-    return PIL.Image.open(BytesIO(response.content)).convert("RGB")
-
-
-img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png"
-mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png"
-
-init_image = download_image(img_url).resize((512, 512))
-mask_image = download_image(mask_url).resize((512, 512))
-
-repo_id = "stabilityai/stable-diffusion-2-inpainting"
-pipe = DiffusionPipeline.from_pretrained(repo_id, torch_dtype=torch.float16, revision="fp16")
-
-pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
-pipe = pipe.to("cuda")
-
-prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
-image = pipe(prompt=prompt, image=init_image, mask_image=mask_image, num_inference_steps=25).images[0]
-
-image.save("yellow_cat.png")
-```
-
- *Image Upscaling (x4 resolution resolution)*: [stable-diffusion-x4-upscaler](https://huggingface.co/stabilityai/stable-diffusion-x4-upscaler) [`StableDiffusionUpscalePipeline`]
-
-```python
-import requests
-from PIL import Image
-from io import BytesIO
-from diffusers import StableDiffusionUpscalePipeline
-import torch
-
-# load model and scheduler
-model_id = "stabilityai/stable-diffusion-x4-upscaler"
-pipeline = StableDiffusionUpscalePipeline.from_pretrained(model_id, revision="fp16", torch_dtype=torch.float16)
-pipeline = pipeline.to("cuda")
-
-# let's download an  image
-url = "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd2-upscale/low_res_cat.png"
-response = requests.get(url)
-low_res_img = Image.open(BytesIO(response.content)).convert("RGB")
-low_res_img = low_res_img.resize((128, 128))
-prompt = "a white cat"
-upscaled_image = pipeline(prompt=prompt, image=low_res_img).images[0]
-upscaled_image.save("upsampled_cat.png")
-```
-
-### How to load and use different schedulers.
-
-The stable diffusion pipeline uses [`DDIMScheduler`] scheduler by default. But `diffusers` provides many other schedulers that can be used with the stable diffusion pipeline such as [`PNDMScheduler`], [`LMSDiscreteScheduler`], [`EulerDiscreteScheduler`], [`EulerAncestralDiscreteScheduler`] etc.
-To use a different scheduler, you can either change it via the [`ConfigMixin.from_config`] method or pass the `scheduler` argument to the `from_pretrained` method of the pipeline. For example, to use the [`EulerDiscreteScheduler`], you can do the following:
-
-```python
->>> from diffusers import StableDiffusionPipeline, EulerDiscreteScheduler
-
->>> pipeline = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2")
->>> pipeline.scheduler = EulerDiscreteScheduler.from_config(pipeline.scheduler.config)
-
->>> # or
->>> euler_scheduler = EulerDiscreteScheduler.from_pretrained("stabilityai/stable-diffusion-2", subfolder="scheduler")
->>> pipeline = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2", scheduler=euler_scheduler)
-```
--- a/docs/source/api/pipelines/stable_diffusion_safe.mdx
+++ b/docs/source/api/pipelines/stable_diffusion_safe.mdx
@@ -1,90 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# Safe Stable Diffusion
-
-Safe Stable Diffusion was proposed in [Safe Latent Diffusion: Mitigating Inappropriate Degeneration in Diffusion Models](https://arxiv.org/abs/2211.05105) and mitigates the well known issue that models like Stable Diffusion that are trained on unfiltered, web-crawled datasets tend to suffer from inappropriate degeneration. For instance Stable Diffusion may unexpectedly generate nudity, violence, images depicting self-harm, or otherwise offensive content.
-Safe Stable Diffusion is an extension to the Stable Diffusion that drastically reduces content like this.
-
-The abstract of the paper is the following:
-
-*Text-conditioned image generation models have recently achieved astonishing results in image quality and text alignment and are consequently employed in a fast-growing number of applications. Since they are highly data-driven, relying on billion-sized datasets randomly scraped from the internet, they also suffer, as we demonstrate, from degenerated and biased human behavior. In turn, they may even reinforce such biases. To help combat these undesired side effects, we present safe latent diffusion (SLD). Specifically, to measure the inappropriate degeneration due to unfiltered and imbalanced training sets, we establish a novel image generation test bed-inappropriate image prompts (I2P)-containing dedicated, real-world image-to-text prompts covering concepts such as nudity and violence. As our exhaustive empirical evaluation demonstrates, the introduced SLD removes and suppresses inappropriate image parts during the diffusion process, with no additional training required and no adverse effect on overall image quality or text alignment.*
-
-
-*Overview*:
-
-| Pipeline | Tasks | Colab | Demo
-|---|---|:---:|:---:|
-| [pipeline_stable_diffusion_safe.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py) | *Text-to-Image Generation* |  [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ml-research/safe-latent-diffusion/blob/main/examples/Safe%20Latent%20Diffusion.ipynb) | -
-
-## Tips
-
- Safe Stable Diffusion may also be used with weights of [Stable Diffusion](./api/pipelines/stable_diffusion).
-
-### Run Safe Stable Diffusion
-
-Safe Stable Diffusion can be tested very easily with the [`StableDiffusionPipelineSafe`], and the `"AIML-TUDA/stable-diffusion-safe"` checkpoint exactly in the same way it is shown in the [Conditional Image Generation Guide](./using-diffusers/conditional_image_generation).
-
-### Interacting with the Safety Concept
-
-To check and edit the currently used safety concept, use the `safety_concept` property of [`StableDiffusionPipelineSafe`]
-```python
->>> from diffusers import StableDiffusionPipelineSafe
-
->>> pipeline = StableDiffusionPipelineSafe.from_pretrained("AIML-TUDA/stable-diffusion-safe")
->>> pipeline.safety_concept
-```
-For each image generation the active concept is also contained in [`StableDiffusionSafePipelineOutput`].
-
-### Using pre-defined safety configurations
-
-You may use the 4 configurations defined in the [Safe Latent Diffusion paper](https://arxiv.org/abs/2211.05105) as follows:
-
-```python
->>> from diffusers import StableDiffusionPipelineSafe
->>> from diffusers.pipelines.stable_diffusion_safe import SafetyConfig
-
->>> pipeline = StableDiffusionPipelineSafe.from_pretrained("AIML-TUDA/stable-diffusion-safe")
->>> prompt = "the four horsewomen of the apocalypse, painting by tom of finland, gaston bussiere, craig mullins, j. c. leyendecker"
->>> out = pipeline(prompt=prompt, **SafetyConfig.MAX)
-```
-
-The following configurations are available: `SafetyConfig.WEAK`, `SafetyConfig.MEDIUM`, `SafetyConfig.STRONg`, and `SafetyConfig.MAX`.
-
-### How to load and use different schedulers.
-
-The safe stable diffusion pipeline uses [`PNDMScheduler`] scheduler by default. But `diffusers` provides many other schedulers that can be used with the stable diffusion pipeline such as [`DDIMScheduler`], [`LMSDiscreteScheduler`], [`EulerDiscreteScheduler`], [`EulerAncestralDiscreteScheduler`] etc.
-To use a different scheduler, you can either change it via the [`ConfigMixin.from_config`] method or pass the `scheduler` argument to the `from_pretrained` method of the pipeline. For example, to use the [`EulerDiscreteScheduler`], you can do the following:
-
-```python
->>> from diffusers import StableDiffusionPipelineSafe, EulerDiscreteScheduler
-
->>> pipeline = StableDiffusionPipelineSafe.from_pretrained("AIML-TUDA/stable-diffusion-safe")
->>> pipeline.scheduler = EulerDiscreteScheduler.from_config(pipeline.scheduler.config)
-
->>> # or
->>> euler_scheduler = EulerDiscreteScheduler.from_pretrained("AIML-TUDA/stable-diffusion-safe", subfolder="scheduler")
->>> pipeline = StableDiffusionPipelineSafe.from_pretrained(
-...     "AIML-TUDA/stable-diffusion-safe", scheduler=euler_scheduler
-... )
-```
-
-
-## StableDiffusionSafePipelineOutput
-[[autodoc]] pipelines.stable_diffusion_safe.StableDiffusionSafePipelineOutput
-
-## StableDiffusionPipelineSafe
-[[autodoc]] StableDiffusionPipelineSafe
-	- __call__
-	- enable_attention_slicing
-	- disable_attention_slicing
-
--- a/docs/source/api/pipelines/versatile_diffusion.mdx
+++ b/docs/source/api/pipelines/versatile_diffusion.mdx
@@ -1,73 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# VersatileDiffusion
-
-VersatileDiffusion was proposed in [Versatile Diffusion: Text, Images and Variations All in One Diffusion Model](https://arxiv.org/abs/2211.08332) by Xingqian Xu, Zhangyang Wang, Eric Zhang, Kai Wang, Humphrey Shi .
-
-The abstract of the paper is the following:
-
-*The recent advances in diffusion models have set an impressive milestone in many generation tasks. Trending works such as DALL-E2, Imagen, and Stable Diffusion have attracted great interest in academia and industry. Despite the rapid landscape changes, recent new approaches focus on extensions and performance rather than capacity, thus requiring separate models for separate tasks. In this work, we expand the existing single-flow diffusion pipeline into a multi-flow network, dubbed Versatile Diffusion (VD), that handles text-to-image, image-to-text, image-variation, and text-variation in one unified model. Moreover, we generalize VD to a unified multi-flow multimodal diffusion framework with grouped layers, swappable streams, and other propositions that can process modalities beyond images and text. Through our experiments, we demonstrate that VD and its underlying framework have the following merits: a) VD handles all subtasks with competitive quality; b) VD initiates novel extensions and applications such as disentanglement of style and semantic, image-text dual-guided generation, etc.; c) Through these experiments and applications, VD provides more semantic insights of the generated outputs.*
-
-## Tips
-
- VersatileDiffusion is conceptually very similar as [Stable Diffusion](./api/pipelines/stable_diffusion),  but instead of providing just a image data stream conditioned on text, VersatileDiffusion provides both a image and text data stream and can be conditioned on both text and image.
-
-### *Run VersatileDiffusion*
-
-You can both load the memory intensive "all-in-one" [`VersatileDiffusionPipeline`] that can run all tasks 
-with the same class as shown in [`VersatileDiffusionPipeline.text_to_image`], [`VersatileDiffusionPipeline.image_variation`], and [`VersatileDiffusionPipeline.dual_guided`]
-
-**or**
-
-You can run the individual pipelines which are much more memory efficient:
-
- *Text-to-Image*: [`VersatileDiffusionTextToImagePipeline.__call__`]
- *Image Variation*: [`VersatileDiffusionImageVariationPipeline.__call__`]
- *Dual Text and Image Guided Generation*: [`VersatileDiffusionDualGuidedPipeline.__call__`]
-
-### *How to load and use different schedulers.*
-
-The versatile diffusion pipelines uses [`DDIMScheduler`] scheduler by default. But `diffusers` provides many other schedulers that can be used with the alt diffusion pipeline such as [`PNDMScheduler`], [`LMSDiscreteScheduler`], [`EulerDiscreteScheduler`], [`EulerAncestralDiscreteScheduler`] etc.
-To use a different scheduler, you can either change it via the [`ConfigMixin.from_config`] method or pass the `scheduler` argument to the `from_pretrained` method of the pipeline. For example, to use the [`EulerDiscreteScheduler`], you can do the following:
-
-```python
->>> from diffusers import VersatileDiffusionPipeline, EulerDiscreteScheduler
-
->>> pipeline = VersatileDiffusionPipeline.from_pretrained("shi-labs/versatile-diffusion")
->>> pipeline.scheduler = EulerDiscreteScheduler.from_config(pipeline.scheduler.config)
-
->>> # or
->>> euler_scheduler = EulerDiscreteScheduler.from_pretrained("shi-labs/versatile-diffusion", subfolder="scheduler")
->>> pipeline = VersatileDiffusionPipeline.from_pretrained("shi-labs/versatile-diffusion", scheduler=euler_scheduler)
-```
-
-## VersatileDiffusionPipeline
-[[autodoc]] VersatileDiffusionPipeline
-
-## VersatileDiffusionTextToImagePipeline
-[[autodoc]] VersatileDiffusionTextToImagePipeline
-	- __call__
-	- enable_attention_slicing
-	- disable_attention_slicing
-
-## VersatileDiffusionImageVariationPipeline
-[[autodoc]] VersatileDiffusionImageVariationPipeline
-	- __call__
-	- enable_attention_slicing
-	- disable_attention_slicing
-
-## VersatileDiffusionDualGuidedPipeline
-[[autodoc]] VersatileDiffusionDualGuidedPipeline
-	- __call__
-	- enable_attention_slicing
-	- disable_attention_slicing
--- a/docs/source/api/pipelines/vq_diffusion.mdx
+++ b/docs/source/api/pipelines/vq_diffusion.mdx
@@ -1,34 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# VQDiffusion
-
-## Overview
-
-[Vector Quantized Diffusion Model for Text-to-Image Synthesis](https://arxiv.org/abs/2111.14822) by Shuyang Gu, Dong Chen, Jianmin Bao, Fang Wen, Bo Zhang, Dongdong Chen, Lu Yuan, Baining Guo
-
-The abstract of the paper is the following:
-
-We present the vector quantized diffusion (VQ-Diffusion) model for text-to-image generation. This method is based on a vector quantized variational autoencoder (VQ-VAE) whose latent space is modeled by a conditional variant of the recently developed Denoising Diffusion Probabilistic Model (DDPM). We find that this latent-space method is well-suited for text-to-image generation tasks because it not only eliminates the unidirectional bias with existing methods but also allows us to incorporate a mask-and-replace diffusion strategy to avoid the accumulation of errors, which is a serious problem with existing methods. Our experiments show that the VQ-Diffusion produces significantly better text-to-image generation results when compared with conventional autoregressive (AR) models with similar numbers of parameters. Compared with previous GAN-based text-to-image methods, our VQ-Diffusion can handle more complex scenes and improve the synthesized image quality by a large margin. Finally, we show that the image generation computation in our method can be made highly efficient by reparameterization. With traditional AR methods, the text-to-image generation time increases linearly with the output image resolution and hence is quite time consuming even for normal size images. The VQ-Diffusion allows us to achieve a better trade-off between quality and speed. Our experiments indicate that the VQ-Diffusion model with the reparameterization is fifteen times faster than traditional AR methods while achieving a better image quality.
-
-The original codebase can be found [here](https://github.com/microsoft/VQ-Diffusion).
-
-## Available Pipelines:
-
-| Pipeline | Tasks | Colab
-|---|---|:---:|
-| [pipeline_vq_diffusion.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/vq_diffusion/pipeline_vq_diffusion.py) | *Text-to-Image Generation* | - |
-
-
-## VQDiffusionPipeline
-[[autodoc]] pipelines.vq_diffusion.pipeline_vq_diffusion.VQDiffusionPipeline
-    - __call__
--- a/docs/source/api/schedulers.mdx
+++ b/docs/source/api/schedulers.mdx
@@ -16,7 +16,7 @@ Diffusers contains multiple pre-built schedule functions for the diffusion proce

 ## What is a scheduler?

-The schedule functions, denoted *Schedulers* in the library take in the output of a trained model, a sample which the diffusion process is iterating on, and a timestep to return a denoised sample. That's why schedulers may also be called *Samplers* in other diffusion models implementations.
+The schedule functions, denoted *Schedulers* in the library take in the output of a trained model, a sample which the diffusion process is iterating on, and a timestep to return a denoised sample.

 - Schedulers define the methodology for iteratively adding noise to an image or for updating a sample based on model outputs.
    - adding noise in different manners represent the algorithmic processes to train a diffusion model by adding noise to images.
@@ -70,45 +70,6 @@ Original paper can be found [here](https://arxiv.org/abs/2010.02502).

 [[autodoc]] DDPMScheduler

-#### Singlestep DPM-Solver
-
-Original paper can be found [here](https://arxiv.org/abs/2206.00927) and the [improved version](https://arxiv.org/abs/2211.01095). The original implementation can be found [here](https://github.com/LuChengTHU/dpm-solver).
-
-[[autodoc]] DPMSolverSinglestepScheduler
-
-#### Multistep DPM-Solver
-
-Original paper can be found [here](https://arxiv.org/abs/2206.00927) and the [improved version](https://arxiv.org/abs/2211.01095). The original implementation can be found [here](https://github.com/LuChengTHU/dpm-solver).
-
-[[autodoc]] DPMSolverMultistepScheduler
-
-#### Heun scheduler inspired by Karras et. al paper
-
-Algorithm 1 of [Karras et. al](https://arxiv.org/abs/2206.00364).
-Scheduler ported from @crowsonkb's https://github.com/crowsonkb/k-diffusion library:
-
-All credit for making this scheduler work goes to [Katherine Crowson](https://github.com/crowsonkb/)
-
-[[autodoc]] HeunDiscreteScheduler
-
-#### DPM Discrete Scheduler inspired by Karras et. al paper
-
-Inspired by [Karras et. al](https://arxiv.org/abs/2206.00364).
-Scheduler ported from @crowsonkb's https://github.com/crowsonkb/k-diffusion library:
-
-All credit for making this scheduler work goes to [Katherine Crowson](https://github.com/crowsonkb/)
-
-[[autodoc]] KDPM2DiscreteScheduler
-
-#### DPM Discrete Scheduler with ancestral sampling inspired by Karras et. al paper
-
-Inspired by [Karras et. al](https://arxiv.org/abs/2206.00364).
-Scheduler ported from @crowsonkb's https://github.com/crowsonkb/k-diffusion library:
-
-All credit for making this scheduler work goes to [Katherine Crowson](https://github.com/crowsonkb/)
-
-[[autodoc]] KDPM2AncestralDiscreteScheduler
-
 #### Variance exploding, stochastic sampling from Karras et. al

 Original paper can be found [here](https://arxiv.org/abs/2006.11239).
@@ -119,6 +80,7 @@ Original paper can be found [here](https://arxiv.org/abs/2006.11239).

 Original implementation can be found [here](https://arxiv.org/abs/2206.00364).

+
 [[autodoc]] LMSDiscreteScheduler

 #### Pseudo numerical methods for diffusion models (PNDM)
@@ -150,34 +112,3 @@ Score SDE-VP is under construction.
 </Tip>

 [[autodoc]] schedulers.scheduling_sde_vp.ScoreSdeVpScheduler
-
-#### Euler scheduler
-
-Euler scheduler (Algorithm 2) from the paper [Elucidating the Design Space of Diffusion-Based Generative Models](https://arxiv.org/abs/2206.00364) by Karras et al. (2022). Based on the original [k-diffusion](https://github.com/crowsonkb/k-diffusion/blob/481677d114f6ea445aa009cf5bd7a9cdee909e47/k_diffusion/sampling.py#L51) implementation by Katherine Crowson.
-Fast scheduler which often times generates good outputs with 20-30 steps.
-
-[[autodoc]] EulerDiscreteScheduler
-
-
-#### Euler Ancestral scheduler
-
-Ancestral sampling with Euler method steps. Based on the original (k-diffusion)[https://github.com/crowsonkb/k-diffusion/blob/481677d114f6ea445aa009cf5bd7a9cdee909e47/k_diffusion/sampling.py#L72] implementation by Katherine Crowson.
-Fast scheduler which often times generates good outputs with 20-30 steps.
-
-[[autodoc]] EulerAncestralDiscreteScheduler
-
-
-#### VQDiffusionScheduler
-
-Original paper can be found [here](https://arxiv.org/abs/2111.14822)
-
-[[autodoc]] VQDiffusionScheduler
-
-#### RePaint scheduler
-
-DDPM-based inpainting scheduler for unsupervised inpainting with extreme masks. 
-Intended for use with [`RePaintPipeline`].
-Based on the paper [RePaint: Inpainting using Denoising Diffusion Probabilistic Models](https://arxiv.org/abs/2201.09865) 
-and the original implementation by Andreas Lugmayr et al.: https://github.com/andreas128/RePaint
-
-[[autodoc]] RePaintScheduler
--- a/docs/source/imgs/access_request.png
+++ b/docs/source/imgs/access_request.png
--- a/docs/source/index.mdx
+++ b/docs/source/index.mdx
@@ -34,30 +34,16 @@ available a colab notebook to directly try them out.

 | Pipeline | Paper | Tasks | Colab
 |---|---|:---:|:---:|
-| [alt_diffusion](./api/pipelines/alt_diffusion) | [**AltDiffusion**](https://arxiv.org/abs/2211.06679) | Image-to-Image Text-Guided Generation |
-| [audio_diffusion](./api/pipelines/audio_diffusion) | [**Audio Diffusion**](https://github.com/teticio/audio-diffusion.git) | Unconditional Audio Generation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/teticio/audio-diffusion/blob/master/notebooks/audio_diffusion_pipeline.ipynb)
-| [cycle_diffusion](./api/pipelines/cycle_diffusion) | [**Cycle Diffusion**](https://arxiv.org/abs/2210.05559) | Image-to-Image Text-Guided Generation |
-| [dance_diffusion](./api/pipelines/dance_diffusion) | [**Dance Diffusion**](https://github.com/williamberman/diffusers.git) | Unconditional Audio Generation |
 | [ddpm](./api/pipelines/ddpm) | [**Denoising Diffusion Probabilistic Models**](https://arxiv.org/abs/2006.11239) | Unconditional Image Generation |
 | [ddim](./api/pipelines/ddim) | [**Denoising Diffusion Implicit Models**](https://arxiv.org/abs/2010.02502) | Unconditional Image Generation |
 | [latent_diffusion](./api/pipelines/latent_diffusion) | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752)| Text-to-Image Generation | 
-| [latent_diffusion](./api/pipelines/latent_diffusion) | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752)| Super Resolution Image-to-Image | 
 | [latent_diffusion_uncond](./api/pipelines/latent_diffusion_uncond) | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752) | Unconditional Image Generation | 
-| [paint_by_example](./api/pipelines/paint_by_example) | [**Paint by Example: Exemplar-based Image Editing with Diffusion Models**](https://arxiv.org/abs/2211.13227) | Image-Guided Image Inpainting | 
 | [pndm](./api/pipelines/pndm) | [**Pseudo Numerical Methods for Diffusion Models on Manifolds**](https://arxiv.org/abs/2202.09778) | Unconditional Image Generation | 
 | [score_sde_ve](./api/pipelines/score_sde_ve) | [**Score-Based Generative Modeling through Stochastic Differential Equations**](https://openreview.net/forum?id=PxTIG12RRHS) | Unconditional Image Generation | 
 | [score_sde_vp](./api/pipelines/score_sde_vp) | [**Score-Based Generative Modeling through Stochastic Differential Equations**](https://openreview.net/forum?id=PxTIG12RRHS) | Unconditional Image Generation | 
 | [stable_diffusion](./api/pipelines/stable_diffusion) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Text-to-Image Generation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/training_example.ipynb)
 | [stable_diffusion](./api/pipelines/stable_diffusion) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Image-to-Image Text-Guided Generation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/image_2_image_using_diffusers.ipynb)
 | [stable_diffusion](./api/pipelines/stable_diffusion) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Text-Guided Image Inpainting | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/in_painting_with_stable_diffusion_using_diffusers.ipynb)
-| [stable_diffusion_2](./api/pipelines/stable_diffusion_2) | [**Stable Diffusion 2**](https://stability.ai/blog/stable-diffusion-v2-release) | Text-to-Image Generation | 
-| [stable_diffusion_2](./api/pipelines/stable_diffusion_2) | [**Stable Diffusion 2**](https://stability.ai/blog/stable-diffusion-v2-release) | Text-Guided Image Inpainting | 
-| [stable_diffusion_2](./api/pipelines/stable_diffusion_2) | [**Stable Diffusion 2**](https://stability.ai/blog/stable-diffusion-v2-release) | Text-Guided Super Resolution Image-to-Image |
-| [stable_diffusion_safe](./api/pipelines/stable_diffusion_safe) | [**Safe Stable Diffusion**](https://arxiv.org/abs/2211.05105) | Text-Guided Generation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ml-research/safe-latent-diffusion/blob/main/examples/Safe%20Latent%20Diffusion.ipynb)
 | [stochastic_karras_ve](./api/pipelines/stochastic_karras_ve) | [**Elucidating the Design Space of Diffusion-Based Generative Models**](https://arxiv.org/abs/2206.00364) | Unconditional Image Generation | 
-| [versatile_diffusion](./api/pipelines/versatile_diffusion) | [Versatile Diffusion: Text, Images and Variations All in One Diffusion Model](https://arxiv.org/abs/2211.08332) | Text-to-Image Generation | 
-| [versatile_diffusion](./api/pipelines/versatile_diffusion) | [Versatile Diffusion: Text, Images and Variations All in One Diffusion Model](https://arxiv.org/abs/2211.08332) | Image Variations Generation | 
-| [versatile_diffusion](./api/pipelines/versatile_diffusion) | [Versatile Diffusion: Text, Images and Variations All in One Diffusion Model](https://arxiv.org/abs/2211.08332) | Dual Image and Text Guided Generation | 
-| [vq_diffusion](./api/pipelines/vq_diffusion) | [Vector Quantized Diffusion Model for Text-to-Image Synthesis](https://arxiv.org/abs/2111.14822) | Text-to-Image Generation | 

 **Note**: Pipelines are simple examples of how to play around with the diffusion systems as described in the corresponding papers. 
--- a/docs/source/installation.mdx
+++ b/docs/source/installation.mdx
@@ -12,12 +12,9 @@ specific language governing permissions and limitations under the License.

 # Installation

-Install 🤗 Diffusers for whichever deep learning library you’re working with.
+Install Diffusers for with PyTorch. Support for other libraries will come in the future

-🤗 Diffusers is tested on Python 3.7+, PyTorch 1.7.0+ and flax. Follow the installation instructions below for the deep learning library you are using:
-
- [PyTorch](https://pytorch.org/get-started/locally/) installation instructions.
- [Flax](https://flax.readthedocs.io/en/latest/) installation instructions.
+🤗 Diffusers is tested on Python 3.7+, and PyTorch 1.7.0+.

 ## Install with pip

@@ -39,30 +36,12 @@ source .env/bin/activate

 Now you're ready to install 🤗 Diffusers with the following command:

-**For PyTorch**
-
 ```bash
-pip install diffusers["torch"]
-```
-
-**For Flax**
-
-```bash
-pip install diffusers["flax"]
+pip install diffusers
 ```

 ## Install from source

-Before intsalling `diffusers` from source, make sure you have `torch` and `accelerate` installed.
-
-For `torch` installation refer to the `torch` [docs](https://pytorch.org/get-started/locally/#start-locally).
-
-To install `accelerate`
-
-```bash
-pip install accelerate
-```
-
 Install 🤗 Diffusers from source with the following command:

 ```bash
@@ -88,18 +67,7 @@ Clone the repository and install 🤗 Diffusers with the following commands:
 ```bash
 git clone https://github.com/huggingface/diffusers.git
 cd diffusers
-```
-
-**For PyTorch**
-
-```
-pip install -e ".[torch]"
-```
-
-**For Flax**
-
-```
-pip install -e ".[flax]"
+pip install -e .
 ```

 These commands will link the folder you cloned the repository to and your Python library paths.
@@ -120,24 +88,3 @@ git pull
 ```

 Your Python environment will find the `main` version of 🤗 Diffusers on the next run.
-
-## Notice on telemetry logging
-
-Our library gathers telemetry information during `from_pretrained()` requests.
-This data includes the version of Diffusers and PyTorch/Flax, the requested model or pipeline class,
-and the path to a pretrained checkpoint if it is hosted on the Hub.
-This usage data helps us debug issues and prioritize new features.
-No private data, such as paths to models saved locally on disk, is ever collected.
-
-We understand that not everyone wants to share additional information, and we respect your privacy,
-so you can disable telemetry collection by setting the `DISABLE_TELEMETRY` environment variable from your terminal:
-
-On Linux/MacOS:
-```bash
-export DISABLE_TELEMETRY=YES
-```
-
-On Windows:
-```bash
-set DISABLE_TELEMETRY=YES
-```
--- a/docs/source/optimization/fp16.mdx
+++ b/docs/source/optimization/fp16.mdx
@@ -22,7 +22,6 @@ We present some techniques and ideas to optimize 🤗 Diffusers _inference_ for
 | fp16             | 3.61s   | x2.63   |
 | channels last    | 3.30s   | x2.88   |
 | traced UNet      | 3.21s   | x2.96   |
-| memory efficient attention  | 2.63s  | x3.61   |

 <em>
  obtained on NVIDIA TITAN RTX by generating a single image of size 512x512 from
@@ -117,34 +116,6 @@ image = pipe(prompt).images[0]

 There's a small performance penalty of about 10% slower inference times, but this method allows you to use Stable Diffusion in as little as 3.2 GB of VRAM!

-
-## Sliced VAE decode for larger batches
-
-To decode large batches of images with limited VRAM, or to enable batches with 32 images or more, you can use sliced VAE decode that decodes the batch latents one image at a time.
-
-You likely want to couple this with [`~StableDiffusionPipeline.enable_attention_slicing`] or [`~StableDiffusionPipeline.enable_xformers_memory_efficient_attention`] to further minimize memory use.
-
-To perform the VAE decode one image at a time, invoke [`~StableDiffusionPipeline.enable_vae_slicing`] in your pipeline before inference. For example:
-
-```Python
-import torch
-from diffusers import StableDiffusionPipeline
-
-pipe = StableDiffusionPipeline.from_pretrained(
-    "runwayml/stable-diffusion-v1-5",
-    revision="fp16",
-    torch_dtype=torch.float16,
-)
-pipe = pipe.to("cuda")
-
-prompt = "a photo of an astronaut riding a horse on mars"
-pipe.enable_vae_slicing()
-images = pipe([prompt] * 32).images
-```
-
-You may see a small performance boost in VAE decode on multi-image batches. There should be no performance impact on single-image batches.
-
-
 ## Offloading to CPU with accelerate for memory savings

 For additional memory savings, you can offload the weights to CPU and load them to GPU when performing the forward pass.
@@ -319,41 +290,3 @@ pipe.unet = TracedUNet()
 with torch.inference_mode():
    image = pipe([prompt] * 1, num_inference_steps=50).images[0]
 ```
-
-
-## Memory Efficient Attention
-Recent work on optimizing the bandwitdh in the attention block have generated huge speed ups and gains in GPU memory usage. The most recent being Flash Attention (from @tridao, [code](https://github.com/HazyResearch/flash-attention), [paper](https://arxiv.org/pdf/2205.14135.pdf)) .
-Here are the speedups we obtain on a few Nvidia GPUs when running the inference at 512x512 with a batch size of 1 (one prompt):
-
-| GPU              	| Base Attention FP16 	| Memory Efficient Attention FP16 	|
-|------------------	|---------------------	|---------------------------------	|
-| NVIDIA Tesla T4  	| 3.5it/s             	| 5.5it/s                         	|
-| NVIDIA 3060 RTX  	| 4.6it/s             	| 7.8it/s                         	|
-| NVIDIA A10G      	| 8.88it/s            	| 15.6it/s                        	|
-| NVIDIA RTX A6000 	| 11.7it/s            	| 21.09it/s                       	|
-| NVIDIA TITAN RTX  | 12.51it/s         	| 18.22it/s                       	|
-| A100-SXM4-40GB    	| 18.6it/s            	| 29.it/s                        	|
-| A100-SXM-80GB    	| 18.7it/s            	| 29.5it/s                        	|
-
-To leverage it just make sure you have: 
- - PyTorch > 1.12
- - Cuda available
- - Installed the [xformers](https://github.com/facebookresearch/xformers) library
-```python
-from diffusers import StableDiffusionPipeline
-import torch
-
-pipe = StableDiffusionPipeline.from_pretrained(
-    "runwayml/stable-diffusion-v1-5",
-    revision="fp16",
-    torch_dtype=torch.float16,
-).to("cuda")
-
-pipe.enable_xformers_memory_efficient_attention()
-
-with torch.inference_mode():
-    sample = pipe("a small cat")
-
-# optional: You can disable it via
-# pipe.disable_xformers_memory_efficient_attention()
-```
--- a/docs/source/optimization/habana.mdx
+++ b/docs/source/optimization/habana.mdx
@@ -1,70 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# How to use Stable Diffusion on Habana Gaudi
-
-🤗 Diffusers is compatible with Habana Gaudi through 🤗 [Optimum Habana](https://huggingface.co/docs/optimum/habana/usage_guides/stable_diffusion).
-
-## Requirements
-
- Optimum Habana 1.3 or later, [here](https://huggingface.co/docs/optimum/habana/installation) is how to install it.
- SynapseAI 1.7.
-
-
-## Inference Pipeline
-
-To generate images with Stable Diffusion 1 and 2 on Gaudi, you need to instantiate two instances:
- A pipeline with [`GaudiStableDiffusionPipeline`](https://huggingface.co/docs/optimum/habana/package_reference/stable_diffusion_pipeline). This pipeline supports *text-to-image generation*.
- A scheduler with [`GaudiDDIMScheduler`](https://huggingface.co/docs/optimum/habana/package_reference/stable_diffusion_pipeline#optimum.habana.diffusers.GaudiDDIMScheduler). This scheduler has been optimized for Habana Gaudi.
-
-When initializing the pipeline, you have to specify `use_habana=True` to deploy it on HPUs.
-Furthermore, in order to get the fastest possible generations you should enable **HPU graphs** with `use_hpu_graphs=True`.
-Finally, you will need to specify a [Gaudi configuration](https://huggingface.co/docs/optimum/habana/package_reference/gaudi_config) which can be downloaded from the [Hugging Face Hub](https://huggingface.co/Habana).
-
-```python
-from optimum.habana import GaudiConfig
-from optimum.habana.diffusers import GaudiDDIMScheduler, GaudiStableDiffusionPipeline
-
-model_name = "stabilityai/stable-diffusion-2-base"
-scheduler = GaudiDDIMScheduler.from_pretrained(model_name, subfolder="scheduler")
-pipeline = GaudiStableDiffusionPipeline.from_pretrained(
-    model_name,
-    scheduler=scheduler,
-    use_habana=True,
-    use_hpu_graphs=True,
-    gaudi_config="Habana/stable-diffusion",
-)
-```
-
-You can then call the pipeline to generate images by batches from one or several prompts:
-```python
-outputs = pipeline(
-    prompt=[
-        "High quality photo of an astronaut riding a horse in space",
-        "Face of a yellow cat, high resolution, sitting on a park bench",
-    ],
-    num_images_per_prompt=10,
-    batch_size=4,
-)
-```
-
-For more information, check out Optimum Habana's [documentation](https://huggingface.co/docs/optimum/habana/usage_guides/stable_diffusion) and the [example](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion) provided in the official Github repository.
-
-
-## Benchmark
-
-Here are the latencies for Habana Gaudi 1 and Gaudi 2 with the [Habana/stable-diffusion](https://huggingface.co/Habana/stable-diffusion) Gaudi configuration (mixed precision bf16/fp32):
-
-|         | Latency | Batch size |
-| ------- |:-------:|:----------:|
-| Gaudi 1 | 4.37s   | 4/8        |
-| Gaudi 2 | 1.19s   | 4/8        |
--- a/docs/source/optimization/mps.mdx
+++ b/docs/source/optimization/mps.mdx
@@ -19,8 +19,11 @@ specific language governing permissions and limitations under the License.
 - Mac computer with Apple silicon (M1/M2) hardware.
 - macOS 12.6 or later (13.0 or later recommended).
 - arm64 version of Python.
- PyTorch 1.13. You can install it with `pip` or `conda` using the instructions in https://pytorch.org/get-started/locally/.
+- PyTorch 1.13.0 RC (Release Candidate). You can install it with `pip` using:

+```
+pip3 install --pre torch --extra-index-url https://download.pytorch.org/whl/test/cpu
+```

 ## Inference Pipeline

@@ -60,4 +63,4 @@ pipeline.enable_attention_slicing()
 ## Known Issues

 - As mentioned above, we are investigating a strange [first-time inference issue](https://github.com/huggingface/diffusers/issues/372).
- Generating multiple prompts in a batch [crashes or doesn't work reliably](https://github.com/huggingface/diffusers/issues/363). We believe this is related to the [`mps` backend in PyTorch](https://github.com/pytorch/pytorch/issues/84039). This is being resolved, but for now we recommend to iterate instead of batching.
+- Generating multiple prompts in a batch [crashes or doesn't work reliably](https://github.com/huggingface/diffusers/issues/363). We believe this is related to the [`mps` backend in PyTorch](https://github.com/pytorch/pytorch/issues/84039). For now, we recommend to iterate instead of batching.
--- a/docs/source/quicktour.mdx
+++ b/docs/source/quicktour.mdx
@@ -41,7 +41,7 @@ In this guide though, you'll use [`DiffusionPipeline`] for text-to-image generat
 ```python
 >>> from diffusers import DiffusionPipeline

->>> pipeline = DiffusionPipeline.from_pretrained("CompVis/ldm-text2im-large-256")
+>>> generator = DiffusionPipeline.from_pretrained("CompVis/ldm-text2im-large-256")
 ```

 The [`DiffusionPipeline`] downloads and caches all modeling, tokenization, and scheduling components. 
@@ -49,13 +49,13 @@ Because the model consists of roughly 1.4 billion parameters, we strongly recomm
 You can move the generator object to GPU, just like you would in PyTorch.

 ```python
->>> pipeline.to("cuda")
+>>> generator.to("cuda")
 ```

-Now you can use the `pipeline` on your text prompt:
+Now you can use the `generator` on your text prompt:

 ```python
->>> image = pipeline("An image of a squirrel in Picasso style").images[0]
+>>> image = generator("An image of a squirrel in Picasso style").images[0]
 ```

 The output is by default wrapped into a [PIL Image object](https://pillow.readthedocs.io/en/stable/reference/Image.html?highlight=image#the-image-class).
@@ -82,7 +82,7 @@ just like we did before only that now you need to pass your `AUTH_TOKEN`:
 ```python
 >>> from diffusers import DiffusionPipeline

->>> pipeline = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", use_auth_token=AUTH_TOKEN)
+>>> generator = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", use_auth_token=AUTH_TOKEN)
 ```

 If you do not pass your authentication token you will see that the diffusion system will not be correctly 
@@ -102,7 +102,7 @@ token. Assuming that `"./stable-diffusion-v1-5"` is the local path to the cloned
 you can also load the pipeline as follows:

 ```python
->>> pipeline = DiffusionPipeline.from_pretrained("./stable-diffusion-v1-5")
+>>> generator = DiffusionPipeline.from_pretrained("./stable-diffusion-v1-5")
 ```

 Running the pipeline is then identical to the code above as it's the same model architecture.
@@ -115,20 +115,19 @@ Running the pipeline is then identical to the code above as it's the same model

 Diffusion systems can be used with multiple different [schedulers](./api/schedulers) each with their
 pros and cons. By default, Stable Diffusion runs with [`PNDMScheduler`], but it's very simple to 
-use a different scheduler. *E.g.* if you would instead like to use the [`EulerDiscreteScheduler`] scheduler,
+use a different scheduler. *E.g.* if you would instead like to use the [`LMSDiscreteScheduler`] scheduler,
 you could use it as follows:

 ```python
->>> from diffusers import EulerDiscreteScheduler
+>>> from diffusers import LMSDiscreteScheduler

->>> pipeline = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", use_auth_token=AUTH_TOKEN)
+>>> scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear")

->>> # change scheduler to Euler
->>> pipeline.scheduler = EulerDiscreteScheduler.from_config(pipeline.scheduler.config)
+>>> generator = StableDiffusionPipeline.from_pretrained(
+...     "runwayml/stable-diffusion-v1-5", scheduler=scheduler, use_auth_token=AUTH_TOKEN
+... )
 ```

-For more in-detail information on how to change between schedulers, please refer to the [Using Schedulers](./using-diffusers/schedulers) guide.
-
 [Stability AI's](https://stability.ai/) Stable Diffusion model is an impressive image generation model
 and can do much more than just generating images from text. We have dedicated a whole documentation page,
 just for Stable Diffusion [here](./conceptual/stable_diffusion).
--- a/docs/source/training/dreambooth.mdx
+++ b/docs/source/training/dreambooth.mdx
@@ -23,7 +23,7 @@ The [Dreambooth training script](https://github.com/huggingface/diffusers/tree/m

 <!-- TODO: replace with our blog when it's done -->

-Dreambooth fine-tuning is very sensitive to hyperparameters and easy to overfit. We recommend you take a look at our [in-depth analysis](https://huggingface.co/blog/dreambooth) with recommended settings for different subjects, and go from there.
+Dreambooth fine-tuning is very sensitive to hyperparameters and easy to overfit. We recommend you take a look at our [in-depth analysis](https://wandb.ai/psuraj/dreambooth/reports/Dreambooth-Training-Analysis--VmlldzoyNzk0NDc3) with recommended settings for different subjects, and go from there.

 </Tip>

@@ -148,7 +148,7 @@ accelerate launch train_dreambooth.py \

 ### Fine-tune the text encoder in addition to the UNet

-The script also allows to fine-tune the `text_encoder` along with the `unet`. It has been observed experimentally that this gives much better results, especially on faces. Please, refer to [our blog](https://huggingface.co/blog/dreambooth) for more details.
+The script also allows to fine-tune the `text_encoder` along with the `unet`. It has been observed experimentally that this gives much better results, especially on faces. Please, refer to [our report](https://wandb.ai/psuraj/dreambooth/reports/Dreambooth-Training-Analysis--VmlldzoyNzk0NDc3) for more details.

 To enable this option, pass the `--train_text_encoder` argument to the training script.

--- a/docs/source/training/text2image.mdx
+++ b/docs/source/training/text2image.mdx
@@ -13,7 +13,7 @@ specific language governing permissions and limitations under the License.

 # Stable Diffusion text-to-image fine-tuning

-The [`train_text_to_image.py`](https://github.com/huggingface/diffusers/tree/main/examples/text_to_image) script shows how to fine-tune the stable diffusion model on your own dataset.
+The [`train_text_to_image.py`](https://github.com/huggingface/diffusers/tree/main/examples/textual_inversion) script shows how to fine-tune the stable diffusion model on your own dataset.

 <Tip warning={true}>

--- a/docs/source/using-diffusers/audio.mdx
+++ b/docs/source/using-diffusers/audio.mdx
@@ -1,16 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# Using Diffusers for audio
-
-[`DanceDiffusionPipeline`] and [`AudioDiffusionPipeline`] can be used to generate
-audio rapidly! More coming soon!
--- a/docs/source/using-diffusers/conditional_image_generation.mdx
+++ b/docs/source/using-diffusers/conditional_image_generation.mdx
@@ -44,3 +44,5 @@ You can save the image by simply calling:
 ```python
 >>> image.save("image_of_squirrel_painting.png")
 ```
+
+
--- a/docs/source/using-diffusers/contribute_pipeline.mdx
+++ b/docs/source/using-diffusers/contribute_pipeline.mdx
@@ -128,7 +128,7 @@ pipe = DiffusionPipeline.from_pretrained("google/ddpm-cifar10-32", custom_pipeli
 pipe()
 ```

-Another way to upload your custom_pipeline, besides sending a PR, is uploading the code that contains it to the Hugging Face Hub, [as exemplified here](https://huggingface.co/docs/diffusers/using-diffusers/custom_pipeline_overview#loading-custom-pipelines-from-the-hub).
+Another way to upload your custom_pipeline, besides sending a PR, is uploading the code that contains it to the Hugging Face Hub, [as exemplified here](https://huggingface.co/docs/diffusers/using-diffusers/custom_pipelines#loading-custom-pipelines-from-the-hub).

 **Try it out now - it works!**

--- a/docs/source/using-diffusers/custom_pipeline_examples.mdx
+++ b/docs/source/using-diffusers/custom_pipeline_examples.mdx
@@ -177,7 +177,7 @@ init_image = download_image(

 prompt = "A fantasy landscape, trending on artstation"

-images = pipe.img2img(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images
+images = pipe.img2img(prompt=prompt, init_image=init_image, strength=0.75, guidance_scale=7.5).images

 ### Inpainting

@@ -187,7 +187,7 @@ init_image = download_image(img_url).resize((512, 512))
 mask_image = download_image(mask_url).resize((512, 512))

 prompt = "a cat sitting on a bench"
-images = pipe.inpaint(prompt=prompt, image=init_image, mask_image=mask_image, strength=0.75).images
+images = pipe.inpaint(prompt=prompt, init_image=init_image, mask_image=mask_image, strength=0.75).images
 ```

 As shown above this one pipeline can run all both "text-to-image", "image-to-image", and "inpainting" in one pipeline.
--- a/docs/source/using-diffusers/custom_pipeline_overview.mdx
+++ b/docs/source/using-diffusers/custom_pipeline_overview.mdx
@@ -10,7 +10,7 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->

-# Loading and Adding Custom Pipelines
+# Loading and Saving Custom Pipelines

 Diffusers allows you to conveniently load any custom pipeline from the Hugging Face Hub as well as any [official community pipeline](https://github.com/huggingface/diffusers/tree/main/examples/community) 
 via the [`DiffusionPipeline`] class.
--- a/docs/source/using-diffusers/img2img.mdx
+++ b/docs/source/using-diffusers/img2img.mdx
@@ -33,11 +33,11 @@ url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/st

 response = requests.get(url)
 init_image = Image.open(BytesIO(response.content)).convert("RGB")
-init_image.thumbnail((768, 768))
+init_image = init_image.resize((768, 512))

 prompt = "A fantasy landscape, trending on artstation"

-images = pipe(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images
+images = pipe(prompt=prompt, init_image=init_image, strength=0.75, guidance_scale=7.5).images

 images[0].save("fantasy_landscape.png")
 ```
--- a/docs/source/using-diffusers/loading.mdx
+++ b/docs/source/using-diffusers/loading.mdx
@@ -12,369 +12,23 @@ specific language governing permissions and limitations under the License.

 # Loading

-A core premise of the diffusers library is to make diffusion models **as accessible as possible**.
-Accessibility is therefore achieved by providing an API to load complete diffusion pipelines as well as individual components with a single line of code. 
+The core functionality for saving and loading systems in `Diffusers` is the HuggingFace Hub.

-In the following we explain in-detail how to easily load:
+[[autodoc]] modeling_utils.ModelMixin
+    - from_pretrained
+    - save_pretrained

- *Complete Diffusion Pipelines* via the [`DiffusionPipeline.from_pretrained`]
- *Diffusion Models* via [`ModelMixin.from_pretrained`]
- *Schedulers* via [`SchedulerMixin.from_pretrained`]
+[[autodoc]] pipeline_utils.DiffusionPipeline
+    - from_pretrained
+    - save_pretrained

-## Loading pipelines
+[[autodoc]] modeling_flax_utils.FlaxModelMixin
+    - from_pretrained
+    - save_pretrained

-The [`DiffusionPipeline`] class is the easiest way to access any diffusion model that is [available on the Hub](https://huggingface.co/models?library=diffusers). Let's look at an example on how to download [CompVis' Latent Diffusion model](https://huggingface.co/CompVis/ldm-text2im-large-256).
+[[autodoc]] pipeline_flax_utils.FlaxDiffusionPipeline
+    - from_pretrained
+    - save_pretrained

-```python
-from diffusers import DiffusionPipeline

-repo_id = "CompVis/ldm-text2im-large-256"
-ldm = DiffusionPipeline.from_pretrained(repo_id)
-```
-
-Here [`DiffusionPipeline`] automatically detects the correct pipeline (*i.e.* [`LDMTextToImagePipeline`]), downloads and caches all required configuration and weight files (if not already done so), and finally returns a pipeline instance, called `ldm`.
-The pipeline instance can then be called using [`LDMTextToImagePipeline.__call__`] (i.e., `ldm("image of a astronaut riding a horse")`) for text-to-image generation.
-
-Instead of using the generic [`DiffusionPipeline`] class for loading, you can also load the appropriate pipeline class directly. The code snippet above yields the same instance as when doing:
-
-```python
-from diffusers import LDMTextToImagePipeline
-
-repo_id = "CompVis/ldm-text2im-large-256"
-ldm = LDMTextToImagePipeline.from_pretrained(repo_id)
-```
-
-Diffusion pipelines like `LDMTextToImagePipeline` often consist of multiple components. These components can be both parameterized models, such as `"unet"`, `"vqvae"` and "bert", tokenizers or schedulers. These components can interact in complex ways with each other when using the pipeline in inference, *e.g.* for [`LDMTextToImagePipeline`] or [`StableDiffusionPipeline`] the inference call is explained [here](https://huggingface.co/blog/stable_diffusion#how-does-stable-diffusion-work).
-The purpose of the [pipeline classes](./api/overview#diffusers-summary) is to wrap the complexity of these diffusion systems and give the user an easy-to-use API while staying flexible for customization, as will be shown later.
-
-### Loading pipelines that require access request
-
-Due to the capabilities of diffusion models to generate extremely realistic images, there is a certain danger that such models might be misused for unwanted applications, *e.g.* generating pornography or violent images.
-In order to minimize the possibility of such unsolicited use cases, some of the most powerful diffusion models require users to acknowledge a license before being able to use the model. If the user does not agree to the license, the pipeline cannot be downloaded.
-If you try to load [`runwayml/stable-diffusion-v1-5`](https://huggingface.co/runwayml/stable-diffusion-v1-5) the same way as done previously:
-
-```python
-from diffusers import DiffusionPipeline
-
-repo_id = "runwayml/stable-diffusion-v1-5"
-stable_diffusion = DiffusionPipeline.from_pretrained(repo_id)
-```
-
-it will only work if you have both *click-accepted* the license on [the model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) and are logged into the Hugging Face Hub. Otherwise you will get an error message
-such as the following:
-
-```
-OSError: runwayml/stable-diffusion-v1-5 is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
-If this is a private repository, make sure to pass a token having permission to this repo with `use_auth_token` or log in with `huggingface-cli login`
-```
-
-Therefore, we need to make sure to *click-accept* the license. You can do this by simply visiting 
-the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) and clicking on "Agree and access repository":
-
-<p align="center">
-    <br>
-    <img src="https://raw.githubusercontent.com/huggingface/diffusers/main/docs/source/imgs/access_request.png" width="400"/>
-    <br>
-</p>
-
-Second, you need to login with your access token:
-
-```
-huggingface-cli login
-```
-
-before trying to load the model. Or alternatively, you can pass [your access token](https://huggingface.co/docs/hub/security-tokens#user-access-tokens) directly via the flag `use_auth_token`. In this case you do **not** need 
-to run `huggingface-cli login` before:
-
-```python
-from diffusers import DiffusionPipeline
-
-repo_id = "runwayml/stable-diffusion-v1-5"
-stable_diffusion = DiffusionPipeline.from_pretrained(repo_id, use_auth_token="<your-access-token>")
-```
-
-The final option to use pipelines that require access without having to rely on the Hugging Face Hub is to load the pipeline locally as explained in the next section.
-
-### Loading pipelines locally
-
-If you prefer to have complete control over the pipeline and its corresponding files or, as said before, if you want to use pipelines that require an access request without having to be connected to the Hugging Face Hub, 
-we recommend loading pipelines locally.
-
-To load a diffusion pipeline locally, you first need to manually download the whole folder structure on your local disk and then pass a local path to the [`DiffusionPipeline.from_pretrained`]. Let's again look at an example for 
-[CompVis' Latent Diffusion model](https://huggingface.co/CompVis/ldm-text2im-large-256).
-
-First, you should make use of [`git-lfs`](https://git-lfs.github.com/) to download the whole folder structure that has been uploaded to the [model repository](https://huggingface.co/CompVis/ldm-text2im-large-256/tree/main):
-
-```
-git lfs install
-git clone https://huggingface.co/runwayml/stable-diffusion-v1-5
-```
-
-The command above will create a local folder called `./stable-diffusion-v1-5` on your disk.
-Now, all you have to do is to simply pass the local folder path to `from_pretrained`:
-
-```python
-from diffusers import DiffusionPipeline
-
-repo_id = "./stable-diffusion-v1-5"
-stable_diffusion = DiffusionPipeline.from_pretrained(repo_id)
-```
-
-If `repo_id` is a local path, as it is the case here, [`DiffusionPipeline.from_pretrained`] will automatically detect it and therefore not try to download any files from the Hub.
-While we usually recommend to load weights directly from the Hub to be certain to stay up to date with the newest changes, loading pipelines locally should be preferred if one 
-wants to stay anonymous, self-contained applications, etc...
-
-### Loading customized pipelines
-
-Advanced users that want to load customized versions of diffusion pipelines can do so by swapping any of the default components, *e.g.* the scheduler, with other scheduler classes.
-A classical use case of this functionality is to swap the scheduler. [Stable Diffusion v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5) uses the [`PNDMScheduler`] by default which is generally not the most performant scheduler. Since the release
-of stable diffusion, multiple improved schedulers have been published. To use those, the user has to manually load their preferred scheduler and pass it into [`DiffusionPipeline.from_pretrained`].
-
-*E.g.* to use [`EulerDiscreteScheduler`] or [`DPMSolverMultistepScheduler`] to have a better quality vs. generation speed trade-off for inference, one could load them as follows:
-
-```python
-from diffusers import DiffusionPipeline, EulerDiscreteScheduler, DPMSolverMultistepScheduler
-
-repo_id = "runwayml/stable-diffusion-v1-5"
-
-scheduler = EulerDiscreteScheduler.from_pretrained(repo_id, subfolder="scheduler")
-# or
-# scheduler = DPMSolverMultistepScheduler.from_pretrained(repo_id, subfolder="scheduler")
-
-stable_diffusion = DiffusionPipeline.from_pretrained(repo_id, scheduler=scheduler)
-```
-
-Three things are worth paying attention to here.
- First, the scheduler is loaded with [`SchedulerMixin.from_pretrained`]
- Second, the scheduler is loaded with a function argument, called `subfolder="scheduler"` as the configuration of stable diffusion's scheduling is defined in a [subfolder of the official pipeline repository](https://huggingface.co/runwayml/stable-diffusion-v1-5/tree/main/scheduler)
- Third, the scheduler instance can simply be passed with the `scheduler` keyword argument to [`DiffusionPipeline.from_pretrained`]. This works because the [`StableDiffusionPipeline`] defines its scheduler with the `scheduler` attribute. It's not possible to use a different name, such as `sampler=scheduler` since `sampler` is not a defined keyword for [`StableDiffusionPipeline.__init__`]
-
-Not only the scheduler components can be customized for diffusion pipelines; in theory, all components of a pipeline can be customized. In practice, however, it often only makes sense to switch out a component that has **compatible** alternatives to what the pipeline expects.
-Many scheduler classes are compatible with each other as can be seen [here](https://github.com/huggingface/diffusers/blob/0dd8c6b4dbab4069de9ed1cafb53cbd495873879/src/diffusers/schedulers/scheduling_ddim.py#L112). This is not always the case for other components, such as the `"unet"`.
-
-One special case that can also be customized is the `"safety_checker"` of stable diffusion. If you believe the safety checker doesn't serve you any good, you can simply disable it by passing `None`:
-
-```python
-from diffusers import DiffusionPipeline, EulerDiscreteScheduler, DPMSolverMultistepScheduler
-
-stable_diffusion = DiffusionPipeline.from_pretrained(repo_id, safety_checker=None)
-```
-
-Another common use case is to reuse the same components in multiple pipelines, *e.g.* the weights and configurations of [`"runwayml/stable-diffusion-v1-5"`](https://huggingface.co/runwayml/stable-diffusion-v1-5) can be used for both [`StableDiffusionPipeline`] and [`StableDiffusionImg2ImgPipeline`] and we might not want to 
-use the exact same weights into RAM twice. In this case, customizing all the input instances would help us 
-to only load the weights into RAM once:
-
-```python
-from diffusers import StableDiffusionPipeline, StableDiffusionImg2ImgPipeline
-
-model_id = "runwayml/stable-diffusion-v1-5"
-stable_diffusion_txt2img = StableDiffusionPipeline.from_pretrained(model_id)
-
-components = stable_diffusion_txt2img.components
-
-# weights are not reloaded into RAM
-stable_diffusion_img2img = StableDiffusionImg2ImgPipeline(**components)
-```
-
-Note how the above code snippet makes use of [`DiffusionPipeline.components`].
-
-### How does loading work?
-
-As a class method, [`DiffusionPipeline.from_pretrained`] is responsible for two things:
- Download the latest version of the folder structure required to run the `repo_id` with `diffusers` and cache them. If the latest folder structure is available in the local cache, [`DiffusionPipeline.from_pretrained`] will simply reuse the cache and **not** re-download the files.
- Load the cached weights into the _correct_ pipeline class – one of the [officially supported pipeline classes](./api/overview#diffusers-summary) - and return an instance of the class. The _correct_ pipeline class is thereby retrieved from the `model_index.json` file.
-
-The underlying folder structure of diffusion pipelines correspond 1-to-1 to their corresponding class instances, *e.g.* [`LDMTextToImagePipeline`] for [`CompVis/ldm-text2im-large-256`](https://huggingface.co/CompVis/ldm-text2im-large-256)
-This can be understood better by looking at an example. Let's print out pipeline class instance `pipeline` we just defined:
-
-```python
-from diffusers import DiffusionPipeline
-
-repo_id = "CompVis/ldm-text2im-large-256"
-ldm = DiffusionPipeline.from_pretrained(repo_id)
-print(ldm)
-```
-
-*Output*:
-```
-LDMTextToImagePipeline {
-  "bert": [
-    "latent_diffusion",
-    "LDMBertModel"
-  ],
-  "scheduler": [
-    "diffusers",
-    "DDIMScheduler"
-  ],
-  "tokenizer": [
-    "transformers",
-    "BertTokenizer"
-  ],
-  "unet": [
-    "diffusers",
-    "UNet2DConditionModel"
-  ],
-  "vqvae": [
-    "diffusers",
-    "AutoencoderKL"
-  ]
-}
-```
-
-First, we see that the official pipeline is the [`LDMTextToImagePipeline`], and second we see that the `LDMTextToImagePipeline` consists of 5 components:
- `"bert"` of class `LDMBertModel` as defined [in the pipeline](https://github.com/huggingface/diffusers/blob/cd502b25cf0debac6f98d27a6638ef95208d1ea2/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py#L664)
- `"scheduler"` of class [`DDIMScheduler`]
- `"tokenizer"` of class `BertTokenizer` as defined [in `transformers`](https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertTokenizer)
- `"unet"` of class [`UNet2DConditionModel`]
- `"vqvae"` of class [`AutoencoderKL`]
-
-Let's now compare the pipeline instance to the folder structure of the model repository `CompVis/ldm-text2im-large-256`. Looking at the folder structure of [`CompVis/ldm-text2im-large-256`](https://huggingface.co/CompVis/ldm-text2im-large-256/tree/main) on the Hub, we can see it matches 1-to-1 the printed out instance of `LDMTextToImagePipeline` above:
-
-```
-.
-├── bert
-│   ├── config.json
-│   └── pytorch_model.bin
-├── model_index.json
-├── scheduler
-│   └── scheduler_config.json
-├── tokenizer
-│   ├── special_tokens_map.json
-│   ├── tokenizer_config.json
-│   └── vocab.txt
-├── unet
-│   ├── config.json
-│   └── diffusion_pytorch_model.bin
-└── vqvae
-    ├── config.json
-    └── diffusion_pytorch_model.bin
-```
-
-As we can see each attribute of the instance of `LDMTextToImagePipeline` has its configuration and possibly weights defined in a subfolder that is called **exactly** like the class attribute (`"bert"`, `"scheduler"`, `"tokenizer"`, `"unet"`, `"vqvae"`). Importantly, every pipeline expects a `model_index.json` file that tells the `DiffusionPipeline` both:
- which pipeline class should be loaded, and
- what sub-classes from which library are stored in which subfolders
-
-In the case of `CompVis/ldm-text2im-large-256` the `model_index.json` is therefore defined as follows:
-
-```
-{
-  "_class_name": "LDMTextToImagePipeline",
-  "_diffusers_version": "0.0.4",
-  "bert": [
-    "latent_diffusion",
-    "LDMBertModel"
-  ],
-  "scheduler": [
-    "diffusers",
-    "DDIMScheduler"
-  ],
-  "tokenizer": [
-    "transformers",
-    "BertTokenizer"
-  ],
-  "unet": [
-    "diffusers",
-    "UNet2DConditionModel"
-  ],
-  "vqvae": [
-    "diffusers",
-    "AutoencoderKL"
-  ]
-}
-```
-
- `_class_name` tells `DiffusionPipeline` which pipeline class should be loaded. 
- `_diffusers_version` can be useful to know under which `diffusers` version this model was created.
- Every component of the pipeline is then defined under the form:
-```
-"name" : [
-  "library",
-  "class"
-]
-```
-	- The `"name"` field corresponds both to the name of the subfolder in which the configuration and weights are stored as well as the attribute name of the pipeline class (as can be seen [here](https://huggingface.co/CompVis/ldm-text2im-large-256/tree/main/bert) and [here](https://github.com/huggingface/diffusers/blob/cd502b25cf0debac6f98d27a6638ef95208d1ea2/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py#L42)
-	- The `"library"` field corresponds to the name of the library, *e.g.* `diffusers` or `transformers` from which the `"class"` should be loaded
-	- The `"class"` field corresponds to the name of the class, *e.g.* [`BertTokenizer`](https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertTokenizer) or [`UNet2DConditionModel`]
-
-
-## Loading models
-
-Models as defined under [src/diffusers/models](https://github.com/huggingface/diffusers/tree/main/src/diffusers/models) can be loaded via the [`ModelMixin.from_pretrained`] function. The API is very similar the [`DiffusionPipeline.from_pretrained`] and works in the same way:
- Download the latest version of the model weights and configuration with `diffusers` and cache them. If the latest files are available in the local cache, [`ModelMixin.from_pretrained`] will simply reuse the cache and **not** re-download the files.
- Load the cached weights into the _defined_ model class - one of [the existing model classes](./api/models) - and return an instance of the class.
-
-In constrast to [`DiffusionPipeline.from_pretrained`], models rely on fewer files that usually don't require a folder structure, but just a `diffusion_pytorch_model.bin` and `config.json` file.
-
-Let's look at an example:
-
-```python
-from diffusers import UNet2DConditionModel
-
-repo_id = "CompVis/ldm-text2im-large-256"
-model = UNet2DConditionModel.from_pretrained(repo_id, subfolder="unet")
-```
-
-Note how we have to define the `subfolder="unet"` argument to tell [`ModelMixin.from_pretrained`] that the model weights are located in a [subfolder of the repository](https://huggingface.co/CompVis/ldm-text2im-large-256/tree/main/unet).
-
-As explained in [Loading customized pipelines]("./using-diffusers/loading#loading-customized-pipelines"), one can pass a loaded model to a diffusion pipeline, via [`DiffusionPipeline.from_pretrained`]:
-
-```python
-from diffusers import DiffusionPipeline
-
-repo_id = "CompVis/ldm-text2im-large-256"
-ldm = DiffusionPipeline.from_pretrained(repo_id, unet=model)
-```
-
-If the model files can be found directly at the root level, which is usually only the case for some very simple diffusion models, such as [`google/ddpm-cifar10-32`](https://huggingface.co/google/ddpm-cifar10-32), we don't 
-need to pass a `subfolder` argument:
-
-```python
-from diffusers import UNet2DModel
-
-repo_id = "google/ddpm-cifar10-32"
-model = UNet2DModel.from_pretrained(repo_id)
-```
-
-## Loading schedulers
-
-Schedulers rely on [`SchedulerMixin.from_pretrained`]. Schedulers are **not parameterized** or **trained**, but instead purely defined by a configuration file.
-For consistency, we use the same method name as we do for models or pipelines, but no weights are loaded in this case.
-
-In constrast to pipelines or models, loading schedulers does not consume any significant amount of memory and the same configuration file can often be used for a variety of different schedulers.
-For example, all of:
-
- [`DDPMScheduler`]
- [`DDIMScheduler`]
- [`PNDMScheduler`]
- [`LMSDiscreteScheduler`]
- [`EulerDiscreteScheduler`]
- [`EulerAncestralDiscreteScheduler`]
- [`DPMSolverMultistepScheduler`]
-
-are compatible with [`StableDiffusionPipeline`] and therefore the same scheduler configuration file can be loaded in any of those classes:
-
-```python
-from diffusers import StableDiffusionPipeline
-from diffusers import (
-    DDPMScheduler,
-    DDIMScheduler,
-    PNDMScheduler,
-    LMSDiscreteScheduler,
-    EulerDiscreteScheduler,
-    EulerAncestralDiscreteScheduler,
-    DPMSolverMultistepScheduler,
-)
-
-repo_id = "runwayml/stable-diffusion-v1-5"
-
-ddpm = DDPMScheduler.from_pretrained(repo_id, subfolder="scheduler")
-ddim = DDIMScheduler.from_pretrained(repo_id, subfolder="scheduler")
-pndm = PNDMScheduler.from_pretrained(repo_id, subfolder="scheduler")
-lms = LMSDiscreteScheduler.from_pretrained(repo_id, subfolder="scheduler")
-euler_anc = EulerAncestralDiscreteScheduler.from_pretrained(repo_id, subfolder="scheduler")
-euler = EulerDiscreteScheduler.from_pretrained(repo_id, subfolder="scheduler")
-dpm = DPMSolverMultistepScheduler.from_pretrained(repo_id, subfolder="scheduler")
-
-# replace `dpm` with any of `ddpm`, `ddim`, `pndm`, `lms`, `euler`, `euler_anc`
-pipeline = StableDiffusionPipeline.from_pretrained(repo_id, scheduler=dpm)
-```
+Under further construction 🚧, open a [PR](https://github.com/huggingface/diffusers/compare) if you want to contribute!
--- a/docs/source/using-diffusers/other-modalities.mdx
+++ b/docs/source/using-diffusers/other-modalities.mdx
@@ -1,21 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# Using Diffusers with other modalities
-
-Diffusers is in the process of expanding to modalities other than images.
-
-Example type        | Colab | Pipeline |
-:-------------------------:|:-------------------------:|:-------------------------:|
-[Molecule conformation](https://www.nature.com/subjects/molecular-conformation#:~:text=Definition,to%20changes%20in%20their%20environment.) generation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/geodiff_molecule_conformation.ipynb) | ❌
-
-More coming soon!
--- a/docs/source/using-diffusers/rl.mdx
+++ b/docs/source/using-diffusers/rl.mdx
@@ -1,25 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# Using Diffusers for reinforcement learning
-
-Support for one RL model and related pipelines is included in the `experimental` source of diffusers.
-More models and examples coming soon!
-
-# Diffuser Value-guided Planning
-
-You can run the model from [*Planning with Diffusion for Flexible Behavior Synthesis*](https://arxiv.org/abs/2205.09991) with Diffusers.
-The script is located in the [RL Examples](https://github.com/huggingface/diffusers/tree/main/examples/rl) folder.
-
-Or, run this example in Colab [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/reinforcement_learning_with_diffusers.ipynb)
-
-[[autodoc]] diffusers.experimental.ValueGuidedRLPipeline
--- a/docs/source/using-diffusers/schedulers.mdx
+++ b/docs/source/using-diffusers/schedulers.mdx
@@ -1,262 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# Schedulers
-
-Diffusion pipelines are inherently a collection of diffusion models and schedulers that are partly independent from each other. This means that one is able to switch out parts of the pipeline to better customize 
-a pipeline to one's use case. The best example of this are the [Schedulers](../api/schedulers.mdx).
-
-Whereas diffusion models usually simply define the forward pass from noise to a less noisy sample, 
-schedulers define the whole denoising process, *i.e.*:
- How many denoising steps?
- Stochastic or deterministic?
- What algorithm to use to find the denoised sample
-
-They can be quite complex and often define a trade-off between **denoising speed** and **denoising quality**.
-It is extremely difficult to measure quantitatively which scheduler works best for a given diffusion pipeline, so it is often recommended to simply try out which works best.
-
-The following paragraphs shows how to do so with the 🧨 Diffusers library.
-
-## Load pipeline
-
-Let's start by loading the stable diffusion pipeline.
-Remember that you have to be a registered user on the 🤗 Hugging Face Hub, and have "click-accepted" the [license](https://huggingface.co/runwayml/stable-diffusion-v1-5) in order to use stable diffusion.
-
-```python
-from huggingface_hub import login
-from diffusers import DiffusionPipeline
-import torch
-
-# first we need to login with our access token
-login()
-
-# Now we can download the pipeline
-pipeline = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16)
-```
-
-Next, we move it to GPU:
-
-```python
-pipeline.to("cuda")
-```
-
-## Access the scheduler
-
-The scheduler is always one of the components of the pipeline and is usually called `"scheduler"`.
-So it can be accessed via the `"scheduler"` property.
-
-```python
-pipeline.scheduler
-```
-
-**Output**:
-```
-PNDMScheduler {
-  "_class_name": "PNDMScheduler",
-  "_diffusers_version": "0.8.0.dev0",
-  "beta_end": 0.012,
-  "beta_schedule": "scaled_linear",
-  "beta_start": 0.00085,
-  "clip_sample": false,
-  "num_train_timesteps": 1000,
-  "set_alpha_to_one": false,
-  "skip_prk_steps": true,
-  "steps_offset": 1,
-  "trained_betas": null
-}
-```
-
-We can see that the scheduler is of type [`PNDMScheduler`]. 
-Cool, now let's compare the scheduler in its performance to other schedulers.
-First we define a prompt on which we will test all the different schedulers:
-
-```python
-prompt = "A photograph of an astronaut riding a horse on Mars, high resolution, high definition."
-```
-
-Next, we create a generator from a random seed that will ensure that we can generate similar images as well as run the pipeline:
-
-```python
-generator = torch.Generator(device="cuda").manual_seed(8)
-image = pipeline(prompt, generator=generator).images[0]
-image
-```
-
-<p align="center">
-    <br>
-    <img src="https://huggingface.co/datasets/patrickvonplaten/images/resolve/main/diffusers_docs/astronaut_pndm.png" width="400"/>
-    <br>
-</p>
-
-
-## Changing the scheduler
-
-Now we show how easy it is to change the scheduler of a pipeline. Every scheduler has a property [`SchedulerMixin.compatibles`] 
-which defines all compatible schedulers. You can take a look at all available, compatible schedulers for the Stable Diffusion pipeline as follows.
-
-```python
-pipeline.scheduler.compatibles
-```
-
-**Output**:
-```
-[diffusers.schedulers.scheduling_lms_discrete.LMSDiscreteScheduler,
- diffusers.schedulers.scheduling_ddim.DDIMScheduler,
- diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler,
- diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler,
- diffusers.schedulers.scheduling_pndm.PNDMScheduler,
- diffusers.schedulers.scheduling_ddpm.DDPMScheduler,
- diffusers.schedulers.scheduling_euler_ancestral_discrete.EulerAncestralDiscreteScheduler]
-```
-
-Cool, lots of schedulers to look at. Feel free to have a look at their respective class definitions: 
-
- [`LMSDiscreteScheduler`], 
- [`DDIMScheduler`], 
- [`DPMSolverMultistepScheduler`], 
- [`EulerDiscreteScheduler`], 
- [`PNDMScheduler`], 
- [`DDPMScheduler`], 
- [`EulerAncestralDiscreteScheduler`].
-
-We will now compare the input prompt with all other schedulers. To change the scheduler of the pipeline you can make use of the 
-convenient [`ConfigMixin.config`] property in combination with the [`ConfigMixin.from_config`] function.
-
-```python
-pipeline.scheduler.config
-```
-
-returns a dictionary of the configuration of the scheduler:
-
-**Output**:
-```
-FrozenDict([('num_train_timesteps', 1000),
-            ('beta_start', 0.00085),
-            ('beta_end', 0.012),
-            ('beta_schedule', 'scaled_linear'),
-            ('trained_betas', None),
-            ('skip_prk_steps', True),
-            ('set_alpha_to_one', False),
-            ('steps_offset', 1),
-            ('_class_name', 'PNDMScheduler'),
-            ('_diffusers_version', '0.8.0.dev0'),
-            ('clip_sample', False)])
-```
-
-This configuration can then be used to instantiate a scheduler
-of a different class that is compatible with the pipeline. Here, 
-we change the scheduler to the [`DDIMScheduler`].
-
-```python
-from diffusers import DDIMScheduler
-
-pipeline.scheduler = DDIMScheduler.from_config(pipeline.scheduler.config)
-```
-
-Cool, now we can run the pipeline again to compare the generation quality.
-
-```python
-generator = torch.Generator(device="cuda").manual_seed(8)
-image = pipeline(prompt, generator=generator).images[0]
-image
-```
-
-<p align="center">
-    <br>
-    <img src="https://huggingface.co/datasets/patrickvonplaten/images/resolve/main/diffusers_docs/astronaut_ddim.png" width="400"/>
-    <br>
-</p>
-
-
-## Compare schedulers
-
-So far we have tried running the stable diffusion pipeline with two schedulers: [`PNDMScheduler`] and [`DDIMScheduler`]. 
-A number of better schedulers have been released that can be run with much fewer steps, let's compare them here:
-
-[`LMSDiscreteScheduler`] usually leads to better results:
-
-```python
-from diffusers import LMSDiscreteScheduler
-
-pipeline.scheduler = LMSDiscreteScheduler.from_config(pipeline.scheduler.config)
-
-generator = torch.Generator(device="cuda").manual_seed(8)
-image = pipeline(prompt, generator=generator).images[0]
-image
-```
-
-<p align="center">
-    <br>
-    <img src="https://huggingface.co/datasets/patrickvonplaten/images/resolve/main/diffusers_docs/astronaut_lms.png" width="400"/>
-    <br>
-</p>
-
-
-[`EulerDiscreteScheduler`] and [`EulerAncestralDiscreteScheduler`] can generate high quality results with as little as 30 steps.
-
-```python
-from diffusers import EulerDiscreteScheduler
-
-pipeline.scheduler = EulerDiscreteScheduler.from_config(pipeline.scheduler.config)
-
-generator = torch.Generator(device="cuda").manual_seed(8)
-image = pipeline(prompt, generator=generator, num_inference_steps=30).images[0]
-image
-```
-
-<p align="center">
-    <br>
-    <img src="https://huggingface.co/datasets/patrickvonplaten/images/resolve/main/diffusers_docs/astronaut_euler_discrete.png" width="400"/>
-    <br>
-</p>
-
-
-and:
-
-```python
-from diffusers import EulerAncestralDiscreteScheduler
-
-pipeline.scheduler = EulerAncestralDiscreteScheduler.from_config(pipeline.scheduler.config)
-
-generator = torch.Generator(device="cuda").manual_seed(8)
-image = pipeline(prompt, generator=generator, num_inference_steps=30).images[0]
-image
-```
-
-<p align="center">
-    <br>
-    <img src="https://huggingface.co/datasets/patrickvonplaten/images/resolve/main/diffusers_docs/astronaut_euler_ancestral.png" width="400"/>
-    <br>
-</p>
-
-
-At the time of writing this doc [`DPMSolverMultistepScheduler`] gives arguably the best speed/quality trade-off and can be run with as little 
-as 20 steps.
-
-```python
-from diffusers import DPMSolverMultistepScheduler
-
-pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config)
-
-generator = torch.Generator(device="cuda").manual_seed(8)
-image = pipeline(prompt, generator=generator, num_inference_steps=20).images[0]
-image
-```
-
-<p align="center">
-    <br>
-    <img src="https://huggingface.co/datasets/patrickvonplaten/images/resolve/main/diffusers_docs/astronaut_dpm.png" width="400"/>
-    <br>
-</p>
-
-As you can see most images look very similar and are arguably of very similar quality. It often really depends on the specific use case which scheduler to choose. A good approach is always to run multiple different
-schedulers to compare results.
--- a/examples/README.md
+++ b/examples/README.md
@@ -38,11 +38,11 @@ Training examples show how to pretrain or fine-tune diffusion models for a varie

 | Task | 🤗 Accelerate | 🤗 Datasets | Colab
 |---|---|:---:|:---:|
-| [**Unconditional Image Generation**](./unconditional_image_generation) | ✅ | ✅ | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/training_example.ipynb)
-| [**Text-to-Image fine-tuning**](./text_to_image) | ✅ | ✅ | 
-| [**Textual Inversion**](./textual_inversion) | ✅ | - | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/sd_textual_inversion_training.ipynb)
+| [**Unconditional Image Generation**](./unconditional_training) | ✅ | ✅ | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/training_example.ipynb)
+| [**Text-to-Image fine-tuning**](./text2image) | ✅ | ✅ | 
+| [**Textual Inversion**](./text_inversion) | ✅ | - | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/sd_textual_inversion_training.ipynb)
 | [**Dreambooth**](./dreambooth) | ✅ | - | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/sd_dreambooth_training.ipynb)
-| [**Reinforcement Learning for Control**](https://github.com/huggingface/diffusers/blob/main/examples/rl/run_diffusers_locomotion.py)                    | - | - | coming soon.
+

 ## Community

--- a/examples/community/README.md
+++ b/examples/community/README.md
@@ -15,15 +15,8 @@ If a community doesn't work as expected, please open an issue and ping the autho
 | Long Prompt Weighting Stable Diffusion | **One** Stable Diffusion Pipeline without tokens length limit, and support parsing weighting in prompt.                                                                                                                                                                                                                                                                                                                                                                                                  | [Long Prompt Weighting Stable Diffusion](#long-prompt-weighting-stable-diffusion)                                                                 | -                                                                                                                                                                                                                  |                        [SkyTNT](https://github.com/SkyTNT) |
 | Speech to Image                        | Using automatic-speech-recognition to transcribe text and Stable Diffusion to generate images                                                                                                                                                                                                                                                                                                                                                                                                            | [Speech to Image](#speech-to-image)                               | -                                                                                                                                                                                                                  | [Mikail Duzenli](https://github.com/MikailINTech)
 | Wild Card Stable Diffusion | Stable Diffusion Pipeline that supports prompts that contain wildcard terms (indicated by surrounding double underscores), with values instantiated randomly from a corresponding txt file or a dictionary of possible values                                                                                                                                                                                                                                                                                                     | [Wildcard Stable Diffusion](#wildcard-stable-diffusion)                                                                 | -                                                                                                                                                                                                                  |                        [Shyam Sudhakaran](https://github.com/shyamsn97) |
-| [Composable Stable Diffusion](https://energy-based-model.github.io/Compositional-Visual-Generation-with-Composable-Diffusion-Models/) | Stable Diffusion Pipeline that supports prompts that contain "&#124;" in prompts (as an AND condition) and weights (separated by "&#124;" as well) to positively / negatively weight prompts.                                                                                                                                                                                                                                                                                                     | [Composable Stable Diffusion](#composable-stable-diffusion)                                                                 | -                                                                                                                                                                                                                  |                        [Mark Rich](https://github.com/MarkRich) |
+| Composable Stable Diffusion| Stable Diffusion Pipeline that supports prompts that contain "&#124;" in prompts (as an AND condition) and weights (separated by "&#124;" as well) to positively / negatively weight prompts.                                                                                                                                                                                                                                                                                                     | [Composable Stable Diffusion](#composable-stable-diffusion)                                                                 | -                                                                                                                                                                                                                  |                        [Mark Rich](https://github.com/MarkRich) |
 | Seed Resizing Stable Diffusion| Stable Diffusion Pipeline that supports resizing an image and retaining the concepts of the 512 by 512 generation.                                                                                                                                                                                                                                                                                                     | [Seed Resizing](#seed-resizing)                                                                 | -                                                                                                                                                                                                                  |                        [Mark Rich](https://github.com/MarkRich) |
-| Imagic Stable Diffusion | Stable Diffusion Pipeline that enables writing a text prompt to edit an existing image| [Imagic Stable Diffusion](#imagic-stable-diffusion)                                                                 | -                                                                                                                                                                                                                  |                        [Mark Rich](https://github.com/MarkRich) |
-| Multilingual Stable Diffusion| Stable Diffusion Pipeline that supports prompts in 50 different languages.                                                                                                                                                                                                                                                                                                     | [Multilingual Stable Diffusion](#multilingual-stable-diffusion-pipeline)                                                                 | -                                                                                                                                                                                                                  |                        [Juan Carlos Piñeros](https://github.com/juancopi81) |
-| Image to Image Inpainting Stable Diffusion | Stable Diffusion Pipeline that enables the overlaying of two images and subsequent inpainting| [Image to Image Inpainting Stable Diffusion](#image-to-image-inpainting-stable-diffusion)                                                                 | -                                                                                                                                                                                                                  |                        [Alex McKinney](https://github.com/vvvm23) |
-| Text Based Inpainting Stable Diffusion | Stable Diffusion Inpainting Pipeline that enables passing a text prompt to generate the mask for inpainting| [Text Based Inpainting Stable Diffusion](#image-to-image-inpainting-stable-diffusion)                                                                 | -                                                                                                                                                                                                                  |                        [Dhruv Karan](https://github.com/unography) |
-| Bit Diffusion | Diffusion on discrete data | [Bit Diffusion](#bit-diffusion) | -  |[Stuti R.](https://github.com/kingstut) |
-| K-Diffusion Stable Diffusion | Run Stable Diffusion with any of [K-Diffusion's samplers](https://github.com/crowsonkb/k-diffusion/blob/master/k_diffusion/sampling.py) | [Stable Diffusion with K Diffusion](#stable-diffusion-with-k-diffusion) | -  | [Patrick von Platen](https://github.com/patrickvonplaten/) |
-| Checkpoint Merger Pipeline | Diffusion Pipeline that enables merging of saved model checkpoints | [Checkpoint Merger Pipeline](#checkpoint-merger-pipeline)                   | -                                                                                                                                                                                                                  | [Naga Sai Abhinay Devarinti](https://github.com/Abhinay1997/) |



@@ -167,7 +160,7 @@ init_image = download_image("https://raw.githubusercontent.com/CompVis/stable-di

 prompt = "A fantasy landscape, trending on artstation"

-images = pipe.img2img(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images
+images = pipe.img2img(prompt=prompt, init_image=init_image, strength=0.75, guidance_scale=7.5).images

 ### Inpainting

@@ -177,26 +170,15 @@ init_image = download_image(img_url).resize((512, 512))
 mask_image = download_image(mask_url).resize((512, 512))

 prompt = "a cat sitting on a bench"
-images = pipe.inpaint(prompt=prompt, image=init_image, mask_image=mask_image, strength=0.75).images
+images = pipe.inpaint(prompt=prompt, init_image=init_image, mask_image=mask_image, strength=0.75).images
 ```

 As shown above this one pipeline can run all both "text-to-image", "image-to-image", and "inpainting" in one pipeline.

 ### Long Prompt Weighting Stable Diffusion
-Features of this custom pipeline:
- Input a prompt without the 77 token length limit.
- Includes tx2img, img2img. and inpainting pipelines.
- Emphasize/weigh part of your prompt with parentheses as so: `a baby deer with (big eyes)`
- De-emphasize part of your prompt as so: `a [baby] deer with big eyes`
- Precisely weigh part of your prompt as so: `a baby deer with (big eyes:1.3)`

-Prompt weighting equivalents:
- `a baby deer with` == `(a baby deer with:1.0)`
- `(big eyes)` == `(big eyes:1.1)`
- `((big eyes))` == `(big eyes:1.21)`
- `[big eyes]` == `(big eyes:0.91)`
-
-You can run this custom pipeline as so:
+The Pipeline lets you input prompt without 77 token length limit. And you can increase words weighting by using "()" or decrease words weighting by using "[]"
+The Pipeline also lets you use the main use cases of the stable diffusion pipeline in a single class.

 #### pytorch

@@ -346,9 +328,8 @@ out = pipe(
 )
 ```

-### Composable Stable diffusion 

-[Composable Stable Diffusion](https://energy-based-model.github.io/Compositional-Visual-Generation-with-Composable-Diffusion-Models/) proposes conjunction and negation (negative prompts) operators for compositional generation with conditional diffusion models.
+### Composable Stable diffusion 

 ```python
 import torch as th
@@ -372,7 +353,7 @@ def dummy(images, **kwargs):
 pipe.safety_checker = dummy

 images = []
-generator = torch.Generator("cuda").manual_seed(0)
+generator = th.Generator("cuda").manual_seed(0)

 seed = 0
 prompt = "a forest | a camel"
@@ -392,49 +373,6 @@ for i in range(4):
 for i, img in enumerate(images):
    img.save(f"./composable_diffusion/image_{i}.png")
 ```
-
-### Imagic Stable Diffusion
-Allows you to edit an image using stable diffusion. 
-
-```python
-import requests
-from PIL import Image
-from io import BytesIO
-import torch
-import os
-from diffusers import DiffusionPipeline, DDIMScheduler
-has_cuda = torch.cuda.is_available()
-device = torch.device('cpu' if not has_cuda else 'cuda')
-pipe = DiffusionPipeline.from_pretrained(
-    "CompVis/stable-diffusion-v1-4",
-        safety_checker=None,
-    use_auth_token=True,
-    custom_pipeline="imagic_stable_diffusion",
-    scheduler = DDIMScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", clip_sample=False, set_alpha_to_one=False)
-).to(device)
-generator = torch.Generator("cuda").manual_seed(0)
-seed = 0
-prompt = "A photo of Barack Obama smiling with a big grin"
-url = 'https://www.dropbox.com/s/6tlwzr73jd1r9yk/obama.png?dl=1'
-response = requests.get(url)
-init_image = Image.open(BytesIO(response.content)).convert("RGB")
-init_image = init_image.resize((512, 512))
-res = pipe.train(
-    prompt,
-    image=init_image,
-    generator=generator)
-res = pipe(alpha=1, guidance_scale=7.5, num_inference_steps=50)
-os.makedirs("imagic", exist_ok=True)
-image = res.images[0]
-image.save('./imagic/imagic_image_alpha_1.png')
-res = pipe(alpha=1.5, guidance_scale=7.5, num_inference_steps=50)
-image = res.images[0]
-image.save('./imagic/imagic_image_alpha_1_5.png')
-res = pipe(alpha=2, guidance_scale=7.5, num_inference_steps=50)
-image = res.images[0]
-image.save('./imagic/imagic_image_alpha_2.png')
-```
-
 ### Seed Resizing 
 Test seed resizing. Originally generate an image in 512 by 512, then generate image with same seed at 512 by 592 using seed resizing. Finally, generate 512 by 592 using original stable diffusion pipeline.

@@ -518,259 +456,4 @@ res = pipe_compare(

 image = res.images[0]
 image.save('./seed_resize/seed_resize_{w}_{h}_image_compare.png'.format(w=width, h=height))
-```
-
-### Multilingual Stable Diffusion Pipeline
-
-The following code can generate an images from texts in different languages using the pre-trained [mBART-50 many-to-one multilingual machine translation model](https://huggingface.co/facebook/mbart-large-50-many-to-one-mmt) and Stable Diffusion.
-
-```python
-from PIL import Image
-
-import torch
-
-from diffusers import DiffusionPipeline
-from transformers import (
-    pipeline,
-    MBart50TokenizerFast,
-    MBartForConditionalGeneration,
-)
-device = "cuda" if torch.cuda.is_available() else "cpu"
-device_dict = {"cuda": 0, "cpu": -1}
-
-# helper function taken from: https://huggingface.co/blog/stable_diffusion
-def image_grid(imgs, rows, cols):
-    assert len(imgs) == rows*cols
-
-    w, h = imgs[0].size
-    grid = Image.new('RGB', size=(cols*w, rows*h))
-    grid_w, grid_h = grid.size
-
-    for i, img in enumerate(imgs):
-        grid.paste(img, box=(i%cols*w, i//cols*h))
-    return grid
-
-# Add language detection pipeline
-language_detection_model_ckpt = "papluca/xlm-roberta-base-language-detection"
-language_detection_pipeline = pipeline("text-classification",
-                                       model=language_detection_model_ckpt,
-                                       device=device_dict[device])
-
-# Add model for language translation
-trans_tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-one-mmt")
-trans_model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-one-mmt").to(device)
-
-diffuser_pipeline = DiffusionPipeline.from_pretrained(
-    "CompVis/stable-diffusion-v1-4",
-    custom_pipeline="multilingual_stable_diffusion",
-    detection_pipeline=language_detection_pipeline,
-    translation_model=trans_model,
-    translation_tokenizer=trans_tokenizer,
-    revision="fp16",
-    torch_dtype=torch.float16,
-)
-
-diffuser_pipeline.enable_attention_slicing()
-diffuser_pipeline = diffuser_pipeline.to(device)
-
-prompt = ["a photograph of an astronaut riding a horse", 
-          "Una casa en la playa",
-          "Ein Hund, der Orange isst",
-          "Un restaurant parisien"]
-
-output = diffuser_pipeline(prompt)
-
-images = output.images
-
-grid = image_grid(images, rows=2, cols=2)
-```
-
-This example produces the following images:
-![image](https://user-images.githubusercontent.com/4313860/198328706-295824a4-9856-4ce5-8e66-278ceb42fd29.png)
-
-### Image to Image Inpainting Stable Diffusion
-
-Similar to the standard stable diffusion inpainting example, except with the addition of an `inner_image` argument.
-
-`image`, `inner_image`, and `mask` should have the same dimensions. `inner_image` should have an alpha (transparency) channel.
-
-The aim is to overlay two images, then mask out the boundary between `image` and `inner_image` to allow stable diffusion to make the connection more seamless.
-For example, this could be used to place a logo on a shirt and make it blend seamlessly.
-
-```python
-import PIL
-import torch
-
-from diffusers import DiffusionPipeline
-
-image_path = "./path-to-image.png"
-inner_image_path = "./path-to-inner-image.png"
-mask_path = "./path-to-mask.png"
-
-init_image = PIL.Image.open(image_path).convert("RGB").resize((512, 512))
-inner_image = PIL.Image.open(inner_image_path).convert("RGBA").resize((512, 512))
-mask_image = PIL.Image.open(mask_path).convert("RGB").resize((512, 512))
-
-pipe = DiffusionPipeline.from_pretrained(
-    "runwayml/stable-diffusion-inpainting",
-    custom_pipeline="img2img_inpainting",
-    revision="fp16",
-    torch_dtype=torch.float16
-)
-pipe = pipe.to("cuda")
-
-prompt = "Your prompt here!"
-image = pipe(prompt=prompt, image=init_image, inner_image=inner_image, mask_image=mask_image).images[0]
-```
-
-![2 by 2 grid demonstrating image to image inpainting.](https://user-images.githubusercontent.com/44398246/203506577-ec303be4-887e-4ebd-a773-c83fcb3dd01a.png)
-
-### Text Based Inpainting Stable Diffusion
-
-Use a text prompt to generate the mask for the area to be inpainted.
-Currently uses the CLIPSeg model for mask generation, then calls the standard Stable Diffusion Inpainting pipeline to perform the inpainting.
-
-```python
-from transformers import CLIPSegProcessor, CLIPSegForImageSegmentation
-from diffusers import DiffusionPipeline
-
-from PIL import Image
-import requests
-from torch import autocast
-
-processor = CLIPSegProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
-model = CLIPSegForImageSegmentation.from_pretrained("CIDAS/clipseg-rd64-refined")
-
-pipe = DiffusionPipeline.from_pretrained(
-    "runwayml/stable-diffusion-inpainting",
-    custom_pipeline="text_inpainting",
-    segmentation_model=model,
-    segmentation_processor=processor
-)
-pipe = pipe.to("cuda")
-
-
-url = "https://github.com/timojl/clipseg/blob/master/example_image.jpg?raw=true"
-image = Image.open(requests.get(url, stream=True).raw).resize((512, 512))
-text = "a glass"  # will mask out this text
-prompt = "a cup"  # the masked out region will be replaced with this
-
-with autocast("cuda"):
-    image = pipe(image=image, text=text, prompt=prompt).images[0]
-```
-
-### Bit Diffusion 
-Based https://arxiv.org/abs/2208.04202, this is used for diffusion on discrete data - eg, discreate image data, DNA sequence data. An unconditional discreate image can be generated like this: 
-
-```python
-from diffusers import DiffusionPipeline
-pipe = DiffusionPipeline.from_pretrained("google/ddpm-cifar10-32", custom_pipeline="bit_diffusion")
-image = pipe().images[0]
-
-```
-
-### Stable Diffusion with K Diffusion
-
-Make sure you have @crowsonkb's https://github.com/crowsonkb/k-diffusion installed:
-
-```
-pip install k-diffusion
-```
-
-You can use the community pipeline as follows:
-
-```python
-from diffusers import DiffusionPipeline
-
-pipe = DiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", custom_pipeline="sd_text2img_k_diffusion")
-pipe = pipe.to("cuda")
-
-prompt = "an astronaut riding a horse on mars"
-pipe.set_scheduler("sample_heun")
-generator = torch.Generator(device="cuda").manual_seed(seed)
-image = pipe(prompt, generator=generator, num_inference_steps=20).images[0]
-
-image.save("./astronaut_heun_k_diffusion.png")
-```
-
-To make sure that K Diffusion and `diffusers` yield the same results:
-
-**Diffusers**:
-```python
-from diffusers import DiffusionPipeline, EulerDiscreteScheduler
-
-seed = 33
-
-pipe = DiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
-pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config)
-pipe = pipe.to("cuda")
-
-generator = torch.Generator(device="cuda").manual_seed(seed)
-image = pipe(prompt, generator=generator, num_inference_steps=50).images[0]
-```
-
-![diffusers_euler](https://huggingface.co/datasets/patrickvonplaten/images/resolve/main/k_diffusion/astronaut_euler.png)
-
-**K Diffusion**:
-```python
-from diffusers import DiffusionPipeline, EulerDiscreteScheduler
-
-seed = 33
-
-pipe = DiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", custom_pipeline="sd_text2img_k_diffusion")
-pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config)
-pipe = pipe.to("cuda")
-
-pipe.set_scheduler("sample_euler")
-generator = torch.Generator(device="cuda").manual_seed(seed)
-image = pipe(prompt, generator=generator, num_inference_steps=50).images[0]
-```
-
-![diffusers_euler](https://huggingface.co/datasets/patrickvonplaten/images/resolve/main/k_diffusion/astronaut_euler_k_diffusion.png)
-
-### Checkpoint Merger Pipeline
-Based on the AUTOMATIC1111/webui for checkpoint merging. This is a custom pipeline that merges upto 3 pretrained model checkpoints as long as they are in the HuggingFace model_index.json format.
-
-The checkpoint merging is currently memory intensive as it modifies the weights of a DiffusionPipeline object in place. Expect atleast 13GB RAM Usage on Kaggle GPU kernels and
-on colab you might run out of the 12GB memory even while merging two checkpoints.
-
-Usage:-
-```python
-from diffusers import DiffusionPipeline
-
-#Return a CheckpointMergerPipeline class that allows you to merge checkpoints. 
-#The checkpoint passed here is ignored. But still pass one of the checkpoints you plan to 
-#merge for convenience
-pipe = DiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", custom_pipeline="checkpoint_merger")
-
-#There are multiple possible scenarios:
-#The pipeline with the merged checkpoints is returned in all the scenarios
-
-#Compatible checkpoints a.k.a matched model_index.json files. Ignores the meta attributes in model_index.json during comparision.( attrs with _ as prefix )
-merged_pipe = pipe.merge(["CompVis/stable-diffusion-v1-4","CompVis/stable-diffusion-v1-2"], interp = "sigmoid", alpha = 0.4)
-
-#Incompatible checkpoints in model_index.json but merge might be possible. Use force = True to ignore model_index.json compatibility
-merged_pipe_1 = pipe.merge(["CompVis/stable-diffusion-v1-4","hakurei/waifu-diffusion"], force = True, interp = "sigmoid", alpha = 0.4)
-
-#Three checkpoint merging. Only "add_difference" method actually works on all three checkpoints. Using any other options will ignore the 3rd checkpoint.
-merged_pipe_2 = pipe.merge(["CompVis/stable-diffusion-v1-4","hakurei/waifu-diffusion","prompthero/openjourney"], force = True, interp = "add_difference", alpha = 0.4)
-
-prompt = "An astronaut riding a horse on Mars"
-
-image = merged_pipe(prompt).images[0]
-
-```
-Some examples along with the merge details:
-
-1. "CompVis/stable-diffusion-v1-4" + "hakurei/waifu-diffusion" ; Sigmoid interpolation; alpha = 0.8 
-
-![Stable plus Waifu Sigmoid 0.8](https://huggingface.co/datasets/NagaSaiAbhinay/CheckpointMergerSamples/resolve/main/stability_v1_4_waifu_sig_0.8.png)
-
-2. "hakurei/waifu-diffusion" + "prompthero/openjourney" ; Inverse Sigmoid interpolation; alpha = 0.8 
-
-![Stable plus Waifu Sigmoid 0.8](https://huggingface.co/datasets/NagaSaiAbhinay/CheckpointMergerSamples/resolve/main/waifu_openjourney_inv_sig_0.8.png)
-
-
-3. "CompVis/stable-diffusion-v1-4" + "hakurei/waifu-diffusion" + "prompthero/openjourney"; Add Difference interpolation; alpha = 0.5 
-
-![Stable plus Waifu plus openjourney add_diff 0.5](https://huggingface.co/datasets/NagaSaiAbhinay/CheckpointMergerSamples/resolve/main/stable_waifu_openjourney_add_diff_0.5.png)
+```
--- a/examples/community/bit_diffusion.py
+++ b/examples/community/bit_diffusion.py
@@ -1,265 +0,0 @@
-from typing import Optional, Tuple, Union
-
-import torch
-
-from diffusers import DDIMScheduler, DDPMScheduler, DiffusionPipeline, UNet2DConditionModel
-from diffusers.pipeline_utils import ImagePipelineOutput
-from diffusers.schedulers.scheduling_ddim import DDIMSchedulerOutput
-from diffusers.schedulers.scheduling_ddpm import DDPMSchedulerOutput
-from einops import rearrange, reduce
-
-
-BITS = 8
-
-
-# convert to bit representations and back taken from https://github.com/lucidrains/bit-diffusion/blob/main/bit_diffusion/bit_diffusion.py
-def decimal_to_bits(x, bits=BITS):
-    """expects image tensor ranging from 0 to 1, outputs bit tensor ranging from -1 to 1"""
-    device = x.device
-
-    x = (x * 255).int().clamp(0, 255)
-
-    mask = 2 ** torch.arange(bits - 1, -1, -1, device=device)
-    mask = rearrange(mask, "d -> d 1 1")
-    x = rearrange(x, "b c h w -> b c 1 h w")
-
-    bits = ((x & mask) != 0).float()
-    bits = rearrange(bits, "b c d h w -> b (c d) h w")
-    bits = bits * 2 - 1
-    return bits
-
-
-def bits_to_decimal(x, bits=BITS):
-    """expects bits from -1 to 1, outputs image tensor from 0 to 1"""
-    device = x.device
-
-    x = (x > 0).int()
-    mask = 2 ** torch.arange(bits - 1, -1, -1, device=device, dtype=torch.int32)
-
-    mask = rearrange(mask, "d -> d 1 1")
-    x = rearrange(x, "b (c d) h w -> b c d h w", d=8)
-    dec = reduce(x * mask, "b c d h w -> b c h w", "sum")
-    return (dec / 255).clamp(0.0, 1.0)
-
-
-# modified scheduler step functions for clamping the predicted x_0 between -bit_scale and +bit_scale
-def ddim_bit_scheduler_step(
-    self,
-    model_output: torch.FloatTensor,
-    timestep: int,
-    sample: torch.FloatTensor,
-    eta: float = 0.0,
-    use_clipped_model_output: bool = True,
-    generator=None,
-    return_dict: bool = True,
-) -> Union[DDIMSchedulerOutput, Tuple]:
-    """
-    Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
-    process from the learned model outputs (most often the predicted noise).
-    Args:
-        model_output (`torch.FloatTensor`): direct output from learned diffusion model.
-        timestep (`int`): current discrete timestep in the diffusion chain.
-        sample (`torch.FloatTensor`):
-            current instance of sample being created by diffusion process.
-        eta (`float`): weight of noise for added noise in diffusion step.
-        use_clipped_model_output (`bool`): TODO
-        generator: random number generator.
-        return_dict (`bool`): option for returning tuple rather than DDIMSchedulerOutput class
-    Returns:
-        [`~schedulers.scheduling_utils.DDIMSchedulerOutput`] or `tuple`:
-        [`~schedulers.scheduling_utils.DDIMSchedulerOutput`] if `return_dict` is True, otherwise a `tuple`. When
-        returning a tuple, the first element is the sample tensor.
-    """
-    if self.num_inference_steps is None:
-        raise ValueError(
-            "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
-        )
-
-    # See formulas (12) and (16) of DDIM paper https://arxiv.org/pdf/2010.02502.pdf
-    # Ideally, read DDIM paper in-detail understanding
-
-    # Notation (<variable name> -> <name in paper>
-    # - pred_noise_t -> e_theta(x_t, t)
-    # - pred_original_sample -> f_theta(x_t, t) or x_0
-    # - std_dev_t -> sigma_t
-    # - eta -> η
-    # - pred_sample_direction -> "direction pointing to x_t"
-    # - pred_prev_sample -> "x_t-1"
-
-    # 1. get previous step value (=t-1)
-    prev_timestep = timestep - self.config.num_train_timesteps // self.num_inference_steps
-
-    # 2. compute alphas, betas
-    alpha_prod_t = self.alphas_cumprod[timestep]
-    alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
-
-    beta_prod_t = 1 - alpha_prod_t
-
-    # 3. compute predicted original sample from predicted noise also called
-    # "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
-    pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
-
-    # 4. Clip "predicted x_0"
-    scale = self.bit_scale
-    if self.config.clip_sample:
-        pred_original_sample = torch.clamp(pred_original_sample, -scale, scale)
-
-    # 5. compute variance: "sigma_t(η)" -> see formula (16)
-    # σ_t = sqrt((1 − α_t−1)/(1 − α_t)) * sqrt(1 − α_t/α_t−1)
-    variance = self._get_variance(timestep, prev_timestep)
-    std_dev_t = eta * variance ** (0.5)
-
-    if use_clipped_model_output:
-        # the model_output is always re-derived from the clipped x_0 in Glide
-        model_output = (sample - alpha_prod_t ** (0.5) * pred_original_sample) / beta_prod_t ** (0.5)
-
-    # 6. compute "direction pointing to x_t" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
-    pred_sample_direction = (1 - alpha_prod_t_prev - std_dev_t**2) ** (0.5) * model_output
-
-    # 7. compute x_t without "random noise" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
-    prev_sample = alpha_prod_t_prev ** (0.5) * pred_original_sample + pred_sample_direction
-
-    if eta > 0:
-        # randn_like does not support generator https://github.com/pytorch/pytorch/issues/27072
-        device = model_output.device if torch.is_tensor(model_output) else "cpu"
-        noise = torch.randn(model_output.shape, dtype=model_output.dtype, generator=generator).to(device)
-        variance = self._get_variance(timestep, prev_timestep) ** (0.5) * eta * noise
-
-        prev_sample = prev_sample + variance
-
-    if not return_dict:
-        return (prev_sample,)
-
-    return DDIMSchedulerOutput(prev_sample=prev_sample, pred_original_sample=pred_original_sample)
-
-
-def ddpm_bit_scheduler_step(
-    self,
-    model_output: torch.FloatTensor,
-    timestep: int,
-    sample: torch.FloatTensor,
-    prediction_type="epsilon",
-    generator=None,
-    return_dict: bool = True,
-) -> Union[DDPMSchedulerOutput, Tuple]:
-    """
-    Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
-    process from the learned model outputs (most often the predicted noise).
-    Args:
-        model_output (`torch.FloatTensor`): direct output from learned diffusion model.
-        timestep (`int`): current discrete timestep in the diffusion chain.
-        sample (`torch.FloatTensor`):
-            current instance of sample being created by diffusion process.
-        prediction_type (`str`, default `epsilon`):
-            indicates whether the model predicts the noise (epsilon), or the samples (`sample`).
-        generator: random number generator.
-        return_dict (`bool`): option for returning tuple rather than DDPMSchedulerOutput class
-    Returns:
-        [`~schedulers.scheduling_utils.DDPMSchedulerOutput`] or `tuple`:
-        [`~schedulers.scheduling_utils.DDPMSchedulerOutput`] if `return_dict` is True, otherwise a `tuple`. When
-        returning a tuple, the first element is the sample tensor.
-    """
-    t = timestep
-
-    if model_output.shape[1] == sample.shape[1] * 2 and self.variance_type in ["learned", "learned_range"]:
-        model_output, predicted_variance = torch.split(model_output, sample.shape[1], dim=1)
-    else:
-        predicted_variance = None
-
-    # 1. compute alphas, betas
-    alpha_prod_t = self.alphas_cumprod[t]
-    alpha_prod_t_prev = self.alphas_cumprod[t - 1] if t > 0 else self.one
-    beta_prod_t = 1 - alpha_prod_t
-    beta_prod_t_prev = 1 - alpha_prod_t_prev
-
-    # 2. compute predicted original sample from predicted noise also called
-    # "predicted x_0" of formula (15) from https://arxiv.org/pdf/2006.11239.pdf
-    if prediction_type == "epsilon":
-        pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
-    elif prediction_type == "sample":
-        pred_original_sample = model_output
-    else:
-        raise ValueError(f"Unsupported prediction_type {prediction_type}.")
-
-    # 3. Clip "predicted x_0"
-    scale = self.bit_scale
-    if self.config.clip_sample:
-        pred_original_sample = torch.clamp(pred_original_sample, -scale, scale)
-
-    # 4. Compute coefficients for pred_original_sample x_0 and current sample x_t
-    # See formula (7) from https://arxiv.org/pdf/2006.11239.pdf
-    pred_original_sample_coeff = (alpha_prod_t_prev ** (0.5) * self.betas[t]) / beta_prod_t
-    current_sample_coeff = self.alphas[t] ** (0.5) * beta_prod_t_prev / beta_prod_t
-
-    # 5. Compute predicted previous sample µ_t
-    # See formula (7) from https://arxiv.org/pdf/2006.11239.pdf
-    pred_prev_sample = pred_original_sample_coeff * pred_original_sample + current_sample_coeff * sample
-
-    # 6. Add noise
-    variance = 0
-    if t > 0:
-        noise = torch.randn(
-            model_output.size(), dtype=model_output.dtype, layout=model_output.layout, generator=generator
-        ).to(model_output.device)
-        variance = (self._get_variance(t, predicted_variance=predicted_variance) ** 0.5) * noise
-
-    pred_prev_sample = pred_prev_sample + variance
-
-    if not return_dict:
-        return (pred_prev_sample,)
-
-    return DDPMSchedulerOutput(prev_sample=pred_prev_sample, pred_original_sample=pred_original_sample)
-
-
-class BitDiffusion(DiffusionPipeline):
-    def __init__(
-        self,
-        unet: UNet2DConditionModel,
-        scheduler: Union[DDIMScheduler, DDPMScheduler],
-        bit_scale: Optional[float] = 1.0,
-    ):
-        super().__init__()
-        self.bit_scale = bit_scale
-        self.scheduler.step = (
-            ddim_bit_scheduler_step if isinstance(scheduler, DDIMScheduler) else ddpm_bit_scheduler_step
-        )
-
-        self.register_modules(unet=unet, scheduler=scheduler)
-
-    @torch.no_grad()
-    def __call__(
-        self,
-        height: Optional[int] = 256,
-        width: Optional[int] = 256,
-        num_inference_steps: Optional[int] = 50,
-        generator: Optional[torch.Generator] = None,
-        batch_size: Optional[int] = 1,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        **kwargs,
-    ) -> Union[Tuple, ImagePipelineOutput]:
-        latents = torch.randn(
-            (batch_size, self.unet.in_channels, height, width),
-            generator=generator,
-        )
-        latents = decimal_to_bits(latents) * self.bit_scale
-        latents = latents.to(self.device)
-
-        self.scheduler.set_timesteps(num_inference_steps)
-
-        for t in self.progress_bar(self.scheduler.timesteps):
-            # predict the noise residual
-            noise_pred = self.unet(latents, t).sample
-
-            # compute the previous noisy sample x_t -> x_t-1
-            latents = self.scheduler.step(noise_pred, t, latents).prev_sample
-
-        image = bits_to_decimal(latents)
-
-        if output_type == "pil":
-            image = self.numpy_to_pil(image)
-
-        if not return_dict:
-            return (image,)
-
-        return ImagePipelineOutput(images=image)
--- a/examples/community/checkpoint_merger.py
+++ b/examples/community/checkpoint_merger.py
@@ -1,262 +0,0 @@
-import glob
-import os
-from typing import Dict, List, Union
-
-import torch
-
-from diffusers import DiffusionPipeline, __version__
-from diffusers.pipeline_utils import (
-    CONFIG_NAME,
-    DIFFUSERS_CACHE,
-    ONNX_WEIGHTS_NAME,
-    SCHEDULER_CONFIG_NAME,
-    WEIGHTS_NAME,
-)
-from huggingface_hub import snapshot_download
-
-
-class CheckpointMergerPipeline(DiffusionPipeline):
-    """
-    A class that that supports merging diffusion models based on the discussion here:
-    https://github.com/huggingface/diffusers/issues/877
-
-    Example usage:-
-
-    pipe = DiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", custom_pipeline="checkpoint_merger.py")
-
-    merged_pipe = pipe.merge(["CompVis/stable-diffusion-v1-4","prompthero/openjourney"], interp = 'inv_sigmoid', alpha = 0.8, force = True)
-
-    merged_pipe.to('cuda')
-
-    prompt = "An astronaut riding a unicycle on Mars"
-
-    results = merged_pipe(prompt)
-
-    ## For more details, see the docstring for the merge method.
-
-    """
-
-    def __init__(self):
-        super().__init__()
-
-    def _compare_model_configs(self, dict0, dict1):
-        if dict0 == dict1:
-            return True
-        else:
-            config0, meta_keys0 = self._remove_meta_keys(dict0)
-            config1, meta_keys1 = self._remove_meta_keys(dict1)
-            if config0 == config1:
-                print(f"Warning !: Mismatch in keys {meta_keys0} and {meta_keys1}.")
-                return True
-        return False
-
-    def _remove_meta_keys(self, config_dict: Dict):
-        meta_keys = []
-        temp_dict = config_dict.copy()
-        for key in config_dict.keys():
-            if key.startswith("_"):
-                temp_dict.pop(key)
-                meta_keys.append(key)
-        return (temp_dict, meta_keys)
-
-    @torch.no_grad()
-    def merge(self, pretrained_model_name_or_path_list: List[Union[str, os.PathLike]], **kwargs):
-        """
-        Returns a new pipeline object of the class 'DiffusionPipeline' with the merged checkpoints(weights) of the models passed
-        in the argument 'pretrained_model_name_or_path_list' as a list.
-
-        Parameters:
-        -----------
-            pretrained_model_name_or_path_list : A list of valid pretrained model names in the HuggingFace hub or paths to locally stored models in the HuggingFace format.
-
-            **kwargs:
-                Supports all the default DiffusionPipeline.get_config_dict kwargs viz..
-
-                cache_dir, resume_download, force_download, proxies, local_files_only, use_auth_token, revision, torch_dtype, device_map.
-
-                alpha - The interpolation parameter. Ranges from 0 to 1.  It affects the ratio in which the checkpoints are merged. A 0.8 alpha
-                    would mean that the first model checkpoints would affect the final result far less than an alpha of 0.2
-
-                interp - The interpolation method to use for the merging. Supports "sigmoid", "inv_sigmoid", "add_difference" and None.
-                    Passing None uses the default interpolation which is weighted sum interpolation. For merging three checkpoints, only "add_difference" is supported.
-
-                force - Whether to ignore mismatch in model_config.json for the current models. Defaults to False.
-
-        """
-        # Default kwargs from DiffusionPipeline
-        cache_dir = kwargs.pop("cache_dir", DIFFUSERS_CACHE)
-        resume_download = kwargs.pop("resume_download", False)
-        force_download = kwargs.pop("force_download", False)
-        proxies = kwargs.pop("proxies", None)
-        local_files_only = kwargs.pop("local_files_only", False)
-        use_auth_token = kwargs.pop("use_auth_token", None)
-        revision = kwargs.pop("revision", None)
-        torch_dtype = kwargs.pop("torch_dtype", None)
-        device_map = kwargs.pop("device_map", None)
-
-        alpha = kwargs.pop("alpha", 0.5)
-        interp = kwargs.pop("interp", None)
-
-        print("Recieved list", pretrained_model_name_or_path_list)
-
-        checkpoint_count = len(pretrained_model_name_or_path_list)
-        # Ignore result from model_index_json comparision of the two checkpoints
-        force = kwargs.pop("force", False)
-
-        # If less than 2 checkpoints, nothing to merge. If more than 3, not supported for now.
-        if checkpoint_count > 3 or checkpoint_count < 2:
-            raise ValueError(
-                "Received incorrect number of checkpoints to merge. Ensure that either 2 or 3 checkpoints are being"
-                " passed."
-            )
-
-        print("Received the right number of checkpoints")
-        # chkpt0, chkpt1 = pretrained_model_name_or_path_list[0:2]
-        # chkpt2 = pretrained_model_name_or_path_list[2] if checkpoint_count == 3 else None
-
-        # Validate that the checkpoints can be merged
-        # Step 1: Load the model config and compare the checkpoints. We'll compare the model_index.json first while ignoring the keys starting with '_'
-        config_dicts = []
-        for pretrained_model_name_or_path in pretrained_model_name_or_path_list:
-            if not os.path.isdir(pretrained_model_name_or_path):
-                config_dict = DiffusionPipeline.get_config_dict(
-                    pretrained_model_name_or_path,
-                    cache_dir=cache_dir,
-                    resume_download=resume_download,
-                    force_download=force_download,
-                    proxies=proxies,
-                    local_files_only=local_files_only,
-                    use_auth_token=use_auth_token,
-                    revision=revision,
-                )
-            config_dicts.append(config_dict)
-
-        comparison_result = True
-        for idx in range(1, len(config_dicts)):
-            comparison_result &= self._compare_model_configs(config_dicts[idx - 1], config_dicts[idx])
-            if not force and comparison_result is False:
-                raise ValueError("Incompatible checkpoints. Please check model_index.json for the models.")
-                print(config_dicts[0], config_dicts[1])
-        print("Compatible model_index.json files found")
-        # Step 2: Basic Validation has succeeded. Let's download the models and save them into our local files.
-        cached_folders = []
-        for pretrained_model_name_or_path, config_dict in zip(pretrained_model_name_or_path_list, config_dicts):
-            folder_names = [k for k in config_dict.keys() if not k.startswith("_")]
-            allow_patterns = [os.path.join(k, "*") for k in folder_names]
-            allow_patterns += [
-                WEIGHTS_NAME,
-                SCHEDULER_CONFIG_NAME,
-                CONFIG_NAME,
-                ONNX_WEIGHTS_NAME,
-                DiffusionPipeline.config_name,
-            ]
-            requested_pipeline_class = config_dict.get("_class_name")
-            user_agent = {"diffusers": __version__, "pipeline_class": requested_pipeline_class}
-
-            cached_folder = snapshot_download(
-                pretrained_model_name_or_path,
-                cache_dir=cache_dir,
-                resume_download=resume_download,
-                proxies=proxies,
-                local_files_only=local_files_only,
-                use_auth_token=use_auth_token,
-                revision=revision,
-                allow_patterns=allow_patterns,
-                user_agent=user_agent,
-            )
-            print("Cached Folder", cached_folder)
-            cached_folders.append(cached_folder)
-
-        # Step 3:-
-        # Load the first checkpoint as a diffusion pipeline and modify it's module state_dict in place
-        final_pipe = DiffusionPipeline.from_pretrained(
-            cached_folders[0], torch_dtype=torch_dtype, device_map=device_map
-        )
-
-        checkpoint_path_2 = None
-        if len(cached_folders) > 2:
-            checkpoint_path_2 = os.path.join(cached_folders[2])
-
-        if interp == "sigmoid":
-            theta_func = CheckpointMergerPipeline.sigmoid
-        elif interp == "inv_sigmoid":
-            theta_func = CheckpointMergerPipeline.inv_sigmoid
-        elif interp == "add_diff":
-            theta_func = CheckpointMergerPipeline.add_difference
-        else:
-            theta_func = CheckpointMergerPipeline.weighted_sum
-
-        # Find each module's state dict.
-        for attr in final_pipe.config.keys():
-            if not attr.startswith("_"):
-                checkpoint_path_1 = os.path.join(cached_folders[1], attr)
-                if os.path.exists(checkpoint_path_1):
-                    files = glob.glob(os.path.join(checkpoint_path_1, "*.bin"))
-                    checkpoint_path_1 = files[0] if len(files) > 0 else None
-                if checkpoint_path_2 is not None and os.path.exists(checkpoint_path_2):
-                    files = glob.glob(os.path.join(checkpoint_path_2, "*.bin"))
-                    checkpoint_path_2 = files[0] if len(files) > 0 else None
-                # For an attr if both checkpoint_path_1 and 2 are None, ignore.
-                # If atleast one is present, deal with it according to interp method, of course only if the state_dict keys match.
-                if checkpoint_path_1 is None and checkpoint_path_2 is None:
-                    print("SKIPPING ATTR ", attr)
-                    continue
-                try:
-                    module = getattr(final_pipe, attr)
-                    theta_0 = getattr(module, "state_dict")
-                    theta_0 = theta_0()
-
-                    update_theta_0 = getattr(module, "load_state_dict")
-                    theta_1 = torch.load(checkpoint_path_1)
-
-                    theta_2 = torch.load(checkpoint_path_2) if checkpoint_path_2 else None
-
-                    if not theta_0.keys() == theta_1.keys():
-                        print("SKIPPING ATTR ", attr, " DUE TO MISMATCH")
-                        continue
-                    if theta_2 and not theta_1.keys() == theta_2.keys():
-                        print("SKIPPING ATTR ", attr, " DUE TO MISMATCH")
-                except:
-                    print("SKIPPING ATTR ", attr)
-                    continue
-                print("Found dicts for")
-                print(attr)
-                print(checkpoint_path_1)
-                print(checkpoint_path_2)
-
-                for key in theta_0.keys():
-                    if theta_2:
-                        theta_0[key] = theta_func(theta_0[key], theta_1[key], theta_2[key], alpha)
-                    else:
-                        theta_0[key] = theta_func(theta_0[key], theta_1[key], None, alpha)
-
-                del theta_1
-                del theta_2
-                update_theta_0(theta_0)
-
-                del theta_0
-                print("Diffusion pipeline successfully updated with merged weights")
-
-        return final_pipe
-
-    @staticmethod
-    def weighted_sum(theta0, theta1, theta2, alpha):
-        return ((1 - alpha) * theta0) + (alpha * theta1)
-
-    # Smoothstep (https://en.wikipedia.org/wiki/Smoothstep)
-    @staticmethod
-    def sigmoid(theta0, theta1, theta2, alpha):
-        alpha = alpha * alpha * (3 - (2 * alpha))
-        return theta0 + ((theta1 - theta0) * alpha)
-
-    # Inverse Smoothstep (https://en.wikipedia.org/wiki/Smoothstep)
-    @staticmethod
-    def inv_sigmoid(theta0, theta1, theta2, alpha):
-        import math
-
-        alpha = 0.5 - math.sin(math.asin(1.0 - 2.0 * alpha) / 3.0)
-        return theta0 + ((theta1 - theta0) * alpha)
-
-    @staticmethod
-    def add_difference(theta0, theta1, theta2, alpha):
-        return theta0 + (theta1 - theta2) * (1.0 - alpha)
--- a/examples/community/clip_guided_stable_diffusion.py
+++ b/examples/community/clip_guided_stable_diffusion.py
@@ -5,14 +5,7 @@ import torch
 from torch import nn
 from torch.nn import functional as F

-from diffusers import (
-    AutoencoderKL,
-    DDIMScheduler,
-    DiffusionPipeline,
-    LMSDiscreteScheduler,
-    PNDMScheduler,
-    UNet2DConditionModel,
-)
+from diffusers import AutoencoderKL, DiffusionPipeline, LMSDiscreteScheduler, PNDMScheduler, UNet2DConditionModel
 from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import StableDiffusionPipelineOutput
 from torchvision import transforms
 from transformers import CLIPFeatureExtractor, CLIPModel, CLIPTextModel, CLIPTokenizer
@@ -63,7 +56,7 @@ class CLIPGuidedStableDiffusion(DiffusionPipeline):
        clip_model: CLIPModel,
        tokenizer: CLIPTokenizer,
        unet: UNet2DConditionModel,
-        scheduler: Union[PNDMScheduler, LMSDiscreteScheduler, DDIMScheduler],
+        scheduler: Union[PNDMScheduler, LMSDiscreteScheduler],
        feature_extractor: CLIPFeatureExtractor,
    ):
        super().__init__()
@@ -78,12 +71,7 @@ class CLIPGuidedStableDiffusion(DiffusionPipeline):
        )

        self.normalize = transforms.Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std)
-        cut_out_size = (
-            feature_extractor.size
-            if isinstance(feature_extractor.size, int)
-            else feature_extractor.size["shortest_edge"]
-        )
-        self.make_cutouts = MakeCutouts(cut_out_size)
+        self.make_cutouts = MakeCutouts(feature_extractor.size)

        set_requires_grad(self.text_encoder, False)
        set_requires_grad(self.clip_model, False)
@@ -135,7 +123,7 @@ class CLIPGuidedStableDiffusion(DiffusionPipeline):
        # predict the noise residual
        noise_pred = self.unet(latent_model_input, timestep, encoder_hidden_states=text_embeddings).sample

-        if isinstance(self.scheduler, (PNDMScheduler, DDIMScheduler)):
+        if isinstance(self.scheduler, PNDMScheduler):
            alpha_prod_t = self.scheduler.alphas_cumprod[timestep]
            beta_prod_t = 1 - alpha_prod_t
            # compute predicted original sample from predicted noise also called
@@ -188,7 +176,6 @@ class CLIPGuidedStableDiffusion(DiffusionPipeline):
        num_inference_steps: Optional[int] = 50,
        guidance_scale: Optional[float] = 7.5,
        num_images_per_prompt: Optional[int] = 1,
-        eta: float = 0.0,
        clip_guidance_scale: Optional[float] = 100,
        clip_prompt: Optional[Union[str, List[str]]] = None,
        num_cutouts: Optional[int] = 4,
@@ -288,20 +275,6 @@ class CLIPGuidedStableDiffusion(DiffusionPipeline):
        # scale the initial noise by the standard deviation required by the scheduler
        latents = latents * self.scheduler.init_noise_sigma

-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        extra_step_kwargs = {}
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-
-        # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        if accepts_generator:
-            extra_step_kwargs["generator"] = generator
-
        for i, t in enumerate(self.progress_bar(timesteps_tensor)):
            # expand the latents if we are doing classifier free guidance
            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
@@ -333,7 +306,7 @@ class CLIPGuidedStableDiffusion(DiffusionPipeline):
                )

            # compute the previous noisy sample x_t -> x_t-1
-            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+            latents = self.scheduler.step(noise_pred, t, latents).prev_sample

        # scale and decode the image latents with vae
        latents = 1 / 0.18215 * latents
--- a/examples/community/composable_stable_diffusion.py
+++ b/examples/community/composable_stable_diffusion.py
@@ -32,7 +32,7 @@ class ComposableStableDiffusionPipeline(DiffusionPipeline):
            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
        safety_checker ([`StableDiffusionSafetyChecker`]):
            Classification module that estimates whether generated images could be considered offsensive or harmful.
--- a/examples/community/imagic_stable_diffusion.py
+++ b/examples/community/imagic_stable_diffusion.py
@@ -1,501 +0,0 @@
-"""
-    modeled after the textual_inversion.py / train_dreambooth.py and the work
-    of justinpinkney here: https://github.com/justinpinkney/stable-diffusion/blob/main/notebooks/imagic.ipynb
-"""
-import inspect
-import warnings
-from typing import List, Optional, Union
-
-import numpy as np
-import torch
-import torch.nn.functional as F
-
-import PIL
-from accelerate import Accelerator
-from diffusers.models import AutoencoderKL, UNet2DConditionModel
-from diffusers.pipeline_utils import DiffusionPipeline
-from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
-from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
-from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
-from diffusers.utils import deprecate, logging
-
-# TODO: remove and import from diffusers.utils when the new version of diffusers is released
-from packaging import version
-from tqdm.auto import tqdm
-from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
-
-
-if version.parse(version.parse(PIL.__version__).base_version) >= version.parse("9.1.0"):
-    PIL_INTERPOLATION = {
-        "linear": PIL.Image.Resampling.BILINEAR,
-        "bilinear": PIL.Image.Resampling.BILINEAR,
-        "bicubic": PIL.Image.Resampling.BICUBIC,
-        "lanczos": PIL.Image.Resampling.LANCZOS,
-        "nearest": PIL.Image.Resampling.NEAREST,
-    }
-else:
-    PIL_INTERPOLATION = {
-        "linear": PIL.Image.LINEAR,
-        "bilinear": PIL.Image.BILINEAR,
-        "bicubic": PIL.Image.BICUBIC,
-        "lanczos": PIL.Image.LANCZOS,
-        "nearest": PIL.Image.NEAREST,
-    }
-# ------------------------------------------------------------------------------
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-
-def preprocess(image):
-    w, h = image.size
-    w, h = map(lambda x: x - x % 32, (w, h))  # resize to integer multiple of 32
-    image = image.resize((w, h), resample=PIL_INTERPOLATION["lanczos"])
-    image = np.array(image).astype(np.float32) / 255.0
-    image = image[None].transpose(0, 3, 1, 2)
-    image = torch.from_numpy(image)
-    return 2.0 * image - 1.0
-
-
-class ImagicStableDiffusionPipeline(DiffusionPipeline):
-    r"""
-    Pipeline for imagic image editing.
-    See paper here: https://arxiv.org/pdf/2210.09276.pdf
-
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-    Args:
-        vae ([`AutoencoderKL`]):
-            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
-        text_encoder ([`CLIPTextModel`]):
-            Frozen text-encoder. Stable Diffusion uses the text portion of
-            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
-            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
-        tokenizer (`CLIPTokenizer`):
-            Tokenizer of class
-            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
-        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
-        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
-            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
-        safety_checker ([`StableDiffusionSafetyChecker`]):
-            Classification module that estimates whether generated images could be considered offsensive or harmful.
-            Please, refer to the [model card](https://huggingface.co/CompVis/stable-diffusion-v1-4) for details.
-        feature_extractor ([`CLIPFeatureExtractor`]):
-            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
-    """
-
-    def __init__(
-        self,
-        vae: AutoencoderKL,
-        text_encoder: CLIPTextModel,
-        tokenizer: CLIPTokenizer,
-        unet: UNet2DConditionModel,
-        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
-        safety_checker: StableDiffusionSafetyChecker,
-        feature_extractor: CLIPFeatureExtractor,
-    ):
-        super().__init__()
-        self.register_modules(
-            vae=vae,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            unet=unet,
-            scheduler=scheduler,
-            safety_checker=safety_checker,
-            feature_extractor=feature_extractor,
-        )
-
-    def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
-        r"""
-        Enable sliced attention computation.
-        When this option is enabled, the attention module will split the input tensor in slices, to compute attention
-        in several steps. This is useful to save some memory in exchange for a small speed decrease.
-        Args:
-            slice_size (`str` or `int`, *optional*, defaults to `"auto"`):
-                When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
-                a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case,
-                `attention_head_dim` must be a multiple of `slice_size`.
-        """
-        if slice_size == "auto":
-            # half the attention head size is usually a good trade-off between
-            # speed and memory
-            slice_size = self.unet.config.attention_head_dim // 2
-        self.unet.set_attention_slice(slice_size)
-
-    def disable_attention_slicing(self):
-        r"""
-        Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go
-        back to computing attention in one step.
-        """
-        # set slice_size = `None` to disable `attention slicing`
-        self.enable_attention_slicing(None)
-
-    def train(
-        self,
-        prompt: Union[str, List[str]],
-        image: Union[torch.FloatTensor, PIL.Image.Image],
-        height: Optional[int] = 512,
-        width: Optional[int] = 512,
-        generator: Optional[torch.Generator] = None,
-        embedding_learning_rate: float = 0.001,
-        diffusion_model_learning_rate: float = 2e-6,
-        text_embedding_optimization_steps: int = 500,
-        model_fine_tuning_optimization_steps: int = 1000,
-        **kwargs,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-        Args:
-            prompt (`str` or `List[str]`):
-                The prompt or prompts to guide the image generation.
-            height (`int`, *optional*, defaults to 512):
-                The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to 512):
-                The width in pixels of the generated image.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`torch.Generator`, *optional*):
-                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
-                deterministic.
-            latents (`torch.FloatTensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `nd.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-        Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-            When returning a tuple, the first element is a list with the generated images, and the second element is a
-            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, according to the `safety_checker`.
-        """
-        message = "Please use `image` instead of `init_image`."
-        init_image = deprecate("init_image", "0.12.0", message, take_from=kwargs)
-        image = init_image or image
-
-        accelerator = Accelerator(
-            gradient_accumulation_steps=1,
-            mixed_precision="fp16",
-        )
-
-        if "torch_device" in kwargs:
-            device = kwargs.pop("torch_device")
-            warnings.warn(
-                "`torch_device` is deprecated as an input argument to `__call__` and will be removed in v0.3.0."
-                " Consider using `pipe.to(torch_device)` instead."
-            )
-
-            if device is None:
-                device = "cuda" if torch.cuda.is_available() else "cpu"
-            self.to(device)
-
-        if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
-
-        # Freeze vae and unet
-        self.vae.requires_grad_(False)
-        self.unet.requires_grad_(False)
-        self.text_encoder.requires_grad_(False)
-        self.unet.eval()
-        self.vae.eval()
-        self.text_encoder.eval()
-
-        if accelerator.is_main_process:
-            accelerator.init_trackers(
-                "imagic",
-                config={
-                    "embedding_learning_rate": embedding_learning_rate,
-                    "text_embedding_optimization_steps": text_embedding_optimization_steps,
-                },
-            )
-
-        # get text embeddings for prompt
-        text_input = self.tokenizer(
-            prompt,
-            padding="max_length",
-            max_length=self.tokenizer.model_max_length,
-            truncaton=True,
-            return_tensors="pt",
-        )
-        text_embeddings = torch.nn.Parameter(
-            self.text_encoder(text_input.input_ids.to(self.device))[0], requires_grad=True
-        )
-        text_embeddings = text_embeddings.detach()
-        text_embeddings.requires_grad_()
-        text_embeddings_orig = text_embeddings.clone()
-
-        # Initialize the optimizer
-        optimizer = torch.optim.Adam(
-            [text_embeddings],  # only optimize the embeddings
-            lr=embedding_learning_rate,
-        )
-
-        if isinstance(image, PIL.Image.Image):
-            image = preprocess(image)
-
-        latents_dtype = text_embeddings.dtype
-        image = image.to(device=self.device, dtype=latents_dtype)
-        init_latent_image_dist = self.vae.encode(image).latent_dist
-        image_latents = init_latent_image_dist.sample(generator=generator)
-        image_latents = 0.18215 * image_latents
-
-        progress_bar = tqdm(range(text_embedding_optimization_steps), disable=not accelerator.is_local_main_process)
-        progress_bar.set_description("Steps")
-
-        global_step = 0
-
-        logger.info("First optimizing the text embedding to better reconstruct the init image")
-        for _ in range(text_embedding_optimization_steps):
-            with accelerator.accumulate(text_embeddings):
-                # Sample noise that we'll add to the latents
-                noise = torch.randn(image_latents.shape).to(image_latents.device)
-                timesteps = torch.randint(1000, (1,), device=image_latents.device)
-
-                # Add noise to the latents according to the noise magnitude at each timestep
-                # (this is the forward diffusion process)
-                noisy_latents = self.scheduler.add_noise(image_latents, noise, timesteps)
-
-                # Predict the noise residual
-                noise_pred = self.unet(noisy_latents, timesteps, text_embeddings).sample
-
-                loss = F.mse_loss(noise_pred, noise, reduction="none").mean([1, 2, 3]).mean()
-                accelerator.backward(loss)
-
-                optimizer.step()
-                optimizer.zero_grad()
-
-            # Checks if the accelerator has performed an optimization step behind the scenes
-            if accelerator.sync_gradients:
-                progress_bar.update(1)
-                global_step += 1
-
-            logs = {"loss": loss.detach().item()}  # , "lr": lr_scheduler.get_last_lr()[0]}
-            progress_bar.set_postfix(**logs)
-            accelerator.log(logs, step=global_step)
-
-        accelerator.wait_for_everyone()
-
-        text_embeddings.requires_grad_(False)
-
-        # Now we fine tune the unet to better reconstruct the image
-        self.unet.requires_grad_(True)
-        self.unet.train()
-        optimizer = torch.optim.Adam(
-            self.unet.parameters(),  # only optimize unet
-            lr=diffusion_model_learning_rate,
-        )
-        progress_bar = tqdm(range(model_fine_tuning_optimization_steps), disable=not accelerator.is_local_main_process)
-
-        logger.info("Next fine tuning the entire model to better reconstruct the init image")
-        for _ in range(model_fine_tuning_optimization_steps):
-            with accelerator.accumulate(self.unet.parameters()):
-                # Sample noise that we'll add to the latents
-                noise = torch.randn(image_latents.shape).to(image_latents.device)
-                timesteps = torch.randint(1000, (1,), device=image_latents.device)
-
-                # Add noise to the latents according to the noise magnitude at each timestep
-                # (this is the forward diffusion process)
-                noisy_latents = self.scheduler.add_noise(image_latents, noise, timesteps)
-
-                # Predict the noise residual
-                noise_pred = self.unet(noisy_latents, timesteps, text_embeddings).sample
-
-                loss = F.mse_loss(noise_pred, noise, reduction="none").mean([1, 2, 3]).mean()
-                accelerator.backward(loss)
-
-                optimizer.step()
-                optimizer.zero_grad()
-
-            # Checks if the accelerator has performed an optimization step behind the scenes
-            if accelerator.sync_gradients:
-                progress_bar.update(1)
-                global_step += 1
-
-            logs = {"loss": loss.detach().item()}  # , "lr": lr_scheduler.get_last_lr()[0]}
-            progress_bar.set_postfix(**logs)
-            accelerator.log(logs, step=global_step)
-
-        accelerator.wait_for_everyone()
-        self.text_embeddings_orig = text_embeddings_orig
-        self.text_embeddings = text_embeddings
-
-    @torch.no_grad()
-    def __call__(
-        self,
-        alpha: float = 1.2,
-        height: Optional[int] = 512,
-        width: Optional[int] = 512,
-        num_inference_steps: Optional[int] = 50,
-        generator: Optional[torch.Generator] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        guidance_scale: float = 7.5,
-        eta: float = 0.0,
-        **kwargs,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-        Args:
-            prompt (`str` or `List[str]`):
-                The prompt or prompts to guide the image generation.
-            height (`int`, *optional*, defaults to 512):
-                The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to 512):
-                The width in pixels of the generated image.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`torch.Generator`, *optional*):
-                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
-                deterministic.
-            latents (`torch.FloatTensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `nd.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-        Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-            When returning a tuple, the first element is a list with the generated images, and the second element is a
-            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, according to the `safety_checker`.
-        """
-        if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
-        if self.text_embeddings is None:
-            raise ValueError("Please run the pipe.train() before trying to generate an image.")
-        if self.text_embeddings_orig is None:
-            raise ValueError("Please run the pipe.train() before trying to generate an image.")
-
-        text_embeddings = alpha * self.text_embeddings_orig + (1 - alpha) * self.text_embeddings
-
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-        # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance:
-            uncond_tokens = [""]
-            max_length = self.tokenizer.model_max_length
-            uncond_input = self.tokenizer(
-                uncond_tokens,
-                padding="max_length",
-                max_length=max_length,
-                truncation=True,
-                return_tensors="pt",
-            )
-            uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(self.device))[0]
-
-            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-            seq_len = uncond_embeddings.shape[1]
-            uncond_embeddings = uncond_embeddings.view(1, seq_len, -1)
-
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
-
-        # get the initial random noise unless the user supplied it
-
-        # Unlike in other pipelines, latents need to be generated in the target device
-        # for 1-to-1 results reproducibility with the CompVis implementation.
-        # However this currently doesn't work in `mps`.
-        latents_shape = (1, self.unet.in_channels, height // 8, width // 8)
-        latents_dtype = text_embeddings.dtype
-        if self.device.type == "mps":
-            # randn does not exist on mps
-            latents = torch.randn(latents_shape, generator=generator, device="cpu", dtype=latents_dtype).to(
-                self.device
-            )
-        else:
-            latents = torch.randn(latents_shape, generator=generator, device=self.device, dtype=latents_dtype)
-
-        # set timesteps
-        self.scheduler.set_timesteps(num_inference_steps)
-
-        # Some schedulers like PNDM have timesteps as arrays
-        # It's more optimized to move all timesteps to correct device beforehand
-        timesteps_tensor = self.scheduler.timesteps.to(self.device)
-
-        # scale the initial noise by the standard deviation required by the scheduler
-        latents = latents * self.scheduler.init_noise_sigma
-
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        extra_step_kwargs = {}
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-
-        for i, t in enumerate(self.progress_bar(timesteps_tensor)):
-            # expand the latents if we are doing classifier free guidance
-            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
-            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-
-            # predict the noise residual
-            noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
-
-            # perform guidance
-            if do_classifier_free_guidance:
-                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-
-            # compute the previous noisy sample x_t -> x_t-1
-            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
-
-        latents = 1 / 0.18215 * latents
-        image = self.vae.decode(latents).sample
-
-        image = (image / 2 + 0.5).clamp(0, 1)
-
-        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
-        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
-
-        if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(
-                self.device
-            )
-            image, has_nsfw_concept = self.safety_checker(
-                images=image, clip_input=safety_checker_input.pixel_values.to(text_embeddings.dtype)
-            )
-        else:
-            has_nsfw_concept = None
-
-        if output_type == "pil":
-            image = self.numpy_to_pil(image)
-
-        if not return_dict:
-            return (image, has_nsfw_concept)
-
-        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
--- a/examples/community/img2img_inpainting.py
+++ b/examples/community/img2img_inpainting.py
@@ -1,463 +0,0 @@
-import inspect
-from typing import Callable, List, Optional, Tuple, Union
-
-import numpy as np
-import torch
-
-import PIL
-from diffusers.configuration_utils import FrozenDict
-from diffusers.models import AutoencoderKL, UNet2DConditionModel
-from diffusers.pipeline_utils import DiffusionPipeline
-from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
-from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
-from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
-from diffusers.utils import deprecate, logging
-from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
-
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-
-def prepare_mask_and_masked_image(image, mask):
-    image = np.array(image.convert("RGB"))
-    image = image[None].transpose(0, 3, 1, 2)
-    image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
-
-    mask = np.array(mask.convert("L"))
-    mask = mask.astype(np.float32) / 255.0
-    mask = mask[None, None]
-    mask[mask < 0.5] = 0
-    mask[mask >= 0.5] = 1
-    mask = torch.from_numpy(mask)
-
-    masked_image = image * (mask < 0.5)
-
-    return mask, masked_image
-
-
-def check_size(image, height, width):
-    if isinstance(image, PIL.Image.Image):
-        w, h = image.size
-    elif isinstance(image, torch.Tensor):
-        *_, h, w = image.shape
-
-    if h != height or w != width:
-        raise ValueError(f"Image size should be {height}x{width}, but got {h}x{w}")
-
-
-def overlay_inner_image(image, inner_image, paste_offset: Tuple[int] = (0, 0)):
-    inner_image = inner_image.convert("RGBA")
-    image = image.convert("RGB")
-
-    image.paste(inner_image, paste_offset, inner_image)
-    image = image.convert("RGB")
-
-    return image
-
-
-class ImageToImageInpaintingPipeline(DiffusionPipeline):
-    r"""
-    Pipeline for text-guided image-to-image inpainting using Stable Diffusion. *This is an experimental feature*.
-
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-
-    Args:
-        vae ([`AutoencoderKL`]):
-            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
-        text_encoder ([`CLIPTextModel`]):
-            Frozen text-encoder. Stable Diffusion uses the text portion of
-            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
-            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
-        tokenizer (`CLIPTokenizer`):
-            Tokenizer of class
-            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
-        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
-        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
-            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
-        safety_checker ([`StableDiffusionSafetyChecker`]):
-            Classification module that estimates whether generated images could be considered offensive or harmful.
-            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
-        feature_extractor ([`CLIPFeatureExtractor`]):
-            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
-    """
-
-    def __init__(
-        self,
-        vae: AutoencoderKL,
-        text_encoder: CLIPTextModel,
-        tokenizer: CLIPTokenizer,
-        unet: UNet2DConditionModel,
-        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
-        safety_checker: StableDiffusionSafetyChecker,
-        feature_extractor: CLIPFeatureExtractor,
-    ):
-        super().__init__()
-
-        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
-            deprecation_message = (
-                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
-                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
-                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
-                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
-                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
-                " file"
-            )
-            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
-            new_config = dict(scheduler.config)
-            new_config["steps_offset"] = 1
-            scheduler._internal_dict = FrozenDict(new_config)
-
-        if safety_checker is None:
-            logger.warning(
-                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
-                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
-                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
-                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
-                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
-                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
-            )
-
-        self.register_modules(
-            vae=vae,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            unet=unet,
-            scheduler=scheduler,
-            safety_checker=safety_checker,
-            feature_extractor=feature_extractor,
-        )
-
-    def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
-        r"""
-        Enable sliced attention computation.
-
-        When this option is enabled, the attention module will split the input tensor in slices, to compute attention
-        in several steps. This is useful to save some memory in exchange for a small speed decrease.
-
-        Args:
-            slice_size (`str` or `int`, *optional*, defaults to `"auto"`):
-                When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
-                a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case,
-                `attention_head_dim` must be a multiple of `slice_size`.
-        """
-        if slice_size == "auto":
-            # half the attention head size is usually a good trade-off between
-            # speed and memory
-            slice_size = self.unet.config.attention_head_dim // 2
-        self.unet.set_attention_slice(slice_size)
-
-    def disable_attention_slicing(self):
-        r"""
-        Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go
-        back to computing attention in one step.
-        """
-        # set slice_size = `None` to disable `attention slicing`
-        self.enable_attention_slicing(None)
-
-    @torch.no_grad()
-    def __call__(
-        self,
-        prompt: Union[str, List[str]],
-        image: Union[torch.FloatTensor, PIL.Image.Image],
-        inner_image: Union[torch.FloatTensor, PIL.Image.Image],
-        mask_image: Union[torch.FloatTensor, PIL.Image.Image],
-        height: int = 512,
-        width: int = 512,
-        num_inference_steps: int = 50,
-        guidance_scale: float = 7.5,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: float = 0.0,
-        generator: Optional[torch.Generator] = None,
-        latents: Optional[torch.FloatTensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
-        callback_steps: Optional[int] = 1,
-        **kwargs,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`str` or `List[str]`):
-                The prompt or prompts to guide the image generation.
-            image (`torch.Tensor` or `PIL.Image.Image`):
-                `Image`, or tensor representing an image batch which will be inpainted, *i.e.* parts of the image will
-                be masked out with `mask_image` and repainted according to `prompt`.
-            inner_image (`torch.Tensor` or `PIL.Image.Image`):
-                `Image`, or tensor representing an image batch which will be overlayed onto `image`. Non-transparent
-                regions of `inner_image` must fit inside white pixels in `mask_image`. Expects four channels, with
-                the last channel representing the alpha channel, which will be used to blend `inner_image` with
-                `image`. If not provided, it will be forcibly cast to RGBA.
-            mask_image (`PIL.Image.Image`):
-                `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
-                repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be converted
-                to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L)
-                instead of 3, so the expected shape would be `(B, H, W, 1)`.
-            height (`int`, *optional*, defaults to 512):
-                The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to 512):
-                The width in pixels of the generated image.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
-                if `guidance_scale` is less than `1`).
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`torch.Generator`, *optional*):
-                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
-                deterministic.
-            latents (`torch.FloatTensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-
-        Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-            When returning a tuple, the first element is a list with the generated images, and the second element is a
-            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, according to the `safety_checker`.
-        """
-
-        if isinstance(prompt, str):
-            batch_size = 1
-        elif isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
-
-        if (callback_steps is None) or (
-            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
-        ):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}."
-            )
-
-        # check if input sizes are correct
-        check_size(image, height, width)
-        check_size(inner_image, height, width)
-        check_size(mask_image, height, width)
-
-        # get prompt text embeddings
-        text_inputs = self.tokenizer(
-            prompt,
-            padding="max_length",
-            max_length=self.tokenizer.model_max_length,
-            return_tensors="pt",
-        )
-        text_input_ids = text_inputs.input_ids
-
-        if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
-            removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :])
-            logger.warning(
-                "The following part of your input was truncated because CLIP can only handle sequences up to"
-                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
-            )
-            text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
-        text_embeddings = self.text_encoder(text_input_ids.to(self.device))[0]
-
-        # duplicate text embeddings for each generation per prompt, using mps friendly method
-        bs_embed, seq_len, _ = text_embeddings.shape
-        text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)
-        text_embeddings = text_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
-
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-        # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance:
-            uncond_tokens: List[str]
-            if negative_prompt is None:
-                uncond_tokens = [""]
-            elif type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}."
-                )
-            elif isinstance(negative_prompt, str):
-                uncond_tokens = [negative_prompt]
-            elif batch_size != len(negative_prompt):
-                raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`."
-                )
-            else:
-                uncond_tokens = negative_prompt
-
-            max_length = text_input_ids.shape[-1]
-            uncond_input = self.tokenizer(
-                uncond_tokens,
-                padding="max_length",
-                max_length=max_length,
-                truncation=True,
-                return_tensors="pt",
-            )
-            uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(self.device))[0]
-
-            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-            seq_len = uncond_embeddings.shape[1]
-            uncond_embeddings = uncond_embeddings.repeat(batch_size, num_images_per_prompt, 1)
-            uncond_embeddings = uncond_embeddings.view(batch_size * num_images_per_prompt, seq_len, -1)
-
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
-
-        # get the initial random noise unless the user supplied it
-        # Unlike in other pipelines, latents need to be generated in the target device
-        # for 1-to-1 results reproducibility with the CompVis implementation.
-        # However this currently doesn't work in `mps`.
-        num_channels_latents = self.vae.config.latent_channels
-        latents_shape = (batch_size * num_images_per_prompt, num_channels_latents, height // 8, width // 8)
-        latents_dtype = text_embeddings.dtype
-        if latents is None:
-            if self.device.type == "mps":
-                # randn does not exist on mps
-                latents = torch.randn(latents_shape, generator=generator, device="cpu", dtype=latents_dtype).to(
-                    self.device
-                )
-            else:
-                latents = torch.randn(latents_shape, generator=generator, device=self.device, dtype=latents_dtype)
-        else:
-            if latents.shape != latents_shape:
-                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
-            latents = latents.to(self.device)
-
-        # overlay the inner image
-        image = overlay_inner_image(image, inner_image)
-
-        # prepare mask and masked_image
-        mask, masked_image = prepare_mask_and_masked_image(image, mask_image)
-        mask = mask.to(device=self.device, dtype=text_embeddings.dtype)
-        masked_image = masked_image.to(device=self.device, dtype=text_embeddings.dtype)
-
-        # resize the mask to latents shape as we concatenate the mask to the latents
-        mask = torch.nn.functional.interpolate(mask, size=(height // 8, width // 8))
-
-        # encode the mask image into latents space so we can concatenate it to the latents
-        masked_image_latents = self.vae.encode(masked_image).latent_dist.sample(generator=generator)
-        masked_image_latents = 0.18215 * masked_image_latents
-
-        # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method
-        mask = mask.repeat(batch_size * num_images_per_prompt, 1, 1, 1)
-        masked_image_latents = masked_image_latents.repeat(batch_size * num_images_per_prompt, 1, 1, 1)
-
-        mask = torch.cat([mask] * 2) if do_classifier_free_guidance else mask
-        masked_image_latents = (
-            torch.cat([masked_image_latents] * 2) if do_classifier_free_guidance else masked_image_latents
-        )
-
-        num_channels_mask = mask.shape[1]
-        num_channels_masked_image = masked_image_latents.shape[1]
-
-        if num_channels_latents + num_channels_mask + num_channels_masked_image != self.unet.config.in_channels:
-            raise ValueError(
-                f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
-                f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
-                f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}"
-                f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of"
-                " `pipeline.unet` or your `mask_image` or `image` input."
-            )
-
-        # set timesteps
-        self.scheduler.set_timesteps(num_inference_steps)
-
-        # Some schedulers like PNDM have timesteps as arrays
-        # It's more optimized to move all timesteps to correct device beforehand
-        timesteps_tensor = self.scheduler.timesteps.to(self.device)
-
-        # scale the initial noise by the standard deviation required by the scheduler
-        latents = latents * self.scheduler.init_noise_sigma
-
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        extra_step_kwargs = {}
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-
-        for i, t in enumerate(self.progress_bar(timesteps_tensor)):
-            # expand the latents if we are doing classifier free guidance
-            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
-
-            # concat latents, mask, masked_image_latents in the channel dimension
-            latent_model_input = torch.cat([latent_model_input, mask, masked_image_latents], dim=1)
-
-            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-
-            # predict the noise residual
-            noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
-
-            # perform guidance
-            if do_classifier_free_guidance:
-                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-
-            # compute the previous noisy sample x_t -> x_t-1
-            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
-
-            # call the callback, if provided
-            if callback is not None and i % callback_steps == 0:
-                callback(i, t, latents)
-
-        latents = 1 / 0.18215 * latents
-        image = self.vae.decode(latents).sample
-
-        image = (image / 2 + 0.5).clamp(0, 1)
-
-        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
-        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
-
-        if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(
-                self.device
-            )
-            image, has_nsfw_concept = self.safety_checker(
-                images=image, clip_input=safety_checker_input.pixel_values.to(text_embeddings.dtype)
-            )
-        else:
-            has_nsfw_concept = None
-
-        if output_type == "pil":
-            image = self.numpy_to_pil(image)
-
-        if not return_dict:
-            return (image, has_nsfw_concept)
-
-        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
--- a/examples/community/interpolate_stable_diffusion.py
+++ b/examples/community/interpolate_stable_diffusion.py
@@ -65,7 +65,7 @@ class StableDiffusionWalkPipeline(DiffusionPipeline):
            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
        safety_checker ([`StableDiffusionSafetyChecker`]):
            Classification module that estimates whether generated images could be considered offensive or harmful.
@@ -101,7 +101,7 @@ class StableDiffusionWalkPipeline(DiffusionPipeline):
            scheduler._internal_dict = FrozenDict(new_config)

        if safety_checker is None:
-            logger.warning(
+            logger.warn(
                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
@@ -278,7 +278,7 @@ class StableDiffusionWalkPipeline(DiffusionPipeline):
        if do_classifier_free_guidance:
            uncond_tokens: List[str]
            if negative_prompt is None:
-                uncond_tokens = [""] * batch_size
+                uncond_tokens = [""]
            elif type(prompt) is not type(negative_prompt):
                raise TypeError(
                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
@@ -307,7 +307,7 @@ class StableDiffusionWalkPipeline(DiffusionPipeline):

            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
            seq_len = uncond_embeddings.shape[1]
-            uncond_embeddings = uncond_embeddings.repeat(1, num_images_per_prompt, 1)
+            uncond_embeddings = uncond_embeddings.repeat(batch_size, num_images_per_prompt, 1)
            uncond_embeddings = uncond_embeddings.view(batch_size * num_images_per_prompt, seq_len, -1)

            # For classifier free guidance, we need to do two forward passes.
--- a/examples/community/lpw_stable_diffusion.py
+++ b/examples/community/lpw_stable_diffusion.py
@@ -6,10 +6,13 @@ import numpy as np
 import torch

 import PIL
-from diffusers import SchedulerMixin, StableDiffusionPipeline
+from diffusers.configuration_utils import FrozenDict
 from diffusers.models import AutoencoderKL, UNet2DConditionModel
-from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput, StableDiffusionSafetyChecker
-from diffusers.utils import PIL_INTERPOLATION, deprecate, logging
+from diffusers.pipeline_utils import DiffusionPipeline
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
+from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
+from diffusers.utils import deprecate, logging
 from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer


@@ -121,7 +124,7 @@ def parse_prompt_attention(text):
    return res


-def get_prompts_with_weights(pipe: StableDiffusionPipeline, prompt: List[str], max_length: int):
+def get_prompts_with_weights(pipe: DiffusionPipeline, prompt: List[str], max_length: int):
    r"""
    Tokenize a list of prompts and return its tokens with weights of each token.

@@ -182,7 +185,7 @@ def pad_tokens_and_weights(tokens, weights, max_length, bos, eos, no_boseos_midd


 def get_unweighted_text_embeddings(
-    pipe: StableDiffusionPipeline,
+    pipe: DiffusionPipeline,
    text_input: torch.Tensor,
    chunk_length: int,
    no_boseos_middle: Optional[bool] = True,
@@ -222,10 +225,10 @@ def get_unweighted_text_embeddings(


 def get_weighted_text_embeddings(
-    pipe: StableDiffusionPipeline,
+    pipe: DiffusionPipeline,
    prompt: Union[str, List[str]],
    uncond_prompt: Optional[Union[str, List[str]]] = None,
-    max_embeddings_multiples: Optional[int] = 3,
+    max_embeddings_multiples: Optional[int] = 1,
    no_boseos_middle: Optional[bool] = False,
    skip_parsing: Optional[bool] = False,
    skip_weighting: Optional[bool] = False,
@@ -239,14 +242,14 @@ def get_weighted_text_embeddings(
    Also, to regularize of the embedding, the weighted embedding would be scaled to preserve the original mean.

    Args:
-        pipe (`StableDiffusionPipeline`):
+        pipe (`DiffusionPipeline`):
            Pipe to provide access to the tokenizer and the text encoder.
        prompt (`str` or `List[str]`):
            The prompt or prompts to guide the image generation.
        uncond_prompt (`str` or `List[str]`):
            The unconditional prompt or prompts for guide the image generation. If unconditional prompt
            is provided, the embeddings of prompt and uncond_prompt are concatenated.
-        max_embeddings_multiples (`int`, *optional*, defaults to `3`):
+        max_embeddings_multiples (`int`, *optional*, defaults to `1`):
            The max multiple length of prompt embeddings compared to the max output length of text encoder.
        no_boseos_middle (`bool`, *optional*, defaults to `False`):
            If the length of text token is multiples of the capacity of text encoder, whether reserve the starting and
@@ -337,15 +340,13 @@ def get_weighted_text_embeddings(
    # assign weights to the prompts and normalize in the sense of mean
    # TODO: should we normalize by chunk or in a whole (current implementation)?
    if (not skip_parsing) and (not skip_weighting):
-        previous_mean = text_embeddings.float().mean(axis=[-2, -1]).to(text_embeddings.dtype)
+        previous_mean = text_embeddings.mean(axis=[-2, -1])
        text_embeddings *= prompt_weights.unsqueeze(-1)
-        current_mean = text_embeddings.float().mean(axis=[-2, -1]).to(text_embeddings.dtype)
-        text_embeddings *= (previous_mean / current_mean).unsqueeze(-1).unsqueeze(-1)
+        text_embeddings *= (previous_mean / text_embeddings.mean(axis=[-2, -1])).unsqueeze(-1).unsqueeze(-1)
        if uncond_prompt is not None:
-            previous_mean = uncond_embeddings.float().mean(axis=[-2, -1]).to(uncond_embeddings.dtype)
+            previous_mean = uncond_embeddings.mean(axis=[-2, -1])
            uncond_embeddings *= uncond_weights.unsqueeze(-1)
-            current_mean = uncond_embeddings.float().mean(axis=[-2, -1]).to(uncond_embeddings.dtype)
-            uncond_embeddings *= (previous_mean / current_mean).unsqueeze(-1).unsqueeze(-1)
+            uncond_embeddings *= (previous_mean / uncond_embeddings.mean(axis=[-2, -1])).unsqueeze(-1).unsqueeze(-1)

    if uncond_prompt is not None:
        return text_embeddings, uncond_embeddings
@@ -355,18 +356,18 @@ def get_weighted_text_embeddings(
 def preprocess_image(image):
    w, h = image.size
    w, h = map(lambda x: x - x % 32, (w, h))  # resize to integer multiple of 32
-    image = image.resize((w, h), resample=PIL_INTERPOLATION["lanczos"])
+    image = image.resize((w, h), resample=PIL.Image.LANCZOS)
    image = np.array(image).astype(np.float32) / 255.0
    image = image[None].transpose(0, 3, 1, 2)
    image = torch.from_numpy(image)
    return 2.0 * image - 1.0


-def preprocess_mask(mask, scale_factor=8):
+def preprocess_mask(mask):
    mask = mask.convert("L")
    w, h = mask.size
    w, h = map(lambda x: x - x % 32, (w, h))  # resize to integer multiple of 32
-    mask = mask.resize((w // scale_factor, h // scale_factor), resample=PIL_INTERPOLATION["nearest"])
+    mask = mask.resize((w // 8, h // 8), resample=PIL.Image.NEAREST)
    mask = np.array(mask).astype(np.float32) / 255.0
    mask = np.tile(mask, (4, 1, 1))
    mask = mask[None].transpose(0, 1, 2, 3)  # what does this step do?
@@ -375,7 +376,7 @@ def preprocess_mask(mask, scale_factor=8):
    return mask


-class StableDiffusionLongPromptWeightingPipeline(StableDiffusionPipeline):
+class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
    r"""
    Pipeline for text-to-image generation using Stable Diffusion without tokens length limit, and support parsing
    weighting in prompt.
@@ -395,7 +396,7 @@ class StableDiffusionLongPromptWeightingPipeline(StableDiffusionPipeline):
            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
        safety_checker ([`StableDiffusionSafetyChecker`]):
            Classification module that estimates whether generated images could be considered offensive or harmful.
@@ -410,12 +411,37 @@ class StableDiffusionLongPromptWeightingPipeline(StableDiffusionPipeline):
        text_encoder: CLIPTextModel,
        tokenizer: CLIPTokenizer,
        unet: UNet2DConditionModel,
-        scheduler: SchedulerMixin,
+        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
        safety_checker: StableDiffusionSafetyChecker,
        feature_extractor: CLIPFeatureExtractor,
-        requires_safety_checker: bool = True,
    ):
-        super().__init__(
+        super().__init__()
+
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if safety_checker is None:
+            logger.warn(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        self.register_modules(
            vae=vae,
            text_encoder=text_encoder,
            tokenizer=tokenizer,
@@ -423,178 +449,41 @@ class StableDiffusionLongPromptWeightingPipeline(StableDiffusionPipeline):
            scheduler=scheduler,
            safety_checker=safety_checker,
            feature_extractor=feature_extractor,
-            requires_safety_checker=requires_safety_checker,
        )

-    def _encode_prompt(
-        self,
-        prompt,
-        device,
-        num_images_per_prompt,
-        do_classifier_free_guidance,
-        negative_prompt,
-        max_embeddings_multiples,
-    ):
+    def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
        r"""
-        Encodes the prompt into text encoder hidden states.
+        Enable sliced attention computation.
+
+        When this option is enabled, the attention module will split the input tensor in slices, to compute attention
+        in several steps. This is useful to save some memory in exchange for a small speed decrease.

        Args:
-            prompt (`str` or `list(int)`):
-                prompt to be encoded
-            device: (`torch.device`):
-                torch device
-            num_images_per_prompt (`int`):
-                number of images that should be generated per prompt
-            do_classifier_free_guidance (`bool`):
-                whether to use classifier free guidance or not
-            negative_prompt (`str` or `List[str]`):
-                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
-                if `guidance_scale` is less than `1`).
-            max_embeddings_multiples (`int`, *optional*, defaults to `3`):
-                The max multiple length of prompt embeddings compared to the max output length of text encoder.
+            slice_size (`str` or `int`, *optional*, defaults to `"auto"`):
+                When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
+                a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case,
+                `attention_head_dim` must be a multiple of `slice_size`.
        """
-        batch_size = len(prompt) if isinstance(prompt, list) else 1
+        if slice_size == "auto":
+            # half the attention head size is usually a good trade-off between
+            # speed and memory
+            slice_size = self.unet.config.attention_head_dim // 2
+        self.unet.set_attention_slice(slice_size)

-        if negative_prompt is None:
-            negative_prompt = [""] * batch_size
-        elif isinstance(negative_prompt, str):
-            negative_prompt = [negative_prompt] * batch_size
-        if batch_size != len(negative_prompt):
-            raise ValueError(
-                f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                " the batch size of `prompt`."
-            )
-
-        text_embeddings, uncond_embeddings = get_weighted_text_embeddings(
-            pipe=self,
-            prompt=prompt,
-            uncond_prompt=negative_prompt if do_classifier_free_guidance else None,
-            max_embeddings_multiples=max_embeddings_multiples,
-        )
-        bs_embed, seq_len, _ = text_embeddings.shape
-        text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)
-        text_embeddings = text_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
-
-        if do_classifier_free_guidance:
-            bs_embed, seq_len, _ = uncond_embeddings.shape
-            uncond_embeddings = uncond_embeddings.repeat(1, num_images_per_prompt, 1)
-            uncond_embeddings = uncond_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
-            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
-
-        return text_embeddings
-
-    def check_inputs(self, prompt, height, width, strength, callback_steps):
-        if not isinstance(prompt, str) and not isinstance(prompt, list):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if strength < 0 or strength > 1:
-            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
-
-        if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
-
-        if (callback_steps is None) or (
-            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
-        ):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}."
-            )
-
-    def get_timesteps(self, num_inference_steps, strength, device, is_text2img):
-        if is_text2img:
-            return self.scheduler.timesteps.to(device), num_inference_steps
-        else:
-            # get the original timestep using init_timestep
-            offset = self.scheduler.config.get("steps_offset", 0)
-            init_timestep = int(num_inference_steps * strength) + offset
-            init_timestep = min(init_timestep, num_inference_steps)
-
-            t_start = max(num_inference_steps - init_timestep + offset, 0)
-            timesteps = self.scheduler.timesteps[t_start:].to(device)
-            return timesteps, num_inference_steps - t_start
-
-    def run_safety_checker(self, image, device, dtype):
-        if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device)
-            image, has_nsfw_concept = self.safety_checker(
-                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
-            )
-        else:
-            has_nsfw_concept = None
-        return image, has_nsfw_concept
-
-    def decode_latents(self, latents):
-        latents = 1 / 0.18215 * latents
-        image = self.vae.decode(latents).sample
-        image = (image / 2 + 0.5).clamp(0, 1)
-        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
-        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
-        return image
-
-    def prepare_extra_step_kwargs(self, generator, eta):
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        extra_step_kwargs = {}
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-
-        # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        if accepts_generator:
-            extra_step_kwargs["generator"] = generator
-        return extra_step_kwargs
-
-    def prepare_latents(self, image, timestep, batch_size, height, width, dtype, device, generator, latents=None):
-        if image is None:
-            shape = (
-                batch_size,
-                self.unet.in_channels,
-                height // self.vae_scale_factor,
-                width // self.vae_scale_factor,
-            )
-
-            if latents is None:
-                if device.type == "mps":
-                    # randn does not work reproducibly on mps
-                    latents = torch.randn(shape, generator=generator, device="cpu", dtype=dtype).to(device)
-                else:
-                    latents = torch.randn(shape, generator=generator, device=device, dtype=dtype)
-            else:
-                if latents.shape != shape:
-                    raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
-                latents = latents.to(device)
-
-            # scale the initial noise by the standard deviation required by the scheduler
-            latents = latents * self.scheduler.init_noise_sigma
-            return latents, None, None
-        else:
-            init_latent_dist = self.vae.encode(image).latent_dist
-            init_latents = init_latent_dist.sample(generator=generator)
-            init_latents = 0.18215 * init_latents
-            init_latents = torch.cat([init_latents] * batch_size, dim=0)
-            init_latents_orig = init_latents
-            shape = init_latents.shape
-
-            # add noise to latents using the timesteps
-            if device.type == "mps":
-                noise = torch.randn(shape, generator=generator, device="cpu", dtype=dtype).to(device)
-            else:
-                noise = torch.randn(shape, generator=generator, device=device, dtype=dtype)
-            latents = self.scheduler.add_noise(init_latents, noise, timestep)
-            return latents, init_latents_orig, noise
+    def disable_attention_slicing(self):
+        r"""
+        Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go
+        back to computing attention in one step.
+        """
+        # set slice_size = `None` to disable `attention slicing`
+        self.enable_attention_slicing(None)

    @torch.no_grad()
    def __call__(
        self,
        prompt: Union[str, List[str]],
        negative_prompt: Optional[Union[str, List[str]]] = None,
-        image: Union[torch.FloatTensor, PIL.Image.Image] = None,
+        init_image: Union[torch.FloatTensor, PIL.Image.Image] = None,
        mask_image: Union[torch.FloatTensor, PIL.Image.Image] = None,
        height: int = 512,
        width: int = 512,
@@ -609,7 +498,6 @@ class StableDiffusionLongPromptWeightingPipeline(StableDiffusionPipeline):
        output_type: Optional[str] = "pil",
        return_dict: bool = True,
        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
-        is_cancelled_callback: Optional[Callable[[], bool]] = None,
        callback_steps: Optional[int] = 1,
        **kwargs,
    ):
@@ -622,11 +510,11 @@ class StableDiffusionLongPromptWeightingPipeline(StableDiffusionPipeline):
            negative_prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
                if `guidance_scale` is less than `1`).
-            image (`torch.FloatTensor` or `PIL.Image.Image`):
+            init_image (`torch.FloatTensor` or `PIL.Image.Image`):
                `Image`, or tensor representing an image batch, that will be used as the starting point for the
                process.
            mask_image (`torch.FloatTensor` or `PIL.Image.Image`):
-                `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
+                `Image`, or tensor representing an image batch, to mask `init_image`. White pixels in the mask will be
                replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
                PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
                contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.
@@ -644,11 +532,11 @@ class StableDiffusionLongPromptWeightingPipeline(StableDiffusionPipeline):
                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
                usually at the expense of lower image quality.
            strength (`float`, *optional*, defaults to 0.8):
-                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1.
-                `image` will be used as a starting point, adding more noise to it the larger the `strength`. The
+                Conceptually, indicates how much to transform the reference `init_image`. Must be between 0 and 1.
+                `init_image` will be used as a starting point, adding more noise to it the larger the `strength`. The
                number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
                noise will be maximum and the denoising process will run for the full number of iterations specified in
-                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
+                `num_inference_steps`. A value of 1, therefore, essentially ignores `init_image`.
            num_images_per_prompt (`int`, *optional*, defaults to 1):
                The number of images to generate per prompt.
            eta (`float`, *optional*, defaults to 0.0):
@@ -672,130 +560,230 @@ class StableDiffusionLongPromptWeightingPipeline(StableDiffusionPipeline):
            callback (`Callable`, *optional*):
                A function that will be called every `callback_steps` steps during inference. The function will be
                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
-            is_cancelled_callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. If the function returns
-                `True`, the inference will be cancelled.
            callback_steps (`int`, *optional*, defaults to 1):
                The frequency at which the `callback` function will be called. If not specified, the callback will be
                called at every step.

        Returns:
-            `None` if cancelled by `is_cancelled_callback`,
            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
            When returning a tuple, the first element is a list with the generated images, and the second element is a
            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
            (nsfw) content, according to the `safety_checker`.
        """
-        message = "Please use `image` instead of `init_image`."
-        init_image = deprecate("init_image", "0.12.0", message, take_from=kwargs)
-        image = init_image or image

-        # 0. Default height and width to unet
-        height = height or self.unet.config.sample_size * self.vae_scale_factor
-        width = width or self.unet.config.sample_size * self.vae_scale_factor
+        if isinstance(prompt, str):
+            batch_size = 1
+            prompt = [prompt]
+        elif isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")

-        # 1. Check inputs. Raise error if not correct
-        self.check_inputs(prompt, height, width, strength, callback_steps)
+        if strength < 0 or strength > 1:
+            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
+
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        # get prompt text embeddings

-        # 2. Define call parameters
-        batch_size = 1 if isinstance(prompt, str) else len(prompt)
-        device = self._execution_device
        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
        # corresponds to doing no classifier free guidance.
        do_classifier_free_guidance = guidance_scale > 1.0
+        # get unconditional embeddings for classifier free guidance
+        if negative_prompt is None:
+            negative_prompt = [""] * batch_size
+        elif isinstance(negative_prompt, str):
+            negative_prompt = [negative_prompt] * batch_size
+        if batch_size != len(negative_prompt):
+            raise ValueError(
+                f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                " the batch size of `prompt`."
+            )

-        # 3. Encode input prompt
-        text_embeddings = self._encode_prompt(
-            prompt,
-            device,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt,
-            max_embeddings_multiples,
+        text_embeddings, uncond_embeddings = get_weighted_text_embeddings(
+            pipe=self,
+            prompt=prompt,
+            uncond_prompt=negative_prompt if do_classifier_free_guidance else None,
+            max_embeddings_multiples=max_embeddings_multiples,
+            **kwargs,
        )
-        dtype = text_embeddings.dtype
+        bs_embed, seq_len, _ = text_embeddings.shape
+        text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)
+        text_embeddings = text_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)

-        # 4. Preprocess image and mask
-        if isinstance(image, PIL.Image.Image):
-            image = preprocess_image(image)
-        if image is not None:
-            image = image.to(device=self.device, dtype=dtype)
-        if isinstance(mask_image, PIL.Image.Image):
-            mask_image = preprocess_mask(mask_image, self.vae_scale_factor)
-        if mask_image is not None:
-            mask = mask_image.to(device=self.device, dtype=dtype)
-            mask = torch.cat([mask] * batch_size * num_images_per_prompt)
+        if do_classifier_free_guidance:
+            bs_embed, seq_len, _ = uncond_embeddings.shape
+            uncond_embeddings = uncond_embeddings.repeat(1, num_images_per_prompt, 1)
+            uncond_embeddings = uncond_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
+            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+
+        # set timesteps
+        self.scheduler.set_timesteps(num_inference_steps)
+
+        latents_dtype = text_embeddings.dtype
+        init_latents_orig = None
+        mask = None
+        noise = None
+
+        if init_image is None:
+            # get the initial random noise unless the user supplied it
+
+            # Unlike in other pipelines, latents need to be generated in the target device
+            # for 1-to-1 results reproducibility with the CompVis implementation.
+            # However this currently doesn't work in `mps`.
+            latents_shape = (
+                batch_size * num_images_per_prompt,
+                self.unet.in_channels,
+                height // 8,
+                width // 8,
+            )
+
+            if latents is None:
+                if self.device.type == "mps":
+                    # randn does not exist on mps
+                    latents = torch.randn(
+                        latents_shape,
+                        generator=generator,
+                        device="cpu",
+                        dtype=latents_dtype,
+                    ).to(self.device)
+                else:
+                    latents = torch.randn(
+                        latents_shape,
+                        generator=generator,
+                        device=self.device,
+                        dtype=latents_dtype,
+                    )
+            else:
+                if latents.shape != latents_shape:
+                    raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
+                latents = latents.to(self.device)
+
+            timesteps = self.scheduler.timesteps.to(self.device)
+
+            # scale the initial noise by the standard deviation required by the scheduler
+            latents = latents * self.scheduler.init_noise_sigma
        else:
-            mask = None
+            if isinstance(init_image, PIL.Image.Image):
+                init_image = preprocess_image(init_image)
+            # encode the init image into latents and scale the latents
+            init_image = init_image.to(device=self.device, dtype=latents_dtype)
+            init_latent_dist = self.vae.encode(init_image).latent_dist
+            init_latents = init_latent_dist.sample(generator=generator)
+            init_latents = 0.18215 * init_latents
+            init_latents = torch.cat([init_latents] * batch_size * num_images_per_prompt, dim=0)
+            init_latents_orig = init_latents

-        # 5. set timesteps
-        self.scheduler.set_timesteps(num_inference_steps, device=device)
-        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device, image is None)
-        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+            # preprocess mask
+            if mask_image is not None:
+                if isinstance(mask_image, PIL.Image.Image):
+                    mask_image = preprocess_mask(mask_image)
+                mask_image = mask_image.to(device=self.device, dtype=latents_dtype)
+                mask = torch.cat([mask_image] * batch_size * num_images_per_prompt)

-        # 6. Prepare latent variables
-        latents, init_latents_orig, noise = self.prepare_latents(
-            image,
-            latent_timestep,
-            batch_size * num_images_per_prompt,
-            height,
-            width,
-            dtype,
-            device,
-            generator,
-            latents,
-        )
+                # check sizes
+                if not mask.shape == init_latents.shape:
+                    raise ValueError("The mask and init_image should be the same size!")

-        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+            # get the original timestep using init_timestep
+            offset = self.scheduler.config.get("steps_offset", 0)
+            init_timestep = int(num_inference_steps * strength) + offset
+            init_timestep = min(init_timestep, num_inference_steps)

-        # 8. Denoising loop
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                # expand the latents if we are doing classifier free guidance
-                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
-                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+            timesteps = self.scheduler.timesteps[-init_timestep]
+            timesteps = torch.tensor([timesteps] * batch_size * num_images_per_prompt, device=self.device)

-                # predict the noise residual
-                noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
+            # add noise to latents using the timesteps
+            if self.device.type == "mps":
+                # randn does not exist on mps
+                noise = torch.randn(
+                    init_latents.shape,
+                    generator=generator,
+                    device="cpu",
+                    dtype=latents_dtype,
+                ).to(self.device)
+            else:
+                noise = torch.randn(
+                    init_latents.shape,
+                    generator=generator,
+                    device=self.device,
+                    dtype=latents_dtype,
+                )
+            latents = self.scheduler.add_noise(init_latents, noise, timesteps)

-                # perform guidance
-                if do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+            t_start = max(num_inference_steps - init_timestep + offset, 0)
+            timesteps = self.scheduler.timesteps[t_start:].to(self.device)

-                # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta

-                if mask is not None:
-                    # masking
-                    init_latents_proper = self.scheduler.add_noise(init_latents_orig, noise, torch.tensor([t]))
-                    latents = (init_latents_proper * mask) + (latents * (1 - mask))
+        for i, t in enumerate(self.progress_bar(timesteps)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)

-                # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    progress_bar.update()
-                    if i % callback_steps == 0:
-                        if callback is not None:
-                            callback(i, t, latents)
-                        if is_cancelled_callback is not None and is_cancelled_callback():
-                            return None
+            # predict the noise residual
+            noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample

-        # 9. Post-processing
-        image = self.decode_latents(latents)
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)

-        # 10. Run safety checker
-        image, has_nsfw_concept = self.run_safety_checker(image, device, text_embeddings.dtype)
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+            if mask is not None:
+                # masking
+                init_latents_proper = self.scheduler.add_noise(init_latents_orig, noise, torch.tensor([t]))
+                latents = (init_latents_proper * mask) + (latents * (1 - mask))
+
+            # call the callback, if provided
+            if callback is not None and i % callback_steps == 0:
+                callback(i, t, latents)
+
+        latents = 1 / 0.18215 * latents
+        image = self.vae.decode(latents).sample
+
+        image = (image / 2 + 0.5).clamp(0, 1)
+
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+        if self.safety_checker is not None:
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(
+                self.device
+            )
+            image, has_nsfw_concept = self.safety_checker(
+                images=image,
+                clip_input=safety_checker_input.pixel_values.to(text_embeddings.dtype),
+            )
+        else:
+            has_nsfw_concept = None

-        # 11. Convert to PIL
        if output_type == "pil":
            image = self.numpy_to_pil(image)

        if not return_dict:
-            return image, has_nsfw_concept
+            return (image, has_nsfw_concept)

        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)

@@ -815,7 +803,6 @@ class StableDiffusionLongPromptWeightingPipeline(StableDiffusionPipeline):
        output_type: Optional[str] = "pil",
        return_dict: bool = True,
        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
-        is_cancelled_callback: Optional[Callable[[], bool]] = None,
        callback_steps: Optional[int] = 1,
        **kwargs,
    ):
@@ -863,9 +850,6 @@ class StableDiffusionLongPromptWeightingPipeline(StableDiffusionPipeline):
            callback (`Callable`, *optional*):
                A function that will be called every `callback_steps` steps during inference. The function will be
                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
-            is_cancelled_callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. If the function returns
-                `True`, the inference will be cancelled.
            callback_steps (`int`, *optional*, defaults to 1):
                The frequency at which the `callback` function will be called. If not specified, the callback will be
                called at every step.
@@ -891,14 +875,13 @@ class StableDiffusionLongPromptWeightingPipeline(StableDiffusionPipeline):
            output_type=output_type,
            return_dict=return_dict,
            callback=callback,
-            is_cancelled_callback=is_cancelled_callback,
            callback_steps=callback_steps,
            **kwargs,
        )

    def img2img(
        self,
-        image: Union[torch.FloatTensor, PIL.Image.Image],
+        init_image: Union[torch.FloatTensor, PIL.Image.Image],
        prompt: Union[str, List[str]],
        negative_prompt: Optional[Union[str, List[str]]] = None,
        strength: float = 0.8,
@@ -911,14 +894,13 @@ class StableDiffusionLongPromptWeightingPipeline(StableDiffusionPipeline):
        output_type: Optional[str] = "pil",
        return_dict: bool = True,
        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
-        is_cancelled_callback: Optional[Callable[[], bool]] = None,
        callback_steps: Optional[int] = 1,
        **kwargs,
    ):
        r"""
        Function for image-to-image generation.
        Args:
-            image (`torch.FloatTensor` or `PIL.Image.Image`):
+            init_image (`torch.FloatTensor` or `PIL.Image.Image`):
                `Image`, or tensor representing an image batch, that will be used as the starting point for the
                process.
            prompt (`str` or `List[str]`):
@@ -927,11 +909,11 @@ class StableDiffusionLongPromptWeightingPipeline(StableDiffusionPipeline):
                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
                if `guidance_scale` is less than `1`).
            strength (`float`, *optional*, defaults to 0.8):
-                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1.
-                `image` will be used as a starting point, adding more noise to it the larger the `strength`. The
+                Conceptually, indicates how much to transform the reference `init_image`. Must be between 0 and 1.
+                `init_image` will be used as a starting point, adding more noise to it the larger the `strength`. The
                number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
                noise will be maximum and the denoising process will run for the full number of iterations specified in
-                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
+                `num_inference_steps`. A value of 1, therefore, essentially ignores `init_image`.
            num_inference_steps (`int`, *optional*, defaults to 50):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference. This parameter will be modulated by `strength`.
@@ -960,9 +942,6 @@ class StableDiffusionLongPromptWeightingPipeline(StableDiffusionPipeline):
            callback (`Callable`, *optional*):
                A function that will be called every `callback_steps` steps during inference. The function will be
                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
-            is_cancelled_callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. If the function returns
-                `True`, the inference will be cancelled.
            callback_steps (`int`, *optional*, defaults to 1):
                The frequency at which the `callback` function will be called. If not specified, the callback will be
                called at every step.
@@ -976,7 +955,7 @@ class StableDiffusionLongPromptWeightingPipeline(StableDiffusionPipeline):
        return self.__call__(
            prompt=prompt,
            negative_prompt=negative_prompt,
-            image=image,
+            init_image=init_image,
            num_inference_steps=num_inference_steps,
            guidance_scale=guidance_scale,
            strength=strength,
@@ -987,14 +966,13 @@ class StableDiffusionLongPromptWeightingPipeline(StableDiffusionPipeline):
            output_type=output_type,
            return_dict=return_dict,
            callback=callback,
-            is_cancelled_callback=is_cancelled_callback,
            callback_steps=callback_steps,
            **kwargs,
        )

    def inpaint(
        self,
-        image: Union[torch.FloatTensor, PIL.Image.Image],
+        init_image: Union[torch.FloatTensor, PIL.Image.Image],
        mask_image: Union[torch.FloatTensor, PIL.Image.Image],
        prompt: Union[str, List[str]],
        negative_prompt: Optional[Union[str, List[str]]] = None,
@@ -1008,18 +986,17 @@ class StableDiffusionLongPromptWeightingPipeline(StableDiffusionPipeline):
        output_type: Optional[str] = "pil",
        return_dict: bool = True,
        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
-        is_cancelled_callback: Optional[Callable[[], bool]] = None,
        callback_steps: Optional[int] = 1,
        **kwargs,
    ):
        r"""
        Function for inpaint.
        Args:
-            image (`torch.FloatTensor` or `PIL.Image.Image`):
+            init_image (`torch.FloatTensor` or `PIL.Image.Image`):
                `Image`, or tensor representing an image batch, that will be used as the starting point for the
                process. This is the image whose masked region will be inpainted.
            mask_image (`torch.FloatTensor` or `PIL.Image.Image`):
-                `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
+                `Image`, or tensor representing an image batch, to mask `init_image`. White pixels in the mask will be
                replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
                PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
                contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.
@@ -1031,7 +1008,7 @@ class StableDiffusionLongPromptWeightingPipeline(StableDiffusionPipeline):
            strength (`float`, *optional*, defaults to 0.8):
                Conceptually, indicates how much to inpaint the masked area. Must be between 0 and 1. When `strength`
                is 1, the denoising process will be run on the masked area for the full number of iterations specified
-                in `num_inference_steps`. `image` will be used as a reference for the masked area, adding more
+                in `num_inference_steps`. `init_image` will be used as a reference for the masked area, adding more
                noise to that region the larger the `strength`. If `strength` is 0, no inpainting will occur.
            num_inference_steps (`int`, *optional*, defaults to 50):
                The reference number of denoising steps. More denoising steps usually lead to a higher quality image at
@@ -1061,9 +1038,6 @@ class StableDiffusionLongPromptWeightingPipeline(StableDiffusionPipeline):
            callback (`Callable`, *optional*):
                A function that will be called every `callback_steps` steps during inference. The function will be
                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
-            is_cancelled_callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. If the function returns
-                `True`, the inference will be cancelled.
            callback_steps (`int`, *optional*, defaults to 1):
                The frequency at which the `callback` function will be called. If not specified, the callback will be
                called at every step.
@@ -1077,7 +1051,7 @@ class StableDiffusionLongPromptWeightingPipeline(StableDiffusionPipeline):
        return self.__call__(
            prompt=prompt,
            negative_prompt=negative_prompt,
-            image=image,
+            init_image=init_image,
            mask_image=mask_image,
            num_inference_steps=num_inference_steps,
            guidance_scale=guidance_scale,
@@ -1089,7 +1063,6 @@ class StableDiffusionLongPromptWeightingPipeline(StableDiffusionPipeline):
            output_type=output_type,
            return_dict=return_dict,
            callback=callback,
-            is_cancelled_callback=is_cancelled_callback,
            callback_steps=callback_steps,
            **kwargs,
        )
--- a/examples/community/lpw_stable_diffusion_onnx.py
+++ b/examples/community/lpw_stable_diffusion_onnx.py
@@ -6,10 +6,11 @@ import numpy as np
 import torch

 import PIL
-from diffusers import OnnxStableDiffusionPipeline, SchedulerMixin
-from diffusers.onnx_utils import ORT_TO_NP_TYPE, OnnxRuntimeModel
+from diffusers.onnx_utils import OnnxRuntimeModel
+from diffusers.pipeline_utils import DiffusionPipeline
 from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
-from diffusers.utils import PIL_INTERPOLATION, deprecate, logging
+from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
+from diffusers.utils import logging
 from transformers import CLIPFeatureExtractor, CLIPTokenizer


@@ -240,7 +241,7 @@ def get_weighted_text_embeddings(
    Also, to regularize of the embedding, the weighted embedding would be scaled to preserve the original mean.

    Args:
-        pipe (`OnnxStableDiffusionPipeline`):
+        pipe (`DiffusionPipeline`):
            Pipe to provide access to the tokenizer and the text encoder.
        prompt (`str` or `List[str]`):
            The prompt or prompts to guide the image generation.
@@ -364,17 +365,17 @@ def get_weighted_text_embeddings(
 def preprocess_image(image):
    w, h = image.size
    w, h = map(lambda x: x - x % 32, (w, h))  # resize to integer multiple of 32
-    image = image.resize((w, h), resample=PIL_INTERPOLATION["lanczos"])
+    image = image.resize((w, h), resample=PIL.Image.LANCZOS)
    image = np.array(image).astype(np.float32) / 255.0
    image = image[None].transpose(0, 3, 1, 2)
    return 2.0 * image - 1.0


-def preprocess_mask(mask, scale_factor=8):
+def preprocess_mask(mask):
    mask = mask.convert("L")
    w, h = mask.size
    w, h = map(lambda x: x - x % 32, (w, h))  # resize to integer multiple of 32
-    mask = mask.resize((w // scale_factor, h // scale_factor), resample=PIL_INTERPOLATION["nearest"])
+    mask = mask.resize((w // 8, h // 8), resample=PIL.Image.NEAREST)
    mask = np.array(mask).astype(np.float32) / 255.0
    mask = np.tile(mask, (4, 1, 1))
    mask = mask[None].transpose(0, 1, 2, 3)  # what does this step do?
@@ -382,7 +383,7 @@ def preprocess_mask(mask, scale_factor=8):
    return mask


-class OnnxStableDiffusionLongPromptWeightingPipeline(OnnxStableDiffusionPipeline):
+class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
    r"""
    Pipeline for text-to-image generation using Stable Diffusion without tokens length limit, and support parsing
    weighting in prompt.
@@ -398,12 +399,12 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(OnnxStableDiffusionPipeline
        text_encoder: OnnxRuntimeModel,
        tokenizer: CLIPTokenizer,
        unet: OnnxRuntimeModel,
-        scheduler: SchedulerMixin,
+        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
        safety_checker: OnnxRuntimeModel,
        feature_extractor: CLIPFeatureExtractor,
-        requires_safety_checker: bool = True,
    ):
-        super().__init__(
+        super().__init__()
+        self.register_modules(
            vae_encoder=vae_encoder,
            vae_decoder=vae_decoder,
            text_encoder=text_encoder,
@@ -412,177 +413,14 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(OnnxStableDiffusionPipeline
            scheduler=scheduler,
            safety_checker=safety_checker,
            feature_extractor=feature_extractor,
-            requires_safety_checker=requires_safety_checker,
        )
-        self.unet_in_channels = 4
-        self.vae_scale_factor = 8
-
-    def _encode_prompt(
-        self,
-        prompt,
-        num_images_per_prompt,
-        do_classifier_free_guidance,
-        negative_prompt,
-        max_embeddings_multiples,
-    ):
-        r"""
-        Encodes the prompt into text encoder hidden states.
-
-        Args:
-            prompt (`str` or `list(int)`):
-                prompt to be encoded
-            num_images_per_prompt (`int`):
-                number of images that should be generated per prompt
-            do_classifier_free_guidance (`bool`):
-                whether to use classifier free guidance or not
-            negative_prompt (`str` or `List[str]`):
-                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
-                if `guidance_scale` is less than `1`).
-            max_embeddings_multiples (`int`, *optional*, defaults to `3`):
-                The max multiple length of prompt embeddings compared to the max output length of text encoder.
-        """
-        batch_size = len(prompt) if isinstance(prompt, list) else 1
-
-        if negative_prompt is None:
-            negative_prompt = [""] * batch_size
-        elif isinstance(negative_prompt, str):
-            negative_prompt = [negative_prompt] * batch_size
-        if batch_size != len(negative_prompt):
-            raise ValueError(
-                f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                " the batch size of `prompt`."
-            )
-
-        text_embeddings, uncond_embeddings = get_weighted_text_embeddings(
-            pipe=self,
-            prompt=prompt,
-            uncond_prompt=negative_prompt if do_classifier_free_guidance else None,
-            max_embeddings_multiples=max_embeddings_multiples,
-        )
-
-        text_embeddings = text_embeddings.repeat(num_images_per_prompt, 0)
-        if do_classifier_free_guidance:
-            uncond_embeddings = uncond_embeddings.repeat(num_images_per_prompt, 0)
-            text_embeddings = np.concatenate([uncond_embeddings, text_embeddings])
-
-        return text_embeddings
-
-    def check_inputs(self, prompt, height, width, strength, callback_steps):
-        if not isinstance(prompt, str) and not isinstance(prompt, list):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if strength < 0 or strength > 1:
-            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
-
-        if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
-
-        if (callback_steps is None) or (
-            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
-        ):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}."
-            )
-
-    def get_timesteps(self, num_inference_steps, strength, is_text2img):
-        if is_text2img:
-            return self.scheduler.timesteps, num_inference_steps
-        else:
-            # get the original timestep using init_timestep
-            offset = self.scheduler.config.get("steps_offset", 0)
-            init_timestep = int(num_inference_steps * strength) + offset
-            init_timestep = min(init_timestep, num_inference_steps)
-
-            t_start = max(num_inference_steps - init_timestep + offset, 0)
-            timesteps = self.scheduler.timesteps[t_start:]
-            return timesteps, num_inference_steps - t_start
-
-    def run_safety_checker(self, image):
-        if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(
-                self.numpy_to_pil(image), return_tensors="np"
-            ).pixel_values.astype(image.dtype)
-            # There will throw an error if use safety_checker directly and batchsize>1
-            images, has_nsfw_concept = [], []
-            for i in range(image.shape[0]):
-                image_i, has_nsfw_concept_i = self.safety_checker(
-                    clip_input=safety_checker_input[i : i + 1], images=image[i : i + 1]
-                )
-                images.append(image_i)
-                has_nsfw_concept.append(has_nsfw_concept_i[0])
-            image = np.concatenate(images)
-        else:
-            has_nsfw_concept = None
-        return image, has_nsfw_concept
-
-    def decode_latents(self, latents):
-        latents = 1 / 0.18215 * latents
-        # image = self.vae_decoder(latent_sample=latents)[0]
-        # it seems likes there is a strange result for using half-precision vae decoder if batchsize>1
-        image = np.concatenate(
-            [self.vae_decoder(latent_sample=latents[i : i + 1])[0] for i in range(latents.shape[0])]
-        )
-        image = np.clip(image / 2 + 0.5, 0, 1)
-        image = image.transpose((0, 2, 3, 1))
-        return image
-
-    def prepare_extra_step_kwargs(self, generator, eta):
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        extra_step_kwargs = {}
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-
-        # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        if accepts_generator:
-            extra_step_kwargs["generator"] = generator
-        return extra_step_kwargs
-
-    def prepare_latents(self, image, timestep, batch_size, height, width, dtype, generator, latents=None):
-        if image is None:
-            shape = (
-                batch_size,
-                self.unet_in_channels,
-                height // self.vae_scale_factor,
-                width // self.vae_scale_factor,
-            )
-
-            if latents is None:
-                latents = torch.randn(shape, generator=generator, device="cpu").numpy().astype(dtype)
-            else:
-                if latents.shape != shape:
-                    raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
-
-            # scale the initial noise by the standard deviation required by the scheduler
-            latents = (torch.from_numpy(latents) * self.scheduler.init_noise_sigma).numpy()
-            return latents, None, None
-        else:
-            init_latents = self.vae_encoder(sample=image)[0]
-            init_latents = 0.18215 * init_latents
-            init_latents = np.concatenate([init_latents] * batch_size, axis=0)
-            init_latents_orig = init_latents
-            shape = init_latents.shape
-
-            # add noise to latents using the timesteps
-            noise = torch.randn(shape, generator=generator, device="cpu").numpy().astype(dtype)
-            latents = self.scheduler.add_noise(
-                torch.from_numpy(init_latents), torch.from_numpy(noise), timestep
-            ).numpy()
-            return latents, init_latents_orig, noise

    @torch.no_grad()
    def __call__(
        self,
        prompt: Union[str, List[str]],
        negative_prompt: Optional[Union[str, List[str]]] = None,
-        image: Union[np.ndarray, PIL.Image.Image] = None,
+        init_image: Union[np.ndarray, PIL.Image.Image] = None,
        mask_image: Union[np.ndarray, PIL.Image.Image] = None,
        height: int = 512,
        width: int = 512,
@@ -591,13 +429,12 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(OnnxStableDiffusionPipeline
        strength: float = 0.8,
        num_images_per_prompt: Optional[int] = 1,
        eta: float = 0.0,
-        generator: Optional[torch.Generator] = None,
+        generator: Optional[np.random.RandomState] = None,
        latents: Optional[np.ndarray] = None,
        max_embeddings_multiples: Optional[int] = 3,
        output_type: Optional[str] = "pil",
        return_dict: bool = True,
        callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
-        is_cancelled_callback: Optional[Callable[[], bool]] = None,
        callback_steps: Optional[int] = 1,
        **kwargs,
    ):
@@ -610,11 +447,11 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(OnnxStableDiffusionPipeline
            negative_prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
                if `guidance_scale` is less than `1`).
-            image (`np.ndarray` or `PIL.Image.Image`):
+            init_image (`np.ndarray` or `PIL.Image.Image`):
                `Image`, or tensor representing an image batch, that will be used as the starting point for the
                process.
            mask_image (`np.ndarray` or `PIL.Image.Image`):
-                `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
+                `Image`, or tensor representing an image batch, to mask `init_image`. White pixels in the mask will be
                replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
                PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
                contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.
@@ -632,19 +469,18 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(OnnxStableDiffusionPipeline
                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
                usually at the expense of lower image quality.
            strength (`float`, *optional*, defaults to 0.8):
-                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1.
-                `image` will be used as a starting point, adding more noise to it the larger the `strength`. The
+                Conceptually, indicates how much to transform the reference `init_image`. Must be between 0 and 1.
+                `init_image` will be used as a starting point, adding more noise to it the larger the `strength`. The
                number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
                noise will be maximum and the denoising process will run for the full number of iterations specified in
-                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
+                `num_inference_steps`. A value of 1, therefore, essentially ignores `init_image`.
            num_images_per_prompt (`int`, *optional*, defaults to 1):
                The number of images to generate per prompt.
            eta (`float`, *optional*, defaults to 0.0):
                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`torch.Generator`, *optional*):
-                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
-                deterministic.
+            generator (`np.random.RandomState`, *optional*):
+                A np.random.RandomState to make generation deterministic.
            latents (`np.ndarray`, *optional*):
                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
@@ -660,142 +496,213 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(OnnxStableDiffusionPipeline
            callback (`Callable`, *optional*):
                A function that will be called every `callback_steps` steps during inference. The function will be
                called with the following arguments: `callback(step: int, timestep: int, latents: np.ndarray)`.
-            is_cancelled_callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. If the function returns
-                `True`, the inference will be cancelled.
            callback_steps (`int`, *optional*, defaults to 1):
                The frequency at which the `callback` function will be called. If not specified, the callback will be
                called at every step.

        Returns:
-            `None` if cancelled by `is_cancelled_callback`,
            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
            When returning a tuple, the first element is a list with the generated images, and the second element is a
            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
            (nsfw) content, according to the `safety_checker`.
        """
-        message = "Please use `image` instead of `init_image`."
-        init_image = deprecate("init_image", "0.12.0", message, take_from=kwargs)
-        image = init_image or image

-        # 0. Default height and width to unet
-        height = height or self.unet.config.sample_size * self.vae_scale_factor
-        width = width or self.unet.config.sample_size * self.vae_scale_factor
+        if isinstance(prompt, str):
+            batch_size = 1
+            prompt = [prompt]
+        elif isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")

-        # 1. Check inputs. Raise error if not correct
-        self.check_inputs(prompt, height, width, strength, callback_steps)
+        if strength < 0 or strength > 1:
+            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
+
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        # get prompt text embeddings

-        # 2. Define call parameters
-        batch_size = 1 if isinstance(prompt, str) else len(prompt)
        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
        # corresponds to doing no classifier free guidance.
        do_classifier_free_guidance = guidance_scale > 1.0
+        # get unconditional embeddings for classifier free guidance
+        if negative_prompt is None:
+            negative_prompt = [""] * batch_size
+        elif isinstance(negative_prompt, str):
+            negative_prompt = [negative_prompt] * batch_size
+        if batch_size != len(negative_prompt):
+            raise ValueError(
+                f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                " the batch size of `prompt`."
+            )

-        # 3. Encode input prompt
-        text_embeddings = self._encode_prompt(
-            prompt,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt,
-            max_embeddings_multiples,
+        if generator is None:
+            generator = np.random
+
+        text_embeddings, uncond_embeddings = get_weighted_text_embeddings(
+            pipe=self,
+            prompt=prompt,
+            uncond_prompt=negative_prompt if do_classifier_free_guidance else None,
+            max_embeddings_multiples=max_embeddings_multiples,
+            **kwargs,
        )
-        dtype = text_embeddings.dtype

-        # 4. Preprocess image and mask
-        if isinstance(image, PIL.Image.Image):
-            image = preprocess_image(image)
-        if image is not None:
-            image = image.astype(dtype)
-        if isinstance(mask_image, PIL.Image.Image):
-            mask_image = preprocess_mask(mask_image, self.vae_scale_factor)
-        if mask_image is not None:
-            mask = mask_image.astype(dtype)
-            mask = np.concatenate([mask] * batch_size * num_images_per_prompt)
-        else:
-            mask = None
+        text_embeddings = text_embeddings.repeat(num_images_per_prompt, 0)
+        if do_classifier_free_guidance:
+            uncond_embeddings = uncond_embeddings.repeat(num_images_per_prompt, 0)
+            text_embeddings = np.concatenate([uncond_embeddings, text_embeddings])

-        # 5. set timesteps
+        # set timesteps
        self.scheduler.set_timesteps(num_inference_steps)
-        timestep_dtype = next(
-            (input.type for input in self.unet.model.get_inputs() if input.name == "timestep"), "tensor(float)"
-        )
-        timestep_dtype = ORT_TO_NP_TYPE[timestep_dtype]
-        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, image is None)
-        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)

-        # 6. Prepare latent variables
-        latents, init_latents_orig, noise = self.prepare_latents(
-            image,
-            latent_timestep,
-            batch_size * num_images_per_prompt,
-            height,
-            width,
-            dtype,
-            generator,
-            latents,
-        )
+        latents_dtype = text_embeddings.dtype
+        init_latents_orig = None
+        mask = None
+        noise = None

-        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        if init_image is None:
+            latents_shape = (
+                batch_size * num_images_per_prompt,
+                4,
+                height // 8,
+                width // 8,
+            )

-        # 8. Denoising loop
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                # expand the latents if we are doing classifier free guidance
-                latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents
-                latent_model_input = self.scheduler.scale_model_input(torch.from_numpy(latent_model_input), t)
-                latent_model_input = latent_model_input.numpy()
+            if latents is None:
+                latents = generator.randn(*latents_shape).astype(latents_dtype)
+            elif latents.shape != latents_shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")

-                # predict the noise residual
-                noise_pred = self.unet(
-                    sample=latent_model_input,
-                    timestep=np.array([t], dtype=timestep_dtype),
-                    encoder_hidden_states=text_embeddings,
+            timesteps = self.scheduler.timesteps.to(self.device)
+
+            # scale the initial noise by the standard deviation required by the scheduler
+            latents = latents * self.scheduler.init_noise_sigma
+        else:
+            if isinstance(init_image, PIL.Image.Image):
+                init_image = preprocess_image(init_image)
+            # encode the init image into latents and scale the latents
+            init_image = init_image.astype(latents_dtype)
+            init_latents = self.vae_encoder(sample=init_image)[0]
+            init_latents = 0.18215 * init_latents
+            init_latents = np.concatenate([init_latents] * batch_size * num_images_per_prompt)
+            init_latents_orig = init_latents
+
+            # preprocess mask
+            if mask_image is not None:
+                if isinstance(mask_image, PIL.Image.Image):
+                    mask_image = preprocess_mask(mask_image)
+                mask_image = mask_image.astype(latents_dtype)
+                mask = np.concatenate([mask_image] * batch_size * num_images_per_prompt)
+
+                # check sizes
+                if not mask.shape == init_latents.shape:
+                    print(mask.shape, init_latents.shape)
+                    raise ValueError("The mask and init_image should be the same size!")
+
+            # get the original timestep using init_timestep
+            offset = self.scheduler.config.get("steps_offset", 0)
+            init_timestep = int(num_inference_steps * strength) + offset
+            init_timestep = min(init_timestep, num_inference_steps)
+
+            timesteps = self.scheduler.timesteps[-init_timestep]
+            timesteps = torch.tensor([timesteps] * batch_size * num_images_per_prompt)
+
+            # add noise to latents using the timesteps
+            noise = generator.randn(*init_latents.shape).astype(latents_dtype)
+            latents = self.scheduler.add_noise(
+                torch.from_numpy(init_latents), torch.from_numpy(noise), timesteps
+            ).numpy()
+
+            t_start = max(num_inference_steps - init_timestep + offset, 0)
+            timesteps = self.scheduler.timesteps[t_start:]
+
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        for i, t in enumerate(self.progress_bar(timesteps)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+            # predict the noise residual
+            noise_pred = self.unet(
+                sample=latent_model_input,
+                timestep=np.array([t]),
+                encoder_hidden_states=text_embeddings,
+            )
+            noise_pred = noise_pred[0]
+
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = np.split(noise_pred, 2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample.numpy()
+
+            if mask is not None:
+                # masking
+                init_latents_proper = self.scheduler.add_noise(
+                    torch.from_numpy(init_latents_orig),
+                    torch.from_numpy(noise),
+                    torch.tensor([t]),
+                ).numpy()
+                latents = (init_latents_proper * mask) + (latents * (1 - mask))
+
+            # call the callback, if provided
+            if callback is not None and i % callback_steps == 0:
+                callback(i, t, latents)
+
+        latents = 1 / 0.18215 * latents
+        # image = self.vae_decoder(latent_sample=latents)[0]
+        # it seems likes there is a problem for using half-precision vae decoder if batchsize>1
+        image = []
+        for i in range(latents.shape[0]):
+            image.append(self.vae_decoder(latent_sample=latents[i : i + 1])[0])
+        image = np.concatenate(image)
+
+        image = np.clip(image / 2 + 0.5, 0, 1)
+        image = image.transpose((0, 2, 3, 1))
+
+        if self.safety_checker is not None:
+            safety_checker_input = self.feature_extractor(
+                self.numpy_to_pil(image), return_tensors="np"
+            ).pixel_values.astype(image.dtype)
+            # There will throw an error if use safety_checker directly and batchsize>1
+            images, has_nsfw_concept = [], []
+            for i in range(image.shape[0]):
+                image_i, has_nsfw_concept_i = self.safety_checker(
+                    clip_input=safety_checker_input[i : i + 1], images=image[i : i + 1]
                )
-                noise_pred = noise_pred[0]
+                images.append(image_i)
+                has_nsfw_concept.append(has_nsfw_concept_i)
+            image = np.concatenate(images)
+        else:
+            has_nsfw_concept = None

-                # perform guidance
-                if do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = np.split(noise_pred, 2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-
-                # compute the previous noisy sample x_t -> x_t-1
-                scheduler_output = self.scheduler.step(
-                    torch.from_numpy(noise_pred), t, torch.from_numpy(latents), **extra_step_kwargs
-                )
-                latents = scheduler_output.prev_sample.numpy()
-
-                if mask is not None:
-                    # masking
-                    init_latents_proper = self.scheduler.add_noise(
-                        torch.from_numpy(init_latents_orig),
-                        torch.from_numpy(noise),
-                        t,
-                    ).numpy()
-                    latents = (init_latents_proper * mask) + (latents * (1 - mask))
-
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    progress_bar.update()
-                    if i % callback_steps == 0:
-                        if callback is not None:
-                            callback(i, t, latents)
-                        if is_cancelled_callback is not None and is_cancelled_callback():
-                            return None
-        # 9. Post-processing
-        image = self.decode_latents(latents)
-
-        # 10. Run safety checker
-        image, has_nsfw_concept = self.run_safety_checker(image)
-
-        # 11. Convert to PIL
        if output_type == "pil":
            image = self.numpy_to_pil(image)

        if not return_dict:
-            return image, has_nsfw_concept
+            return (image, has_nsfw_concept)

        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)

@@ -809,7 +716,7 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(OnnxStableDiffusionPipeline
        guidance_scale: float = 7.5,
        num_images_per_prompt: Optional[int] = 1,
        eta: float = 0.0,
-        generator: Optional[torch.Generator] = None,
+        generator: Optional[np.random.RandomState] = None,
        latents: Optional[np.ndarray] = None,
        max_embeddings_multiples: Optional[int] = 3,
        output_type: Optional[str] = "pil",
@@ -844,9 +751,8 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(OnnxStableDiffusionPipeline
            eta (`float`, *optional*, defaults to 0.0):
                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`torch.Generator`, *optional*):
-                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
-                deterministic.
+            generator (`np.random.RandomState`, *optional*):
+                A np.random.RandomState to make generation deterministic.
            latents (`np.ndarray`, *optional*):
                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
@@ -893,7 +799,7 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(OnnxStableDiffusionPipeline

    def img2img(
        self,
-        image: Union[np.ndarray, PIL.Image.Image],
+        init_image: Union[np.ndarray, PIL.Image.Image],
        prompt: Union[str, List[str]],
        negative_prompt: Optional[Union[str, List[str]]] = None,
        strength: float = 0.8,
@@ -901,7 +807,7 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(OnnxStableDiffusionPipeline
        guidance_scale: Optional[float] = 7.5,
        num_images_per_prompt: Optional[int] = 1,
        eta: Optional[float] = 0.0,
-        generator: Optional[torch.Generator] = None,
+        generator: Optional[np.random.RandomState] = None,
        max_embeddings_multiples: Optional[int] = 3,
        output_type: Optional[str] = "pil",
        return_dict: bool = True,
@@ -912,7 +818,7 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(OnnxStableDiffusionPipeline
        r"""
        Function for image-to-image generation.
        Args:
-            image (`np.ndarray` or `PIL.Image.Image`):
+            init_image (`np.ndarray` or `PIL.Image.Image`):
                `Image`, or ndarray representing an image batch, that will be used as the starting point for the
                process.
            prompt (`str` or `List[str]`):
@@ -921,11 +827,11 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(OnnxStableDiffusionPipeline
                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
                if `guidance_scale` is less than `1`).
            strength (`float`, *optional*, defaults to 0.8):
-                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1.
-                `image` will be used as a starting point, adding more noise to it the larger the `strength`. The
+                Conceptually, indicates how much to transform the reference `init_image`. Must be between 0 and 1.
+                `init_image` will be used as a starting point, adding more noise to it the larger the `strength`. The
                number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
                noise will be maximum and the denoising process will run for the full number of iterations specified in
-                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
+                `num_inference_steps`. A value of 1, therefore, essentially ignores `init_image`.
            num_inference_steps (`int`, *optional*, defaults to 50):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference. This parameter will be modulated by `strength`.
@@ -940,9 +846,8 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(OnnxStableDiffusionPipeline
            eta (`float`, *optional*, defaults to 0.0):
                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`torch.Generator`, *optional*):
-                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
-                deterministic.
+            generator (`np.random.RandomState`, *optional*):
+                A np.random.RandomState to make generation deterministic.
            max_embeddings_multiples (`int`, *optional*, defaults to `3`):
                The max multiple length of prompt embeddings compared to the max output length of text encoder.
            output_type (`str`, *optional*, defaults to `"pil"`):
@@ -967,7 +872,7 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(OnnxStableDiffusionPipeline
        return self.__call__(
            prompt=prompt,
            negative_prompt=negative_prompt,
-            image=image,
+            init_image=init_image,
            num_inference_steps=num_inference_steps,
            guidance_scale=guidance_scale,
            strength=strength,
@@ -984,7 +889,7 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(OnnxStableDiffusionPipeline

    def inpaint(
        self,
-        image: Union[np.ndarray, PIL.Image.Image],
+        init_image: Union[np.ndarray, PIL.Image.Image],
        mask_image: Union[np.ndarray, PIL.Image.Image],
        prompt: Union[str, List[str]],
        negative_prompt: Optional[Union[str, List[str]]] = None,
@@ -993,7 +898,7 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(OnnxStableDiffusionPipeline
        guidance_scale: Optional[float] = 7.5,
        num_images_per_prompt: Optional[int] = 1,
        eta: Optional[float] = 0.0,
-        generator: Optional[torch.Generator] = None,
+        generator: Optional[np.random.RandomState] = None,
        max_embeddings_multiples: Optional[int] = 3,
        output_type: Optional[str] = "pil",
        return_dict: bool = True,
@@ -1004,11 +909,11 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(OnnxStableDiffusionPipeline
        r"""
        Function for inpaint.
        Args:
-            image (`np.ndarray` or `PIL.Image.Image`):
+            init_image (`np.ndarray` or `PIL.Image.Image`):
                `Image`, or tensor representing an image batch, that will be used as the starting point for the
                process. This is the image whose masked region will be inpainted.
            mask_image (`np.ndarray` or `PIL.Image.Image`):
-                `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
+                `Image`, or tensor representing an image batch, to mask `init_image`. White pixels in the mask will be
                replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
                PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
                contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.
@@ -1020,7 +925,7 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(OnnxStableDiffusionPipeline
            strength (`float`, *optional*, defaults to 0.8):
                Conceptually, indicates how much to inpaint the masked area. Must be between 0 and 1. When `strength`
                is 1, the denoising process will be run on the masked area for the full number of iterations specified
-                in `num_inference_steps`. `image` will be used as a reference for the masked area, adding more
+                in `num_inference_steps`. `init_image` will be used as a reference for the masked area, adding more
                noise to that region the larger the `strength`. If `strength` is 0, no inpainting will occur.
            num_inference_steps (`int`, *optional*, defaults to 50):
                The reference number of denoising steps. More denoising steps usually lead to a higher quality image at
@@ -1036,9 +941,8 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(OnnxStableDiffusionPipeline
            eta (`float`, *optional*, defaults to 0.0):
                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`torch.Generator`, *optional*):
-                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
-                deterministic.
+            generator (`np.random.RandomState`, *optional*):
+                A np.random.RandomState to make generation deterministic.
            max_embeddings_multiples (`int`, *optional*, defaults to `3`):
                The max multiple length of prompt embeddings compared to the max output length of text encoder.
            output_type (`str`, *optional*, defaults to `"pil"`):
@@ -1063,7 +967,7 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(OnnxStableDiffusionPipeline
        return self.__call__(
            prompt=prompt,
            negative_prompt=negative_prompt,
-            image=image,
+            init_image=init_image,
            mask_image=mask_image,
            num_inference_steps=num_inference_steps,
            guidance_scale=guidance_scale,
--- a/examples/community/multilingual_stable_diffusion.py
+++ b/examples/community/multilingual_stable_diffusion.py
@@ -1,436 +0,0 @@
-import inspect
-from typing import Callable, List, Optional, Union
-
-import torch
-
-from diffusers.configuration_utils import FrozenDict
-from diffusers.models import AutoencoderKL, UNet2DConditionModel
-from diffusers.pipeline_utils import DiffusionPipeline
-from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
-from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
-from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
-from diffusers.utils import deprecate, logging
-from transformers import (
-    CLIPFeatureExtractor,
-    CLIPTextModel,
-    CLIPTokenizer,
-    MBart50TokenizerFast,
-    MBartForConditionalGeneration,
-    pipeline,
-)
-
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-
-def detect_language(pipe, prompt, batch_size):
-    """helper function to detect language(s) of prompt"""
-
-    if batch_size == 1:
-        preds = pipe(prompt, top_k=1, truncation=True, max_length=128)
-        return preds[0]["label"]
-    else:
-        detected_languages = []
-        for p in prompt:
-            preds = pipe(p, top_k=1, truncation=True, max_length=128)
-            detected_languages.append(preds[0]["label"])
-
-        return detected_languages
-
-
-def translate_prompt(prompt, translation_tokenizer, translation_model, device):
-    """helper function to translate prompt to English"""
-
-    encoded_prompt = translation_tokenizer(prompt, return_tensors="pt").to(device)
-    generated_tokens = translation_model.generate(**encoded_prompt, max_new_tokens=1000)
-    en_trans = translation_tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
-
-    return en_trans[0]
-
-
-class MultilingualStableDiffusion(DiffusionPipeline):
-    r"""
-    Pipeline for text-to-image generation using Stable Diffusion in different languages.
-
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-
-    Args:
-        detection_pipeline ([`pipeline`]):
-            Transformers pipeline to detect prompt's language.
-        translation_model ([`MBartForConditionalGeneration`]):
-            Model to translate prompt to English, if necessary. Please refer to the
-            [model card](https://huggingface.co/docs/transformers/model_doc/mbart) for details.
-        translation_tokenizer ([`MBart50TokenizerFast`]):
-            Tokenizer of the translation model.
-        vae ([`AutoencoderKL`]):
-            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
-        text_encoder ([`CLIPTextModel`]):
-            Frozen text-encoder. Stable Diffusion uses the text portion of
-            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
-            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
-        tokenizer (`CLIPTokenizer`):
-            Tokenizer of class
-            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
-        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
-        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
-            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
-        safety_checker ([`StableDiffusionSafetyChecker`]):
-            Classification module that estimates whether generated images could be considered offensive or harmful.
-            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
-        feature_extractor ([`CLIPFeatureExtractor`]):
-            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
-    """
-
-    def __init__(
-        self,
-        detection_pipeline: pipeline,
-        translation_model: MBartForConditionalGeneration,
-        translation_tokenizer: MBart50TokenizerFast,
-        vae: AutoencoderKL,
-        text_encoder: CLIPTextModel,
-        tokenizer: CLIPTokenizer,
-        unet: UNet2DConditionModel,
-        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
-        safety_checker: StableDiffusionSafetyChecker,
-        feature_extractor: CLIPFeatureExtractor,
-    ):
-        super().__init__()
-
-        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
-            deprecation_message = (
-                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
-                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
-                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
-                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
-                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
-                " file"
-            )
-            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
-            new_config = dict(scheduler.config)
-            new_config["steps_offset"] = 1
-            scheduler._internal_dict = FrozenDict(new_config)
-
-        if safety_checker is None:
-            logger.warning(
-                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
-                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
-                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
-                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
-                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
-                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
-            )
-
-        self.register_modules(
-            detection_pipeline=detection_pipeline,
-            translation_model=translation_model,
-            translation_tokenizer=translation_tokenizer,
-            vae=vae,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            unet=unet,
-            scheduler=scheduler,
-            safety_checker=safety_checker,
-            feature_extractor=feature_extractor,
-        )
-
-    def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
-        r"""
-        Enable sliced attention computation.
-
-        When this option is enabled, the attention module will split the input tensor in slices, to compute attention
-        in several steps. This is useful to save some memory in exchange for a small speed decrease.
-
-        Args:
-            slice_size (`str` or `int`, *optional*, defaults to `"auto"`):
-                When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
-                a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case,
-                `attention_head_dim` must be a multiple of `slice_size`.
-        """
-        if slice_size == "auto":
-            # half the attention head size is usually a good trade-off between
-            # speed and memory
-            slice_size = self.unet.config.attention_head_dim // 2
-        self.unet.set_attention_slice(slice_size)
-
-    def disable_attention_slicing(self):
-        r"""
-        Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go
-        back to computing attention in one step.
-        """
-        # set slice_size = `None` to disable `attention slicing`
-        self.enable_attention_slicing(None)
-
-    @torch.no_grad()
-    def __call__(
-        self,
-        prompt: Union[str, List[str]],
-        height: int = 512,
-        width: int = 512,
-        num_inference_steps: int = 50,
-        guidance_scale: float = 7.5,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: float = 0.0,
-        generator: Optional[torch.Generator] = None,
-        latents: Optional[torch.FloatTensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
-        callback_steps: Optional[int] = 1,
-        **kwargs,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`str` or `List[str]`):
-                The prompt or prompts to guide the image generation. Can be in different languages.
-            height (`int`, *optional*, defaults to 512):
-                The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to 512):
-                The width in pixels of the generated image.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
-                if `guidance_scale` is less than `1`).
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`torch.Generator`, *optional*):
-                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
-                deterministic.
-            latents (`torch.FloatTensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-
-        Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-            When returning a tuple, the first element is a list with the generated images, and the second element is a
-            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, according to the `safety_checker`.
-        """
-        if isinstance(prompt, str):
-            batch_size = 1
-        elif isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
-
-        if (callback_steps is None) or (
-            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
-        ):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}."
-            )
-
-        # detect language and translate if necessary
-        prompt_language = detect_language(self.detection_pipeline, prompt, batch_size)
-        if batch_size == 1 and prompt_language != "en":
-            prompt = translate_prompt(prompt, self.translation_tokenizer, self.translation_model, self.device)
-
-        if isinstance(prompt, list):
-            for index in range(batch_size):
-                if prompt_language[index] != "en":
-                    p = translate_prompt(
-                        prompt[index], self.translation_tokenizer, self.translation_model, self.device
-                    )
-                    prompt[index] = p
-
-        # get prompt text embeddings
-        text_inputs = self.tokenizer(
-            prompt,
-            padding="max_length",
-            max_length=self.tokenizer.model_max_length,
-            return_tensors="pt",
-        )
-        text_input_ids = text_inputs.input_ids
-
-        if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
-            removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :])
-            logger.warning(
-                "The following part of your input was truncated because CLIP can only handle sequences up to"
-                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
-            )
-            text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
-        text_embeddings = self.text_encoder(text_input_ids.to(self.device))[0]
-
-        # duplicate text embeddings for each generation per prompt, using mps friendly method
-        bs_embed, seq_len, _ = text_embeddings.shape
-        text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)
-        text_embeddings = text_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
-
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-        # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance:
-            uncond_tokens: List[str]
-            if negative_prompt is None:
-                uncond_tokens = [""] * batch_size
-            elif type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}."
-                )
-            elif isinstance(negative_prompt, str):
-                # detect language and translate it if necessary
-                negative_prompt_language = detect_language(self.detection_pipeline, negative_prompt, batch_size)
-                if negative_prompt_language != "en":
-                    negative_prompt = translate_prompt(
-                        negative_prompt, self.translation_tokenizer, self.translation_model, self.device
-                    )
-                if isinstance(negative_prompt, str):
-                    uncond_tokens = [negative_prompt]
-            elif batch_size != len(negative_prompt):
-                raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`."
-                )
-            else:
-                # detect language and translate it if necessary
-                if isinstance(negative_prompt, list):
-                    negative_prompt_languages = detect_language(self.detection_pipeline, negative_prompt, batch_size)
-                    for index in range(batch_size):
-                        if negative_prompt_languages[index] != "en":
-                            p = translate_prompt(
-                                negative_prompt[index], self.translation_tokenizer, self.translation_model, self.device
-                            )
-                            negative_prompt[index] = p
-                uncond_tokens = negative_prompt
-
-            max_length = text_input_ids.shape[-1]
-            uncond_input = self.tokenizer(
-                uncond_tokens,
-                padding="max_length",
-                max_length=max_length,
-                truncation=True,
-                return_tensors="pt",
-            )
-            uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(self.device))[0]
-
-            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-            seq_len = uncond_embeddings.shape[1]
-            uncond_embeddings = uncond_embeddings.repeat(1, num_images_per_prompt, 1)
-            uncond_embeddings = uncond_embeddings.view(batch_size * num_images_per_prompt, seq_len, -1)
-
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
-
-        # get the initial random noise unless the user supplied it
-
-        # Unlike in other pipelines, latents need to be generated in the target device
-        # for 1-to-1 results reproducibility with the CompVis implementation.
-        # However this currently doesn't work in `mps`.
-        latents_shape = (batch_size * num_images_per_prompt, self.unet.in_channels, height // 8, width // 8)
-        latents_dtype = text_embeddings.dtype
-        if latents is None:
-            if self.device.type == "mps":
-                # randn does not work reproducibly on mps
-                latents = torch.randn(latents_shape, generator=generator, device="cpu", dtype=latents_dtype).to(
-                    self.device
-                )
-            else:
-                latents = torch.randn(latents_shape, generator=generator, device=self.device, dtype=latents_dtype)
-        else:
-            if latents.shape != latents_shape:
-                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
-            latents = latents.to(self.device)
-
-        # set timesteps
-        self.scheduler.set_timesteps(num_inference_steps)
-
-        # Some schedulers like PNDM have timesteps as arrays
-        # It's more optimized to move all timesteps to correct device beforehand
-        timesteps_tensor = self.scheduler.timesteps.to(self.device)
-
-        # scale the initial noise by the standard deviation required by the scheduler
-        latents = latents * self.scheduler.init_noise_sigma
-
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        extra_step_kwargs = {}
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-
-        for i, t in enumerate(self.progress_bar(timesteps_tensor)):
-            # expand the latents if we are doing classifier free guidance
-            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
-            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-
-            # predict the noise residual
-            noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
-
-            # perform guidance
-            if do_classifier_free_guidance:
-                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-
-            # compute the previous noisy sample x_t -> x_t-1
-            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
-
-            # call the callback, if provided
-            if callback is not None and i % callback_steps == 0:
-                callback(i, t, latents)
-
-        latents = 1 / 0.18215 * latents
-        image = self.vae.decode(latents).sample
-
-        image = (image / 2 + 0.5).clamp(0, 1)
-
-        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
-        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
-
-        if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(
-                self.device
-            )
-            image, has_nsfw_concept = self.safety_checker(
-                images=image, clip_input=safety_checker_input.pixel_values.to(text_embeddings.dtype)
-            )
-        else:
-            has_nsfw_concept = None
-
-        if output_type == "pil":
-            image = self.numpy_to_pil(image)
-
-        if not return_dict:
-            return (image, has_nsfw_concept)
-
-        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
--- a/examples/community/sd_text2img_k_diffusion.py
+++ b/examples/community/sd_text2img_k_diffusion.py
@@ -1,476 +0,0 @@
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import importlib
-import warnings
-from typing import Callable, List, Optional, Union
-
-import torch
-
-from diffusers import LMSDiscreteScheduler
-from diffusers.pipeline_utils import DiffusionPipeline
-from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
-from diffusers.utils import is_accelerate_available, logging
-from k_diffusion.external import CompVisDenoiser, CompVisVDenoiser
-
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-
-class ModelWrapper:
-    def __init__(self, model, alphas_cumprod):
-        self.model = model
-        self.alphas_cumprod = alphas_cumprod
-
-    def apply_model(self, *args, **kwargs):
-        if len(args) == 3:
-            encoder_hidden_states = args[-1]
-            args = args[:2]
-        if kwargs.get("cond", None) is not None:
-            encoder_hidden_states = kwargs.pop("cond")
-        return self.model(*args, encoder_hidden_states=encoder_hidden_states, **kwargs).sample
-
-
-class StableDiffusionPipeline(DiffusionPipeline):
-    r"""
-    Pipeline for text-to-image generation using Stable Diffusion.
-
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-
-    Args:
-        vae ([`AutoencoderKL`]):
-            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
-        text_encoder ([`CLIPTextModel`]):
-            Frozen text-encoder. Stable Diffusion uses the text portion of
-            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
-            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
-        tokenizer (`CLIPTokenizer`):
-            Tokenizer of class
-            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
-        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
-        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
-            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
-        safety_checker ([`StableDiffusionSafetyChecker`]):
-            Classification module that estimates whether generated images could be considered offensive or harmful.
-            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
-        feature_extractor ([`CLIPFeatureExtractor`]):
-            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
-    """
-    _optional_components = ["safety_checker", "feature_extractor"]
-
-    def __init__(
-        self,
-        vae,
-        text_encoder,
-        tokenizer,
-        unet,
-        scheduler,
-        safety_checker,
-        feature_extractor,
-    ):
-        super().__init__()
-
-        if safety_checker is None:
-            logger.warning(
-                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
-                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
-                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
-                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
-                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
-                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
-            )
-
-        # get correct sigmas from LMS
-        scheduler = LMSDiscreteScheduler.from_config(scheduler.config)
-        self.register_modules(
-            vae=vae,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            unet=unet,
-            scheduler=scheduler,
-            safety_checker=safety_checker,
-            feature_extractor=feature_extractor,
-        )
-
-        model = ModelWrapper(unet, scheduler.alphas_cumprod)
-        if scheduler.prediction_type == "v_prediction":
-            self.k_diffusion_model = CompVisVDenoiser(model)
-        else:
-            self.k_diffusion_model = CompVisDenoiser(model)
-
-    def set_sampler(self, scheduler_type: str):
-        warnings.warn("The `set_sampler` method is deprecated, please use `set_scheduler` instead.")
-        return self.set_scheduler(scheduler_type)
-
-    def set_scheduler(self, scheduler_type: str):
-        library = importlib.import_module("k_diffusion")
-        sampling = getattr(library, "sampling")
-        self.sampler = getattr(sampling, scheduler_type)
-
-    def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
-        r"""
-        Enable sliced attention computation.
-
-        When this option is enabled, the attention module will split the input tensor in slices, to compute attention
-        in several steps. This is useful to save some memory in exchange for a small speed decrease.
-
-        Args:
-            slice_size (`str` or `int`, *optional*, defaults to `"auto"`):
-                When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
-                a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case,
-                `attention_head_dim` must be a multiple of `slice_size`.
-        """
-        if slice_size == "auto":
-            # half the attention head size is usually a good trade-off between
-            # speed and memory
-            slice_size = self.unet.config.attention_head_dim // 2
-        self.unet.set_attention_slice(slice_size)
-
-    def disable_attention_slicing(self):
-        r"""
-        Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go
-        back to computing attention in one step.
-        """
-        # set slice_size = `None` to disable `attention slicing`
-        self.enable_attention_slicing(None)
-
-    def enable_sequential_cpu_offload(self, gpu_id=0):
-        r"""
-        Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
-        text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
-        `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
-        """
-        if is_accelerate_available():
-            from accelerate import cpu_offload
-        else:
-            raise ImportError("Please install accelerate via `pip install accelerate`")
-
-        device = torch.device(f"cuda:{gpu_id}")
-
-        for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae, self.safety_checker]:
-            if cpu_offloaded_model is not None:
-                cpu_offload(cpu_offloaded_model, device)
-
-    @property
-    def _execution_device(self):
-        r"""
-        Returns the device on which the pipeline's models will be executed. After calling
-        `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
-        hooks.
-        """
-        if self.device != torch.device("meta") or not hasattr(self.unet, "_hf_hook"):
-            return self.device
-        for module in self.unet.modules():
-            if (
-                hasattr(module, "_hf_hook")
-                and hasattr(module._hf_hook, "execution_device")
-                and module._hf_hook.execution_device is not None
-            ):
-                return torch.device(module._hf_hook.execution_device)
-        return self.device
-
-    def _encode_prompt(self, prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt):
-        r"""
-        Encodes the prompt into text encoder hidden states.
-
-        Args:
-            prompt (`str` or `list(int)`):
-                prompt to be encoded
-            device: (`torch.device`):
-                torch device
-            num_images_per_prompt (`int`):
-                number of images that should be generated per prompt
-            do_classifier_free_guidance (`bool`):
-                whether to use classifier free guidance or not
-            negative_prompt (`str` or `List[str]`):
-                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
-                if `guidance_scale` is less than `1`).
-        """
-        batch_size = len(prompt) if isinstance(prompt, list) else 1
-
-        text_inputs = self.tokenizer(
-            prompt,
-            padding="max_length",
-            max_length=self.tokenizer.model_max_length,
-            truncation=True,
-            return_tensors="pt",
-        )
-        text_input_ids = text_inputs.input_ids
-        untruncated_ids = self.tokenizer(prompt, padding="max_length", return_tensors="pt").input_ids
-
-        if not torch.equal(text_input_ids, untruncated_ids):
-            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
-            logger.warning(
-                "The following part of your input was truncated because CLIP can only handle sequences up to"
-                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
-            )
-
-        if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-            attention_mask = text_inputs.attention_mask.to(device)
-        else:
-            attention_mask = None
-
-        text_embeddings = self.text_encoder(
-            text_input_ids.to(device),
-            attention_mask=attention_mask,
-        )
-        text_embeddings = text_embeddings[0]
-
-        # duplicate text embeddings for each generation per prompt, using mps friendly method
-        bs_embed, seq_len, _ = text_embeddings.shape
-        text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)
-        text_embeddings = text_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
-
-        # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance:
-            uncond_tokens: List[str]
-            if negative_prompt is None:
-                uncond_tokens = [""] * batch_size
-            elif type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}."
-                )
-            elif isinstance(negative_prompt, str):
-                uncond_tokens = [negative_prompt]
-            elif batch_size != len(negative_prompt):
-                raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`."
-                )
-            else:
-                uncond_tokens = negative_prompt
-
-            max_length = text_input_ids.shape[-1]
-            uncond_input = self.tokenizer(
-                uncond_tokens,
-                padding="max_length",
-                max_length=max_length,
-                truncation=True,
-                return_tensors="pt",
-            )
-
-            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-                attention_mask = uncond_input.attention_mask.to(device)
-            else:
-                attention_mask = None
-
-            uncond_embeddings = self.text_encoder(
-                uncond_input.input_ids.to(device),
-                attention_mask=attention_mask,
-            )
-            uncond_embeddings = uncond_embeddings[0]
-
-            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-            seq_len = uncond_embeddings.shape[1]
-            uncond_embeddings = uncond_embeddings.repeat(1, num_images_per_prompt, 1)
-            uncond_embeddings = uncond_embeddings.view(batch_size * num_images_per_prompt, seq_len, -1)
-
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
-
-        return text_embeddings
-
-    def run_safety_checker(self, image, device, dtype):
-        if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device)
-            image, has_nsfw_concept = self.safety_checker(
-                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
-            )
-        else:
-            has_nsfw_concept = None
-        return image, has_nsfw_concept
-
-    def decode_latents(self, latents):
-        latents = 1 / 0.18215 * latents
-        image = self.vae.decode(latents).sample
-        image = (image / 2 + 0.5).clamp(0, 1)
-        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
-        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
-        return image
-
-    def check_inputs(self, prompt, height, width, callback_steps):
-        if not isinstance(prompt, str) and not isinstance(prompt, list):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
-
-        if (callback_steps is None) or (
-            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
-        ):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}."
-            )
-
-    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
-        shape = (batch_size, num_channels_latents, height // 8, width // 8)
-        if latents is None:
-            if device.type == "mps":
-                # randn does not work reproducibly on mps
-                latents = torch.randn(shape, generator=generator, device="cpu", dtype=dtype).to(device)
-            else:
-                latents = torch.randn(shape, generator=generator, device=device, dtype=dtype)
-        else:
-            if latents.shape != shape:
-                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
-            latents = latents.to(device)
-
-        # scale the initial noise by the standard deviation required by the scheduler
-        return latents
-
-    @torch.no_grad()
-    def __call__(
-        self,
-        prompt: Union[str, List[str]],
-        height: int = 512,
-        width: int = 512,
-        num_inference_steps: int = 50,
-        guidance_scale: float = 7.5,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: float = 0.0,
-        generator: Optional[torch.Generator] = None,
-        latents: Optional[torch.FloatTensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
-        callback_steps: Optional[int] = 1,
-        **kwargs,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`str` or `List[str]`):
-                The prompt or prompts to guide the image generation.
-            height (`int`, *optional*, defaults to 512):
-                The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to 512):
-                The width in pixels of the generated image.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
-                if `guidance_scale` is less than `1`).
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`torch.Generator`, *optional*):
-                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
-                deterministic.
-            latents (`torch.FloatTensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-
-        Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-            When returning a tuple, the first element is a list with the generated images, and the second element is a
-            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, according to the `safety_checker`.
-        """
-
-        # 1. Check inputs. Raise error if not correct
-        self.check_inputs(prompt, height, width, callback_steps)
-
-        # 2. Define call parameters
-        batch_size = 1 if isinstance(prompt, str) else len(prompt)
-        device = self._execution_device
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = True
-        if guidance_scale <= 1.0:
-            raise ValueError("has to use guidance_scale")
-
-        # 3. Encode input prompt
-        text_embeddings = self._encode_prompt(
-            prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
-        )
-
-        # 4. Prepare timesteps
-        self.scheduler.set_timesteps(num_inference_steps, device=text_embeddings.device)
-        sigmas = self.scheduler.sigmas
-        sigmas = sigmas.to(text_embeddings.dtype)
-
-        # 5. Prepare latent variables
-        num_channels_latents = self.unet.in_channels
-        latents = self.prepare_latents(
-            batch_size * num_images_per_prompt,
-            num_channels_latents,
-            height,
-            width,
-            text_embeddings.dtype,
-            device,
-            generator,
-            latents,
-        )
-        latents = latents * sigmas[0]
-        self.k_diffusion_model.sigmas = self.k_diffusion_model.sigmas.to(latents.device)
-        self.k_diffusion_model.log_sigmas = self.k_diffusion_model.log_sigmas.to(latents.device)
-
-        def model_fn(x, t):
-            latent_model_input = torch.cat([x] * 2)
-
-            noise_pred = self.k_diffusion_model(latent_model_input, t, cond=text_embeddings)
-
-            noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-            noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-            return noise_pred
-
-        latents = self.sampler(model_fn, latents, sigmas)
-
-        # 8. Post-processing
-        image = self.decode_latents(latents)
-
-        # 9. Run safety checker
-        image, has_nsfw_concept = self.run_safety_checker(image, device, text_embeddings.dtype)
-
-        # 10. Convert to PIL
-        if output_type == "pil":
-            image = self.numpy_to_pil(image)
-
-        if not return_dict:
-            return (image, has_nsfw_concept)
-
-        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
--- a/examples/community/seed_resize_stable_diffusion.py
+++ b/examples/community/seed_resize_stable_diffusion.py
@@ -37,7 +37,7 @@ class SeedResizeStableDiffusionPipeline(DiffusionPipeline):
            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
        safety_checker ([`StableDiffusionSafetyChecker`]):
            Classification module that estimates whether generated images could be considered offensive or harmful.
--- a/examples/community/speech_to_image_diffusion.py
+++ b/examples/community/speech_to_image_diffusion.py
@@ -42,7 +42,7 @@ class SpeechToImagePipeline(DiffusionPipeline):
        super().__init__()

        if safety_checker is None:
-            logger.warning(
+            logger.warn(
                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
@@ -148,7 +148,7 @@ class SpeechToImagePipeline(DiffusionPipeline):
        if do_classifier_free_guidance:
            uncond_tokens: List[str]
            if negative_prompt is None:
-                uncond_tokens = [""] * batch_size
+                uncond_tokens = [""]
            elif type(prompt) is not type(negative_prompt):
                raise TypeError(
                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
@@ -177,7 +177,7 @@ class SpeechToImagePipeline(DiffusionPipeline):

            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
            seq_len = uncond_embeddings.shape[1]
-            uncond_embeddings = uncond_embeddings.repeat(1, num_images_per_prompt, 1)
+            uncond_embeddings = uncond_embeddings.repeat(batch_size, num_images_per_prompt, 1)
            uncond_embeddings = uncond_embeddings.view(batch_size * num_images_per_prompt, seq_len, -1)

            # For classifier free guidance, we need to do two forward passes.
--- a/examples/community/stable_diffusion_mega.py
+++ b/examples/community/stable_diffusion_mega.py
@@ -42,7 +42,7 @@ class StableDiffusionMegaPipeline(DiffusionPipeline):
            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
        safety_checker ([`StableDiffusionMegaSafetyChecker`]):
            Classification module that estimates whether generated images could be considered offensive or harmful.
@@ -50,7 +50,6 @@ class StableDiffusionMegaPipeline(DiffusionPipeline):
        feature_extractor ([`CLIPFeatureExtractor`]):
            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
    """
-    _optional_components = ["safety_checker", "feature_extractor"]

    def __init__(
        self,
@@ -61,7 +60,6 @@ class StableDiffusionMegaPipeline(DiffusionPipeline):
        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
        safety_checker: StableDiffusionSafetyChecker,
        feature_extractor: CLIPFeatureExtractor,
-        requires_safety_checker: bool = True,
    ):
        super().__init__()
        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
@@ -87,7 +85,6 @@ class StableDiffusionMegaPipeline(DiffusionPipeline):
            safety_checker=safety_checker,
            feature_extractor=feature_extractor,
        )
-        self.register_to_config(requires_safety_checker=requires_safety_checker)

    @property
    def components(self) -> Dict[str, Any]:
@@ -124,7 +121,7 @@ class StableDiffusionMegaPipeline(DiffusionPipeline):
    def inpaint(
        self,
        prompt: Union[str, List[str]],
-        image: Union[torch.FloatTensor, PIL.Image.Image],
+        init_image: Union[torch.FloatTensor, PIL.Image.Image],
        mask_image: Union[torch.FloatTensor, PIL.Image.Image],
        strength: float = 0.8,
        num_inference_steps: Optional[int] = 50,
@@ -141,7 +138,7 @@ class StableDiffusionMegaPipeline(DiffusionPipeline):
        # For more information on how this function works, please see: https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion#diffusers.StableDiffusionImg2ImgPipeline
        return StableDiffusionInpaintPipelineLegacy(**self.components)(
            prompt=prompt,
-            image=image,
+            init_image=init_image,
            mask_image=mask_image,
            strength=strength,
            num_inference_steps=num_inference_steps,
@@ -159,7 +156,7 @@ class StableDiffusionMegaPipeline(DiffusionPipeline):
    def img2img(
        self,
        prompt: Union[str, List[str]],
-        image: Union[torch.FloatTensor, PIL.Image.Image],
+        init_image: Union[torch.FloatTensor, PIL.Image.Image],
        strength: float = 0.8,
        num_inference_steps: Optional[int] = 50,
        guidance_scale: Optional[float] = 7.5,
@@ -176,7 +173,7 @@ class StableDiffusionMegaPipeline(DiffusionPipeline):
        # For more information on how this function works, please see: https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion#diffusers.StableDiffusionImg2ImgPipeline
        return StableDiffusionImg2ImgPipeline(**self.components)(
            prompt=prompt,
-            image=image,
+            init_image=init_image,
            strength=strength,
            num_inference_steps=num_inference_steps,
            guidance_scale=guidance_scale,
--- a/examples/community/text_inpainting.py
+++ b/examples/community/text_inpainting.py
@@ -1,302 +0,0 @@
-from typing import Callable, List, Optional, Union
-
-import torch
-
-import PIL
-from diffusers.configuration_utils import FrozenDict
-from diffusers.models import AutoencoderKL, UNet2DConditionModel
-from diffusers.pipeline_utils import DiffusionPipeline
-from diffusers.pipelines.stable_diffusion import StableDiffusionInpaintPipeline
-from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
-from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
-from diffusers.utils import deprecate, is_accelerate_available, logging
-from transformers import (
-    CLIPFeatureExtractor,
-    CLIPSegForImageSegmentation,
-    CLIPSegProcessor,
-    CLIPTextModel,
-    CLIPTokenizer,
-)
-
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-
-class TextInpainting(DiffusionPipeline):
-    r"""
-    Pipeline for text based inpainting using Stable Diffusion.
-    Uses CLIPSeg to get a mask from the given text, then calls the Inpainting pipeline with the generated mask
-
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-
-    Args:
-        segmentation_model ([`CLIPSegForImageSegmentation`]):
-            CLIPSeg Model to generate mask from the given text. Please refer to the [model card]() for details.
-        segmentation_processor ([`CLIPSegProcessor`]):
-            CLIPSeg processor to get image, text features to translate prompt to English, if necessary. Please refer to the
-            [model card](https://huggingface.co/docs/transformers/model_doc/clipseg) for details.
-        vae ([`AutoencoderKL`]):
-            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
-        text_encoder ([`CLIPTextModel`]):
-            Frozen text-encoder. Stable Diffusion uses the text portion of
-            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
-            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
-        tokenizer (`CLIPTokenizer`):
-            Tokenizer of class
-            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
-        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
-        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
-            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
-        safety_checker ([`StableDiffusionSafetyChecker`]):
-            Classification module that estimates whether generated images could be considered offensive or harmful.
-            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
-        feature_extractor ([`CLIPFeatureExtractor`]):
-            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
-    """
-
-    def __init__(
-        self,
-        segmentation_model: CLIPSegForImageSegmentation,
-        segmentation_processor: CLIPSegProcessor,
-        vae: AutoencoderKL,
-        text_encoder: CLIPTextModel,
-        tokenizer: CLIPTokenizer,
-        unet: UNet2DConditionModel,
-        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
-        safety_checker: StableDiffusionSafetyChecker,
-        feature_extractor: CLIPFeatureExtractor,
-    ):
-        super().__init__()
-
-        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
-            deprecation_message = (
-                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
-                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
-                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
-                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
-                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
-                " file"
-            )
-            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
-            new_config = dict(scheduler.config)
-            new_config["steps_offset"] = 1
-            scheduler._internal_dict = FrozenDict(new_config)
-
-        if hasattr(scheduler.config, "skip_prk_steps") and scheduler.config.skip_prk_steps is False:
-            deprecation_message = (
-                f"The configuration file of this scheduler: {scheduler} has not set the configuration"
-                " `skip_prk_steps`. `skip_prk_steps` should be set to True in the configuration file. Please make"
-                " sure to update the config accordingly as not setting `skip_prk_steps` in the config might lead to"
-                " incorrect results in future versions. If you have downloaded this checkpoint from the Hugging Face"
-                " Hub, it would be very nice if you could open a Pull request for the"
-                " `scheduler/scheduler_config.json` file"
-            )
-            deprecate("skip_prk_steps not set", "1.0.0", deprecation_message, standard_warn=False)
-            new_config = dict(scheduler.config)
-            new_config["skip_prk_steps"] = True
-            scheduler._internal_dict = FrozenDict(new_config)
-
-        if safety_checker is None:
-            logger.warning(
-                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
-                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
-                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
-                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
-                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
-                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
-            )
-
-        self.register_modules(
-            segmentation_model=segmentation_model,
-            segmentation_processor=segmentation_processor,
-            vae=vae,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            unet=unet,
-            scheduler=scheduler,
-            safety_checker=safety_checker,
-            feature_extractor=feature_extractor,
-        )
-
-    def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
-        r"""
-        Enable sliced attention computation.
-
-        When this option is enabled, the attention module will split the input tensor in slices, to compute attention
-        in several steps. This is useful to save some memory in exchange for a small speed decrease.
-
-        Args:
-            slice_size (`str` or `int`, *optional*, defaults to `"auto"`):
-                When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
-                a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case,
-                `attention_head_dim` must be a multiple of `slice_size`.
-        """
-        if slice_size == "auto":
-            # half the attention head size is usually a good trade-off between
-            # speed and memory
-            slice_size = self.unet.config.attention_head_dim // 2
-        self.unet.set_attention_slice(slice_size)
-
-    def disable_attention_slicing(self):
-        r"""
-        Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go
-        back to computing attention in one step.
-        """
-        # set slice_size = `None` to disable `attention slicing`
-        self.enable_attention_slicing(None)
-
-    def enable_sequential_cpu_offload(self):
-        r"""
-        Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
-        text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
-        `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
-        """
-        if is_accelerate_available():
-            from accelerate import cpu_offload
-        else:
-            raise ImportError("Please install accelerate via `pip install accelerate`")
-
-        device = torch.device("cuda")
-
-        for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae, self.safety_checker]:
-            if cpu_offloaded_model is not None:
-                cpu_offload(cpu_offloaded_model, device)
-
-    @property
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
-    def _execution_device(self):
-        r"""
-        Returns the device on which the pipeline's models will be executed. After calling
-        `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
-        hooks.
-        """
-        if self.device != torch.device("meta") or not hasattr(self.unet, "_hf_hook"):
-            return self.device
-        for module in self.unet.modules():
-            if (
-                hasattr(module, "_hf_hook")
-                and hasattr(module._hf_hook, "execution_device")
-                and module._hf_hook.execution_device is not None
-            ):
-                return torch.device(module._hf_hook.execution_device)
-        return self.device
-
-    @torch.no_grad()
-    def __call__(
-        self,
-        prompt: Union[str, List[str]],
-        image: Union[torch.FloatTensor, PIL.Image.Image],
-        text: str,
-        height: int = 512,
-        width: int = 512,
-        num_inference_steps: int = 50,
-        guidance_scale: float = 7.5,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: float = 0.0,
-        generator: Optional[torch.Generator] = None,
-        latents: Optional[torch.FloatTensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
-        callback_steps: Optional[int] = 1,
-        **kwargs,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`str` or `List[str]`):
-                The prompt or prompts to guide the image generation.
-            image (`PIL.Image.Image`):
-                `Image`, or tensor representing an image batch which will be inpainted, *i.e.* parts of the image will
-                be masked out with `mask_image` and repainted according to `prompt`.
-            text (`str``):
-                The text to use to generate the mask.
-            height (`int`, *optional*, defaults to 512):
-                The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to 512):
-                The width in pixels of the generated image.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
-                if `guidance_scale` is less than `1`).
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`torch.Generator`, *optional*):
-                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
-                deterministic.
-            latents (`torch.FloatTensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-
-        Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-            When returning a tuple, the first element is a list with the generated images, and the second element is a
-            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, according to the `safety_checker`.
-        """
-
-        # We use the input text to generate the mask
-        inputs = self.segmentation_processor(
-            text=[text], images=[image], padding="max_length", return_tensors="pt"
-        ).to(self.device)
-        outputs = self.segmentation_model(**inputs)
-        mask = torch.sigmoid(outputs.logits).cpu().detach().unsqueeze(-1).numpy()
-        mask_pil = self.numpy_to_pil(mask)[0].resize(image.size)
-
-        # Run inpainting pipeline with the generated mask
-        inpainting_pipeline = StableDiffusionInpaintPipeline(
-            vae=self.vae,
-            text_encoder=self.text_encoder,
-            tokenizer=self.tokenizer,
-            unet=self.unet,
-            scheduler=self.scheduler,
-            safety_checker=self.safety_checker,
-            feature_extractor=self.feature_extractor,
-        )
-        return inpainting_pipeline(
-            prompt=prompt,
-            image=image,
-            mask_image=mask_pil,
-            height=height,
-            width=width,
-            num_inference_steps=num_inference_steps,
-            guidance_scale=guidance_scale,
-            negative_prompt=negative_prompt,
-            num_images_per_prompt=num_images_per_prompt,
-            eta=eta,
-            generator=generator,
-            latents=latents,
-            output_type=output_type,
-            return_dict=return_dict,
-            callback=callback,
-            callback_steps=callback_steps,
-        )
--- a/examples/community/wildcard_stable_diffusion.py
+++ b/examples/community/wildcard_stable_diffusion.py
@@ -99,7 +99,7 @@ class WildcardStableDiffusionPipeline(DiffusionPipeline):
            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
        safety_checker ([`StableDiffusionSafetyChecker`]):
            Classification module that estimates whether generated images could be considered offensive or harmful.
@@ -135,7 +135,7 @@ class WildcardStableDiffusionPipeline(DiffusionPipeline):
            scheduler._internal_dict = FrozenDict(new_config)

        if safety_checker is None:
-            logger.warning(
+            logger.warn(
                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
@@ -295,7 +295,7 @@ class WildcardStableDiffusionPipeline(DiffusionPipeline):
        if do_classifier_free_guidance:
            uncond_tokens: List[str]
            if negative_prompt is None:
-                uncond_tokens = [""] * batch_size
+                uncond_tokens = [""]
            elif type(prompt) is not type(negative_prompt):
                raise TypeError(
                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
@@ -324,7 +324,7 @@ class WildcardStableDiffusionPipeline(DiffusionPipeline):

            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
            seq_len = uncond_embeddings.shape[1]
-            uncond_embeddings = uncond_embeddings.repeat(1, num_images_per_prompt, 1)
+            uncond_embeddings = uncond_embeddings.repeat(batch_size, num_images_per_prompt, 1)
            uncond_embeddings = uncond_embeddings.view(batch_size * num_images_per_prompt, seq_len, -1)

            # For classifier free guidance, we need to do two forward passes.
--- a/examples/dreambooth/README.md
+++ b/examples/dreambooth/README.md
@@ -9,18 +9,8 @@ The `train_dreambooth.py` script shows how to implement the training procedure a

 Before running the scripts, make sure to install the library's training dependencies:

-**Important**
-
-To make sure you can successfully run the latest versions of the example scripts, we highly recommend **installing from source** and keeping the install up to date as we update the example scripts frequently and install some example-specific requirements. To do this, execute the following steps in a new virtual environment:
 ```bash
-git clone https://github.com/huggingface/diffusers
-cd diffusers
-pip install -e .
-```
-
-Then cd in the example folder and run
-```bash
-pip install -r requirements.txt
+pip install -U -r requirements.txt
 ```

 And initialize an [🤗Accelerate](https://github.com/huggingface/accelerate/) environment with:
@@ -29,19 +19,6 @@ And initialize an [🤗Accelerate](https://github.com/huggingface/accelerate/) e
 accelerate config
 ```

-Or for a default accelerate configuration without answering questions about your environment
-
-```bash
-accelerate config default
-```
-
-Or if your environment doesn't support an interactive shell e.g. a notebook
-
-```python
-from accelerate.utils import write_basic_config
-write_basic_config()
-```
-
 ### Dog toy example

 You need to accept the model license before downloading or using the weights. In this example we'll use model version `v1-4`, so you'll need to visit [its card](https://huggingface.co/CompVis/stable-diffusion-v1-4), read the license and tick the checkbox if you agree. 
@@ -62,8 +39,6 @@ Now let's get our dataset. Download images from [here](https://drive.google.com/

 And launch the training using

-**___Note: Change the `resolution` to 768 if you are using the [stable-diffusion-2](https://huggingface.co/stabilityai/stable-diffusion-2) 768x768 model.___**
-
 ```bash
 export MODEL_NAME="CompVis/stable-diffusion-v1-4"
 export INSTANCE_DIR="path-to-instance-images"
@@ -86,7 +61,7 @@ accelerate launch train_dreambooth.py \
 ### Training with prior-preservation loss

 Prior-preservation is used to avoid overfitting and language-drift. Refer to the paper to learn more about it. For prior-preservation we first generate images using the model with a class prompt and then use those during training along with our data.
-According to the paper, it's recommended to generate `num_epochs * num_samples` images for prior-preservation. 200-300 works well for most cases. The `num_class_images` flag sets the number of images to generate with the class prompt. You can place existing images in `class_data_dir`, and the training script will generate any additional images so that `num_class_images` are present in `class_data_dir` during training time.
+According to the paper, it's recommended to generate `num_epochs * num_samples` images for prior-preservation. 200-300 works well for most cases.

 ```bash
 export MODEL_NAME="CompVis/stable-diffusion-v1-4"
@@ -117,7 +92,7 @@ accelerate launch train_dreambooth.py \

 With the help of gradient checkpointing and the 8-bit optimizer from bitsandbytes it's possible to run train dreambooth on a 16GB GPU.

-To install `bitandbytes` please refer to this [readme](https://github.com/TimDettmers/bitsandbytes#requirements--installation).
+Install `bitsandbytes` with `pip install bitsandbytes`

 ```bash
 export MODEL_NAME="CompVis/stable-diffusion-v1-4"
@@ -166,7 +141,7 @@ export INSTANCE_DIR="path-to-instance-images"
 export CLASS_DIR="path-to-class-images"
 export OUTPUT_DIR="path-to-save-model"

-accelerate launch --mixed_precision="fp16" train_dreambooth.py \
+accelerate launch train_dreambooth.py \
  --pretrained_model_name_or_path=$MODEL_NAME \
  --instance_data_dir=$INSTANCE_DIR \
  --class_data_dir=$CLASS_DIR \
@@ -182,7 +157,8 @@ accelerate launch --mixed_precision="fp16" train_dreambooth.py \
  --lr_scheduler="constant" \
  --lr_warmup_steps=0 \
  --num_class_images=200 \
-  --max_train_steps=800
+  --max_train_steps=800 \
+  --mixed_precision=fp16
 ```

 ### Fine-tune text encoder with the UNet.
@@ -209,7 +185,7 @@ accelerate launch train_dreambooth.py \
  --class_prompt="a photo of dog" \
  --resolution=512 \
  --train_batch_size=1 \
-  --use_8bit_adam \
+  --use_8bit_adam
  --gradient_checkpointing \
  --learning_rate=2e-6 \
  --lr_scheduler="constant" \
@@ -218,17 +194,6 @@ accelerate launch train_dreambooth.py \
  --max_train_steps=800
 ```

-### Using DreamBooth for other pipelines than Stable Diffusion
-
-Altdiffusion also support dreambooth now, the runing comman is basically the same as abouve, all you need to do is replace the `MODEL_NAME` like this:
-One can now simply change the `pretrained_model_name_or_path` to another architecture such as [`AltDiffusion`](https://huggingface.co/docs/diffusers/api/pipelines/alt_diffusion).
-
-```
-export MODEL_NAME="CompVis/stable-diffusion-v1-4" --> export MODEL_NAME="BAAI/AltDiffusion-m9"
-or
-export MODEL_NAME="CompVis/stable-diffusion-v1-4" --> export MODEL_NAME="BAAI/AltDiffusion"
-```
-
 ### Inference

 Once you have trained a model using above command, the inference can be done simply using the `StableDiffusionPipeline`. Make sure to include the `identifier`(e.g. sks in above example) in your prompt.
@@ -326,98 +291,4 @@ python train_dreambooth_flax.py \
  --learning_rate=2e-6 \
  --num_class_images=200 \
  --max_train_steps=800
-```
-
-### Training with prior-preservation loss
-
-Prior-preservation is used to avoid overfitting and language-drift. Refer to the paper to learn more about it. For prior-preservation we first generate images using the model with a class prompt and then use those during training along with our data.
-According to the paper, it's recommended to generate `num_epochs * num_samples` images for prior-preservation. 200-300 works well for most cases.
-
-```bash
-export MODEL_NAME="runwayml/stable-diffusion-inpainting"
-export INSTANCE_DIR="path-to-instance-images"
-export CLASS_DIR="path-to-class-images"
-export OUTPUT_DIR="path-to-save-model"
-
-accelerate launch train_dreambooth_inpaint.py \
-  --pretrained_model_name_or_path=$MODEL_NAME  \
-  --instance_data_dir=$INSTANCE_DIR \
-  --class_data_dir=$CLASS_DIR \
-  --output_dir=$OUTPUT_DIR \
-  --with_prior_preservation --prior_loss_weight=1.0 \
-  --instance_prompt="a photo of sks dog" \
-  --class_prompt="a photo of dog" \
-  --resolution=512 \
-  --train_batch_size=1 \
-  --gradient_accumulation_steps=1 \
-  --learning_rate=5e-6 \
-  --lr_scheduler="constant" \
-  --lr_warmup_steps=0 \
-  --num_class_images=200 \
-  --max_train_steps=800
-```
-
-
-### Training with gradient checkpointing and 8-bit optimizer:
-
-With the help of gradient checkpointing and the 8-bit optimizer from bitsandbytes it's possible to run train dreambooth on a 16GB GPU.
-
-To install `bitandbytes` please refer to this [readme](https://github.com/TimDettmers/bitsandbytes#requirements--installation).
-
-```bash
-export MODEL_NAME="runwayml/stable-diffusion-inpainting"
-export INSTANCE_DIR="path-to-instance-images"
-export CLASS_DIR="path-to-class-images"
-export OUTPUT_DIR="path-to-save-model"
-
-accelerate launch train_dreambooth_inpaint.py \
-  --pretrained_model_name_or_path=$MODEL_NAME  \
-  --instance_data_dir=$INSTANCE_DIR \
-  --class_data_dir=$CLASS_DIR \
-  --output_dir=$OUTPUT_DIR \
-  --with_prior_preservation --prior_loss_weight=1.0 \
-  --instance_prompt="a photo of sks dog" \
-  --class_prompt="a photo of dog" \
-  --resolution=512 \
-  --train_batch_size=1 \
-  --gradient_accumulation_steps=2 --gradient_checkpointing \
-  --use_8bit_adam \
-  --learning_rate=5e-6 \
-  --lr_scheduler="constant" \
-  --lr_warmup_steps=0 \
-  --num_class_images=200 \
-  --max_train_steps=800
-```
-
-### Fine-tune text encoder with the UNet.
-
-The script also allows to fine-tune the `text_encoder` along with the `unet`. It's been observed experimentally that fine-tuning `text_encoder` gives much better results especially on faces. 
-Pass the `--train_text_encoder` argument to the script to enable training `text_encoder`.
-
-___Note: Training text encoder requires more memory, with this option the training won't fit on 16GB GPU. It needs at least 24GB VRAM.___
-
-```bash
-export MODEL_NAME="runwayml/stable-diffusion-inpainting"
-export INSTANCE_DIR="path-to-instance-images"
-export CLASS_DIR="path-to-class-images"
-export OUTPUT_DIR="path-to-save-model"
-
-accelerate launch train_dreambooth_inpaint.py \
-  --pretrained_model_name_or_path=$MODEL_NAME  \
-  --train_text_encoder \
-  --instance_data_dir=$INSTANCE_DIR \
-  --class_data_dir=$CLASS_DIR \
-  --output_dir=$OUTPUT_DIR \
-  --with_prior_preservation --prior_loss_weight=1.0 \
-  --instance_prompt="a photo of sks dog" \
-  --class_prompt="a photo of dog" \
-  --resolution=512 \
-  --train_batch_size=1 \
-  --use_8bit_adam \
-  --gradient_checkpointing \
-  --learning_rate=2e-6 \
-  --lr_scheduler="constant" \
-  --lr_warmup_steps=0 \
-  --num_class_images=200 \
-  --max_train_steps=800
-```
+```
--- a/examples/dreambooth/requirements.txt
+++ b/examples/dreambooth/requirements.txt
@@ -1,3 +1,4 @@
+diffusers>==0.5.0
 accelerate
 torchvision
 transformers>=4.21.0
--- a/examples/dreambooth/requirements_flax.txt
+++ b/examples/dreambooth/requirements_flax.txt
@@ -1,3 +1,4 @@
+diffusers>==0.5.1
 transformers>=4.21.0
 flax
 optax
--- a/examples/dreambooth/train_dreambooth.py
+++ b/examples/dreambooth/train_dreambooth.py
@@ -14,42 +14,18 @@ from torch.utils.data import Dataset
 from accelerate import Accelerator
 from accelerate.logging import get_logger
 from accelerate.utils import set_seed
-from diffusers import AutoencoderKL, DDPMScheduler, DiffusionPipeline, UNet2DConditionModel
+from diffusers import AutoencoderKL, DDPMScheduler, StableDiffusionPipeline, UNet2DConditionModel
 from diffusers.optimization import get_scheduler
-from diffusers.utils import check_min_version
 from huggingface_hub import HfFolder, Repository, whoami
 from PIL import Image
 from torchvision import transforms
 from tqdm.auto import tqdm
-from transformers import AutoTokenizer, PretrainedConfig
+from transformers import CLIPTextModel, CLIPTokenizer


-# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.10.0.dev0")
-
 logger = get_logger(__name__)


-def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: str, revision: str):
-    text_encoder_config = PretrainedConfig.from_pretrained(
-        pretrained_model_name_or_path,
-        subfolder="text_encoder",
-        revision=revision,
-    )
-    model_class = text_encoder_config.architectures[0]
-
-    if model_class == "CLIPTextModel":
-        from transformers import CLIPTextModel
-
-        return CLIPTextModel
-    elif model_class == "RobertaSeriesModelWithTransformation":
-        from diffusers.pipelines.alt_diffusion.modeling_roberta_series import RobertaSeriesModelWithTransformation
-
-        return RobertaSeriesModelWithTransformation
-    else:
-        raise ValueError(f"{model_class} is not supported.")
-
-
 def parse_args(input_args=None):
    parser = argparse.ArgumentParser(description="Simple example of a training script.")
    parser.add_argument(
@@ -90,7 +66,6 @@ def parse_args(input_args=None):
        "--instance_prompt",
        type=str,
        default=None,
-        required=True,
        help="The prompt with identifier specifying the instance",
    )
    parser.add_argument(
@@ -111,8 +86,8 @@ def parse_args(input_args=None):
        type=int,
        default=100,
        help=(
-            "Minimal class images for prior preservation loss. If there are not enough images already present in"
-            " class_data_dir, additional images will be sampled with class_prompt."
+            "Minimal class images for prior preservation loss. If not have enough images, additional images will be"
+            " sampled with class_prompt."
        ),
    )
    parser.add_argument(
@@ -148,7 +123,6 @@ def parse_args(input_args=None):
        default=None,
        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
    )
-    parser.add_argument("--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.")
    parser.add_argument(
        "--gradient_accumulation_steps",
        type=int,
@@ -212,12 +186,12 @@ def parse_args(input_args=None):
    parser.add_argument(
        "--mixed_precision",
        type=str,
-        default=None,
+        default="no",
        choices=["no", "fp16", "bf16"],
        help=(
-            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
-            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
-            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+            "Whether to use mixed precision. Choose"
+            "between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
+            "and an Nvidia Ampere GPU."
        ),
    )
    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
@@ -231,16 +205,14 @@ def parse_args(input_args=None):
    if env_local_rank != -1 and env_local_rank != args.local_rank:
        args.local_rank = env_local_rank

+    if args.instance_data_dir is None:
+        raise ValueError("You must specify a train data directory.")
+
    if args.with_prior_preservation:
        if args.class_data_dir is None:
            raise ValueError("You must specify a data directory for class images.")
        if args.class_prompt is None:
            raise ValueError("You must specify prompt for class images.")
-    else:
-        if args.class_data_dir is not None:
-            logger.warning("You need not use --class_data_dir without --with_prior_preservation.")
-        if args.class_prompt is not None:
-            logger.warning("You need not use --class_prompt without --with_prior_preservation.")

    return args

@@ -304,10 +276,9 @@ class DreamBoothDataset(Dataset):
        example["instance_images"] = self.image_transforms(instance_image)
        example["instance_prompt_ids"] = self.tokenizer(
            self.instance_prompt,
+            padding="do_not_pad",
            truncation=True,
-            padding="max_length",
            max_length=self.tokenizer.model_max_length,
-            return_tensors="pt",
        ).input_ids

        if self.class_data_root:
@@ -317,37 +288,14 @@ class DreamBoothDataset(Dataset):
            example["class_images"] = self.image_transforms(class_image)
            example["class_prompt_ids"] = self.tokenizer(
                self.class_prompt,
+                padding="do_not_pad",
                truncation=True,
-                padding="max_length",
                max_length=self.tokenizer.model_max_length,
-                return_tensors="pt",
            ).input_ids

        return example


-def collate_fn(examples, with_prior_preservation=False):
-    input_ids = [example["instance_prompt_ids"] for example in examples]
-    pixel_values = [example["instance_images"] for example in examples]
-
-    # Concat class and instance examples for prior preservation.
-    # We do this to avoid doing two forward passes.
-    if with_prior_preservation:
-        input_ids += [example["class_prompt_ids"] for example in examples]
-        pixel_values += [example["class_images"] for example in examples]
-
-    pixel_values = torch.stack(pixel_values)
-    pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
-
-    input_ids = torch.cat(input_ids, dim=0)
-
-    batch = {
-        "input_ids": input_ids,
-        "pixel_values": pixel_values,
-    }
-    return batch
-
-
 class PromptDataset(Dataset):
    "A simple dataset to prepare the prompts to generate class images on multiple GPUs."

@@ -405,7 +353,7 @@ def main(args):

        if cur_class_images < args.num_class_images:
            torch_dtype = torch.float16 if accelerator.device.type == "cuda" else torch.float32
-            pipeline = DiffusionPipeline.from_pretrained(
+            pipeline = StableDiffusionPipeline.from_pretrained(
                args.pretrained_model_name_or_path,
                torch_dtype=torch_dtype,
                safety_checker=None,
@@ -455,24 +403,19 @@ def main(args):

    # Load the tokenizer
    if args.tokenizer_name:
-        tokenizer = AutoTokenizer.from_pretrained(
+        tokenizer = CLIPTokenizer.from_pretrained(
            args.tokenizer_name,
            revision=args.revision,
-            use_fast=False,
        )
    elif args.pretrained_model_name_or_path:
-        tokenizer = AutoTokenizer.from_pretrained(
+        tokenizer = CLIPTokenizer.from_pretrained(
            args.pretrained_model_name_or_path,
            subfolder="tokenizer",
            revision=args.revision,
-            use_fast=False,
        )

-    # import correct text encoder class
-    text_encoder_cls = import_model_class_from_model_name_or_path(args.pretrained_model_name_or_path, args.revision)
-
    # Load models and create wrapper for stable diffusion
-    text_encoder = text_encoder_cls.from_pretrained(
+    text_encoder = CLIPTextModel.from_pretrained(
        args.pretrained_model_name_or_path,
        subfolder="text_encoder",
        revision=args.revision,
@@ -526,7 +469,9 @@ def main(args):
        eps=args.adam_epsilon,
    )

-    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+    noise_scheduler = DDPMScheduler(
+        beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000
+    )

    train_dataset = DreamBoothDataset(
        instance_data_root=args.instance_data_dir,
@@ -538,12 +483,29 @@ def main(args):
        center_crop=args.center_crop,
    )

+    def collate_fn(examples):
+        input_ids = [example["instance_prompt_ids"] for example in examples]
+        pixel_values = [example["instance_images"] for example in examples]
+
+        # Concat class and instance examples for prior preservation.
+        # We do this to avoid doing two forward passes.
+        if args.with_prior_preservation:
+            input_ids += [example["class_prompt_ids"] for example in examples]
+            pixel_values += [example["class_images"] for example in examples]
+
+        pixel_values = torch.stack(pixel_values)
+        pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
+
+        input_ids = tokenizer.pad({"input_ids": input_ids}, padding=True, return_tensors="pt").input_ids
+
+        batch = {
+            "input_ids": input_ids,
+            "pixel_values": pixel_values,
+        }
+        return batch
+
    train_dataloader = torch.utils.data.DataLoader(
-        train_dataset,
-        batch_size=args.train_batch_size,
-        shuffle=True,
-        collate_fn=lambda examples: collate_fn(examples, args.with_prior_preservation),
-        num_workers=1,
+        train_dataset, batch_size=args.train_batch_size, shuffle=True, collate_fn=collate_fn, num_workers=1
    )

    # Scheduler and math around the number of training steps.
@@ -570,9 +532,9 @@ def main(args):
        )

    weight_dtype = torch.float32
-    if accelerator.mixed_precision == "fp16":
+    if args.mixed_precision == "fp16":
        weight_dtype = torch.float16
-    elif accelerator.mixed_precision == "bf16":
+    elif args.mixed_precision == "bf16":
        weight_dtype = torch.bfloat16

    # Move text_encode and vae to gpu.
@@ -635,31 +597,23 @@ def main(args):
                encoder_hidden_states = text_encoder(batch["input_ids"])[0]

                # Predict the noise residual
-                model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
-
-                # Get the target for loss depending on the prediction type
-                if noise_scheduler.config.prediction_type == "epsilon":
-                    target = noise
-                elif noise_scheduler.config.prediction_type == "v_prediction":
-                    target = noise_scheduler.get_velocity(latents, noise, timesteps)
-                else:
-                    raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
+                noise_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample

                if args.with_prior_preservation:
-                    # Chunk the noise and model_pred into two parts and compute the loss on each part separately.
-                    model_pred, model_pred_prior = torch.chunk(model_pred, 2, dim=0)
-                    target, target_prior = torch.chunk(target, 2, dim=0)
+                    # Chunk the noise and noise_pred into two parts and compute the loss on each part separately.
+                    noise_pred, noise_pred_prior = torch.chunk(noise_pred, 2, dim=0)
+                    noise, noise_prior = torch.chunk(noise, 2, dim=0)

                    # Compute instance loss
-                    loss = F.mse_loss(model_pred.float(), target.float(), reduction="none").mean([1, 2, 3]).mean()
+                    loss = F.mse_loss(noise_pred.float(), noise.float(), reduction="none").mean([1, 2, 3]).mean()

                    # Compute prior loss
-                    prior_loss = F.mse_loss(model_pred_prior.float(), target_prior.float(), reduction="mean")
+                    prior_loss = F.mse_loss(noise_pred_prior.float(), noise_prior.float(), reduction="mean")

                    # Add the prior loss to the instance loss.
                    loss = loss + args.prior_loss_weight * prior_loss
                else:
-                    loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
+                    loss = F.mse_loss(noise_pred.float(), noise.float(), reduction="mean")

                accelerator.backward(loss)
                if accelerator.sync_gradients:
@@ -678,17 +632,6 @@ def main(args):
                progress_bar.update(1)
                global_step += 1

-                if global_step % args.save_steps == 0:
-                    if accelerator.is_main_process:
-                        pipeline = DiffusionPipeline.from_pretrained(
-                            args.pretrained_model_name_or_path,
-                            unet=accelerator.unwrap_model(unet),
-                            text_encoder=accelerator.unwrap_model(text_encoder),
-                            revision=args.revision,
-                        )
-                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
-                        pipeline.save_pretrained(save_path)
-
            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
            progress_bar.set_postfix(**logs)
            accelerator.log(logs, step=global_step)
@@ -700,7 +643,7 @@ def main(args):

    # Create the pipeline using using the trained modules and save it.
    if accelerator.is_main_process:
-        pipeline = DiffusionPipeline.from_pretrained(
+        pipeline = StableDiffusionPipeline.from_pretrained(
            args.pretrained_model_name_or_path,
            unet=accelerator.unwrap_model(unet),
            text_encoder=accelerator.unwrap_model(text_encoder),
--- a/examples/dreambooth/train_dreambooth_flax.py
+++ b/examples/dreambooth/train_dreambooth_flax.py
@@ -23,7 +23,6 @@ from diffusers import (
    FlaxUNet2DConditionModel,
 )
 from diffusers.pipelines.stable_diffusion import FlaxStableDiffusionSafetyChecker
-from diffusers.utils import check_min_version
 from flax import jax_utils
 from flax.training import train_state
 from flax.training.common_utils import shard
@@ -34,9 +33,6 @@ from tqdm.auto import tqdm
 from transformers import CLIPFeatureExtractor, CLIPTokenizer, FlaxCLIPTextModel, set_seed


-# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.10.0.dev0")
-
 logger = logging.getLogger(__name__)


@@ -93,8 +89,8 @@ def parse_args():
        type=int,
        default=100,
        help=(
-            "Minimal class images for prior preservation loss. If there are not enough images already present in"
-            " class_data_dir, additional images will be sampled with class_prompt."
+            "Minimal class images for prior preservation loss. If not have enough images, additional images will be"
+            " sampled with class_prompt."
        ),
    )
    parser.add_argument(
@@ -331,6 +327,22 @@ def main():
    if args.seed is not None:
        set_seed(args.seed)

+    if jax.process_index() == 0:
+        if args.push_to_hub:
+            if args.hub_model_id is None:
+                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
+            else:
+                repo_name = args.hub_model_id
+            repo = Repository(args.output_dir, clone_from=repo_name)
+
+            with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
+                if "step_*" not in gitignore:
+                    gitignore.write("step_*\n")
+                if "epoch_*" not in gitignore:
+                    gitignore.write("epoch_*\n")
+        elif args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
    rng = jax.random.PRNGKey(args.seed)

    if args.with_prior_preservation:
@@ -349,8 +361,7 @@ def main():
            logger.info(f"Number of class images to sample: {num_new_images}.")

            sample_dataset = PromptDataset(args.class_prompt, num_new_images)
-            total_sample_batch_size = args.sample_batch_size * jax.local_device_count()
-            sample_dataloader = torch.utils.data.DataLoader(sample_dataset, batch_size=total_sample_batch_size)
+            sample_dataloader = torch.utils.data.DataLoader(sample_dataset, batch_size=args.sample_batch_size)

            for example in tqdm(
                sample_dataloader, desc="Generating class images", disable=not jax.process_index() == 0
@@ -440,9 +451,7 @@ def main():
        weight_dtype = jnp.bfloat16

    # Load models and create wrapper for stable diffusion
-    text_encoder = FlaxCLIPTextModel.from_pretrained(
-        args.pretrained_model_name_or_path, subfolder="text_encoder", dtype=weight_dtype
-    )
+    text_encoder = FlaxCLIPTextModel.from_pretrained("openai/clip-vit-large-patch14", dtype=weight_dtype)
    vae, vae_params = FlaxAutoencoderKL.from_pretrained(
        args.pretrained_model_name_or_path, subfolder="vae", dtype=weight_dtype
    )
--- a/examples/research_projects/README.md
+++ b/examples/research_projects/README.md
@@ -1,14 +0,0 @@
-# Research projects
-
-This folder contains various research projects using 🧨 Diffusers. 
-They are not really maintained by the core maintainers of this library and often require a specific version of Diffusers that is indicated in the requirements file of each folder. 
-Updating them to the most recent version of the library will require some work.
-
-To use any of them, just run the command
-
-```
-pip install -r requirements.txt
-```
-inside the folder of your choice.
-
-If you need help with any of those, please open an issue where you directly ping the author(s), as indicated at the top of the README of each folder.
--- a/examples/research_projects/dreambooth_inpaint/README.md
+++ b/examples/research_projects/dreambooth_inpaint/README.md
@@ -1,26 +0,0 @@
-# Dreambooth for the inpainting model
-
-This script was added by @thedarkzeno .
-
-Please note that this script is not actively maintained, you can open an issue and tag @thedarkzeno or @patil-suraj though.
-
-```bash
-export MODEL_NAME="runwayml/stable-diffusion-inpainting"
-export INSTANCE_DIR="path-to-instance-images"
-export OUTPUT_DIR="path-to-save-model"
-
-accelerate launch train_dreambooth_inpaint.py \
-  --pretrained_model_name_or_path=$MODEL_NAME  \
-  --instance_data_dir=$INSTANCE_DIR \
-  --output_dir=$OUTPUT_DIR \
-  --instance_prompt="a photo of sks dog" \
-  --resolution=512 \
-  --train_batch_size=1 \
-  --gradient_accumulation_steps=1 \
-  --learning_rate=5e-6 \
-  --lr_scheduler="constant" \
-  --lr_warmup_steps=0 \
-  --max_train_steps=400
-```
-
-The script is also compatible with prior preservation loss and gradient checkpointing
--- a/examples/research_projects/dreambooth_inpaint/requirements.txt
+++ b/examples/research_projects/dreambooth_inpaint/requirements.txt
@@ -1,7 +0,0 @@
-diffusers==0.9.0
-accelerate
-torchvision
-transformers>=4.21.0
-ftfy
-tensorboard
-modelcards
--- a/examples/research_projects/dreambooth_inpaint/train_dreambooth_inpaint.py
+++ b/examples/research_projects/dreambooth_inpaint/train_dreambooth_inpaint.py
@@ -1,747 +0,0 @@
-import argparse
-import hashlib
-import itertools
-import math
-import os
-import random
-from pathlib import Path
-from typing import Optional
-
-import numpy as np
-import torch
-import torch.nn.functional as F
-import torch.utils.checkpoint
-from torch.utils.data import Dataset
-
-from accelerate import Accelerator
-from accelerate.logging import get_logger
-from accelerate.utils import set_seed
-from diffusers import (
-    AutoencoderKL,
-    DDPMScheduler,
-    StableDiffusionInpaintPipeline,
-    StableDiffusionPipeline,
-    UNet2DConditionModel,
-)
-from diffusers.optimization import get_scheduler
-from diffusers.utils import check_min_version
-from huggingface_hub import HfFolder, Repository, whoami
-from PIL import Image, ImageDraw
-from torchvision import transforms
-from tqdm.auto import tqdm
-from transformers import CLIPTextModel, CLIPTokenizer
-
-
-# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.10.0.dev0")
-
-logger = get_logger(__name__)
-
-
-def prepare_mask_and_masked_image(image, mask):
-    image = np.array(image.convert("RGB"))
-    image = image[None].transpose(0, 3, 1, 2)
-    image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
-
-    mask = np.array(mask.convert("L"))
-    mask = mask.astype(np.float32) / 255.0
-    mask = mask[None, None]
-    mask[mask < 0.5] = 0
-    mask[mask >= 0.5] = 1
-    mask = torch.from_numpy(mask)
-
-    masked_image = image * (mask < 0.5)
-
-    return mask, masked_image
-
-
-# generate random masks
-def random_mask(im_shape, ratio=1, mask_full_image=False):
-    mask = Image.new("L", im_shape, 0)
-    draw = ImageDraw.Draw(mask)
-    size = (random.randint(0, int(im_shape[0] * ratio)), random.randint(0, int(im_shape[1] * ratio)))
-    # use this to always mask the whole image
-    if mask_full_image:
-        size = (int(im_shape[0] * ratio), int(im_shape[1] * ratio))
-    limits = (im_shape[0] - size[0] // 2, im_shape[1] - size[1] // 2)
-    center = (random.randint(size[0] // 2, limits[0]), random.randint(size[1] // 2, limits[1]))
-    draw_type = random.randint(0, 1)
-    if draw_type == 0 or mask_full_image:
-        draw.rectangle(
-            (center[0] - size[0] // 2, center[1] - size[1] // 2, center[0] + size[0] // 2, center[1] + size[1] // 2),
-            fill=255,
-        )
-    else:
-        draw.ellipse(
-            (center[0] - size[0] // 2, center[1] - size[1] // 2, center[0] + size[0] // 2, center[1] + size[1] // 2),
-            fill=255,
-        )
-
-    return mask
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(description="Simple example of a training script.")
-    parser.add_argument(
-        "--pretrained_model_name_or_path",
-        type=str,
-        default=None,
-        required=True,
-        help="Path to pretrained model or model identifier from huggingface.co/models.",
-    )
-    parser.add_argument(
-        "--tokenizer_name",
-        type=str,
-        default=None,
-        help="Pretrained tokenizer name or path if not the same as model_name",
-    )
-    parser.add_argument(
-        "--instance_data_dir",
-        type=str,
-        default=None,
-        required=True,
-        help="A folder containing the training data of instance images.",
-    )
-    parser.add_argument(
-        "--class_data_dir",
-        type=str,
-        default=None,
-        required=False,
-        help="A folder containing the training data of class images.",
-    )
-    parser.add_argument(
-        "--instance_prompt",
-        type=str,
-        default=None,
-        help="The prompt with identifier specifying the instance",
-    )
-    parser.add_argument(
-        "--class_prompt",
-        type=str,
-        default=None,
-        help="The prompt to specify images in the same class as provided instance images.",
-    )
-    parser.add_argument(
-        "--with_prior_preservation",
-        default=False,
-        action="store_true",
-        help="Flag to add prior preservation loss.",
-    )
-    parser.add_argument("--prior_loss_weight", type=float, default=1.0, help="The weight of prior preservation loss.")
-    parser.add_argument(
-        "--num_class_images",
-        type=int,
-        default=100,
-        help=(
-            "Minimal class images for prior preservation loss. If not have enough images, additional images will be"
-            " sampled with class_prompt."
-        ),
-    )
-    parser.add_argument(
-        "--output_dir",
-        type=str,
-        default="text-inversion-model",
-        help="The output directory where the model predictions and checkpoints will be written.",
-    )
-    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
-    parser.add_argument(
-        "--resolution",
-        type=int,
-        default=512,
-        help=(
-            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
-            " resolution"
-        ),
-    )
-    parser.add_argument(
-        "--center_crop", action="store_true", help="Whether to center crop images before resizing to resolution"
-    )
-    parser.add_argument("--train_text_encoder", action="store_true", help="Whether to train the text encoder")
-    parser.add_argument(
-        "--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader."
-    )
-    parser.add_argument(
-        "--sample_batch_size", type=int, default=4, help="Batch size (per device) for sampling images."
-    )
-    parser.add_argument("--num_train_epochs", type=int, default=1)
-    parser.add_argument(
-        "--max_train_steps",
-        type=int,
-        default=None,
-        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
-    )
-    parser.add_argument(
-        "--gradient_accumulation_steps",
-        type=int,
-        default=1,
-        help="Number of updates steps to accumulate before performing a backward/update pass.",
-    )
-    parser.add_argument(
-        "--gradient_checkpointing",
-        action="store_true",
-        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
-    )
-    parser.add_argument(
-        "--learning_rate",
-        type=float,
-        default=5e-6,
-        help="Initial learning rate (after the potential warmup period) to use.",
-    )
-    parser.add_argument(
-        "--scale_lr",
-        action="store_true",
-        default=False,
-        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
-    )
-    parser.add_argument(
-        "--lr_scheduler",
-        type=str,
-        default="constant",
-        help=(
-            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
-            ' "constant", "constant_with_warmup"]'
-        ),
-    )
-    parser.add_argument(
-        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
-    )
-    parser.add_argument(
-        "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
-    )
-    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
-    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
-    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
-    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
-    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
-    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
-    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
-    parser.add_argument(
-        "--hub_model_id",
-        type=str,
-        default=None,
-        help="The name of the repository to keep in sync with the local `output_dir`.",
-    )
-    parser.add_argument(
-        "--logging_dir",
-        type=str,
-        default="logs",
-        help=(
-            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
-            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
-        ),
-    )
-    parser.add_argument(
-        "--mixed_precision",
-        type=str,
-        default="no",
-        choices=["no", "fp16", "bf16"],
-        help=(
-            "Whether to use mixed precision. Choose"
-            "between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
-            "and an Nvidia Ampere GPU."
-        ),
-    )
-    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
-
-    args = parser.parse_args()
-    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
-    if env_local_rank != -1 and env_local_rank != args.local_rank:
-        args.local_rank = env_local_rank
-
-    if args.instance_data_dir is None:
-        raise ValueError("You must specify a train data directory.")
-
-    if args.with_prior_preservation:
-        if args.class_data_dir is None:
-            raise ValueError("You must specify a data directory for class images.")
-        if args.class_prompt is None:
-            raise ValueError("You must specify prompt for class images.")
-
-    return args
-
-
-class DreamBoothDataset(Dataset):
-    """
-    A dataset to prepare the instance and class images with the prompts for fine-tuning the model.
-    It pre-processes the images and the tokenizes prompts.
-    """
-
-    def __init__(
-        self,
-        instance_data_root,
-        instance_prompt,
-        tokenizer,
-        class_data_root=None,
-        class_prompt=None,
-        size=512,
-        center_crop=False,
-    ):
-        self.size = size
-        self.center_crop = center_crop
-        self.tokenizer = tokenizer
-
-        self.instance_data_root = Path(instance_data_root)
-        if not self.instance_data_root.exists():
-            raise ValueError("Instance images root doesn't exists.")
-
-        self.instance_images_path = list(Path(instance_data_root).iterdir())
-        self.num_instance_images = len(self.instance_images_path)
-        self.instance_prompt = instance_prompt
-        self._length = self.num_instance_images
-
-        if class_data_root is not None:
-            self.class_data_root = Path(class_data_root)
-            self.class_data_root.mkdir(parents=True, exist_ok=True)
-            self.class_images_path = list(self.class_data_root.iterdir())
-            self.num_class_images = len(self.class_images_path)
-            self._length = max(self.num_class_images, self.num_instance_images)
-            self.class_prompt = class_prompt
-        else:
-            self.class_data_root = None
-
-        self.image_transforms_resize_and_crop = transforms.Compose(
-            [
-                transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR),
-                transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size),
-            ]
-        )
-
-        self.image_transforms = transforms.Compose(
-            [
-                transforms.ToTensor(),
-                transforms.Normalize([0.5], [0.5]),
-            ]
-        )
-
-    def __len__(self):
-        return self._length
-
-    def __getitem__(self, index):
-        example = {}
-        instance_image = Image.open(self.instance_images_path[index % self.num_instance_images])
-        if not instance_image.mode == "RGB":
-            instance_image = instance_image.convert("RGB")
-        instance_image = self.image_transforms_resize_and_crop(instance_image)
-
-        example["PIL_images"] = instance_image
-        example["instance_images"] = self.image_transforms(instance_image)
-
-        example["instance_prompt_ids"] = self.tokenizer(
-            self.instance_prompt,
-            padding="do_not_pad",
-            truncation=True,
-            max_length=self.tokenizer.model_max_length,
-        ).input_ids
-
-        if self.class_data_root:
-            class_image = Image.open(self.class_images_path[index % self.num_class_images])
-            if not class_image.mode == "RGB":
-                class_image = class_image.convert("RGB")
-            class_image = self.image_transforms_resize_and_crop(class_image)
-            example["class_images"] = self.image_transforms(class_image)
-            example["class_PIL_images"] = class_image
-            example["class_prompt_ids"] = self.tokenizer(
-                self.class_prompt,
-                padding="do_not_pad",
-                truncation=True,
-                max_length=self.tokenizer.model_max_length,
-            ).input_ids
-
-        return example
-
-
-class PromptDataset(Dataset):
-    "A simple dataset to prepare the prompts to generate class images on multiple GPUs."
-
-    def __init__(self, prompt, num_samples):
-        self.prompt = prompt
-        self.num_samples = num_samples
-
-    def __len__(self):
-        return self.num_samples
-
-    def __getitem__(self, index):
-        example = {}
-        example["prompt"] = self.prompt
-        example["index"] = index
-        return example
-
-
-def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: Optional[str] = None):
-    if token is None:
-        token = HfFolder.get_token()
-    if organization is None:
-        username = whoami(token)["name"]
-        return f"{username}/{model_id}"
-    else:
-        return f"{organization}/{model_id}"
-
-
-def main():
-    args = parse_args()
-    logging_dir = Path(args.output_dir, args.logging_dir)
-
-    accelerator = Accelerator(
-        gradient_accumulation_steps=args.gradient_accumulation_steps,
-        mixed_precision=args.mixed_precision,
-        log_with="tensorboard",
-        logging_dir=logging_dir,
-    )
-
-    # Currently, it's not possible to do gradient accumulation when training two models with accelerate.accumulate
-    # This will be enabled soon in accelerate. For now, we don't allow gradient accumulation when training two models.
-    # TODO (patil-suraj): Remove this check when gradient accumulation with two models is enabled in accelerate.
-    if args.train_text_encoder and args.gradient_accumulation_steps > 1 and accelerator.num_processes > 1:
-        raise ValueError(
-            "Gradient accumulation is not supported when training the text encoder in distributed training. "
-            "Please set gradient_accumulation_steps to 1. This feature will be supported in the future."
-        )
-
-    if args.seed is not None:
-        set_seed(args.seed)
-
-    if args.with_prior_preservation:
-        class_images_dir = Path(args.class_data_dir)
-        if not class_images_dir.exists():
-            class_images_dir.mkdir(parents=True)
-        cur_class_images = len(list(class_images_dir.iterdir()))
-
-        if cur_class_images < args.num_class_images:
-            torch_dtype = torch.float16 if accelerator.device.type == "cuda" else torch.float32
-            pipeline = StableDiffusionInpaintPipeline.from_pretrained(
-                args.pretrained_model_name_or_path, torch_dtype=torch_dtype, safety_checker=None
-            )
-            pipeline.set_progress_bar_config(disable=True)
-
-            num_new_images = args.num_class_images - cur_class_images
-            logger.info(f"Number of class images to sample: {num_new_images}.")
-
-            sample_dataset = PromptDataset(args.class_prompt, num_new_images)
-            sample_dataloader = torch.utils.data.DataLoader(
-                sample_dataset, batch_size=args.sample_batch_size, num_workers=1
-            )
-
-            sample_dataloader = accelerator.prepare(sample_dataloader)
-            pipeline.to(accelerator.device)
-            transform_to_pil = transforms.ToPILImage()
-            for example in tqdm(
-                sample_dataloader, desc="Generating class images", disable=not accelerator.is_local_main_process
-            ):
-                bsz = len(example["prompt"])
-                fake_images = torch.rand((3, args.resolution, args.resolution))
-                transform_to_pil = transforms.ToPILImage()
-                fake_pil_images = transform_to_pil(fake_images)
-
-                fake_mask = random_mask((args.resolution, args.resolution), ratio=1, mask_full_image=True)
-
-                images = pipeline(prompt=example["prompt"], mask_image=fake_mask, image=fake_pil_images).images
-
-                for i, image in enumerate(images):
-                    hash_image = hashlib.sha1(image.tobytes()).hexdigest()
-                    image_filename = class_images_dir / f"{example['index'][i] + cur_class_images}-{hash_image}.jpg"
-                    image.save(image_filename)
-
-            del pipeline
-            if torch.cuda.is_available():
-                torch.cuda.empty_cache()
-
-    # Handle the repository creation
-    if accelerator.is_main_process:
-        if args.push_to_hub:
-            if args.hub_model_id is None:
-                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
-            else:
-                repo_name = args.hub_model_id
-            repo = Repository(args.output_dir, clone_from=repo_name)
-
-            with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
-                if "step_*" not in gitignore:
-                    gitignore.write("step_*\n")
-                if "epoch_*" not in gitignore:
-                    gitignore.write("epoch_*\n")
-        elif args.output_dir is not None:
-            os.makedirs(args.output_dir, exist_ok=True)
-
-    # Load the tokenizer
-    if args.tokenizer_name:
-        tokenizer = CLIPTokenizer.from_pretrained(args.tokenizer_name)
-    elif args.pretrained_model_name_or_path:
-        tokenizer = CLIPTokenizer.from_pretrained(args.pretrained_model_name_or_path, subfolder="tokenizer")
-
-    # Load models and create wrapper for stable diffusion
-    text_encoder = CLIPTextModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="text_encoder")
-    vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae")
-    unet = UNet2DConditionModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="unet")
-
-    vae.requires_grad_(False)
-    if not args.train_text_encoder:
-        text_encoder.requires_grad_(False)
-
-    if args.gradient_checkpointing:
-        unet.enable_gradient_checkpointing()
-        if args.train_text_encoder:
-            text_encoder.gradient_checkpointing_enable()
-
-    if args.scale_lr:
-        args.learning_rate = (
-            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
-        )
-
-    # Use 8-bit Adam for lower memory usage or to fine-tune the model in 16GB GPUs
-    if args.use_8bit_adam:
-        try:
-            import bitsandbytes as bnb
-        except ImportError:
-            raise ImportError(
-                "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`."
-            )
-
-        optimizer_class = bnb.optim.AdamW8bit
-    else:
-        optimizer_class = torch.optim.AdamW
-
-    params_to_optimize = (
-        itertools.chain(unet.parameters(), text_encoder.parameters()) if args.train_text_encoder else unet.parameters()
-    )
-    optimizer = optimizer_class(
-        params_to_optimize,
-        lr=args.learning_rate,
-        betas=(args.adam_beta1, args.adam_beta2),
-        weight_decay=args.adam_weight_decay,
-        eps=args.adam_epsilon,
-    )
-
-    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
-
-    train_dataset = DreamBoothDataset(
-        instance_data_root=args.instance_data_dir,
-        instance_prompt=args.instance_prompt,
-        class_data_root=args.class_data_dir if args.with_prior_preservation else None,
-        class_prompt=args.class_prompt,
-        tokenizer=tokenizer,
-        size=args.resolution,
-        center_crop=args.center_crop,
-    )
-
-    def collate_fn(examples):
-        input_ids = [example["instance_prompt_ids"] for example in examples]
-        pixel_values = [example["instance_images"] for example in examples]
-
-        # Concat class and instance examples for prior preservation.
-        # We do this to avoid doing two forward passes.
-        if args.with_prior_preservation:
-            input_ids += [example["class_prompt_ids"] for example in examples]
-            pixel_values += [example["class_images"] for example in examples]
-            pior_pil = [example["class_PIL_images"] for example in examples]
-
-        masks = []
-        masked_images = []
-        for example in examples:
-            pil_image = example["PIL_images"]
-            # generate a random mask
-            mask = random_mask(pil_image.size, 1, False)
-            # prepare mask and masked image
-            mask, masked_image = prepare_mask_and_masked_image(pil_image, mask)
-
-            masks.append(mask)
-            masked_images.append(masked_image)
-
-        if args.with_prior_preservation:
-            for pil_image in pior_pil:
-                # generate a random mask
-                mask = random_mask(pil_image.size, 1, False)
-                # prepare mask and masked image
-                mask, masked_image = prepare_mask_and_masked_image(pil_image, mask)
-
-                masks.append(mask)
-                masked_images.append(masked_image)
-
-        pixel_values = torch.stack(pixel_values)
-        pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
-
-        input_ids = tokenizer.pad({"input_ids": input_ids}, padding=True, return_tensors="pt").input_ids
-        masks = torch.stack(masks)
-        masked_images = torch.stack(masked_images)
-        batch = {"input_ids": input_ids, "pixel_values": pixel_values, "masks": masks, "masked_images": masked_images}
-        return batch
-
-    train_dataloader = torch.utils.data.DataLoader(
-        train_dataset, batch_size=args.train_batch_size, shuffle=True, collate_fn=collate_fn
-    )
-
-    # Scheduler and math around the number of training steps.
-    overrode_max_train_steps = False
-    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
-    if args.max_train_steps is None:
-        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
-        overrode_max_train_steps = True
-
-    lr_scheduler = get_scheduler(
-        args.lr_scheduler,
-        optimizer=optimizer,
-        num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps,
-        num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
-    )
-
-    if args.train_text_encoder:
-        unet, text_encoder, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
-            unet, text_encoder, optimizer, train_dataloader, lr_scheduler
-        )
-    else:
-        unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
-            unet, optimizer, train_dataloader, lr_scheduler
-        )
-
-    weight_dtype = torch.float32
-    if args.mixed_precision == "fp16":
-        weight_dtype = torch.float16
-    elif args.mixed_precision == "bf16":
-        weight_dtype = torch.bfloat16
-
-    # Move text_encode and vae to gpu.
-    # For mixed precision training we cast the text_encoder and vae weights to half-precision
-    # as these models are only used for inference, keeping weights in full precision is not required.
-    vae.to(accelerator.device, dtype=weight_dtype)
-    if not args.train_text_encoder:
-        text_encoder.to(accelerator.device, dtype=weight_dtype)
-
-    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
-    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
-    if overrode_max_train_steps:
-        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
-    # Afterwards we recalculate our number of training epochs
-    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
-
-    # We need to initialize the trackers we use, and also store our configuration.
-    # The trackers initializes automatically on the main process.
-    if accelerator.is_main_process:
-        accelerator.init_trackers("dreambooth", config=vars(args))
-
-    # Train!
-    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
-
-    logger.info("***** Running training *****")
-    logger.info(f"  Num examples = {len(train_dataset)}")
-    logger.info(f"  Num batches each epoch = {len(train_dataloader)}")
-    logger.info(f"  Num Epochs = {args.num_train_epochs}")
-    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
-    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
-    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
-    logger.info(f"  Total optimization steps = {args.max_train_steps}")
-    # Only show the progress bar once on each machine.
-    progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process)
-    progress_bar.set_description("Steps")
-    global_step = 0
-
-    for epoch in range(args.num_train_epochs):
-        unet.train()
-        for step, batch in enumerate(train_dataloader):
-            with accelerator.accumulate(unet):
-                # Convert images to latent space
-
-                latents = vae.encode(batch["pixel_values"].to(dtype=weight_dtype)).latent_dist.sample()
-                latents = latents * 0.18215
-
-                # Convert masked images to latent space
-                masked_latents = vae.encode(
-                    batch["masked_images"].reshape(batch["pixel_values"].shape).to(dtype=weight_dtype)
-                ).latent_dist.sample()
-                masked_latents = masked_latents * 0.18215
-
-                masks = batch["masks"]
-                # resize the mask to latents shape as we concatenate the mask to the latents
-                mask = torch.stack(
-                    [
-                        torch.nn.functional.interpolate(mask, size=(args.resolution // 8, args.resolution // 8))
-                        for mask in masks
-                    ]
-                )
-                mask = mask.reshape(-1, 1, args.resolution // 8, args.resolution // 8)
-
-                # Sample noise that we'll add to the latents
-                noise = torch.randn_like(latents)
-                bsz = latents.shape[0]
-                # Sample a random timestep for each image
-                timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device)
-                timesteps = timesteps.long()
-
-                # Add noise to the latents according to the noise magnitude at each timestep
-                # (this is the forward diffusion process)
-                noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
-
-                # concatenate the noised latents with the mask and the masked latents
-                latent_model_input = torch.cat([noisy_latents, mask, masked_latents], dim=1)
-
-                # Get the text embedding for conditioning
-                encoder_hidden_states = text_encoder(batch["input_ids"])[0]
-
-                # Predict the noise residual
-                noise_pred = unet(latent_model_input, timesteps, encoder_hidden_states).sample
-
-                # Get the target for loss depending on the prediction type
-                if noise_scheduler.config.prediction_type == "epsilon":
-                    target = noise
-                elif noise_scheduler.config.prediction_type == "v_prediction":
-                    target = noise_scheduler.get_velocity(latents, noise, timesteps)
-                else:
-                    raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
-
-                if args.with_prior_preservation:
-                    # Chunk the noise and noise_pred into two parts and compute the loss on each part separately.
-                    noise_pred, noise_pred_prior = torch.chunk(noise_pred, 2, dim=0)
-                    target, target_prior = torch.chunk(target, 2, dim=0)
-
-                    # Compute instance loss
-                    loss = F.mse_loss(noise_pred.float(), target.float(), reduction="none").mean([1, 2, 3]).mean()
-
-                    # Compute prior loss
-                    prior_loss = F.mse_loss(noise_pred_prior.float(), target_prior.float(), reduction="mean")
-
-                    # Add the prior loss to the instance loss.
-                    loss = loss + args.prior_loss_weight * prior_loss
-                else:
-                    loss = F.mse_loss(noise_pred.float(), target.float(), reduction="mean")
-
-                accelerator.backward(loss)
-                if accelerator.sync_gradients:
-                    params_to_clip = (
-                        itertools.chain(unet.parameters(), text_encoder.parameters())
-                        if args.train_text_encoder
-                        else unet.parameters()
-                    )
-                    accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
-                optimizer.step()
-                lr_scheduler.step()
-                optimizer.zero_grad()
-
-            # Checks if the accelerator has performed an optimization step behind the scenes
-            if accelerator.sync_gradients:
-                progress_bar.update(1)
-                global_step += 1
-
-            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
-            progress_bar.set_postfix(**logs)
-            accelerator.log(logs, step=global_step)
-
-            if global_step >= args.max_train_steps:
-                break
-
-        accelerator.wait_for_everyone()
-
-    # Create the pipeline using using the trained modules and save it.
-    if accelerator.is_main_process:
-        pipeline = StableDiffusionPipeline.from_pretrained(
-            args.pretrained_model_name_or_path,
-            unet=accelerator.unwrap_model(unet),
-            text_encoder=accelerator.unwrap_model(text_encoder),
-        )
-        pipeline.save_pretrained(args.output_dir)
-
-        if args.push_to_hub:
-            repo.push_to_hub(commit_message="End of training", blocking=False, auto_lfs_prune=True)
-
-    accelerator.end_training()
-
-
-if __name__ == "__main__":
-    main()
--- a/examples/rl/README.md
+++ b/examples/rl/README.md
@@ -1,22 +0,0 @@
-# Overview
-
-These examples show how to run [Diffuser](https://arxiv.org/abs/2205.09991) in Diffusers. 
-There are two ways to use the script, `run_diffuser_locomotion.py`.
-
-The key option is a change of the variable `n_guide_steps`. 
-When `n_guide_steps=0`, the trajectories are sampled from the diffusion model, but not fine-tuned to maximize reward in the environment.
-By default, `n_guide_steps=2` to match the original implementation.
- 
-
-You will need some RL specific requirements to run the examples:
-
-```
-pip install -f https://download.pytorch.org/whl/torch_stable.html \
-                free-mujoco-py \
-                einops \
-                gym==0.24.1 \
-                protobuf==3.20.1 \
-                git+https://github.com/rail-berkeley/d4rl.git \
-                mediapy \
-                Pillow==9.0.0
-```
--- a/examples/rl/run_diffuser_locomotion.py
+++ b/examples/rl/run_diffuser_locomotion.py
@@ -1,59 +0,0 @@
-import d4rl  # noqa
-import gym
-import tqdm
-from diffusers.experimental import ValueGuidedRLPipeline
-
-
-config = dict(
-    n_samples=64,
-    horizon=32,
-    num_inference_steps=20,
-    n_guide_steps=2,  # can set to 0 for faster sampling, does not use value network
-    scale_grad_by_std=True,
-    scale=0.1,
-    eta=0.0,
-    t_grad_cutoff=2,
-    device="cpu",
-)
-
-
-if __name__ == "__main__":
-    env_name = "hopper-medium-v2"
-    env = gym.make(env_name)
-
-    pipeline = ValueGuidedRLPipeline.from_pretrained(
-        "bglick13/hopper-medium-v2-value-function-hor32",
-        env=env,
-    )
-
-    env.seed(0)
-    obs = env.reset()
-    total_reward = 0
-    total_score = 0
-    T = 1000
-    rollout = [obs.copy()]
-    try:
-        for t in tqdm.tqdm(range(T)):
-            # call the policy
-            denorm_actions = pipeline(obs, planning_horizon=32)
-
-            # execute action in environment
-            next_observation, reward, terminal, _ = env.step(denorm_actions)
-            score = env.get_normalized_score(total_reward)
-
-            # update return
-            total_reward += reward
-            total_score += score
-            print(
-                f"Step: {t}, Reward: {reward}, Total Reward: {total_reward}, Score: {score}, Total Score:"
-                f" {total_score}"
-            )
-
-            # save observations for rendering
-            rollout.append(next_observation.copy())
-
-            obs = next_observation
-    except KeyboardInterrupt:
-        pass
-
-    print(f"Total reward: {total_reward}")
--- a/examples/text_to_image/README.md
+++ b/examples/text_to_image/README.md
@@ -12,18 +12,9 @@ ___This script is experimental. The script fine-tunes the whole model and often

 Before running the scripts, make sure to install the library's training dependencies:

-**Important**
-
-To make sure you can successfully run the latest versions of the example scripts, we highly recommend **installing from source** and keeping the install up to date as we update the example scripts frequently and install some example-specific requirements. To do this, execute the following steps in a new virtual environment:
 ```bash
-git clone https://github.com/huggingface/diffusers
-cd diffusers
-pip install .
-```
-
-Then cd in the example folder  and run
-```bash
-pip install -r requirements.txt
+pip install git+https://github.com/huggingface/diffusers.git
+pip install -U -r requirements.txt
 ```

 And initialize an [🤗Accelerate](https://github.com/huggingface/accelerate/) environment with:
@@ -51,13 +42,11 @@ If you have already cloned the repo, then you won't need to go through these ste
 #### Hardware
 With `gradient_checkpointing` and `mixed_precision` it should be possible to fine tune the model on a single 24GB GPU. For higher `batch_size` and faster training it's better to use GPUs with >30GB memory.

-**___Note: Change the `resolution` to 768 if you are using the [stable-diffusion-2](https://huggingface.co/stabilityai/stable-diffusion-2) 768x768 model.___**
-
 ```bash
 export MODEL_NAME="CompVis/stable-diffusion-v1-4"
 export dataset_name="lambdalabs/pokemon-blip-captions"

-accelerate launch --mixed_precision="fp16"  train_text_to_image.py \
+accelerate launch train_text_to_image.py \
  --pretrained_model_name_or_path=$MODEL_NAME \
  --dataset_name=$dataset_name \
  --use_ema \
@@ -65,6 +54,7 @@ accelerate launch --mixed_precision="fp16"  train_text_to_image.py \
  --train_batch_size=1 \
  --gradient_accumulation_steps=4 \
  --gradient_checkpointing \
+  --mixed_precision="fp16" \
  --max_train_steps=15000 \
  --learning_rate=1e-05 \
  --max_grad_norm=1 \
@@ -80,7 +70,7 @@ If you wish to use custom loading logic, you should modify the script, we have l
 export MODEL_NAME="CompVis/stable-diffusion-v1-4"
 export TRAIN_DIR="path_to_your_dataset"

-accelerate launch --mixed_precision="fp16" train_text_to_image.py \
+accelerate launch train_text_to_image.py \
  --pretrained_model_name_or_path=$MODEL_NAME \
  --train_data_dir=$TRAIN_DIR \
  --use_ema \
@@ -88,6 +78,7 @@ accelerate launch --mixed_precision="fp16" train_text_to_image.py \
  --train_batch_size=1 \
  --gradient_accumulation_steps=4 \
  --gradient_checkpointing \
+  --mixed_precision="fp16" \
  --max_train_steps=15000 \
  --learning_rate=1e-05 \
  --max_grad_norm=1 \
--- a/examples/text_to_image/requirements.txt
+++ b/examples/text_to_image/requirements.txt
@@ -1,7 +1,7 @@
+diffusers==0.4.1
 accelerate
 torchvision
 transformers>=4.21.0
-datasets
 ftfy
 tensorboard
 modelcards
--- a/examples/text_to_image/requirements_flax.txt
+++ b/examples/text_to_image/requirements_flax.txt
@@ -1,5 +1,5 @@
+diffusers>==0.5.1
 transformers>=4.21.0
-datasets
 flax
 optax
 torch
--- a/examples/text_to_image/train_text_to_image.py
+++ b/examples/text_to_image/train_text_to_image.py
@@ -15,18 +15,15 @@ from accelerate import Accelerator
 from accelerate.logging import get_logger
 from accelerate.utils import set_seed
 from datasets import load_dataset
-from diffusers import AutoencoderKL, DDPMScheduler, StableDiffusionPipeline, UNet2DConditionModel
+from diffusers import AutoencoderKL, DDPMScheduler, PNDMScheduler, StableDiffusionPipeline, UNet2DConditionModel
 from diffusers.optimization import get_scheduler
-from diffusers.utils import check_min_version
+from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker
 from huggingface_hub import HfFolder, Repository, whoami
 from torchvision import transforms
 from tqdm.auto import tqdm
-from transformers import CLIPTextModel, CLIPTokenizer
+from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer


-# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.10.0.dev0")
-
 logger = get_logger(__name__)


@@ -39,13 +36,6 @@ def parse_args():
        required=True,
        help="Path to pretrained model or model identifier from huggingface.co/models.",
    )
-    parser.add_argument(
-        "--revision",
-        type=str,
-        default=None,
-        required=False,
-        help="Revision of pretrained model identifier from huggingface.co/models.",
-    )
    parser.add_argument(
        "--dataset_name",
        type=str,
@@ -196,12 +186,12 @@ def parse_args():
    parser.add_argument(
        "--mixed_precision",
        type=str,
-        default=None,
+        default="no",
        choices=["no", "fp16", "bf16"],
        help=(
-            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
-            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
-            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+            "Whether to use mixed precision. Choose"
+            "between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
+            "and an Nvidia Ampere GPU."
        ),
    )
    parser.add_argument(
@@ -345,24 +335,10 @@ def main():
            os.makedirs(args.output_dir, exist_ok=True)

    # Load models and create wrapper for stable diffusion
-    tokenizer = CLIPTokenizer.from_pretrained(
-        args.pretrained_model_name_or_path, subfolder="tokenizer", revision=args.revision
-    )
-    text_encoder = CLIPTextModel.from_pretrained(
-        args.pretrained_model_name_or_path,
-        subfolder="text_encoder",
-        revision=args.revision,
-    )
-    vae = AutoencoderKL.from_pretrained(
-        args.pretrained_model_name_or_path,
-        subfolder="vae",
-        revision=args.revision,
-    )
-    unet = UNet2DConditionModel.from_pretrained(
-        args.pretrained_model_name_or_path,
-        subfolder="unet",
-        revision=args.revision,
-    )
+    tokenizer = CLIPTokenizer.from_pretrained(args.pretrained_model_name_or_path, subfolder="tokenizer")
+    text_encoder = CLIPTextModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="text_encoder")
+    vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae")
+    unet = UNet2DConditionModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="unet")

    # Freeze vae and text_encoder
    vae.requires_grad_(False)
@@ -396,7 +372,11 @@ def main():
        weight_decay=args.adam_weight_decay,
        eps=args.adam_epsilon,
    )
-    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+
+    # TODO (patil-suraj): load scheduler using args
+    noise_scheduler = DDPMScheduler(
+        beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000
+    )

    # Get the datasets: you can either provide your own training and evaluation files (see below)
    # or specify a Dataset from the hub (the dataset will be downloaded automatically from the datasets Hub).
@@ -520,9 +500,9 @@ def main():
    )

    weight_dtype = torch.float32
-    if accelerator.mixed_precision == "fp16":
+    if args.mixed_precision == "fp16":
        weight_dtype = torch.float16
-    elif accelerator.mixed_precision == "bf16":
+    elif args.mixed_precision == "bf16":
        weight_dtype = torch.bfloat16

    # Move text_encode and vae to gpu.
@@ -586,17 +566,9 @@ def main():
                # Get the text embedding for conditioning
                encoder_hidden_states = text_encoder(batch["input_ids"])[0]

-                # Get the target for loss depending on the prediction type
-                if noise_scheduler.config.prediction_type == "epsilon":
-                    target = noise
-                elif noise_scheduler.config.prediction_type == "v_prediction":
-                    target = noise_scheduler.get_velocity(latents, noise, timesteps)
-                else:
-                    raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
-
                # Predict the noise residual and compute loss
-                model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
-                loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
+                noise_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
+                loss = F.mse_loss(noise_pred.float(), noise.float(), reduction="mean")

                # Gather the losses across all processes for logging (if we use distributed training).
                avg_loss = accelerator.gather(loss.repeat(args.train_batch_size)).mean()
@@ -632,12 +604,16 @@ def main():
        if args.use_ema:
            ema_unet.copy_to(unet.parameters())

-        pipeline = StableDiffusionPipeline.from_pretrained(
-            args.pretrained_model_name_or_path,
+        pipeline = StableDiffusionPipeline(
            text_encoder=text_encoder,
            vae=vae,
            unet=unet,
-            revision=args.revision,
+            tokenizer=tokenizer,
+            scheduler=PNDMScheduler(
+                beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", skip_prk_steps=True
+            ),
+            safety_checker=StableDiffusionSafetyChecker.from_pretrained("CompVis/stable-diffusion-safety-checker"),
+            feature_extractor=CLIPFeatureExtractor.from_pretrained("openai/clip-vit-base-patch32"),
        )
        pipeline.save_pretrained(args.output_dir)

--- a/examples/text_to_image/train_text_to_image_flax.py
+++ b/examples/text_to_image/train_text_to_image_flax.py
@@ -23,7 +23,6 @@ from diffusers import (
    FlaxUNet2DConditionModel,
 )
 from diffusers.pipelines.stable_diffusion import FlaxStableDiffusionSafetyChecker
-from diffusers.utils import check_min_version
 from flax import jax_utils
 from flax.training import train_state
 from flax.training.common_utils import shard
@@ -33,9 +32,6 @@ from tqdm.auto import tqdm
 from transformers import CLIPFeatureExtractor, CLIPTokenizer, FlaxCLIPTextModel, set_seed


-# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.10.0.dev0")
-
 logger = logging.getLogger(__name__)


@@ -383,9 +379,7 @@ def main():

    # Load models and create wrapper for stable diffusion
    tokenizer = CLIPTokenizer.from_pretrained(args.pretrained_model_name_or_path, subfolder="tokenizer")
-    text_encoder = FlaxCLIPTextModel.from_pretrained(
-        args.pretrained_model_name_or_path, subfolder="text_encoder", dtype=weight_dtype
-    )
+    text_encoder = FlaxCLIPTextModel.from_pretrained("openai/clip-vit-large-patch14", dtype=weight_dtype)
    vae, vae_params = FlaxAutoencoderKL.from_pretrained(
        args.pretrained_model_name_or_path, subfolder="vae", dtype=weight_dtype
    )
--- a/examples/textual_inversion/README.md
+++ b/examples/textual_inversion/README.md
@@ -16,18 +16,8 @@ Colab for inference

 Before running the scripts, make sure to install the library's training dependencies:

-**Important**
-
-To make sure you can successfully run the latest versions of the example scripts, we highly recommend **installing from source** and keeping the install up to date as we update the example scripts frequently and install some example-specific requirements. To do this, execute the following steps in a new virtual environment:
 ```bash
-git clone https://github.com/huggingface/diffusers
-cd diffusers
-pip install .
-```
-
-Then cd in the example folder  and run
-```bash
-pip install -r requirements.txt
+pip install diffusers"[training]" accelerate "transformers>=4.21.0"
 ```

 And initialize an [🤗Accelerate](https://github.com/huggingface/accelerate/) environment with:
@@ -39,7 +29,7 @@ accelerate config

 ### Cat toy example

-You need to accept the model license before downloading or using the weights. In this example we'll use model version `v1-5`, so you'll need to visit [its card](https://huggingface.co/runwayml/stable-diffusion-v1-5), read the license and tick the checkbox if you agree. 
+You need to accept the model license before downloading or using the weights. In this example we'll use model version `v1-4`, so you'll need to visit [its card](https://huggingface.co/CompVis/stable-diffusion-v1-4), read the license and tick the checkbox if you agree. 

 You have to be a registered user in 🤗 Hugging Face Hub, and you'll also need to use an access token for the code to work. For more information on access tokens, please refer to [this section of the documentation](https://huggingface.co/docs/hub/security-tokens).

@@ -57,8 +47,6 @@ Now let's get our dataset.Download 3-4 images from [here](https://drive.google.c

 And launch the training using

-**___Note: Change the `resolution` to 768 if you are using the [stable-diffusion-2](https://huggingface.co/stabilityai/stable-diffusion-2) 768x768 model.___**
-
 ```bash
 export MODEL_NAME="runwayml/stable-diffusion-v1-5"
 export DATA_DIR="path-to-dir-containing-images"
@@ -123,4 +111,4 @@ python textual_inversion_flax.py \
  --learning_rate=5.0e-04 --scale_lr \
  --output_dir="textual_inversion_cat"
 ```
-It should be at least 70% faster than the PyTorch script with the same configuration.
+It should be at least 70% faster than the PyTorch script with the same configuration.
--- a/examples/textual_inversion/requirements.txt
+++ b/examples/textual_inversion/requirements.txt
@@ -1,6 +1,3 @@
 accelerate
 torchvision
 transformers>=4.21.0
-ftfy
-tensorboard
-modelcards
--- a/examples/textual_inversion/requirements_flax.txt
+++ b/examples/textual_inversion/requirements_flax.txt
@@ -1,3 +1,4 @@
+diffusers>==0.5.1
 transformers>=4.21.0
 flax
 optax
--- a/examples/textual_inversion/textual_inversion.py
+++ b/examples/textual_inversion/textual_inversion.py
@@ -19,48 +19,21 @@ from accelerate.utils import set_seed
 from diffusers import AutoencoderKL, DDPMScheduler, PNDMScheduler, StableDiffusionPipeline, UNet2DConditionModel
 from diffusers.optimization import get_scheduler
 from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker
-from diffusers.utils import check_min_version
 from huggingface_hub import HfFolder, Repository, whoami
-
-# TODO: remove and import from diffusers.utils when the new version of diffusers is released
-from packaging import version
 from PIL import Image
 from torchvision import transforms
 from tqdm.auto import tqdm
 from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer


-if version.parse(version.parse(PIL.__version__).base_version) >= version.parse("9.1.0"):
-    PIL_INTERPOLATION = {
-        "linear": PIL.Image.Resampling.BILINEAR,
-        "bilinear": PIL.Image.Resampling.BILINEAR,
-        "bicubic": PIL.Image.Resampling.BICUBIC,
-        "lanczos": PIL.Image.Resampling.LANCZOS,
-        "nearest": PIL.Image.Resampling.NEAREST,
-    }
-else:
-    PIL_INTERPOLATION = {
-        "linear": PIL.Image.LINEAR,
-        "bilinear": PIL.Image.BILINEAR,
-        "bicubic": PIL.Image.BICUBIC,
-        "lanczos": PIL.Image.LANCZOS,
-        "nearest": PIL.Image.NEAREST,
-    }
-# ------------------------------------------------------------------------------
-
-
-# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.10.0.dev0")
-
-
 logger = get_logger(__name__)


-def save_progress(text_encoder, placeholder_token_id, accelerator, args, save_path):
+def save_progress(text_encoder, placeholder_token_id, accelerator, args):
    logger.info("Saving embeddings")
    learned_embeds = accelerator.unwrap_model(text_encoder).get_input_embeddings().weight[placeholder_token_id]
    learned_embeds_dict = {args.placeholder_token: learned_embeds.detach().cpu()}
-    torch.save(learned_embeds_dict, save_path)
+    torch.save(learned_embeds_dict, os.path.join(args.output_dir, "learned_embeds.bin"))


 def parse_args():
@@ -71,12 +44,6 @@ def parse_args():
        default=500,
        help="Save learned_embeds.bin every X updates steps.",
    )
-    parser.add_argument(
-        "--only_save_embeds",
-        action="store_true",
-        default=False,
-        help="Save only the embeddings for the new concept.",
-    )
    parser.add_argument(
        "--pretrained_model_name_or_path",
        type=str,
@@ -84,13 +51,6 @@ def parse_args():
        required=True,
        help="Path to pretrained model or model identifier from huggingface.co/models.",
    )
-    parser.add_argument(
-        "--revision",
-        type=str,
-        default=None,
-        required=False,
-        help="Revision of pretrained model identifier from huggingface.co/models.",
-    )
    parser.add_argument(
        "--tokenizer_name",
        type=str,
@@ -300,10 +260,10 @@ class TextualInversionDataset(Dataset):
            self._length = self.num_images * repeats

        self.interpolation = {
-            "linear": PIL_INTERPOLATION["linear"],
-            "bilinear": PIL_INTERPOLATION["bilinear"],
-            "bicubic": PIL_INTERPOLATION["bicubic"],
-            "lanczos": PIL_INTERPOLATION["lanczos"],
+            "linear": PIL.Image.LINEAR,
+            "bilinear": PIL.Image.BILINEAR,
+            "bicubic": PIL.Image.BICUBIC,
+            "lanczos": PIL.Image.LANCZOS,
        }[interpolation]

        self.templates = imagenet_style_templates_small if learnable_property == "style" else imagenet_templates_small
@@ -423,21 +383,9 @@ def main():
    placeholder_token_id = tokenizer.convert_tokens_to_ids(args.placeholder_token)

    # Load models and create wrapper for stable diffusion
-    text_encoder = CLIPTextModel.from_pretrained(
-        args.pretrained_model_name_or_path,
-        subfolder="text_encoder",
-        revision=args.revision,
-    )
-    vae = AutoencoderKL.from_pretrained(
-        args.pretrained_model_name_or_path,
-        subfolder="vae",
-        revision=args.revision,
-    )
-    unet = UNet2DConditionModel.from_pretrained(
-        args.pretrained_model_name_or_path,
-        subfolder="unet",
-        revision=args.revision,
-    )
+    text_encoder = CLIPTextModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="text_encoder")
+    vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae")
+    unet = UNet2DConditionModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="unet")

    # Resize the token embeddings as we are adding new special tokens to the tokenizer
    text_encoder.resize_token_embeddings(len(tokenizer))
@@ -471,7 +419,13 @@ def main():
        eps=args.adam_epsilon,
    )

-    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+    # TODO (patil-suraj): load scheduler using args
+    noise_scheduler = DDPMScheduler(
+        beta_start=0.00085,
+        beta_end=0.012,
+        beta_schedule="scaled_linear",
+        num_train_timesteps=1000,
+    )

    train_dataset = TextualInversionDataset(
        data_root=args.train_data_dir,
@@ -562,17 +516,9 @@ def main():
                encoder_hidden_states = text_encoder(batch["input_ids"])[0]

                # Predict the noise residual
-                model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
+                noise_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample

-                # Get the target for loss depending on the prediction type
-                if noise_scheduler.config.prediction_type == "epsilon":
-                    target = noise
-                elif noise_scheduler.config.prediction_type == "v_prediction":
-                    target = noise_scheduler.get_velocity(latents, noise, timesteps)
-                else:
-                    raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
-
-                loss = F.mse_loss(model_pred, target, reduction="none").mean([1, 2, 3]).mean()
+                loss = F.mse_loss(noise_pred, noise, reduction="none").mean([1, 2, 3]).mean()
                accelerator.backward(loss)

                # Zero out the gradients for all token embeddings except the newly added
@@ -594,8 +540,7 @@ def main():
                progress_bar.update(1)
                global_step += 1
                if global_step % args.save_steps == 0:
-                    save_path = os.path.join(args.output_dir, f"learned_embeds-steps-{global_step}.bin")
-                    save_progress(text_encoder, placeholder_token_id, accelerator, args, save_path)
+                    save_progress(text_encoder, placeholder_token_id, accelerator, args)

            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
            progress_bar.set_postfix(**logs)
@@ -608,25 +553,20 @@ def main():

    # Create the pipeline using using the trained modules and save it.
    if accelerator.is_main_process:
-        if args.push_to_hub and args.only_save_embeds:
-            logger.warn("Enabling full model saving because --push_to_hub=True was specified.")
-            save_full_model = True
-        else:
-            save_full_model = not args.only_save_embeds
-        if save_full_model:
-            pipeline = StableDiffusionPipeline(
-                text_encoder=accelerator.unwrap_model(text_encoder),
-                vae=vae,
-                unet=unet,
-                tokenizer=tokenizer,
-                scheduler=PNDMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler"),
-                safety_checker=StableDiffusionSafetyChecker.from_pretrained("CompVis/stable-diffusion-safety-checker"),
-                feature_extractor=CLIPFeatureExtractor.from_pretrained("openai/clip-vit-base-patch32"),
-            )
-            pipeline.save_pretrained(args.output_dir)
-        # Save the newly trained embeddings
-        save_path = os.path.join(args.output_dir, "learned_embeds.bin")
-        save_progress(text_encoder, placeholder_token_id, accelerator, args, save_path)
+        pipeline = StableDiffusionPipeline(
+            text_encoder=accelerator.unwrap_model(text_encoder),
+            vae=vae,
+            unet=unet,
+            tokenizer=tokenizer,
+            scheduler=PNDMScheduler(
+                beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", skip_prk_steps=True
+            ),
+            safety_checker=StableDiffusionSafetyChecker.from_pretrained("CompVis/stable-diffusion-safety-checker"),
+            feature_extractor=CLIPFeatureExtractor.from_pretrained("openai/clip-vit-base-patch32"),
+        )
+        pipeline.save_pretrained(args.output_dir)
+        # Also save the newly trained embeddings
+        save_progress(text_encoder, placeholder_token_id, accelerator, args)

        if args.push_to_hub:
            repo.push_to_hub(commit_message="End of training", blocking=False, auto_lfs_prune=True)
--- a/examples/textual_inversion/textual_inversion_flax.py
+++ b/examples/textual_inversion/textual_inversion_flax.py
@@ -24,41 +24,16 @@ from diffusers import (
    FlaxUNet2DConditionModel,
 )
 from diffusers.pipelines.stable_diffusion import FlaxStableDiffusionSafetyChecker
-from diffusers.utils import check_min_version
 from flax import jax_utils
 from flax.training import train_state
 from flax.training.common_utils import shard
 from huggingface_hub import HfFolder, Repository, whoami
-
-# TODO: remove and import from diffusers.utils when the new version of diffusers is released
-from packaging import version
 from PIL import Image
 from torchvision import transforms
 from tqdm.auto import tqdm
 from transformers import CLIPFeatureExtractor, CLIPTokenizer, FlaxCLIPTextModel, set_seed


-if version.parse(version.parse(PIL.__version__).base_version) >= version.parse("9.1.0"):
-    PIL_INTERPOLATION = {
-        "linear": PIL.Image.Resampling.BILINEAR,
-        "bilinear": PIL.Image.Resampling.BILINEAR,
-        "bicubic": PIL.Image.Resampling.BICUBIC,
-        "lanczos": PIL.Image.Resampling.LANCZOS,
-        "nearest": PIL.Image.Resampling.NEAREST,
-    }
-else:
-    PIL_INTERPOLATION = {
-        "linear": PIL.Image.LINEAR,
-        "bilinear": PIL.Image.BILINEAR,
-        "bicubic": PIL.Image.BICUBIC,
-        "lanczos": PIL.Image.LANCZOS,
-        "nearest": PIL.Image.NEAREST,
-    }
-# ------------------------------------------------------------------------------
-
-# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.10.0.dev0")
-
 logger = logging.getLogger(__name__)


@@ -271,10 +246,10 @@ class TextualInversionDataset(Dataset):
            self._length = self.num_images * repeats

        self.interpolation = {
-            "linear": PIL_INTERPOLATION["linear"],
-            "bilinear": PIL_INTERPOLATION["bilinear"],
-            "bicubic": PIL_INTERPOLATION["bicubic"],
-            "lanczos": PIL_INTERPOLATION["lanczos"],
+            "linear": PIL.Image.LINEAR,
+            "bilinear": PIL.Image.BILINEAR,
+            "bicubic": PIL.Image.BICUBIC,
+            "lanczos": PIL.Image.LANCZOS,
        }[interpolation]

        self.templates = imagenet_style_templates_small if learnable_property == "style" else imagenet_templates_small
@@ -416,7 +391,7 @@ def main():
    placeholder_token_id = tokenizer.convert_tokens_to_ids(args.placeholder_token)

    # Load models and create wrapper for stable diffusion
-    text_encoder = FlaxCLIPTextModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="text_encoder")
+    text_encoder = FlaxCLIPTextModel.from_pretrained("openai/clip-vit-large-patch14")
    vae, vae_params = FlaxAutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae")
    unet, unet_params = FlaxUNet2DConditionModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="unet")

--- a/examples/unconditional_image_generation/README.md
+++ b/examples/unconditional_image_generation/README.md
@@ -6,21 +6,10 @@ Creating a training image set is [described in a different document](https://hug

 Before running the scripts, make sure to install the library's training dependencies:

-**Important**
-
-To make sure you can successfully run the latest versions of the example scripts, we highly recommend **installing from source** and keeping the install up to date as we update the example scripts frequently and install some example-specific requirements. To do this, execute the following steps in a new virtual environment:
 ```bash
-git clone https://github.com/huggingface/diffusers
-cd diffusers
-pip install .
+pip install diffusers[training] accelerate datasets tensorboard
 ```

-Then cd in the example folder  and run
-```bash
-pip install -r requirements.txt
-```
-
-
 And initialize an [🤗Accelerate](https://github.com/huggingface/accelerate/) environment with:

 ```bash
@@ -138,24 +127,3 @@ dataset.push_to_hub("name_of_your_dataset", private=True)
 and that's it! You can now train your model by simply setting the `--dataset_name` argument to the name of your dataset on the hub.

 More on this can also be found in [this blog post](https://huggingface.co/blog/image-search-datasets).
-
-#### Use ONNXRuntime to accelerate training
-
-In order to leverage onnxruntime to accelerate training, please use train_unconditional_ort.py
-
-The command to train a DDPM UNet model on the Oxford Flowers dataset with onnxruntime:
-
-```bash
-accelerate launch train_unconditional_ort.py \
-  --dataset_name="huggan/flowers-102-categories" \
-  --resolution=64 \
-  --output_dir="ddpm-ema-flowers-64" \
-  --train_batch_size=16 \
-  --num_epochs=1 \
-  --gradient_accumulation_steps=1 \
-  --learning_rate=1e-4 \
-  --lr_warmup_steps=500 \
-  --mixed_precision=fp16
-  ```
-
-Please contact Prathik Rao (prathikr), Sunghoon Choi (hanbitmyths), Ashwini Khade (askhade), or Peng Wang (pengwa) on github with any questions.
--- a/examples/unconditional_image_generation/train_unconditional.py
+++ b/examples/unconditional_image_generation/train_unconditional.py
@@ -1,5 +1,4 @@
 import argparse
-import inspect
 import math
 import os
 from pathlib import Path
@@ -14,7 +13,6 @@ from datasets import load_dataset
 from diffusers import DDPMPipeline, DDPMScheduler, UNet2DModel
 from diffusers.optimization import get_scheduler
 from diffusers.training_utils import EMAModel
-from diffusers.utils import check_min_version
 from huggingface_hub import HfFolder, Repository, whoami
 from torchvision.transforms import (
    CenterCrop,
@@ -28,31 +26,9 @@ from torchvision.transforms import (
 from tqdm.auto import tqdm


-# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.10.0.dev0")
-
-
 logger = get_logger(__name__)


-def _extract_into_tensor(arr, timesteps, broadcast_shape):
-    """
-    Extract values from a 1-D numpy array for a batch of indices.
-
-    :param arr: the 1-D numpy array.
-    :param timesteps: a tensor of indices into the array to extract.
-    :param broadcast_shape: a larger shape of K dimensions with the batch
-                            dimension equal to the length of timesteps.
-    :return: a tensor of shape [batch_size, 1, ...] where the shape has K dims.
-    """
-    if not isinstance(arr, torch.Tensor):
-        arr = torch.from_numpy(arr)
-    res = arr[timesteps].float().to(timesteps.device)
-    while len(res.shape) < len(broadcast_shape):
-        res = res[..., None]
-    return res.expand(broadcast_shape)
-
-
 def parse_args():
    parser = argparse.ArgumentParser(description="Simple example of a training script.")
    parser.add_argument(
@@ -195,17 +171,6 @@ def parse_args():
        ),
    )

-    parser.add_argument(
-        "--prediction_type",
-        type=str,
-        default="epsilon",
-        choices=["epsilon", "sample"],
-        help="Whether the model should predict the 'epsilon'/noise error or directly the reconstructed image 'x0'.",
-    )
-
-    parser.add_argument("--ddpm_num_steps", type=int, default=1000)
-    parser.add_argument("--ddpm_beta_schedule", type=str, default="linear")
-
    args = parser.parse_args()
    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
    if env_local_rank != -1 and env_local_rank != args.local_rank:
@@ -259,17 +224,7 @@ def main(args):
            "UpBlock2D",
        ),
    )
-    accepts_prediction_type = "prediction_type" in set(inspect.signature(DDPMScheduler.__init__).parameters.keys())
-
-    if accepts_prediction_type:
-        noise_scheduler = DDPMScheduler(
-            num_train_timesteps=args.ddpm_num_steps,
-            beta_schedule=args.ddpm_beta_schedule,
-            prediction_type=args.prediction_type,
-        )
-    else:
-        noise_scheduler = DDPMScheduler(num_train_timesteps=args.ddpm_num_steps, beta_schedule=args.ddpm_beta_schedule)
-
+    noise_scheduler = DDPMScheduler(num_train_timesteps=1000)
    optimizer = torch.optim.AdamW(
        model.parameters(),
        lr=args.learning_rate,
@@ -302,8 +257,6 @@ def main(args):
        images = [augmentations(image.convert("RGB")) for image in examples["image"]]
        return {"input": images}

-    logger.info(f"Dataset size: {len(dataset)}")
-
    dataset.set_transform(transforms)
    train_dataloader = torch.utils.data.DataLoader(
        dataset, batch_size=args.train_batch_size, shuffle=True, num_workers=args.dataloader_num_workers
@@ -322,12 +275,7 @@ def main(args):

    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)

-    ema_model = EMAModel(
-        accelerator.unwrap_model(model),
-        inv_gamma=args.ema_inv_gamma,
-        power=args.ema_power,
-        max_value=args.ema_max_decay,
-    )
+    ema_model = EMAModel(model, inv_gamma=args.ema_inv_gamma, power=args.ema_power, max_value=args.ema_max_decay)

    # Handle the repository creation
    if accelerator.is_main_process:
@@ -371,22 +319,8 @@ def main(args):

            with accelerator.accumulate(model):
                # Predict the noise residual
-                model_output = model(noisy_images, timesteps).sample
-
-                if args.prediction_type == "epsilon":
-                    loss = F.mse_loss(model_output, noise)  # this could have different weights!
-                elif args.prediction_type == "sample":
-                    alpha_t = _extract_into_tensor(
-                        noise_scheduler.alphas_cumprod, timesteps, (clean_images.shape[0], 1, 1, 1)
-                    )
-                    snr_weights = alpha_t / (1 - alpha_t)
-                    loss = snr_weights * F.mse_loss(
-                        model_output, clean_images, reduction="none"
-                    )  # use SNR weighting from distillation paper
-                    loss = loss.mean()
-                else:
-                    raise ValueError(f"Unsupported prediction type: {args.prediction_type}")
-
+                noise_pred = model(noisy_images, timesteps).sample
+                loss = F.mse_loss(noise_pred, noise)
                accelerator.backward(loss)

                if accelerator.sync_gradients:
@@ -419,13 +353,9 @@ def main(args):
                    scheduler=noise_scheduler,
                )

-                generator = torch.Generator(device=pipeline.device).manual_seed(0)
+                generator = torch.manual_seed(0)
                # run pipeline in inference (sample random noise and denoise)
-                images = pipeline(
-                    generator=generator,
-                    batch_size=args.eval_batch_size,
-                    output_type="numpy",
-                ).images
+                images = pipeline(generator=generator, batch_size=args.eval_batch_size, output_type="numpy").images

                # denormalize the images and save to tensorboard
                images_processed = (images * 255).round().astype("uint8")
--- a/examples/unconditional_image_generation/train_unconditional_ort.py
+++ b/examples/unconditional_image_generation/train_unconditional_ort.py
@@ -1,280 +0,0 @@
-import argparse
-import math
-import os
-from pathlib import Path
-from typing import Optional
-
-import torch
-import torch.nn.functional as F
-
-from accelerate import Accelerator
-from accelerate.logging import get_logger
-from datasets import load_dataset
-from diffusers import DDPMPipeline, DDPMScheduler, UNet2DModel
-from diffusers.optimization import get_scheduler
-from diffusers.training_utils import EMAModel
-from diffusers.utils import check_min_version
-from huggingface_hub import HfFolder, Repository, whoami
-from onnxruntime.training.ortmodule import ORTModule
-from torchvision.transforms import (
-    CenterCrop,
-    Compose,
-    InterpolationMode,
-    Normalize,
-    RandomHorizontalFlip,
-    Resize,
-    ToTensor,
-)
-from tqdm.auto import tqdm
-
-
-# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.10.0.dev0")
-
-logger = get_logger(__name__)
-
-
-def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: Optional[str] = None):
-    if token is None:
-        token = HfFolder.get_token()
-    if organization is None:
-        username = whoami(token)["name"]
-        return f"{username}/{model_id}"
-    else:
-        return f"{organization}/{model_id}"
-
-
-def main(args):
-    logging_dir = os.path.join(args.output_dir, args.logging_dir)
-    accelerator = Accelerator(
-        gradient_accumulation_steps=args.gradient_accumulation_steps,
-        mixed_precision=args.mixed_precision,
-        log_with="tensorboard",
-        logging_dir=logging_dir,
-    )
-
-    model = UNet2DModel(
-        sample_size=args.resolution,
-        in_channels=3,
-        out_channels=3,
-        layers_per_block=2,
-        block_out_channels=(128, 128, 256, 256, 512, 512),
-        down_block_types=(
-            "DownBlock2D",
-            "DownBlock2D",
-            "DownBlock2D",
-            "DownBlock2D",
-            "AttnDownBlock2D",
-            "DownBlock2D",
-        ),
-        up_block_types=(
-            "UpBlock2D",
-            "AttnUpBlock2D",
-            "UpBlock2D",
-            "UpBlock2D",
-            "UpBlock2D",
-            "UpBlock2D",
-        ),
-    )
-    model = ORTModule(model)
-    noise_scheduler = DDPMScheduler(num_train_timesteps=1000, tensor_format="pt")
-    optimizer = torch.optim.AdamW(
-        model.parameters(),
-        lr=args.learning_rate,
-        betas=(args.adam_beta1, args.adam_beta2),
-        weight_decay=args.adam_weight_decay,
-        eps=args.adam_epsilon,
-    )
-
-    augmentations = Compose(
-        [
-            Resize(args.resolution, interpolation=InterpolationMode.BILINEAR),
-            CenterCrop(args.resolution),
-            RandomHorizontalFlip(),
-            ToTensor(),
-            Normalize([0.5], [0.5]),
-        ]
-    )
-
-    if args.dataset_name is not None:
-        dataset = load_dataset(
-            args.dataset_name,
-            args.dataset_config_name,
-            cache_dir=args.cache_dir,
-            use_auth_token=True if args.use_auth_token else None,
-            split="train",
-        )
-    else:
-        dataset = load_dataset("imagefolder", data_dir=args.train_data_dir, cache_dir=args.cache_dir, split="train")
-
-    def transforms(examples):
-        images = [augmentations(image.convert("RGB")) for image in examples["image"]]
-        return {"input": images}
-
-    dataset.set_transform(transforms)
-    train_dataloader = torch.utils.data.DataLoader(dataset, batch_size=args.train_batch_size, shuffle=True)
-
-    lr_scheduler = get_scheduler(
-        args.lr_scheduler,
-        optimizer=optimizer,
-        num_warmup_steps=args.lr_warmup_steps,
-        num_training_steps=(len(train_dataloader) * args.num_epochs) // args.gradient_accumulation_steps,
-    )
-
-    model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
-        model, optimizer, train_dataloader, lr_scheduler
-    )
-
-    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
-
-    ema_model = EMAModel(model, inv_gamma=args.ema_inv_gamma, power=args.ema_power, max_value=args.ema_max_decay)
-
-    # Handle the repository creation
-    if accelerator.is_main_process:
-        if args.push_to_hub:
-            if args.hub_model_id is None:
-                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
-            else:
-                repo_name = args.hub_model_id
-            repo = Repository(args.output_dir, clone_from=repo_name)
-
-            with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
-                if "step_*" not in gitignore:
-                    gitignore.write("step_*\n")
-                if "epoch_*" not in gitignore:
-                    gitignore.write("epoch_*\n")
-        elif args.output_dir is not None:
-            os.makedirs(args.output_dir, exist_ok=True)
-
-    if accelerator.is_main_process:
-        run = os.path.split(__file__)[-1].split(".")[0]
-        accelerator.init_trackers(run)
-
-    global_step = 0
-    for epoch in range(args.num_epochs):
-        model.train()
-        progress_bar = tqdm(total=num_update_steps_per_epoch, disable=not accelerator.is_local_main_process)
-        progress_bar.set_description(f"Epoch {epoch}")
-        for step, batch in enumerate(train_dataloader):
-            clean_images = batch["input"]
-            # Sample noise that we'll add to the images
-            noise = torch.randn(clean_images.shape).to(clean_images.device)
-            bsz = clean_images.shape[0]
-            # Sample a random timestep for each image
-            timesteps = torch.randint(
-                0, noise_scheduler.config.num_train_timesteps, (bsz,), device=clean_images.device
-            ).long()
-
-            # Add noise to the clean images according to the noise magnitude at each timestep
-            # (this is the forward diffusion process)
-            noisy_images = noise_scheduler.add_noise(clean_images, noise, timesteps)
-
-            with accelerator.accumulate(model):
-                # Predict the noise residual
-                noise_pred = model(noisy_images, timesteps, return_dict=True)[0]
-                loss = F.mse_loss(noise_pred, noise)
-                accelerator.backward(loss)
-
-                accelerator.clip_grad_norm_(model.parameters(), 1.0)
-                optimizer.step()
-                lr_scheduler.step()
-                if args.use_ema:
-                    ema_model.step(model)
-                optimizer.zero_grad()
-
-            # Checks if the accelerator has performed an optimization step behind the scenes
-            if accelerator.sync_gradients:
-                progress_bar.update(1)
-                global_step += 1
-
-            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0], "step": global_step}
-            if args.use_ema:
-                logs["ema_decay"] = ema_model.decay
-            progress_bar.set_postfix(**logs)
-            accelerator.log(logs, step=global_step)
-        progress_bar.close()
-
-        accelerator.wait_for_everyone()
-
-        # Generate sample images for visual inspection
-        if accelerator.is_main_process:
-            if epoch % args.save_images_epochs == 0 or epoch == args.num_epochs - 1:
-                pipeline = DDPMPipeline(
-                    unet=accelerator.unwrap_model(ema_model.averaged_model if args.use_ema else model),
-                    scheduler=noise_scheduler,
-                )
-
-                generator = torch.manual_seed(0)
-                # run pipeline in inference (sample random noise and denoise)
-                images = pipeline(generator=generator, batch_size=args.eval_batch_size, output_type="numpy").images
-
-                # denormalize the images and save to tensorboard
-                images_processed = (images * 255).round().astype("uint8")
-                accelerator.trackers[0].writer.add_images(
-                    "test_samples", images_processed.transpose(0, 3, 1, 2), epoch
-                )
-
-            if epoch % args.save_model_epochs == 0 or epoch == args.num_epochs - 1:
-                # save the model
-                pipeline.save_pretrained(args.output_dir)
-                if args.push_to_hub:
-                    repo.push_to_hub(commit_message=f"Epoch {epoch}", blocking=False)
-        accelerator.wait_for_everyone()
-
-    accelerator.end_training()
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Simple example of a training script.")
-    parser.add_argument("--local_rank", type=int, default=-1)
-    parser.add_argument("--dataset_name", type=str, default=None)
-    parser.add_argument("--dataset_config_name", type=str, default=None)
-    parser.add_argument("--train_data_dir", type=str, default=None, help="A folder containing the training data.")
-    parser.add_argument("--output_dir", type=str, default="ddpm-model-64")
-    parser.add_argument("--overwrite_output_dir", action="store_true")
-    parser.add_argument("--cache_dir", type=str, default=None)
-    parser.add_argument("--resolution", type=int, default=64)
-    parser.add_argument("--train_batch_size", type=int, default=16)
-    parser.add_argument("--eval_batch_size", type=int, default=16)
-    parser.add_argument("--num_epochs", type=int, default=100)
-    parser.add_argument("--save_images_epochs", type=int, default=10)
-    parser.add_argument("--save_model_epochs", type=int, default=10)
-    parser.add_argument("--gradient_accumulation_steps", type=int, default=1)
-    parser.add_argument("--learning_rate", type=float, default=1e-4)
-    parser.add_argument("--lr_scheduler", type=str, default="cosine")
-    parser.add_argument("--lr_warmup_steps", type=int, default=500)
-    parser.add_argument("--adam_beta1", type=float, default=0.95)
-    parser.add_argument("--adam_beta2", type=float, default=0.999)
-    parser.add_argument("--adam_weight_decay", type=float, default=1e-6)
-    parser.add_argument("--adam_epsilon", type=float, default=1e-08)
-    parser.add_argument("--use_ema", action="store_true", default=True)
-    parser.add_argument("--ema_inv_gamma", type=float, default=1.0)
-    parser.add_argument("--ema_power", type=float, default=3 / 4)
-    parser.add_argument("--ema_max_decay", type=float, default=0.9999)
-    parser.add_argument("--push_to_hub", action="store_true")
-    parser.add_argument("--use_auth_token", action="store_true")
-    parser.add_argument("--hub_token", type=str, default=None)
-    parser.add_argument("--hub_model_id", type=str, default=None)
-    parser.add_argument("--hub_private_repo", action="store_true")
-    parser.add_argument("--logging_dir", type=str, default="logs")
-    parser.add_argument(
-        "--mixed_precision",
-        type=str,
-        default="no",
-        choices=["no", "fp16", "bf16"],
-        help=(
-            "Whether to use mixed precision. Choose"
-            "between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
-            "and an Nvidia Ampere GPU."
-        ),
-    )
-
-    args = parser.parse_args()
-    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
-    if env_local_rank != -1 and env_local_rank != args.local_rank:
-        args.local_rank = env_local_rank
-
-    if args.dataset_name is None and args.train_data_dir is None:
-        raise ValueError("You must specify either a dataset name from the hub or a train data directory.")
-
-    main(args)
--- a/scripts/convert_ldm_original_checkpoint_to_diffusers.py
+++ b/scripts/convert_ldm_original_checkpoint_to_diffusers.py
@@ -112,9 +112,9 @@ def assign_to_checkpoint(
            continue

        # Global renaming happens here
-        new_path = new_path.replace("middle_block.0", "mid_block.resnets.0")
-        new_path = new_path.replace("middle_block.1", "mid_block.attentions.0")
-        new_path = new_path.replace("middle_block.2", "mid_block.resnets.1")
+        new_path = new_path.replace("middle_block.0", "mid.resnets.0")
+        new_path = new_path.replace("middle_block.1", "mid.attentions.0")
+        new_path = new_path.replace("middle_block.2", "mid.resnets.1")

        if additional_replacements is not None:
            for replacement in additional_replacements:
@@ -175,16 +175,15 @@ def convert_ldm_checkpoint(checkpoint, config):
        attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key]

        if f"input_blocks.{i}.0.op.weight" in checkpoint:
-            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = checkpoint[
+            new_checkpoint[f"downsample_blocks.{block_id}.downsamplers.0.conv.weight"] = checkpoint[
                f"input_blocks.{i}.0.op.weight"
            ]
-            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = checkpoint[
+            new_checkpoint[f"downsample_blocks.{block_id}.downsamplers.0.conv.bias"] = checkpoint[
                f"input_blocks.{i}.0.op.bias"
            ]
-            continue

        paths = renew_resnet_paths(resnets)
-        meta_path = {"old": f"input_blocks.{i}.0", "new": f"down_blocks.{block_id}.resnets.{layer_in_block_id}"}
+        meta_path = {"old": f"input_blocks.{i}.0", "new": f"downsample_blocks.{block_id}.resnets.{layer_in_block_id}"}
        resnet_op = {"old": "resnets.2.op", "new": "downsamplers.0.op"}
        assign_to_checkpoint(
            paths, new_checkpoint, checkpoint, additional_replacements=[meta_path, resnet_op], config=config
@@ -194,18 +193,18 @@ def convert_ldm_checkpoint(checkpoint, config):
            paths = renew_attention_paths(attentions)
            meta_path = {
                "old": f"input_blocks.{i}.1",
-                "new": f"down_blocks.{block_id}.attentions.{layer_in_block_id}",
+                "new": f"downsample_blocks.{block_id}.attentions.{layer_in_block_id}",
            }
            to_split = {
                f"input_blocks.{i}.1.qkv.bias": {
-                    "key": f"down_blocks.{block_id}.attentions.{layer_in_block_id}.key.bias",
-                    "query": f"down_blocks.{block_id}.attentions.{layer_in_block_id}.query.bias",
-                    "value": f"down_blocks.{block_id}.attentions.{layer_in_block_id}.value.bias",
+                    "key": f"downsample_blocks.{block_id}.attentions.{layer_in_block_id}.key.bias",
+                    "query": f"downsample_blocks.{block_id}.attentions.{layer_in_block_id}.query.bias",
+                    "value": f"downsample_blocks.{block_id}.attentions.{layer_in_block_id}.value.bias",
                },
                f"input_blocks.{i}.1.qkv.weight": {
-                    "key": f"down_blocks.{block_id}.attentions.{layer_in_block_id}.key.weight",
-                    "query": f"down_blocks.{block_id}.attentions.{layer_in_block_id}.query.weight",
-                    "value": f"down_blocks.{block_id}.attentions.{layer_in_block_id}.value.weight",
+                    "key": f"downsample_blocks.{block_id}.attentions.{layer_in_block_id}.key.weight",
+                    "query": f"downsample_blocks.{block_id}.attentions.{layer_in_block_id}.query.weight",
+                    "value": f"downsample_blocks.{block_id}.attentions.{layer_in_block_id}.value.weight",
                },
            }
            assign_to_checkpoint(
--- a/scripts/convert_models_diffuser_to_diffusers.py
+++ b/scripts/convert_models_diffuser_to_diffusers.py
@@ -1,100 +0,0 @@
-import json
-import os
-
-import torch
-
-from diffusers import UNet1DModel
-
-
-os.makedirs("hub/hopper-medium-v2/unet/hor32", exist_ok=True)
-os.makedirs("hub/hopper-medium-v2/unet/hor128", exist_ok=True)
-
-os.makedirs("hub/hopper-medium-v2/value_function", exist_ok=True)
-
-
-def unet(hor):
-    if hor == 128:
-        down_block_types = ("DownResnetBlock1D", "DownResnetBlock1D", "DownResnetBlock1D")
-        block_out_channels = (32, 128, 256)
-        up_block_types = ("UpResnetBlock1D", "UpResnetBlock1D")
-
-    elif hor == 32:
-        down_block_types = ("DownResnetBlock1D", "DownResnetBlock1D", "DownResnetBlock1D", "DownResnetBlock1D")
-        block_out_channels = (32, 64, 128, 256)
-        up_block_types = ("UpResnetBlock1D", "UpResnetBlock1D", "UpResnetBlock1D")
-    model = torch.load(f"/Users/bglickenhaus/Documents/diffuser/temporal_unet-hopper-mediumv2-hor{hor}.torch")
-    state_dict = model.state_dict()
-    config = dict(
-        down_block_types=down_block_types,
-        block_out_channels=block_out_channels,
-        up_block_types=up_block_types,
-        layers_per_block=1,
-        use_timestep_embedding=True,
-        out_block_type="OutConv1DBlock",
-        norm_num_groups=8,
-        downsample_each_block=False,
-        in_channels=14,
-        out_channels=14,
-        extra_in_channels=0,
-        time_embedding_type="positional",
-        flip_sin_to_cos=False,
-        freq_shift=1,
-        sample_size=65536,
-        mid_block_type="MidResTemporalBlock1D",
-        act_fn="mish",
-    )
-    hf_value_function = UNet1DModel(**config)
-    print(f"length of state dict: {len(state_dict.keys())}")
-    print(f"length of value function dict: {len(hf_value_function.state_dict().keys())}")
-    mapping = dict((k, hfk) for k, hfk in zip(model.state_dict().keys(), hf_value_function.state_dict().keys()))
-    for k, v in mapping.items():
-        state_dict[v] = state_dict.pop(k)
-    hf_value_function.load_state_dict(state_dict)
-
-    torch.save(hf_value_function.state_dict(), f"hub/hopper-medium-v2/unet/hor{hor}/diffusion_pytorch_model.bin")
-    with open(f"hub/hopper-medium-v2/unet/hor{hor}/config.json", "w") as f:
-        json.dump(config, f)
-
-
-def value_function():
-    config = dict(
-        in_channels=14,
-        down_block_types=("DownResnetBlock1D", "DownResnetBlock1D", "DownResnetBlock1D", "DownResnetBlock1D"),
-        up_block_types=(),
-        out_block_type="ValueFunction",
-        mid_block_type="ValueFunctionMidBlock1D",
-        block_out_channels=(32, 64, 128, 256),
-        layers_per_block=1,
-        downsample_each_block=True,
-        sample_size=65536,
-        out_channels=14,
-        extra_in_channels=0,
-        time_embedding_type="positional",
-        use_timestep_embedding=True,
-        flip_sin_to_cos=False,
-        freq_shift=1,
-        norm_num_groups=8,
-        act_fn="mish",
-    )
-
-    model = torch.load("/Users/bglickenhaus/Documents/diffuser/value_function-hopper-mediumv2-hor32.torch")
-    state_dict = model
-    hf_value_function = UNet1DModel(**config)
-    print(f"length of state dict: {len(state_dict.keys())}")
-    print(f"length of value function dict: {len(hf_value_function.state_dict().keys())}")
-
-    mapping = dict((k, hfk) for k, hfk in zip(state_dict.keys(), hf_value_function.state_dict().keys()))
-    for k, v in mapping.items():
-        state_dict[v] = state_dict.pop(k)
-
-    hf_value_function.load_state_dict(state_dict)
-
-    torch.save(hf_value_function.state_dict(), "hub/hopper-medium-v2/value_function/diffusion_pytorch_model.bin")
-    with open("hub/hopper-medium-v2/value_function/config.json", "w") as f:
-        json.dump(config, f)
-
-
-if __name__ == "__main__":
-    unet(32)
-    # unet(128)
-    value_function()
--- a/scripts/convert_original_stable_diffusion_to_diffusers.py
+++ b/scripts/convert_original_stable_diffusion_to_diffusers.py
@@ -30,10 +30,6 @@ except ImportError:
 from diffusers import (
    AutoencoderKL,
    DDIMScheduler,
-    DPMSolverMultistepScheduler,
-    EulerAncestralDiscreteScheduler,
-    EulerDiscreteScheduler,
-    HeunDiscreteScheduler,
    LDMTextToImagePipeline,
    LMSDiscreteScheduler,
    PNDMScheduler,
@@ -41,9 +37,8 @@ from diffusers import (
    UNet2DConditionModel,
 )
 from diffusers.pipelines.latent_diffusion.pipeline_latent_diffusion import LDMBertConfig, LDMBertModel
-from diffusers.pipelines.paint_by_example import PaintByExampleImageEncoder, PaintByExamplePipeline
 from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker
-from transformers import AutoFeatureExtractor, BertTokenizerFast, CLIPTextModel, CLIPTokenizer, CLIPVisionConfig
+from transformers import AutoFeatureExtractor, BertTokenizerFast, CLIPTextModel, CLIPTokenizer


 def shave_segments(path, n_shave_prefix_segments=1):
@@ -209,12 +204,11 @@ def conv_attn_to_linear(checkpoint):
                checkpoint[key] = checkpoint[key][:, :, 0]


-def create_unet_diffusers_config(original_config, image_size: int):
+def create_unet_diffusers_config(original_config):
    """
    Creates a config for the diffusers based on the config of the LDM model.
    """
    unet_params = original_config.model.params.unet_config.params
-    vae_params = original_config.model.params.first_stage_config.params.ddconfig

    block_out_channels = [unet_params.model_channels * mult for mult in unet_params.channel_mult]

@@ -232,19 +226,8 @@ def create_unet_diffusers_config(original_config, image_size: int):
        up_block_types.append(block_type)
        resolution //= 2

-    vae_scale_factor = 2 ** (len(vae_params.ch_mult) - 1)
-
-    head_dim = unet_params.num_heads if "num_heads" in unet_params else None
-    use_linear_projection = (
-        unet_params.use_linear_in_transformer if "use_linear_in_transformer" in unet_params else False
-    )
-    if use_linear_projection:
-        # stable diffusion 2-base-512 and 2-768
-        if head_dim is None:
-            head_dim = [5, 10, 20, 20]
-
    config = dict(
-        sample_size=image_size // vae_scale_factor,
+        sample_size=unet_params.image_size,
        in_channels=unet_params.in_channels,
        out_channels=unet_params.out_channels,
        down_block_types=tuple(down_block_types),
@@ -252,14 +235,13 @@ def create_unet_diffusers_config(original_config, image_size: int):
        block_out_channels=tuple(block_out_channels),
        layers_per_block=unet_params.num_res_blocks,
        cross_attention_dim=unet_params.context_dim,
-        attention_head_dim=head_dim,
-        use_linear_projection=use_linear_projection,
+        attention_head_dim=unet_params.num_heads,
    )

    return config


-def create_vae_diffusers_config(original_config, image_size: int):
+def create_vae_diffusers_config(original_config):
    """
    Creates a config for the diffusers based on the config of the LDM model.
    """
@@ -271,7 +253,7 @@ def create_vae_diffusers_config(original_config, image_size: int):
    up_block_types = ["UpDecoderBlock2D"] * len(block_out_channels)

    config = dict(
-        sample_size=image_size,
+        sample_size=vae_params.resolution,
        in_channels=vae_params.in_channels,
        out_channels=vae_params.out_ch,
        down_block_types=tuple(down_block_types),
@@ -648,89 +630,6 @@ def convert_ldm_clip_checkpoint(checkpoint):
    return text_model


-def convert_paint_by_example_checkpoint(checkpoint):
-    config = CLIPVisionConfig.from_pretrained("openai/clip-vit-large-patch14")
-    model = PaintByExampleImageEncoder(config)
-
-    keys = list(checkpoint.keys())
-
-    text_model_dict = {}
-
-    for key in keys:
-        if key.startswith("cond_stage_model.transformer"):
-            text_model_dict[key[len("cond_stage_model.transformer.") :]] = checkpoint[key]
-
-    # load clip vision
-    model.model.load_state_dict(text_model_dict)
-
-    # load mapper
-    keys_mapper = {
-        k[len("cond_stage_model.mapper.res") :]: v
-        for k, v in checkpoint.items()
-        if k.startswith("cond_stage_model.mapper")
-    }
-
-    MAPPING = {
-        "attn.c_qkv": ["attn1.to_q", "attn1.to_k", "attn1.to_v"],
-        "attn.c_proj": ["attn1.to_out.0"],
-        "ln_1": ["norm1"],
-        "ln_2": ["norm3"],
-        "mlp.c_fc": ["ff.net.0.proj"],
-        "mlp.c_proj": ["ff.net.2"],
-    }
-
-    mapped_weights = {}
-    for key, value in keys_mapper.items():
-        prefix = key[: len("blocks.i")]
-        suffix = key.split(prefix)[-1].split(".")[-1]
-        name = key.split(prefix)[-1].split(suffix)[0][1:-1]
-        mapped_names = MAPPING[name]
-
-        num_splits = len(mapped_names)
-        for i, mapped_name in enumerate(mapped_names):
-            new_name = ".".join([prefix, mapped_name, suffix])
-            shape = value.shape[0] // num_splits
-            mapped_weights[new_name] = value[i * shape : (i + 1) * shape]
-
-    model.mapper.load_state_dict(mapped_weights)
-
-    # load final layer norm
-    model.final_layer_norm.load_state_dict(
-        {
-            "bias": checkpoint["cond_stage_model.final_ln.bias"],
-            "weight": checkpoint["cond_stage_model.final_ln.weight"],
-        }
-    )
-
-    # load final proj
-    model.proj_out.load_state_dict(
-        {
-            "bias": checkpoint["proj_out.bias"],
-            "weight": checkpoint["proj_out.weight"],
-        }
-    )
-
-    # load uncond vector
-    model.uncond_vector.data = torch.nn.Parameter(checkpoint["learnable_vector"])
-    return model
-
-
-def convert_open_clip_checkpoint(checkpoint):
-    text_model = CLIPTextModel.from_pretrained("stabilityai/stable-diffusion-2", subfolder="text_encoder")
-
-    # SKIP for now - need openclip -> HF conversion script here
-    #    keys = list(checkpoint.keys())
-    #
-    #    text_model_dict = {}
-    #    for key in keys:
-    #        if key.startswith("cond_stage_model.model.transformer"):
-    #            text_model_dict[key[len("cond_stage_model.model.transformer.") :]] = checkpoint[key]
-    #
-    #    text_model.load_state_dict(text_model_dict)
-
-    return text_model
-
-
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()

@@ -744,41 +643,11 @@ if __name__ == "__main__":
        type=str,
        help="The YAML config file corresponding to the original architecture.",
    )
-    parser.add_argument(
-        "--num_in_channels",
-        default=None,
-        type=int,
-        help="The number of input channels. If `None` number of input channels will be automatically inferred.",
-    )
    parser.add_argument(
        "--scheduler_type",
        default="pndm",
        type=str,
-        help="Type of scheduler to use. Should be one of ['pndm', 'lms', 'ddim', 'euler', 'euler-ancest', 'dpm']",
-    )
-    parser.add_argument(
-        "--pipeline_type",
-        default=None,
-        type=str,
-        help="The pipeline type. If `None` pipeline will be automatically inferred.",
-    )
-    parser.add_argument(
-        "--image_size",
-        default=None,
-        type=int,
-        help=(
-            "The image size that the model was trained on. Use 512 for Stable Diffusion v1.X and Stable Siffusion v2"
-            " Base. Use 768 for Stable Diffusion v2."
-        ),
-    )
-    parser.add_argument(
-        "--prediction_type",
-        default=None,
-        type=str,
-        help=(
-            "The prediction type that the model was trained on. Use 'epsilon' for Stable Diffusion v1.X and Stable"
-            " Siffusion v2 Base. Use 'v-prediction' for Stable Diffusion v2."
-        ),
+        help="Type of scheduler to use. Should be one of ['pndm', 'lms', 'ddim']",
    )
    parser.add_argument(
        "--extract_ema",
@@ -790,135 +659,63 @@ if __name__ == "__main__":
        ),
    )
    parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output model.")
+
    args = parser.parse_args()

-    image_size = args.image_size
-    prediction_type = args.prediction_type
-
-    checkpoint = torch.load(args.checkpoint_path)
-    global_step = checkpoint["global_step"]
-    checkpoint = checkpoint["state_dict"]
-
    if args.original_config_file is None:
-        key_name = "model.diffusion_model.input_blocks.2.1.transformer_blocks.0.attn2.to_k.weight"
-
-        if key_name in checkpoint and checkpoint[key_name].shape[-1] == 1024:
-            # model_type = "v2"
-            os.system(
-                "wget https://raw.githubusercontent.com/Stability-AI/stablediffusion/main/configs/stable-diffusion/v2-inference-v.yaml"
-            )
-            args.original_config_file = "./v2-inference-v.yaml"
-        else:
-            # model_type = "v1"
-            os.system(
-                "wget https://raw.githubusercontent.com/CompVis/stable-diffusion/main/configs/stable-diffusion/v1-inference.yaml"
-            )
-            args.original_config_file = "./v1-inference.yaml"
+        os.system(
+            "wget https://raw.githubusercontent.com/CompVis/stable-diffusion/main/configs/stable-diffusion/v1-inference.yaml"
+        )
+        args.original_config_file = "./v1-inference.yaml"

    original_config = OmegaConf.load(args.original_config_file)

-    if args.num_in_channels is not None:
-        original_config["model"]["params"]["unet_config"]["params"]["in_channels"] = args.num_in_channels
-
-    if (
-        "parameterization" in original_config["model"]["params"]
-        and original_config["model"]["params"]["parameterization"] == "v"
-    ):
-        if prediction_type is None:
-            # NOTE: For stable diffusion 2 base it is recommended to pass `prediction_type=="epsilon"`
-            # as it relies on a brittle global step parameter here
-            prediction_type = "epsilon" if global_step == 875000 else "v_prediction"
-        if image_size is None:
-            # NOTE: For stable diffusion 2 base one has to pass `image_size==512`
-            # as it relies on a brittle global step parameter here
-            image_size = 512 if global_step == 875000 else 768
-    else:
-        if prediction_type is None:
-            prediction_type = "epsilon"
-        if image_size is None:
-            image_size = 512
+    checkpoint = torch.load(args.checkpoint_path)
+    checkpoint = checkpoint["state_dict"]

    num_train_timesteps = original_config.model.params.timesteps
    beta_start = original_config.model.params.linear_start
    beta_end = original_config.model.params.linear_end
-
-    scheduler = DDIMScheduler(
-        beta_end=beta_end,
-        beta_schedule="scaled_linear",
-        beta_start=beta_start,
-        num_train_timesteps=num_train_timesteps,
-        steps_offset=1,
-        clip_sample=False,
-        set_alpha_to_one=False,
-        prediction_type=prediction_type,
-    )
    if args.scheduler_type == "pndm":
-        config = dict(scheduler.config)
-        config["skip_prk_steps"] = True
-        scheduler = PNDMScheduler.from_config(config)
+        scheduler = PNDMScheduler(
+            beta_end=beta_end,
+            beta_schedule="scaled_linear",
+            beta_start=beta_start,
+            num_train_timesteps=num_train_timesteps,
+            skip_prk_steps=True,
+        )
    elif args.scheduler_type == "lms":
-        scheduler = LMSDiscreteScheduler.from_config(scheduler.config)
-    elif args.scheduler_type == "heun":
-        scheduler = HeunDiscreteScheduler.from_config(scheduler.config)
-    elif args.scheduler_type == "euler":
-        scheduler = EulerDiscreteScheduler.from_config(scheduler.config)
-    elif args.scheduler_type == "euler-ancestral":
-        scheduler = EulerAncestralDiscreteScheduler.from_config(scheduler.config)
-    elif args.scheduler_type == "dpm":
-        scheduler = DPMSolverMultistepScheduler.from_config(scheduler.config)
+        scheduler = LMSDiscreteScheduler(beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear")
    elif args.scheduler_type == "ddim":
-        scheduler = scheduler
+        scheduler = DDIMScheduler(
+            beta_start=beta_start,
+            beta_end=beta_end,
+            beta_schedule="scaled_linear",
+            clip_sample=False,
+            set_alpha_to_one=False,
+        )
    else:
        raise ValueError(f"Scheduler of type {args.scheduler_type} doesn't exist!")

    # Convert the UNet2DConditionModel model.
-    unet_config = create_unet_diffusers_config(original_config, image_size=image_size)
-    unet = UNet2DConditionModel(**unet_config)
-
+    unet_config = create_unet_diffusers_config(original_config)
    converted_unet_checkpoint = convert_ldm_unet_checkpoint(
        checkpoint, unet_config, path=args.checkpoint_path, extract_ema=args.extract_ema
    )

+    unet = UNet2DConditionModel(**unet_config)
    unet.load_state_dict(converted_unet_checkpoint)

    # Convert the VAE model.
-    vae_config = create_vae_diffusers_config(original_config, image_size=image_size)
+    vae_config = create_vae_diffusers_config(original_config)
    converted_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config)

    vae = AutoencoderKL(**vae_config)
    vae.load_state_dict(converted_vae_checkpoint)

    # Convert the text model.
-    model_type = args.pipeline_type
-    if model_type is None:
-        model_type = original_config.model.params.cond_stage_config.target.split(".")[-1]
-
-    if model_type == "FrozenOpenCLIPEmbedder":
-        text_model = convert_open_clip_checkpoint(checkpoint)
-        tokenizer = CLIPTokenizer.from_pretrained("stabilityai/stable-diffusion-2", subfolder="tokenizer")
-        pipe = StableDiffusionPipeline(
-            vae=vae,
-            text_encoder=text_model,
-            tokenizer=tokenizer,
-            unet=unet,
-            scheduler=scheduler,
-            safety_checker=None,
-            feature_extractor=None,
-            requires_safety_checker=False,
-        )
-    elif model_type == "PaintByExample":
-        vision_model = convert_paint_by_example_checkpoint(checkpoint)
-        tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
-        feature_extractor = AutoFeatureExtractor.from_pretrained("CompVis/stable-diffusion-safety-checker")
-        pipe = PaintByExamplePipeline(
-            vae=vae,
-            image_encoder=vision_model,
-            unet=unet,
-            scheduler=scheduler,
-            safety_checker=None,
-            feature_extractor=feature_extractor,
-        )
-    elif model_type == "FrozenCLIPEmbedder":
+    text_model_type = original_config.model.params.cond_stage_config.target.split(".")[-1]
+    if text_model_type == "FrozenCLIPEmbedder":
        text_model = convert_ldm_clip_checkpoint(checkpoint)
        tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
        safety_checker = StableDiffusionSafetyChecker.from_pretrained("CompVis/stable-diffusion-safety-checker")
--- a/scripts/convert_stable_diffusion_checkpoint_to_onnx.py
+++ b/scripts/convert_stable_diffusion_checkpoint_to_onnx.py
@@ -81,8 +81,6 @@ def convert_models(model_path: str, output_path: str, opset: int, fp16: bool = F
    output_path = Path(output_path)

    # TEXT ENCODER
-    num_tokens = pipeline.text_encoder.config.max_position_embeddings
-    text_hidden_size = pipeline.text_encoder.config.hidden_size
    text_input = pipeline.tokenizer(
        "A sample prompt",
        padding="max_length",
@@ -105,15 +103,13 @@ def convert_models(model_path: str, output_path: str, opset: int, fp16: bool = F
    del pipeline.text_encoder

    # UNET
-    unet_in_channels = pipeline.unet.config.in_channels
-    unet_sample_size = pipeline.unet.config.sample_size
    unet_path = output_path / "unet" / "model.onnx"
    onnx_export(
        pipeline.unet,
        model_args=(
-            torch.randn(2, unet_in_channels, unet_sample_size, unet_sample_size).to(device=device, dtype=dtype),
-            torch.randn(2).to(device=device, dtype=dtype),
-            torch.randn(2, num_tokens, text_hidden_size).to(device=device, dtype=dtype),
+            torch.randn(2, pipeline.unet.in_channels, 64, 64).to(device=device, dtype=dtype),
+            torch.LongTensor([0, 1]).to(device=device),
+            torch.randn(2, 77, 768).to(device=device, dtype=dtype),
            False,
        ),
        output_path=unet_path,
@@ -146,16 +142,11 @@ def convert_models(model_path: str, output_path: str, opset: int, fp16: bool = F

    # VAE ENCODER
    vae_encoder = pipeline.vae
-    vae_in_channels = vae_encoder.config.in_channels
-    vae_sample_size = vae_encoder.config.sample_size
    # need to get the raw tensor output (sample) from the encoder
    vae_encoder.forward = lambda sample, return_dict: vae_encoder.encode(sample, return_dict)[0].sample()
    onnx_export(
        vae_encoder,
-        model_args=(
-            torch.randn(1, vae_in_channels, vae_sample_size, vae_sample_size).to(device=device, dtype=dtype),
-            False,
-        ),
+        model_args=(torch.randn(1, 3, 512, 512).to(device=device, dtype=dtype), False),
        output_path=output_path / "vae_encoder" / "model.onnx",
        ordered_input_names=["sample", "return_dict"],
        output_names=["latent_sample"],
@@ -167,16 +158,11 @@ def convert_models(model_path: str, output_path: str, opset: int, fp16: bool = F

    # VAE DECODER
    vae_decoder = pipeline.vae
-    vae_latent_channels = vae_decoder.config.latent_channels
-    vae_out_channels = vae_decoder.config.out_channels
    # forward only through the decoder part
    vae_decoder.forward = vae_encoder.decode
    onnx_export(
        vae_decoder,
-        model_args=(
-            torch.randn(1, vae_latent_channels, unet_sample_size, unet_sample_size).to(device=device, dtype=dtype),
-            False,
-        ),
+        model_args=(torch.randn(1, 4, 64, 64).to(device=device, dtype=dtype), False),
        output_path=output_path / "vae_decoder" / "model.onnx",
        ordered_input_names=["latent_sample", "return_dict"],
        output_names=["sample"],
@@ -188,37 +174,24 @@ def convert_models(model_path: str, output_path: str, opset: int, fp16: bool = F
    del pipeline.vae

    # SAFETY CHECKER
-    if pipeline.safety_checker is not None:
-        safety_checker = pipeline.safety_checker
-        clip_num_channels = safety_checker.config.vision_config.num_channels
-        clip_image_size = safety_checker.config.vision_config.image_size
-        safety_checker.forward = safety_checker.forward_onnx
-        onnx_export(
-            pipeline.safety_checker,
-            model_args=(
-                torch.randn(
-                    1,
-                    clip_num_channels,
-                    clip_image_size,
-                    clip_image_size,
-                ).to(device=device, dtype=dtype),
-                torch.randn(1, vae_sample_size, vae_sample_size, vae_out_channels).to(device=device, dtype=dtype),
-            ),
-            output_path=output_path / "safety_checker" / "model.onnx",
-            ordered_input_names=["clip_input", "images"],
-            output_names=["out_images", "has_nsfw_concepts"],
-            dynamic_axes={
-                "clip_input": {0: "batch", 1: "channels", 2: "height", 3: "width"},
-                "images": {0: "batch", 1: "height", 2: "width", 3: "channels"},
-            },
-            opset=opset,
-        )
-        del pipeline.safety_checker
-        safety_checker = OnnxRuntimeModel.from_pretrained(output_path / "safety_checker")
-        feature_extractor = pipeline.feature_extractor
-    else:
-        safety_checker = None
-        feature_extractor = None
+    safety_checker = pipeline.safety_checker
+    safety_checker.forward = safety_checker.forward_onnx
+    onnx_export(
+        pipeline.safety_checker,
+        model_args=(
+            torch.randn(1, 3, 224, 224).to(device=device, dtype=dtype),
+            torch.randn(1, 512, 512, 3).to(device=device, dtype=dtype),
+        ),
+        output_path=output_path / "safety_checker" / "model.onnx",
+        ordered_input_names=["clip_input", "images"],
+        output_names=["out_images", "has_nsfw_concepts"],
+        dynamic_axes={
+            "clip_input": {0: "batch", 1: "channels", 2: "height", 3: "width"},
+            "images": {0: "batch", 1: "height", 2: "width", 3: "channels"},
+        },
+        opset=opset,
+    )
+    del pipeline.safety_checker

    onnx_pipeline = OnnxStableDiffusionPipeline(
        vae_encoder=OnnxRuntimeModel.from_pretrained(output_path / "vae_encoder"),
@@ -227,9 +200,8 @@ def convert_models(model_path: str, output_path: str, opset: int, fp16: bool = F
        tokenizer=pipeline.tokenizer,
        unet=OnnxRuntimeModel.from_pretrained(output_path / "unet"),
        scheduler=pipeline.scheduler,
-        safety_checker=safety_checker,
-        feature_extractor=feature_extractor,
-        requires_safety_checker=safety_checker is not None,
+        safety_checker=OnnxRuntimeModel.from_pretrained(output_path / "safety_checker"),
+        feature_extractor=pipeline.feature_extractor,
    )

    onnx_pipeline.save_pretrained(output_path)
--- a/scripts/convert_versatile_diffusion_to_diffusers.py
+++ b/scripts/convert_versatile_diffusion_to_diffusers.py
@@ -1,791 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Conversion script for the Versatile Stable Diffusion checkpoints. """
-
-import argparse
-from argparse import Namespace
-
-import torch
-
-from diffusers import (
-    AutoencoderKL,
-    DDIMScheduler,
-    DPMSolverMultistepScheduler,
-    EulerAncestralDiscreteScheduler,
-    EulerDiscreteScheduler,
-    LMSDiscreteScheduler,
-    PNDMScheduler,
-    UNet2DConditionModel,
-    VersatileDiffusionPipeline,
-)
-from diffusers.pipelines.versatile_diffusion.modeling_text_unet import UNetFlatConditionModel
-from transformers import (
-    CLIPFeatureExtractor,
-    CLIPTextModelWithProjection,
-    CLIPTokenizer,
-    CLIPVisionModelWithProjection,
-)
-
-
-SCHEDULER_CONFIG = Namespace(
-    **{
-        "beta_linear_start": 0.00085,
-        "beta_linear_end": 0.012,
-        "timesteps": 1000,
-        "scale_factor": 0.18215,
-    }
-)
-
-IMAGE_UNET_CONFIG = Namespace(
-    **{
-        "input_channels": 4,
-        "model_channels": 320,
-        "output_channels": 4,
-        "num_noattn_blocks": [2, 2, 2, 2],
-        "channel_mult": [1, 2, 4, 4],
-        "with_attn": [True, True, True, False],
-        "num_heads": 8,
-        "context_dim": 768,
-        "use_checkpoint": True,
-    }
-)
-
-TEXT_UNET_CONFIG = Namespace(
-    **{
-        "input_channels": 768,
-        "model_channels": 320,
-        "output_channels": 768,
-        "num_noattn_blocks": [2, 2, 2, 2],
-        "channel_mult": [1, 2, 4, 4],
-        "second_dim": [4, 4, 4, 4],
-        "with_attn": [True, True, True, False],
-        "num_heads": 8,
-        "context_dim": 768,
-        "use_checkpoint": True,
-    }
-)
-
-AUTOENCODER_CONFIG = Namespace(
-    **{
-        "double_z": True,
-        "z_channels": 4,
-        "resolution": 256,
-        "in_channels": 3,
-        "out_ch": 3,
-        "ch": 128,
-        "ch_mult": [1, 2, 4, 4],
-        "num_res_blocks": 2,
-        "attn_resolutions": [],
-        "dropout": 0.0,
-    }
-)
-
-
-def shave_segments(path, n_shave_prefix_segments=1):
-    """
-    Removes segments. Positive values shave the first segments, negative shave the last segments.
-    """
-    if n_shave_prefix_segments >= 0:
-        return ".".join(path.split(".")[n_shave_prefix_segments:])
-    else:
-        return ".".join(path.split(".")[:n_shave_prefix_segments])
-
-
-def renew_resnet_paths(old_list, n_shave_prefix_segments=0):
-    """
-    Updates paths inside resnets to the new naming scheme (local renaming)
-    """
-    mapping = []
-    for old_item in old_list:
-        new_item = old_item.replace("in_layers.0", "norm1")
-        new_item = new_item.replace("in_layers.2", "conv1")
-
-        new_item = new_item.replace("out_layers.0", "norm2")
-        new_item = new_item.replace("out_layers.3", "conv2")
-
-        new_item = new_item.replace("emb_layers.1", "time_emb_proj")
-        new_item = new_item.replace("skip_connection", "conv_shortcut")
-
-        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
-
-        mapping.append({"old": old_item, "new": new_item})
-
-    return mapping
-
-
-def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0):
-    """
-    Updates paths inside resnets to the new naming scheme (local renaming)
-    """
-    mapping = []
-    for old_item in old_list:
-        new_item = old_item
-
-        new_item = new_item.replace("nin_shortcut", "conv_shortcut")
-        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
-
-        mapping.append({"old": old_item, "new": new_item})
-
-    return mapping
-
-
-def renew_attention_paths(old_list, n_shave_prefix_segments=0):
-    """
-    Updates paths inside attentions to the new naming scheme (local renaming)
-    """
-    mapping = []
-    for old_item in old_list:
-        new_item = old_item
-
-        #         new_item = new_item.replace('norm.weight', 'group_norm.weight')
-        #         new_item = new_item.replace('norm.bias', 'group_norm.bias')
-
-        #         new_item = new_item.replace('proj_out.weight', 'proj_attn.weight')
-        #         new_item = new_item.replace('proj_out.bias', 'proj_attn.bias')
-
-        #         new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
-
-        mapping.append({"old": old_item, "new": new_item})
-
-    return mapping
-
-
-def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
-    """
-    Updates paths inside attentions to the new naming scheme (local renaming)
-    """
-    mapping = []
-    for old_item in old_list:
-        new_item = old_item
-
-        new_item = new_item.replace("norm.weight", "group_norm.weight")
-        new_item = new_item.replace("norm.bias", "group_norm.bias")
-
-        new_item = new_item.replace("q.weight", "query.weight")
-        new_item = new_item.replace("q.bias", "query.bias")
-
-        new_item = new_item.replace("k.weight", "key.weight")
-        new_item = new_item.replace("k.bias", "key.bias")
-
-        new_item = new_item.replace("v.weight", "value.weight")
-        new_item = new_item.replace("v.bias", "value.bias")
-
-        new_item = new_item.replace("proj_out.weight", "proj_attn.weight")
-        new_item = new_item.replace("proj_out.bias", "proj_attn.bias")
-
-        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
-
-        mapping.append({"old": old_item, "new": new_item})
-
-    return mapping
-
-
-def assign_to_checkpoint(
-    paths, checkpoint, old_checkpoint, attention_paths_to_split=None, additional_replacements=None, config=None
-):
-    """
-    This does the final conversion step: take locally converted weights and apply a global renaming
-    to them. It splits attention layers, and takes into account additional replacements
-    that may arise.
-
-    Assigns the weights to the new checkpoint.
-    """
-    assert isinstance(paths, list), "Paths should be a list of dicts containing 'old' and 'new' keys."
-
-    # Splits the attention layers into three variables.
-    if attention_paths_to_split is not None:
-        for path, path_map in attention_paths_to_split.items():
-            old_tensor = old_checkpoint[path]
-            channels = old_tensor.shape[0] // 3
-
-            target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (-1)
-
-            num_heads = old_tensor.shape[0] // config["num_head_channels"] // 3
-
-            old_tensor = old_tensor.reshape((num_heads, 3 * channels // num_heads) + old_tensor.shape[1:])
-            query, key, value = old_tensor.split(channels // num_heads, dim=1)
-
-            checkpoint[path_map["query"]] = query.reshape(target_shape)
-            checkpoint[path_map["key"]] = key.reshape(target_shape)
-            checkpoint[path_map["value"]] = value.reshape(target_shape)
-
-    for path in paths:
-        new_path = path["new"]
-
-        # These have already been assigned
-        if attention_paths_to_split is not None and new_path in attention_paths_to_split:
-            continue
-
-        # Global renaming happens here
-        new_path = new_path.replace("middle_block.0", "mid_block.resnets.0")
-        new_path = new_path.replace("middle_block.1", "mid_block.attentions.0")
-        new_path = new_path.replace("middle_block.2", "mid_block.resnets.1")
-
-        if additional_replacements is not None:
-            for replacement in additional_replacements:
-                new_path = new_path.replace(replacement["old"], replacement["new"])
-
-        # proj_attn.weight has to be converted from conv 1D to linear
-        if "proj_attn.weight" in new_path:
-            checkpoint[new_path] = old_checkpoint[path["old"]][:, :, 0]
-        elif path["old"] in old_checkpoint:
-            checkpoint[new_path] = old_checkpoint[path["old"]]
-
-
-def conv_attn_to_linear(checkpoint):
-    keys = list(checkpoint.keys())
-    attn_keys = ["query.weight", "key.weight", "value.weight"]
-    for key in keys:
-        if ".".join(key.split(".")[-2:]) in attn_keys:
-            if checkpoint[key].ndim > 2:
-                checkpoint[key] = checkpoint[key][:, :, 0, 0]
-        elif "proj_attn.weight" in key:
-            if checkpoint[key].ndim > 2:
-                checkpoint[key] = checkpoint[key][:, :, 0]
-
-
-def create_image_unet_diffusers_config(unet_params):
-    """
-    Creates a config for the diffusers based on the config of the VD model.
-    """
-
-    block_out_channels = [unet_params.model_channels * mult for mult in unet_params.channel_mult]
-
-    down_block_types = []
-    resolution = 1
-    for i in range(len(block_out_channels)):
-        block_type = "CrossAttnDownBlock2D" if unet_params.with_attn[i] else "DownBlock2D"
-        down_block_types.append(block_type)
-        if i != len(block_out_channels) - 1:
-            resolution *= 2
-
-    up_block_types = []
-    for i in range(len(block_out_channels)):
-        block_type = "CrossAttnUpBlock2D" if unet_params.with_attn[-i - 1] else "UpBlock2D"
-        up_block_types.append(block_type)
-        resolution //= 2
-
-    if not all(n == unet_params.num_noattn_blocks[0] for n in unet_params.num_noattn_blocks):
-        raise ValueError("Not all num_res_blocks are equal, which is not supported in this script.")
-
-    config = dict(
-        sample_size=None,
-        in_channels=unet_params.input_channels,
-        out_channels=unet_params.output_channels,
-        down_block_types=tuple(down_block_types),
-        up_block_types=tuple(up_block_types),
-        block_out_channels=tuple(block_out_channels),
-        layers_per_block=unet_params.num_noattn_blocks[0],
-        cross_attention_dim=unet_params.context_dim,
-        attention_head_dim=unet_params.num_heads,
-    )
-
-    return config
-
-
-def create_text_unet_diffusers_config(unet_params):
-    """
-    Creates a config for the diffusers based on the config of the VD model.
-    """
-
-    block_out_channels = [unet_params.model_channels * mult for mult in unet_params.channel_mult]
-
-    down_block_types = []
-    resolution = 1
-    for i in range(len(block_out_channels)):
-        block_type = "CrossAttnDownBlockFlat" if unet_params.with_attn[i] else "DownBlockFlat"
-        down_block_types.append(block_type)
-        if i != len(block_out_channels) - 1:
-            resolution *= 2
-
-    up_block_types = []
-    for i in range(len(block_out_channels)):
-        block_type = "CrossAttnUpBlockFlat" if unet_params.with_attn[-i - 1] else "UpBlockFlat"
-        up_block_types.append(block_type)
-        resolution //= 2
-
-    if not all(n == unet_params.num_noattn_blocks[0] for n in unet_params.num_noattn_blocks):
-        raise ValueError("Not all num_res_blocks are equal, which is not supported in this script.")
-
-    config = dict(
-        sample_size=None,
-        in_channels=(unet_params.input_channels, 1, 1),
-        out_channels=(unet_params.output_channels, 1, 1),
-        down_block_types=tuple(down_block_types),
-        up_block_types=tuple(up_block_types),
-        block_out_channels=tuple(block_out_channels),
-        layers_per_block=unet_params.num_noattn_blocks[0],
-        cross_attention_dim=unet_params.context_dim,
-        attention_head_dim=unet_params.num_heads,
-    )
-
-    return config
-
-
-def create_vae_diffusers_config(vae_params):
-    """
-    Creates a config for the diffusers based on the config of the VD model.
-    """
-
-    block_out_channels = [vae_params.ch * mult for mult in vae_params.ch_mult]
-    down_block_types = ["DownEncoderBlock2D"] * len(block_out_channels)
-    up_block_types = ["UpDecoderBlock2D"] * len(block_out_channels)
-
-    config = dict(
-        sample_size=vae_params.resolution,
-        in_channels=vae_params.in_channels,
-        out_channels=vae_params.out_ch,
-        down_block_types=tuple(down_block_types),
-        up_block_types=tuple(up_block_types),
-        block_out_channels=tuple(block_out_channels),
-        latent_channels=vae_params.z_channels,
-        layers_per_block=vae_params.num_res_blocks,
-    )
-    return config
-
-
-def create_diffusers_scheduler(original_config):
-    schedular = DDIMScheduler(
-        num_train_timesteps=original_config.model.params.timesteps,
-        beta_start=original_config.model.params.linear_start,
-        beta_end=original_config.model.params.linear_end,
-        beta_schedule="scaled_linear",
-    )
-    return schedular
-
-
-def convert_vd_unet_checkpoint(checkpoint, config, unet_key, extract_ema=False):
-    """
-    Takes a state dict and a config, and returns a converted checkpoint.
-    """
-
-    # extract state_dict for UNet
-    unet_state_dict = {}
-    keys = list(checkpoint.keys())
-
-    # at least a 100 parameters have to start with `model_ema` in order for the checkpoint to be EMA
-    if sum(k.startswith("model_ema") for k in keys) > 100:
-        print("Checkpoint has both EMA and non-EMA weights.")
-        if extract_ema:
-            print(
-                "In this conversion only the EMA weights are extracted. If you want to instead extract the non-EMA"
-                " weights (useful to continue fine-tuning), please make sure to remove the `--extract_ema` flag."
-            )
-            for key in keys:
-                if key.startswith("model.diffusion_model"):
-                    flat_ema_key = "model_ema." + "".join(key.split(".")[1:])
-                    unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(flat_ema_key)
-        else:
-            print(
-                "In this conversion only the non-EMA weights are extracted. If you want to instead extract the EMA"
-                " weights (usually better for inference), please make sure to add the `--extract_ema` flag."
-            )
-
-    for key in keys:
-        if key.startswith(unet_key):
-            unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(key)
-
-    new_checkpoint = {}
-
-    new_checkpoint["time_embedding.linear_1.weight"] = checkpoint["model.diffusion_model.time_embed.0.weight"]
-    new_checkpoint["time_embedding.linear_1.bias"] = checkpoint["model.diffusion_model.time_embed.0.bias"]
-    new_checkpoint["time_embedding.linear_2.weight"] = checkpoint["model.diffusion_model.time_embed.2.weight"]
-    new_checkpoint["time_embedding.linear_2.bias"] = checkpoint["model.diffusion_model.time_embed.2.bias"]
-
-    new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"]
-    new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"]
-
-    new_checkpoint["conv_norm_out.weight"] = unet_state_dict["out.0.weight"]
-    new_checkpoint["conv_norm_out.bias"] = unet_state_dict["out.0.bias"]
-    new_checkpoint["conv_out.weight"] = unet_state_dict["out.2.weight"]
-    new_checkpoint["conv_out.bias"] = unet_state_dict["out.2.bias"]
-
-    # Retrieves the keys for the input blocks only
-    num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "input_blocks" in layer})
-    input_blocks = {
-        layer_id: [key for key in unet_state_dict if f"input_blocks.{layer_id}" in key]
-        for layer_id in range(num_input_blocks)
-    }
-
-    # Retrieves the keys for the middle blocks only
-    num_middle_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "middle_block" in layer})
-    middle_blocks = {
-        layer_id: [key for key in unet_state_dict if f"middle_block.{layer_id}" in key]
-        for layer_id in range(num_middle_blocks)
-    }
-
-    # Retrieves the keys for the output blocks only
-    num_output_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "output_blocks" in layer})
-    output_blocks = {
-        layer_id: [key for key in unet_state_dict if f"output_blocks.{layer_id}" in key]
-        for layer_id in range(num_output_blocks)
-    }
-
-    for i in range(1, num_input_blocks):
-        block_id = (i - 1) // (config["layers_per_block"] + 1)
-        layer_in_block_id = (i - 1) % (config["layers_per_block"] + 1)
-
-        resnets = [
-            key for key in input_blocks[i] if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in key
-        ]
-        attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key]
-
-        if f"input_blocks.{i}.0.op.weight" in unet_state_dict:
-            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop(
-                f"input_blocks.{i}.0.op.weight"
-            )
-            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop(
-                f"input_blocks.{i}.0.op.bias"
-            )
-        elif f"input_blocks.{i}.0.weight" in unet_state_dict:
-            # text_unet uses linear layers in place of downsamplers
-            shape = unet_state_dict[f"input_blocks.{i}.0.weight"].shape
-            if shape[0] != shape[1]:
-                continue
-            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.weight"] = unet_state_dict.pop(
-                f"input_blocks.{i}.0.weight"
-            )
-            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.bias"] = unet_state_dict.pop(
-                f"input_blocks.{i}.0.bias"
-            )
-
-        paths = renew_resnet_paths(resnets)
-        meta_path = {"old": f"input_blocks.{i}.0", "new": f"down_blocks.{block_id}.resnets.{layer_in_block_id}"}
-        assign_to_checkpoint(
-            paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
-        )
-
-        if len(attentions):
-            paths = renew_attention_paths(attentions)
-            meta_path = {"old": f"input_blocks.{i}.1", "new": f"down_blocks.{block_id}.attentions.{layer_in_block_id}"}
-            assign_to_checkpoint(
-                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
-            )
-
-    resnet_0 = middle_blocks[0]
-    attentions = middle_blocks[1]
-    resnet_1 = middle_blocks[2]
-
-    resnet_0_paths = renew_resnet_paths(resnet_0)
-    assign_to_checkpoint(resnet_0_paths, new_checkpoint, unet_state_dict, config=config)
-
-    resnet_1_paths = renew_resnet_paths(resnet_1)
-    assign_to_checkpoint(resnet_1_paths, new_checkpoint, unet_state_dict, config=config)
-
-    attentions_paths = renew_attention_paths(attentions)
-    meta_path = {"old": "middle_block.1", "new": "mid_block.attentions.0"}
-    assign_to_checkpoint(
-        attentions_paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
-    )
-
-    for i in range(num_output_blocks):
-        block_id = i // (config["layers_per_block"] + 1)
-        layer_in_block_id = i % (config["layers_per_block"] + 1)
-        output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]]
-        output_block_list = {}
-
-        for layer in output_block_layers:
-            layer_id, layer_name = layer.split(".")[0], shave_segments(layer, 1)
-            if layer_id in output_block_list:
-                output_block_list[layer_id].append(layer_name)
-            else:
-                output_block_list[layer_id] = [layer_name]
-
-        if len(output_block_list) > 1:
-            resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.0" in key]
-            attentions = [key for key in output_blocks[i] if f"output_blocks.{i}.1" in key]
-
-            paths = renew_resnet_paths(resnets)
-
-            meta_path = {"old": f"output_blocks.{i}.0", "new": f"up_blocks.{block_id}.resnets.{layer_in_block_id}"}
-            assign_to_checkpoint(
-                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
-            )
-
-            if ["conv.weight", "conv.bias"] in output_block_list.values():
-                index = list(output_block_list.values()).index(["conv.weight", "conv.bias"])
-                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
-                    f"output_blocks.{i}.{index}.conv.weight"
-                ]
-                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[
-                    f"output_blocks.{i}.{index}.conv.bias"
-                ]
-                # Clear attentions as they have been attributed above.
-                if len(attentions) == 2:
-                    attentions = []
-            elif f"output_blocks.{i}.1.weight" in unet_state_dict:
-                # text_unet uses linear layers in place of upsamplers
-                shape = unet_state_dict[f"output_blocks.{i}.1.weight"].shape
-                if shape[0] != shape[1]:
-                    continue
-                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.weight"] = unet_state_dict.pop(
-                    f"output_blocks.{i}.1.weight"
-                )
-                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.bias"] = unet_state_dict.pop(
-                    f"output_blocks.{i}.1.bias"
-                )
-                # Clear attentions as they have been attributed above.
-                if len(attentions) == 2:
-                    attentions = []
-            elif f"output_blocks.{i}.2.weight" in unet_state_dict:
-                # text_unet uses linear layers in place of upsamplers
-                shape = unet_state_dict[f"output_blocks.{i}.2.weight"].shape
-                if shape[0] != shape[1]:
-                    continue
-                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.weight"] = unet_state_dict.pop(
-                    f"output_blocks.{i}.2.weight"
-                )
-                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.bias"] = unet_state_dict.pop(
-                    f"output_blocks.{i}.2.bias"
-                )
-
-            if len(attentions):
-                paths = renew_attention_paths(attentions)
-                meta_path = {
-                    "old": f"output_blocks.{i}.1",
-                    "new": f"up_blocks.{block_id}.attentions.{layer_in_block_id}",
-                }
-                assign_to_checkpoint(
-                    paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
-                )
-        else:
-            resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1)
-            for path in resnet_0_paths:
-                old_path = ".".join(["output_blocks", str(i), path["old"]])
-                new_path = ".".join(["up_blocks", str(block_id), "resnets", str(layer_in_block_id), path["new"]])
-
-                new_checkpoint[new_path] = unet_state_dict[old_path]
-
-    return new_checkpoint
-
-
-def convert_vd_vae_checkpoint(checkpoint, config):
-    # extract state dict for VAE
-    vae_state_dict = {}
-    keys = list(checkpoint.keys())
-    for key in keys:
-        vae_state_dict[key] = checkpoint.get(key)
-
-    new_checkpoint = {}
-
-    new_checkpoint["encoder.conv_in.weight"] = vae_state_dict["encoder.conv_in.weight"]
-    new_checkpoint["encoder.conv_in.bias"] = vae_state_dict["encoder.conv_in.bias"]
-    new_checkpoint["encoder.conv_out.weight"] = vae_state_dict["encoder.conv_out.weight"]
-    new_checkpoint["encoder.conv_out.bias"] = vae_state_dict["encoder.conv_out.bias"]
-    new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict["encoder.norm_out.weight"]
-    new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict["encoder.norm_out.bias"]
-
-    new_checkpoint["decoder.conv_in.weight"] = vae_state_dict["decoder.conv_in.weight"]
-    new_checkpoint["decoder.conv_in.bias"] = vae_state_dict["decoder.conv_in.bias"]
-    new_checkpoint["decoder.conv_out.weight"] = vae_state_dict["decoder.conv_out.weight"]
-    new_checkpoint["decoder.conv_out.bias"] = vae_state_dict["decoder.conv_out.bias"]
-    new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict["decoder.norm_out.weight"]
-    new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict["decoder.norm_out.bias"]
-
-    new_checkpoint["quant_conv.weight"] = vae_state_dict["quant_conv.weight"]
-    new_checkpoint["quant_conv.bias"] = vae_state_dict["quant_conv.bias"]
-    new_checkpoint["post_quant_conv.weight"] = vae_state_dict["post_quant_conv.weight"]
-    new_checkpoint["post_quant_conv.bias"] = vae_state_dict["post_quant_conv.bias"]
-
-    # Retrieves the keys for the encoder down blocks only
-    num_down_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "encoder.down" in layer})
-    down_blocks = {
-        layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key] for layer_id in range(num_down_blocks)
-    }
-
-    # Retrieves the keys for the decoder up blocks only
-    num_up_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "decoder.up" in layer})
-    up_blocks = {
-        layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] for layer_id in range(num_up_blocks)
-    }
-
-    for i in range(num_down_blocks):
-        resnets = [key for key in down_blocks[i] if f"down.{i}" in key and f"down.{i}.downsample" not in key]
-
-        if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict:
-            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(
-                f"encoder.down.{i}.downsample.conv.weight"
-            )
-            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop(
-                f"encoder.down.{i}.downsample.conv.bias"
-            )
-
-        paths = renew_vae_resnet_paths(resnets)
-        meta_path = {"old": f"down.{i}.block", "new": f"down_blocks.{i}.resnets"}
-        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
-
-    mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key]
-    num_mid_res_blocks = 2
-    for i in range(1, num_mid_res_blocks + 1):
-        resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key]
-
-        paths = renew_vae_resnet_paths(resnets)
-        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
-        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
-
-    mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key]
-    paths = renew_vae_attention_paths(mid_attentions)
-    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
-    assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
-    conv_attn_to_linear(new_checkpoint)
-
-    for i in range(num_up_blocks):
-        block_id = num_up_blocks - 1 - i
-        resnets = [
-            key for key in up_blocks[block_id] if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key
-        ]
-
-        if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict:
-            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[
-                f"decoder.up.{block_id}.upsample.conv.weight"
-            ]
-            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[
-                f"decoder.up.{block_id}.upsample.conv.bias"
-            ]
-
-        paths = renew_vae_resnet_paths(resnets)
-        meta_path = {"old": f"up.{block_id}.block", "new": f"up_blocks.{i}.resnets"}
-        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
-
-    mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key]
-    num_mid_res_blocks = 2
-    for i in range(1, num_mid_res_blocks + 1):
-        resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key]
-
-        paths = renew_vae_resnet_paths(resnets)
-        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
-        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
-
-    mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key]
-    paths = renew_vae_attention_paths(mid_attentions)
-    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
-    assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
-    conv_attn_to_linear(new_checkpoint)
-    return new_checkpoint
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        "--unet_checkpoint_path", default=None, type=str, required=False, help="Path to the checkpoint to convert."
-    )
-    parser.add_argument(
-        "--vae_checkpoint_path", default=None, type=str, required=False, help="Path to the checkpoint to convert."
-    )
-    parser.add_argument(
-        "--optimus_checkpoint_path", default=None, type=str, required=False, help="Path to the checkpoint to convert."
-    )
-    parser.add_argument(
-        "--scheduler_type",
-        default="pndm",
-        type=str,
-        help="Type of scheduler to use. Should be one of ['pndm', 'lms', 'ddim', 'euler', 'euler-ancest', 'dpm']",
-    )
-    parser.add_argument(
-        "--extract_ema",
-        action="store_true",
-        help=(
-            "Only relevant for checkpoints that have both EMA and non-EMA weights. Whether to extract the EMA weights"
-            " or not. Defaults to `False`. Add `--extract_ema` to extract the EMA weights. EMA weights usually yield"
-            " higher quality images for inference. Non-EMA weights are usually better to continue fine-tuning."
-        ),
-    )
-    parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output model.")
-
-    args = parser.parse_args()
-
-    scheduler_config = SCHEDULER_CONFIG
-
-    num_train_timesteps = scheduler_config.timesteps
-    beta_start = scheduler_config.beta_linear_start
-    beta_end = scheduler_config.beta_linear_end
-    if args.scheduler_type == "pndm":
-        scheduler = PNDMScheduler(
-            beta_end=beta_end,
-            beta_schedule="scaled_linear",
-            beta_start=beta_start,
-            num_train_timesteps=num_train_timesteps,
-            skip_prk_steps=True,
-            steps_offset=1,
-        )
-    elif args.scheduler_type == "lms":
-        scheduler = LMSDiscreteScheduler(beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear")
-    elif args.scheduler_type == "euler":
-        scheduler = EulerDiscreteScheduler(beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear")
-    elif args.scheduler_type == "euler-ancestral":
-        scheduler = EulerAncestralDiscreteScheduler(
-            beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear"
-        )
-    elif args.scheduler_type == "dpm":
-        scheduler = DPMSolverMultistepScheduler(
-            beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear"
-        )
-    elif args.scheduler_type == "ddim":
-        scheduler = DDIMScheduler(
-            beta_start=beta_start,
-            beta_end=beta_end,
-            beta_schedule="scaled_linear",
-            clip_sample=False,
-            set_alpha_to_one=False,
-            steps_offset=1,
-        )
-    else:
-        raise ValueError(f"Scheduler of type {args.scheduler_type} doesn't exist!")
-
-    # Convert the UNet2DConditionModel models.
-    if args.unet_checkpoint_path is not None:
-        # image UNet
-        image_unet_config = create_image_unet_diffusers_config(IMAGE_UNET_CONFIG)
-        checkpoint = torch.load(args.unet_checkpoint_path)
-        converted_image_unet_checkpoint = convert_vd_unet_checkpoint(
-            checkpoint, image_unet_config, unet_key="model.diffusion_model.unet_image.", extract_ema=args.extract_ema
-        )
-        image_unet = UNet2DConditionModel(**image_unet_config)
-        image_unet.load_state_dict(converted_image_unet_checkpoint)
-
-        # text UNet
-        text_unet_config = create_text_unet_diffusers_config(TEXT_UNET_CONFIG)
-        converted_text_unet_checkpoint = convert_vd_unet_checkpoint(
-            checkpoint, text_unet_config, unet_key="model.diffusion_model.unet_text.", extract_ema=args.extract_ema
-        )
-        text_unet = UNetFlatConditionModel(**text_unet_config)
-        text_unet.load_state_dict(converted_text_unet_checkpoint)
-
-    # Convert the VAE model.
-    if args.vae_checkpoint_path is not None:
-        vae_config = create_vae_diffusers_config(AUTOENCODER_CONFIG)
-        checkpoint = torch.load(args.vae_checkpoint_path)
-        converted_vae_checkpoint = convert_vd_vae_checkpoint(checkpoint, vae_config)
-
-        vae = AutoencoderKL(**vae_config)
-        vae.load_state_dict(converted_vae_checkpoint)
-
-    tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
-    image_feature_extractor = CLIPFeatureExtractor.from_pretrained("openai/clip-vit-large-patch14")
-    text_encoder = CLIPTextModelWithProjection.from_pretrained("openai/clip-vit-large-patch14")
-    image_encoder = CLIPVisionModelWithProjection.from_pretrained("openai/clip-vit-large-patch14")
-
-    pipe = VersatileDiffusionPipeline(
-        scheduler=scheduler,
-        tokenizer=tokenizer,
-        image_feature_extractor=image_feature_extractor,
-        text_encoder=text_encoder,
-        image_encoder=image_encoder,
-        image_unet=image_unet,
-        text_unet=text_unet,
-        vae=vae,
-    )
-    pipe.save_pretrained(args.dump_path)
--- a/scripts/convert_vq_diffusion_to_diffusers.py
+++ b/scripts/convert_vq_diffusion_to_diffusers.py
@@ -1,925 +0,0 @@
-"""
-This script ports models from VQ-diffusion (https://github.com/microsoft/VQ-Diffusion) to diffusers.
-
-It currently only supports porting the ITHQ dataset.
-
-ITHQ dataset:
-```sh
-# From the root directory of diffusers.
-
-# Download the VQVAE checkpoint
-$ wget https://facevcstandard.blob.core.windows.net/v-zhictang/Improved-VQ-Diffusion_model_release/ithq_vqvae.pth?sv=2020-10-02&st=2022-05-30T15%3A17%3A18Z&se=2030-05-31T15%3A17%3A00Z&sr=b&sp=r&sig=1jVavHFPpUjDs%2FTO1V3PTezaNbPp2Nx8MxiWI7y6fEY%3D -O ithq_vqvae.pth
-
-# Download the VQVAE config
-# NOTE that in VQ-diffusion the documented file is `configs/ithq.yaml` but the target class
-# `image_synthesis.modeling.codecs.image_codec.ema_vqvae.PatchVQVAE`
-# loads `OUTPUT/pretrained_model/taming_dvae/config.yaml`
-$ wget https://raw.githubusercontent.com/microsoft/VQ-Diffusion/main/OUTPUT/pretrained_model/taming_dvae/config.yaml -O ithq_vqvae.yaml
-
-# Download the main model checkpoint
-$ wget https://facevcstandard.blob.core.windows.net/v-zhictang/Improved-VQ-Diffusion_model_release/ithq_learnable.pth?sv=2020-10-02&st=2022-05-30T10%3A22%3A06Z&se=2030-05-31T10%3A22%3A00Z&sr=b&sp=r&sig=GOE%2Bza02%2FPnGxYVOOPtwrTR4RA3%2F5NVgMxdW4kjaEZ8%3D -O ithq_learnable.pth
-
-# Download the main model config
-$ wget https://raw.githubusercontent.com/microsoft/VQ-Diffusion/main/configs/ithq.yaml -O ithq.yaml
-
-# run the convert script
-$ python ./scripts/convert_vq_diffusion_to_diffusers.py \
-    --checkpoint_path ./ithq_learnable.pth \
-    --original_config_file ./ithq.yaml \
-    --vqvae_checkpoint_path ./ithq_vqvae.pth \
-    --vqvae_original_config_file ./ithq_vqvae.yaml \
-    --dump_path <path to save pre-trained `VQDiffusionPipeline`>
-```
-"""
-
-import argparse
-import tempfile
-
-import torch
-
-import yaml
-from accelerate import init_empty_weights, load_checkpoint_and_dispatch
-from diffusers import Transformer2DModel, VQDiffusionPipeline, VQDiffusionScheduler, VQModel
-from diffusers.pipelines.vq_diffusion.pipeline_vq_diffusion import LearnedClassifierFreeSamplingEmbeddings
-from transformers import CLIPTextModel, CLIPTokenizer
-from yaml.loader import FullLoader
-
-
-try:
-    from omegaconf import OmegaConf
-except ImportError:
-    raise ImportError(
-        "OmegaConf is required to convert the VQ Diffusion checkpoints. Please install it with `pip install"
-        " OmegaConf`."
-    )
-
-# vqvae model
-
-PORTED_VQVAES = ["image_synthesis.modeling.codecs.image_codec.patch_vqgan.PatchVQGAN"]
-
-
-def vqvae_model_from_original_config(original_config):
-    assert original_config.target in PORTED_VQVAES, f"{original_config.target} has not yet been ported to diffusers."
-
-    original_config = original_config.params
-
-    original_encoder_config = original_config.encoder_config.params
-    original_decoder_config = original_config.decoder_config.params
-
-    in_channels = original_encoder_config.in_channels
-    out_channels = original_decoder_config.out_ch
-
-    down_block_types = get_down_block_types(original_encoder_config)
-    up_block_types = get_up_block_types(original_decoder_config)
-
-    assert original_encoder_config.ch == original_decoder_config.ch
-    assert original_encoder_config.ch_mult == original_decoder_config.ch_mult
-    block_out_channels = tuple(
-        [original_encoder_config.ch * a_ch_mult for a_ch_mult in original_encoder_config.ch_mult]
-    )
-
-    assert original_encoder_config.num_res_blocks == original_decoder_config.num_res_blocks
-    layers_per_block = original_encoder_config.num_res_blocks
-
-    assert original_encoder_config.z_channels == original_decoder_config.z_channels
-    latent_channels = original_encoder_config.z_channels
-
-    num_vq_embeddings = original_config.n_embed
-
-    # Hard coded value for ResnetBlock.GoupNorm(num_groups) in VQ-diffusion
-    norm_num_groups = 32
-
-    e_dim = original_config.embed_dim
-
-    model = VQModel(
-        in_channels=in_channels,
-        out_channels=out_channels,
-        down_block_types=down_block_types,
-        up_block_types=up_block_types,
-        block_out_channels=block_out_channels,
-        layers_per_block=layers_per_block,
-        latent_channels=latent_channels,
-        num_vq_embeddings=num_vq_embeddings,
-        norm_num_groups=norm_num_groups,
-        vq_embed_dim=e_dim,
-    )
-
-    return model
-
-
-def get_down_block_types(original_encoder_config):
-    attn_resolutions = coerce_attn_resolutions(original_encoder_config.attn_resolutions)
-    num_resolutions = len(original_encoder_config.ch_mult)
-    resolution = coerce_resolution(original_encoder_config.resolution)
-
-    curr_res = resolution
-    down_block_types = []
-
-    for _ in range(num_resolutions):
-        if curr_res in attn_resolutions:
-            down_block_type = "AttnDownEncoderBlock2D"
-        else:
-            down_block_type = "DownEncoderBlock2D"
-
-        down_block_types.append(down_block_type)
-
-        curr_res = [r // 2 for r in curr_res]
-
-    return down_block_types
-
-
-def get_up_block_types(original_decoder_config):
-    attn_resolutions = coerce_attn_resolutions(original_decoder_config.attn_resolutions)
-    num_resolutions = len(original_decoder_config.ch_mult)
-    resolution = coerce_resolution(original_decoder_config.resolution)
-
-    curr_res = [r // 2 ** (num_resolutions - 1) for r in resolution]
-    up_block_types = []
-
-    for _ in reversed(range(num_resolutions)):
-        if curr_res in attn_resolutions:
-            up_block_type = "AttnUpDecoderBlock2D"
-        else:
-            up_block_type = "UpDecoderBlock2D"
-
-        up_block_types.append(up_block_type)
-
-        curr_res = [r * 2 for r in curr_res]
-
-    return up_block_types
-
-
-def coerce_attn_resolutions(attn_resolutions):
-    attn_resolutions = OmegaConf.to_object(attn_resolutions)
-    attn_resolutions_ = []
-    for ar in attn_resolutions:
-        if isinstance(ar, (list, tuple)):
-            attn_resolutions_.append(list(ar))
-        else:
-            attn_resolutions_.append([ar, ar])
-    return attn_resolutions_
-
-
-def coerce_resolution(resolution):
-    resolution = OmegaConf.to_object(resolution)
-    if isinstance(resolution, int):
-        resolution = [resolution, resolution]  # H, W
-    elif isinstance(resolution, (tuple, list)):
-        resolution = list(resolution)
-    else:
-        raise ValueError("Unknown type of resolution:", resolution)
-    return resolution
-
-
-# done vqvae model
-
-# vqvae checkpoint
-
-
-def vqvae_original_checkpoint_to_diffusers_checkpoint(model, checkpoint):
-    diffusers_checkpoint = {}
-
-    diffusers_checkpoint.update(vqvae_encoder_to_diffusers_checkpoint(model, checkpoint))
-
-    # quant_conv
-
-    diffusers_checkpoint.update(
-        {
-            "quant_conv.weight": checkpoint["quant_conv.weight"],
-            "quant_conv.bias": checkpoint["quant_conv.bias"],
-        }
-    )
-
-    # quantize
-    diffusers_checkpoint.update({"quantize.embedding.weight": checkpoint["quantize.embedding"]})
-
-    # post_quant_conv
-    diffusers_checkpoint.update(
-        {
-            "post_quant_conv.weight": checkpoint["post_quant_conv.weight"],
-            "post_quant_conv.bias": checkpoint["post_quant_conv.bias"],
-        }
-    )
-
-    # decoder
-    diffusers_checkpoint.update(vqvae_decoder_to_diffusers_checkpoint(model, checkpoint))
-
-    return diffusers_checkpoint
-
-
-def vqvae_encoder_to_diffusers_checkpoint(model, checkpoint):
-    diffusers_checkpoint = {}
-
-    # conv_in
-    diffusers_checkpoint.update(
-        {
-            "encoder.conv_in.weight": checkpoint["encoder.conv_in.weight"],
-            "encoder.conv_in.bias": checkpoint["encoder.conv_in.bias"],
-        }
-    )
-
-    # down_blocks
-    for down_block_idx, down_block in enumerate(model.encoder.down_blocks):
-        diffusers_down_block_prefix = f"encoder.down_blocks.{down_block_idx}"
-        down_block_prefix = f"encoder.down.{down_block_idx}"
-
-        # resnets
-        for resnet_idx, resnet in enumerate(down_block.resnets):
-            diffusers_resnet_prefix = f"{diffusers_down_block_prefix}.resnets.{resnet_idx}"
-            resnet_prefix = f"{down_block_prefix}.block.{resnet_idx}"
-
-            diffusers_checkpoint.update(
-                vqvae_resnet_to_diffusers_checkpoint(
-                    resnet, checkpoint, diffusers_resnet_prefix=diffusers_resnet_prefix, resnet_prefix=resnet_prefix
-                )
-            )
-
-        # downsample
-
-        # do not include the downsample when on the last down block
-        # There is no downsample on the last down block
-        if down_block_idx != len(model.encoder.down_blocks) - 1:
-            # There's a single downsample in the original checkpoint but a list of downsamples
-            # in the diffusers model.
-            diffusers_downsample_prefix = f"{diffusers_down_block_prefix}.downsamplers.0.conv"
-            downsample_prefix = f"{down_block_prefix}.downsample.conv"
-            diffusers_checkpoint.update(
-                {
-                    f"{diffusers_downsample_prefix}.weight": checkpoint[f"{downsample_prefix}.weight"],
-                    f"{diffusers_downsample_prefix}.bias": checkpoint[f"{downsample_prefix}.bias"],
-                }
-            )
-
-        # attentions
-
-        if hasattr(down_block, "attentions"):
-            for attention_idx, _ in enumerate(down_block.attentions):
-                diffusers_attention_prefix = f"{diffusers_down_block_prefix}.attentions.{attention_idx}"
-                attention_prefix = f"{down_block_prefix}.attn.{attention_idx}"
-                diffusers_checkpoint.update(
-                    vqvae_attention_to_diffusers_checkpoint(
-                        checkpoint,
-                        diffusers_attention_prefix=diffusers_attention_prefix,
-                        attention_prefix=attention_prefix,
-                    )
-                )
-
-    # mid block
-
-    # mid block attentions
-
-    # There is a single hardcoded attention block in the middle of the VQ-diffusion encoder
-    diffusers_attention_prefix = "encoder.mid_block.attentions.0"
-    attention_prefix = "encoder.mid.attn_1"
-    diffusers_checkpoint.update(
-        vqvae_attention_to_diffusers_checkpoint(
-            checkpoint, diffusers_attention_prefix=diffusers_attention_prefix, attention_prefix=attention_prefix
-        )
-    )
-
-    # mid block resnets
-
-    for diffusers_resnet_idx, resnet in enumerate(model.encoder.mid_block.resnets):
-        diffusers_resnet_prefix = f"encoder.mid_block.resnets.{diffusers_resnet_idx}"
-
-        # the hardcoded prefixes to `block_` are 1 and 2
-        orig_resnet_idx = diffusers_resnet_idx + 1
-        # There are two hardcoded resnets in the middle of the VQ-diffusion encoder
-        resnet_prefix = f"encoder.mid.block_{orig_resnet_idx}"
-
-        diffusers_checkpoint.update(
-            vqvae_resnet_to_diffusers_checkpoint(
-                resnet, checkpoint, diffusers_resnet_prefix=diffusers_resnet_prefix, resnet_prefix=resnet_prefix
-            )
-        )
-
-    diffusers_checkpoint.update(
-        {
-            # conv_norm_out
-            "encoder.conv_norm_out.weight": checkpoint["encoder.norm_out.weight"],
-            "encoder.conv_norm_out.bias": checkpoint["encoder.norm_out.bias"],
-            # conv_out
-            "encoder.conv_out.weight": checkpoint["encoder.conv_out.weight"],
-            "encoder.conv_out.bias": checkpoint["encoder.conv_out.bias"],
-        }
-    )
-
-    return diffusers_checkpoint
-
-
-def vqvae_decoder_to_diffusers_checkpoint(model, checkpoint):
-    diffusers_checkpoint = {}
-
-    # conv in
-    diffusers_checkpoint.update(
-        {
-            "decoder.conv_in.weight": checkpoint["decoder.conv_in.weight"],
-            "decoder.conv_in.bias": checkpoint["decoder.conv_in.bias"],
-        }
-    )
-
-    # up_blocks
-
-    for diffusers_up_block_idx, up_block in enumerate(model.decoder.up_blocks):
-        # up_blocks are stored in reverse order in the VQ-diffusion checkpoint
-        orig_up_block_idx = len(model.decoder.up_blocks) - 1 - diffusers_up_block_idx
-
-        diffusers_up_block_prefix = f"decoder.up_blocks.{diffusers_up_block_idx}"
-        up_block_prefix = f"decoder.up.{orig_up_block_idx}"
-
-        # resnets
-        for resnet_idx, resnet in enumerate(up_block.resnets):
-            diffusers_resnet_prefix = f"{diffusers_up_block_prefix}.resnets.{resnet_idx}"
-            resnet_prefix = f"{up_block_prefix}.block.{resnet_idx}"
-
-            diffusers_checkpoint.update(
-                vqvae_resnet_to_diffusers_checkpoint(
-                    resnet, checkpoint, diffusers_resnet_prefix=diffusers_resnet_prefix, resnet_prefix=resnet_prefix
-                )
-            )
-
-        # upsample
-
-        # there is no up sample on the last up block
-        if diffusers_up_block_idx != len(model.decoder.up_blocks) - 1:
-            # There's a single upsample in the VQ-diffusion checkpoint but a list of downsamples
-            # in the diffusers model.
-            diffusers_downsample_prefix = f"{diffusers_up_block_prefix}.upsamplers.0.conv"
-            downsample_prefix = f"{up_block_prefix}.upsample.conv"
-            diffusers_checkpoint.update(
-                {
-                    f"{diffusers_downsample_prefix}.weight": checkpoint[f"{downsample_prefix}.weight"],
-                    f"{diffusers_downsample_prefix}.bias": checkpoint[f"{downsample_prefix}.bias"],
-                }
-            )
-
-        # attentions
-
-        if hasattr(up_block, "attentions"):
-            for attention_idx, _ in enumerate(up_block.attentions):
-                diffusers_attention_prefix = f"{diffusers_up_block_prefix}.attentions.{attention_idx}"
-                attention_prefix = f"{up_block_prefix}.attn.{attention_idx}"
-                diffusers_checkpoint.update(
-                    vqvae_attention_to_diffusers_checkpoint(
-                        checkpoint,
-                        diffusers_attention_prefix=diffusers_attention_prefix,
-                        attention_prefix=attention_prefix,
-                    )
-                )
-
-    # mid block
-
-    # mid block attentions
-
-    # There is a single hardcoded attention block in the middle of the VQ-diffusion decoder
-    diffusers_attention_prefix = "decoder.mid_block.attentions.0"
-    attention_prefix = "decoder.mid.attn_1"
-    diffusers_checkpoint.update(
-        vqvae_attention_to_diffusers_checkpoint(
-            checkpoint, diffusers_attention_prefix=diffusers_attention_prefix, attention_prefix=attention_prefix
-        )
-    )
-
-    # mid block resnets
-
-    for diffusers_resnet_idx, resnet in enumerate(model.encoder.mid_block.resnets):
-        diffusers_resnet_prefix = f"decoder.mid_block.resnets.{diffusers_resnet_idx}"
-
-        # the hardcoded prefixes to `block_` are 1 and 2
-        orig_resnet_idx = diffusers_resnet_idx + 1
-        # There are two hardcoded resnets in the middle of the VQ-diffusion decoder
-        resnet_prefix = f"decoder.mid.block_{orig_resnet_idx}"
-
-        diffusers_checkpoint.update(
-            vqvae_resnet_to_diffusers_checkpoint(
-                resnet, checkpoint, diffusers_resnet_prefix=diffusers_resnet_prefix, resnet_prefix=resnet_prefix
-            )
-        )
-
-    diffusers_checkpoint.update(
-        {
-            # conv_norm_out
-            "decoder.conv_norm_out.weight": checkpoint["decoder.norm_out.weight"],
-            "decoder.conv_norm_out.bias": checkpoint["decoder.norm_out.bias"],
-            # conv_out
-            "decoder.conv_out.weight": checkpoint["decoder.conv_out.weight"],
-            "decoder.conv_out.bias": checkpoint["decoder.conv_out.bias"],
-        }
-    )
-
-    return diffusers_checkpoint
-
-
-def vqvae_resnet_to_diffusers_checkpoint(resnet, checkpoint, *, diffusers_resnet_prefix, resnet_prefix):
-    rv = {
-        # norm1
-        f"{diffusers_resnet_prefix}.norm1.weight": checkpoint[f"{resnet_prefix}.norm1.weight"],
-        f"{diffusers_resnet_prefix}.norm1.bias": checkpoint[f"{resnet_prefix}.norm1.bias"],
-        # conv1
-        f"{diffusers_resnet_prefix}.conv1.weight": checkpoint[f"{resnet_prefix}.conv1.weight"],
-        f"{diffusers_resnet_prefix}.conv1.bias": checkpoint[f"{resnet_prefix}.conv1.bias"],
-        # norm2
-        f"{diffusers_resnet_prefix}.norm2.weight": checkpoint[f"{resnet_prefix}.norm2.weight"],
-        f"{diffusers_resnet_prefix}.norm2.bias": checkpoint[f"{resnet_prefix}.norm2.bias"],
-        # conv2
-        f"{diffusers_resnet_prefix}.conv2.weight": checkpoint[f"{resnet_prefix}.conv2.weight"],
-        f"{diffusers_resnet_prefix}.conv2.bias": checkpoint[f"{resnet_prefix}.conv2.bias"],
-    }
-
-    if resnet.conv_shortcut is not None:
-        rv.update(
-            {
-                f"{diffusers_resnet_prefix}.conv_shortcut.weight": checkpoint[f"{resnet_prefix}.nin_shortcut.weight"],
-                f"{diffusers_resnet_prefix}.conv_shortcut.bias": checkpoint[f"{resnet_prefix}.nin_shortcut.bias"],
-            }
-        )
-
-    return rv
-
-
-def vqvae_attention_to_diffusers_checkpoint(checkpoint, *, diffusers_attention_prefix, attention_prefix):
-    return {
-        # group_norm
-        f"{diffusers_attention_prefix}.group_norm.weight": checkpoint[f"{attention_prefix}.norm.weight"],
-        f"{diffusers_attention_prefix}.group_norm.bias": checkpoint[f"{attention_prefix}.norm.bias"],
-        # query
-        f"{diffusers_attention_prefix}.query.weight": checkpoint[f"{attention_prefix}.q.weight"][:, :, 0, 0],
-        f"{diffusers_attention_prefix}.query.bias": checkpoint[f"{attention_prefix}.q.bias"],
-        # key
-        f"{diffusers_attention_prefix}.key.weight": checkpoint[f"{attention_prefix}.k.weight"][:, :, 0, 0],
-        f"{diffusers_attention_prefix}.key.bias": checkpoint[f"{attention_prefix}.k.bias"],
-        # value
-        f"{diffusers_attention_prefix}.value.weight": checkpoint[f"{attention_prefix}.v.weight"][:, :, 0, 0],
-        f"{diffusers_attention_prefix}.value.bias": checkpoint[f"{attention_prefix}.v.bias"],
-        # proj_attn
-        f"{diffusers_attention_prefix}.proj_attn.weight": checkpoint[f"{attention_prefix}.proj_out.weight"][
-            :, :, 0, 0
-        ],
-        f"{diffusers_attention_prefix}.proj_attn.bias": checkpoint[f"{attention_prefix}.proj_out.bias"],
-    }
-
-
-# done vqvae checkpoint
-
-# transformer model
-
-PORTED_DIFFUSIONS = ["image_synthesis.modeling.transformers.diffusion_transformer.DiffusionTransformer"]
-PORTED_TRANSFORMERS = ["image_synthesis.modeling.transformers.transformer_utils.Text2ImageTransformer"]
-PORTED_CONTENT_EMBEDDINGS = ["image_synthesis.modeling.embeddings.dalle_mask_image_embedding.DalleMaskImageEmbedding"]
-
-
-def transformer_model_from_original_config(
-    original_diffusion_config, original_transformer_config, original_content_embedding_config
-):
-    assert (
-        original_diffusion_config.target in PORTED_DIFFUSIONS
-    ), f"{original_diffusion_config.target} has not yet been ported to diffusers."
-    assert (
-        original_transformer_config.target in PORTED_TRANSFORMERS
-    ), f"{original_transformer_config.target} has not yet been ported to diffusers."
-    assert (
-        original_content_embedding_config.target in PORTED_CONTENT_EMBEDDINGS
-    ), f"{original_content_embedding_config.target} has not yet been ported to diffusers."
-
-    original_diffusion_config = original_diffusion_config.params
-    original_transformer_config = original_transformer_config.params
-    original_content_embedding_config = original_content_embedding_config.params
-
-    inner_dim = original_transformer_config["n_embd"]
-
-    n_heads = original_transformer_config["n_head"]
-
-    # VQ-Diffusion gives dimension of the multi-headed attention layers as the
-    # number of attention heads times the sequence length (the dimension) of a
-    # single head. We want to specify our attention blocks with those values
-    # specified separately
-    assert inner_dim % n_heads == 0
-    d_head = inner_dim // n_heads
-
-    depth = original_transformer_config["n_layer"]
-    context_dim = original_transformer_config["condition_dim"]
-
-    num_embed = original_content_embedding_config["num_embed"]
-    # the number of embeddings in the transformer includes the mask embedding.
-    # the content embedding (the vqvae) does not include the mask embedding.
-    num_embed = num_embed + 1
-
-    height = original_transformer_config["content_spatial_size"][0]
-    width = original_transformer_config["content_spatial_size"][1]
-
-    assert width == height, "width has to be equal to height"
-    dropout = original_transformer_config["resid_pdrop"]
-    num_embeds_ada_norm = original_diffusion_config["diffusion_step"]
-
-    model_kwargs = {
-        "attention_bias": True,
-        "cross_attention_dim": context_dim,
-        "attention_head_dim": d_head,
-        "num_layers": depth,
-        "dropout": dropout,
-        "num_attention_heads": n_heads,
-        "num_vector_embeds": num_embed,
-        "num_embeds_ada_norm": num_embeds_ada_norm,
-        "norm_num_groups": 32,
-        "sample_size": width,
-        "activation_fn": "geglu-approximate",
-    }
-
-    model = Transformer2DModel(**model_kwargs)
-    return model
-
-
-# done transformer model
-
-# transformer checkpoint
-
-
-def transformer_original_checkpoint_to_diffusers_checkpoint(model, checkpoint):
-    diffusers_checkpoint = {}
-
-    transformer_prefix = "transformer.transformer"
-
-    diffusers_latent_image_embedding_prefix = "latent_image_embedding"
-    latent_image_embedding_prefix = f"{transformer_prefix}.content_emb"
-
-    # DalleMaskImageEmbedding
-    diffusers_checkpoint.update(
-        {
-            f"{diffusers_latent_image_embedding_prefix}.emb.weight": checkpoint[
-                f"{latent_image_embedding_prefix}.emb.weight"
-            ],
-            f"{diffusers_latent_image_embedding_prefix}.height_emb.weight": checkpoint[
-                f"{latent_image_embedding_prefix}.height_emb.weight"
-            ],
-            f"{diffusers_latent_image_embedding_prefix}.width_emb.weight": checkpoint[
-                f"{latent_image_embedding_prefix}.width_emb.weight"
-            ],
-        }
-    )
-
-    # transformer blocks
-    for transformer_block_idx, transformer_block in enumerate(model.transformer_blocks):
-        diffusers_transformer_block_prefix = f"transformer_blocks.{transformer_block_idx}"
-        transformer_block_prefix = f"{transformer_prefix}.blocks.{transformer_block_idx}"
-
-        # ada norm block
-        diffusers_ada_norm_prefix = f"{diffusers_transformer_block_prefix}.norm1"
-        ada_norm_prefix = f"{transformer_block_prefix}.ln1"
-
-        diffusers_checkpoint.update(
-            transformer_ada_norm_to_diffusers_checkpoint(
-                checkpoint, diffusers_ada_norm_prefix=diffusers_ada_norm_prefix, ada_norm_prefix=ada_norm_prefix
-            )
-        )
-
-        # attention block
-        diffusers_attention_prefix = f"{diffusers_transformer_block_prefix}.attn1"
-        attention_prefix = f"{transformer_block_prefix}.attn1"
-
-        diffusers_checkpoint.update(
-            transformer_attention_to_diffusers_checkpoint(
-                checkpoint, diffusers_attention_prefix=diffusers_attention_prefix, attention_prefix=attention_prefix
-            )
-        )
-
-        # ada norm block
-        diffusers_ada_norm_prefix = f"{diffusers_transformer_block_prefix}.norm2"
-        ada_norm_prefix = f"{transformer_block_prefix}.ln1_1"
-
-        diffusers_checkpoint.update(
-            transformer_ada_norm_to_diffusers_checkpoint(
-                checkpoint, diffusers_ada_norm_prefix=diffusers_ada_norm_prefix, ada_norm_prefix=ada_norm_prefix
-            )
-        )
-
-        # attention block
-        diffusers_attention_prefix = f"{diffusers_transformer_block_prefix}.attn2"
-        attention_prefix = f"{transformer_block_prefix}.attn2"
-
-        diffusers_checkpoint.update(
-            transformer_attention_to_diffusers_checkpoint(
-                checkpoint, diffusers_attention_prefix=diffusers_attention_prefix, attention_prefix=attention_prefix
-            )
-        )
-
-        # norm block
-        diffusers_norm_block_prefix = f"{diffusers_transformer_block_prefix}.norm3"
-        norm_block_prefix = f"{transformer_block_prefix}.ln2"
-
-        diffusers_checkpoint.update(
-            {
-                f"{diffusers_norm_block_prefix}.weight": checkpoint[f"{norm_block_prefix}.weight"],
-                f"{diffusers_norm_block_prefix}.bias": checkpoint[f"{norm_block_prefix}.bias"],
-            }
-        )
-
-        # feedforward block
-        diffusers_feedforward_prefix = f"{diffusers_transformer_block_prefix}.ff"
-        feedforward_prefix = f"{transformer_block_prefix}.mlp"
-
-        diffusers_checkpoint.update(
-            transformer_feedforward_to_diffusers_checkpoint(
-                checkpoint,
-                diffusers_feedforward_prefix=diffusers_feedforward_prefix,
-                feedforward_prefix=feedforward_prefix,
-            )
-        )
-
-    # to logits
-
-    diffusers_norm_out_prefix = "norm_out"
-    norm_out_prefix = f"{transformer_prefix}.to_logits.0"
-
-    diffusers_checkpoint.update(
-        {
-            f"{diffusers_norm_out_prefix}.weight": checkpoint[f"{norm_out_prefix}.weight"],
-            f"{diffusers_norm_out_prefix}.bias": checkpoint[f"{norm_out_prefix}.bias"],
-        }
-    )
-
-    diffusers_out_prefix = "out"
-    out_prefix = f"{transformer_prefix}.to_logits.1"
-
-    diffusers_checkpoint.update(
-        {
-            f"{diffusers_out_prefix}.weight": checkpoint[f"{out_prefix}.weight"],
-            f"{diffusers_out_prefix}.bias": checkpoint[f"{out_prefix}.bias"],
-        }
-    )
-
-    return diffusers_checkpoint
-
-
-def transformer_ada_norm_to_diffusers_checkpoint(checkpoint, *, diffusers_ada_norm_prefix, ada_norm_prefix):
-    return {
-        f"{diffusers_ada_norm_prefix}.emb.weight": checkpoint[f"{ada_norm_prefix}.emb.weight"],
-        f"{diffusers_ada_norm_prefix}.linear.weight": checkpoint[f"{ada_norm_prefix}.linear.weight"],
-        f"{diffusers_ada_norm_prefix}.linear.bias": checkpoint[f"{ada_norm_prefix}.linear.bias"],
-    }
-
-
-def transformer_attention_to_diffusers_checkpoint(checkpoint, *, diffusers_attention_prefix, attention_prefix):
-    return {
-        # key
-        f"{diffusers_attention_prefix}.to_k.weight": checkpoint[f"{attention_prefix}.key.weight"],
-        f"{diffusers_attention_prefix}.to_k.bias": checkpoint[f"{attention_prefix}.key.bias"],
-        # query
-        f"{diffusers_attention_prefix}.to_q.weight": checkpoint[f"{attention_prefix}.query.weight"],
-        f"{diffusers_attention_prefix}.to_q.bias": checkpoint[f"{attention_prefix}.query.bias"],
-        # value
-        f"{diffusers_attention_prefix}.to_v.weight": checkpoint[f"{attention_prefix}.value.weight"],
-        f"{diffusers_attention_prefix}.to_v.bias": checkpoint[f"{attention_prefix}.value.bias"],
-        # linear out
-        f"{diffusers_attention_prefix}.to_out.0.weight": checkpoint[f"{attention_prefix}.proj.weight"],
-        f"{diffusers_attention_prefix}.to_out.0.bias": checkpoint[f"{attention_prefix}.proj.bias"],
-    }
-
-
-def transformer_feedforward_to_diffusers_checkpoint(checkpoint, *, diffusers_feedforward_prefix, feedforward_prefix):
-    return {
-        f"{diffusers_feedforward_prefix}.net.0.proj.weight": checkpoint[f"{feedforward_prefix}.0.weight"],
-        f"{diffusers_feedforward_prefix}.net.0.proj.bias": checkpoint[f"{feedforward_prefix}.0.bias"],
-        f"{diffusers_feedforward_prefix}.net.2.weight": checkpoint[f"{feedforward_prefix}.2.weight"],
-        f"{diffusers_feedforward_prefix}.net.2.bias": checkpoint[f"{feedforward_prefix}.2.bias"],
-    }
-
-
-# done transformer checkpoint
-
-
-def read_config_file(filename):
-    # The yaml file contains annotations that certain values should
-    # loaded as tuples. By default, OmegaConf will panic when reading
-    # these. Instead, we can manually read the yaml with the FullLoader and then
-    # construct the OmegaConf object.
-    with open(filename) as f:
-        original_config = yaml.load(f, FullLoader)
-
-    return OmegaConf.create(original_config)
-
-
-# We take separate arguments for the vqvae because the ITHQ vqvae config file
-# is separate from the config file for the rest of the model.
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        "--vqvae_checkpoint_path",
-        default=None,
-        type=str,
-        required=True,
-        help="Path to the vqvae checkpoint to convert.",
-    )
-
-    parser.add_argument(
-        "--vqvae_original_config_file",
-        default=None,
-        type=str,
-        required=True,
-        help="The YAML config file corresponding to the original architecture for the vqvae.",
-    )
-
-    parser.add_argument(
-        "--checkpoint_path", default=None, type=str, required=True, help="Path to the checkpoint to convert."
-    )
-
-    parser.add_argument(
-        "--original_config_file",
-        default=None,
-        type=str,
-        required=True,
-        help="The YAML config file corresponding to the original architecture.",
-    )
-
-    parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output model.")
-
-    parser.add_argument(
-        "--checkpoint_load_device",
-        default="cpu",
-        type=str,
-        required=False,
-        help="The device passed to `map_location` when loading checkpoints.",
-    )
-
-    # See link for how ema weights are always selected
-    # https://github.com/microsoft/VQ-Diffusion/blob/3c98e77f721db7c787b76304fa2c96a36c7b00af/inference_VQ_Diffusion.py#L65
-    parser.add_argument(
-        "--no_use_ema",
-        action="store_true",
-        required=False,
-        help=(
-            "Set to not use the ema weights from the original VQ-Diffusion checkpoint. You probably do not want to set"
-            " it as the original VQ-Diffusion always uses the ema weights when loading models."
-        ),
-    )
-
-    args = parser.parse_args()
-
-    use_ema = not args.no_use_ema
-
-    print(f"loading checkpoints to {args.checkpoint_load_device}")
-
-    checkpoint_map_location = torch.device(args.checkpoint_load_device)
-
-    # vqvae_model
-
-    print(f"loading vqvae, config: {args.vqvae_original_config_file}, checkpoint: {args.vqvae_checkpoint_path}")
-
-    vqvae_original_config = read_config_file(args.vqvae_original_config_file).model
-    vqvae_checkpoint = torch.load(args.vqvae_checkpoint_path, map_location=checkpoint_map_location)["model"]
-
-    with init_empty_weights():
-        vqvae_model = vqvae_model_from_original_config(vqvae_original_config)
-
-    vqvae_diffusers_checkpoint = vqvae_original_checkpoint_to_diffusers_checkpoint(vqvae_model, vqvae_checkpoint)
-
-    with tempfile.NamedTemporaryFile() as vqvae_diffusers_checkpoint_file:
-        torch.save(vqvae_diffusers_checkpoint, vqvae_diffusers_checkpoint_file.name)
-        del vqvae_diffusers_checkpoint
-        del vqvae_checkpoint
-        load_checkpoint_and_dispatch(vqvae_model, vqvae_diffusers_checkpoint_file.name, device_map="auto")
-
-    print("done loading vqvae")
-
-    # done vqvae_model
-
-    # transformer_model
-
-    print(
-        f"loading transformer, config: {args.original_config_file}, checkpoint: {args.checkpoint_path}, use ema:"
-        f" {use_ema}"
-    )
-
-    original_config = read_config_file(args.original_config_file).model
-
-    diffusion_config = original_config.params.diffusion_config
-    transformer_config = original_config.params.diffusion_config.params.transformer_config
-    content_embedding_config = original_config.params.diffusion_config.params.content_emb_config
-
-    pre_checkpoint = torch.load(args.checkpoint_path, map_location=checkpoint_map_location)
-
-    if use_ema:
-        if "ema" in pre_checkpoint:
-            checkpoint = {}
-            for k, v in pre_checkpoint["model"].items():
-                checkpoint[k] = v
-
-            for k, v in pre_checkpoint["ema"].items():
-                # The ema weights are only used on the transformer. To mimic their key as if they came
-                # from the state_dict for the top level model, we prefix with an additional "transformer."
-                # See the source linked in the args.use_ema config for more information.
-                checkpoint[f"transformer.{k}"] = v
-        else:
-            print("attempted to load ema weights but no ema weights are specified in the loaded checkpoint.")
-            checkpoint = pre_checkpoint["model"]
-    else:
-        checkpoint = pre_checkpoint["model"]
-
-    del pre_checkpoint
-
-    with init_empty_weights():
-        transformer_model = transformer_model_from_original_config(
-            diffusion_config, transformer_config, content_embedding_config
-        )
-
-    diffusers_transformer_checkpoint = transformer_original_checkpoint_to_diffusers_checkpoint(
-        transformer_model, checkpoint
-    )
-
-    # classifier free sampling embeddings interlude
-
-    # The learned embeddings are stored on the transformer in the original VQ-diffusion. We store them on a separate
-    # model, so we pull them off the checkpoint before the checkpoint is deleted.
-
-    learnable_classifier_free_sampling_embeddings = diffusion_config.params.learnable_cf
-
-    if learnable_classifier_free_sampling_embeddings:
-        learned_classifier_free_sampling_embeddings_embeddings = checkpoint["transformer.empty_text_embed"]
-    else:
-        learned_classifier_free_sampling_embeddings_embeddings = None
-
-    # done classifier free sampling embeddings interlude
-
-    with tempfile.NamedTemporaryFile() as diffusers_transformer_checkpoint_file:
-        torch.save(diffusers_transformer_checkpoint, diffusers_transformer_checkpoint_file.name)
-        del diffusers_transformer_checkpoint
-        del checkpoint
-        load_checkpoint_and_dispatch(transformer_model, diffusers_transformer_checkpoint_file.name, device_map="auto")
-
-    print("done loading transformer")
-
-    # done transformer_model
-
-    # text encoder
-
-    print("loading CLIP text encoder")
-
-    clip_name = "openai/clip-vit-base-patch32"
-
-    # The original VQ-Diffusion specifies the pad value by the int used in the
-    # returned tokens. Each model uses `0` as the pad value. The transformers clip api
-    # specifies the pad value via the token before it has been tokenized. The `!` pad
-    # token is the same as padding with the `0` pad value.
-    pad_token = "!"
-
-    tokenizer_model = CLIPTokenizer.from_pretrained(clip_name, pad_token=pad_token, device_map="auto")
-
-    assert tokenizer_model.convert_tokens_to_ids(pad_token) == 0
-
-    text_encoder_model = CLIPTextModel.from_pretrained(
-        clip_name,
-        # `CLIPTextModel` does not support device_map="auto"
-        # device_map="auto"
-    )
-
-    print("done loading CLIP text encoder")
-
-    # done text encoder
-
-    # scheduler
-
-    scheduler_model = VQDiffusionScheduler(
-        # the scheduler has the same number of embeddings as the transformer
-        num_vec_classes=transformer_model.num_vector_embeds
-    )
-
-    # done scheduler
-
-    # learned classifier free sampling embeddings
-
-    with init_empty_weights():
-        learned_classifier_free_sampling_embeddings_model = LearnedClassifierFreeSamplingEmbeddings(
-            learnable_classifier_free_sampling_embeddings,
-            hidden_size=text_encoder_model.config.hidden_size,
-            length=tokenizer_model.model_max_length,
-        )
-
-    learned_classifier_free_sampling_checkpoint = {
-        "embeddings": learned_classifier_free_sampling_embeddings_embeddings.float()
-    }
-
-    with tempfile.NamedTemporaryFile() as learned_classifier_free_sampling_checkpoint_file:
-        torch.save(learned_classifier_free_sampling_checkpoint, learned_classifier_free_sampling_checkpoint_file.name)
-        del learned_classifier_free_sampling_checkpoint
-        del learned_classifier_free_sampling_embeddings_embeddings
-        load_checkpoint_and_dispatch(
-            learned_classifier_free_sampling_embeddings_model,
-            learned_classifier_free_sampling_checkpoint_file.name,
-            device_map="auto",
-        )
-
-    # done learned classifier free sampling embeddings
-
-    print(f"saving VQ diffusion model, path: {args.dump_path}")
-
-    pipe = VQDiffusionPipeline(
-        vqvae=vqvae_model,
-        transformer=transformer_model,
-        tokenizer=tokenizer_model,
-        text_encoder=text_encoder_model,
-        learned_classifier_free_sampling_embeddings=learned_classifier_free_sampling_embeddings_model,
-        scheduler=scheduler_model,
-    )
-    pipe.save_pretrained(args.dump_path)
-
-    print("done writing VQ diffusion model")
--- a/setup.py
+++ b/setup.py
@@ -78,7 +78,7 @@ from setuptools import find_packages, setup
 # 1. all dependencies should be listed here with their version requirements if any
 # 2. once modified, run: `make deps_table_update` to update src/diffusers/dependency_versions_table.py
 _deps = [
-    "Pillow",  # keep the PIL.Image.Resampling deprecation away
+    "Pillow<10.0",  # keep the PIL.Image.Resampling deprecation away
    "accelerate>=0.11.0",
    "black==22.8",
    "datasets",
@@ -89,18 +89,15 @@ _deps = [
    "huggingface-hub>=0.10.0",
    "importlib_metadata",
    "isort>=5.5.4",
-    "jax>=0.2.8,!=0.3.2",
-    "jaxlib>=0.1.65",
-    "k-diffusion",
-    "librosa",
+    "jax>=0.2.8,!=0.3.2,<=0.3.6",
+    "jaxlib>=0.1.65,<=0.3.6",
    "modelcards>=0.1.4",
    "numpy",
+    "onnxruntime",
    "parameterized",
    "pytest",
    "pytest-timeout",
    "pytest-xdist",
-    "safetensors",
-    "sentencepiece>=0.1.91,!=0.1.92",
    "scipy",
    "regex!=2019.12.17",
    "requests",
@@ -182,20 +179,18 @@ extras["quality"] = deps_list("black", "isort", "flake8", "hf-doc-builder")
 extras["docs"] = deps_list("hf-doc-builder")
 extras["training"] = deps_list("accelerate", "datasets", "tensorboard", "modelcards")
 extras["test"] = deps_list(
+    "accelerate",
    "datasets",
-    "k-diffusion",
-    "librosa",
+    "onnxruntime",
    "parameterized",
    "pytest",
    "pytest-timeout",
    "pytest-xdist",
-    "safetensors",
-    "sentencepiece",
    "scipy",
    "torchvision",
-    "transformers",
+    "transformers"
 )
-extras["torch"] = deps_list("torch", "accelerate")
+extras["torch"] = deps_list("torch")

 if os.name == "nt":  # windows
    extras["flax"] = []  # jax is not supported on windows
@@ -218,7 +213,7 @@ install_requires = [

 setup(
    name="diffusers",
-    version="0.10.0.dev0",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
+    version="0.7.0.dev0",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
    description="Diffusers",
    long_description=open("README.md", "r", encoding="utf-8").read(),
    long_description_content_type="text/markdown",
--- a/Show More
+++ b/Show More