Release: 0.6.0

Initial docs update for new in-painting pipeline (#910 )
Docs update for new in-painting pipeline. Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
2025-12-07 04:54:47 +08:00 · 2022-10-19 17:38:55 +02:00 · 2022-10-19 17:31:23 +02:00 · 2022-10-19 17:03:31 +02:00 · 2022-10-19 16:36:51 +02:00 · 2022-10-19 16:11:50 +02:00
204 changed files with 27532 additions and 4208 deletions
--- a/.github/ISSUE_TEMPLATE/bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/bug-report.yml
@@ -30,8 +30,7 @@ body:
    id: system-info
    attributes:
      label: System Info
-      description: Please share your system info with us,
-      render: shell
-      placeholder: diffusers version, Python Version, etc
+      description: Please share your system info with us. You can run the command `diffusers-cli env` and copy-paste its output below.
+      placeholder: diffusers version, platform, python version, ...
    validations:
      required: true
--- a/.github/ISSUE_TEMPLATE/feedback.md
+++ b/.github/ISSUE_TEMPLATE/feedback.md
@@ -0,0 +1,12 @@
+---
+name: "💬 Feedback about API Design"
+about: Give feedback about the current API design
+title: ''
+labels: ''
+assignees: ''
+
+---
+
+**What API design would you like to have changed or added to the library? Why?**
+
+**What use case would this enable or better enable? Can you give us a code example?**
--- a/.github/ISSUE_TEMPLATE/new-model-addition.yml
+++ b/.github/ISSUE_TEMPLATE/new-model-addition.yml
@@ -0,0 +1,31 @@
+name: "\U0001F31F New model/pipeline/scheduler addition"
+description: Submit a proposal/request to implement a new diffusion model / pipeline / scheduler
+labels: [ "New model/pipeline/scheduler" ]
+
+body:
+  - type: textarea
+    id: description-request
+    validations:
+      required: true
+    attributes:
+      label: Model/Pipeline/Scheduler description
+      description: |
+        Put any and all important information relative to the model/pipeline/scheduler
+
+  - type: checkboxes
+    id: information-tasks
+    attributes:
+      label: Open source status
+      description: |
+          Please note that if the model implementation isn't available or if the weights aren't open-source, we are less likely to implement it in `diffusers`.
+      options:
+        - label: "The model implementation is available"
+        - label: "The model weights are available (Only relevant if addition is not a scheduler)."
+
+  - type: textarea
+    id: additional-info
+    attributes:
+      label: Provide useful links for the implementation
+      description: |
+        Please provide information regarding the implementation, the weights, and the authors.
+        Please mention the authors by @gh-username if you're aware of their usernames.
--- a/.github/actions/setup-miniconda/action.yml
+++ b/.github/actions/setup-miniconda/action.yml
@@ -0,0 +1,146 @@
+name: Set up conda environment for testing
+
+description: Sets up miniconda in your ${RUNNER_TEMP} environment and gives you the ${CONDA_RUN} environment variable so you don't have to worry about polluting non-empeheral runners anymore
+
+inputs:
+  python-version:
+    description: If set to any value, dont use sudo to clean the workspace
+    required: false
+    type: string
+    default: "3.9"
+  miniconda-version:
+    description: Miniconda version to install
+    required: false
+    type: string
+    default: "4.12.0"
+  environment-file:
+    description: Environment file to install dependencies from
+    required: false
+    type: string
+    default: ""
+
+runs:
+  using: composite
+  steps:
+      # Use the same trick from https://github.com/marketplace/actions/setup-miniconda
+      # to refresh the cache daily. This is kind of optional though
+      - name: Get date
+        id: get-date
+        shell: bash
+        run: echo "::set-output name=today::$(/bin/date -u '+%Y%m%d')d"
+      - name: Setup miniconda cache
+        id: miniconda-cache
+        uses: actions/cache@v2
+        with:
+          path: ${{ runner.temp }}/miniconda
+          key: miniconda-${{ runner.os }}-${{ runner.arch }}-${{ inputs.python-version }}-${{ steps.get-date.outputs.today }}
+      - name: Install miniconda (${{ inputs.miniconda-version }})
+        if: steps.miniconda-cache.outputs.cache-hit != 'true'
+        env:
+          MINICONDA_VERSION: ${{ inputs.miniconda-version }}
+        shell: bash -l {0}
+        run: |
+          MINICONDA_INSTALL_PATH="${RUNNER_TEMP}/miniconda"
+          mkdir -p "${MINICONDA_INSTALL_PATH}"
+          case ${RUNNER_OS}-${RUNNER_ARCH} in
+            Linux-X64)
+              MINICONDA_ARCH="Linux-x86_64"
+              ;;
+            macOS-ARM64)
+              MINICONDA_ARCH="MacOSX-arm64"
+              ;;
+            macOS-X64)
+              MINICONDA_ARCH="MacOSX-x86_64"
+              ;;
+            *)
+            echo "::error::Platform ${RUNNER_OS}-${RUNNER_ARCH} currently unsupported using this action"
+              exit 1
+              ;;
+          esac
+          MINICONDA_URL="https://repo.anaconda.com/miniconda/Miniconda3-py39_${MINICONDA_VERSION}-${MINICONDA_ARCH}.sh"
+          curl -fsSL "${MINICONDA_URL}" -o "${MINICONDA_INSTALL_PATH}/miniconda.sh"
+          bash "${MINICONDA_INSTALL_PATH}/miniconda.sh" -b -u -p "${MINICONDA_INSTALL_PATH}"
+          rm -rf "${MINICONDA_INSTALL_PATH}/miniconda.sh"
+      - name: Update GitHub path to include miniconda install
+        shell: bash
+        run: |
+          MINICONDA_INSTALL_PATH="${RUNNER_TEMP}/miniconda"
+          echo "${MINICONDA_INSTALL_PATH}/bin" >> $GITHUB_PATH
+      - name: Setup miniconda env cache (with env file)
+        id: miniconda-env-cache-env-file
+        if: ${{ runner.os }} == 'macOS' && ${{ inputs.environment-file }} != ''
+        uses: actions/cache@v2
+        with:
+          path: ${{ runner.temp }}/conda-python-${{ inputs.python-version }}
+          key: miniconda-env-${{ runner.os }}-${{ runner.arch }}-${{ inputs.python-version }}-${{ steps.get-date.outputs.today }}-${{ hashFiles(inputs.environment-file) }}
+      - name: Setup miniconda env cache (without env file)
+        id: miniconda-env-cache
+        if: ${{ runner.os }} == 'macOS' && ${{ inputs.environment-file }} == ''
+        uses: actions/cache@v2
+        with:
+          path: ${{ runner.temp }}/conda-python-${{ inputs.python-version }}
+          key: miniconda-env-${{ runner.os }}-${{ runner.arch }}-${{ inputs.python-version }}-${{ steps.get-date.outputs.today }}
+      - name: Setup conda environment with python (v${{ inputs.python-version }})
+        if: steps.miniconda-env-cache-env-file.outputs.cache-hit != 'true' && steps.miniconda-env-cache.outputs.cache-hit != 'true'
+        shell: bash
+        env:
+          PYTHON_VERSION: ${{ inputs.python-version }}
+          ENV_FILE: ${{ inputs.environment-file }}
+        run: |
+          CONDA_BASE_ENV="${RUNNER_TEMP}/conda-python-${PYTHON_VERSION}"
+          ENV_FILE_FLAG=""
+          if [[ -f "${ENV_FILE}" ]]; then
+            ENV_FILE_FLAG="--file ${ENV_FILE}"
+          elif [[ -n "${ENV_FILE}" ]]; then
+            echo "::warning::Specified env file (${ENV_FILE}) not found, not going to include it"
+          fi
+          conda create \
+            --yes \
+            --prefix "${CONDA_BASE_ENV}" \
+            "python=${PYTHON_VERSION}" \
+            ${ENV_FILE_FLAG} \
+            cmake=3.22 \
+            conda-build=3.21 \
+            ninja=1.10 \
+            pkg-config=0.29 \
+            wheel=0.37
+      - name: Clone the base conda environment and update GitHub env
+        shell: bash
+        env:
+          PYTHON_VERSION: ${{ inputs.python-version }}
+          CONDA_BASE_ENV: ${{ runner.temp }}/conda-python-${{ inputs.python-version }}
+        run: |
+          CONDA_ENV="${RUNNER_TEMP}/conda_environment_${GITHUB_RUN_ID}"
+          conda create \
+            --yes \
+            --prefix "${CONDA_ENV}" \
+            --clone "${CONDA_BASE_ENV}"
+          # TODO: conda-build could not be cloned because it hardcodes the path, so it
+          # could not be cached
+          conda install --yes -p ${CONDA_ENV} conda-build=3.21
+          echo "CONDA_ENV=${CONDA_ENV}" >> "${GITHUB_ENV}"
+          echo "CONDA_RUN=conda run -p ${CONDA_ENV} --no-capture-output" >> "${GITHUB_ENV}"
+          echo "CONDA_BUILD=conda run -p ${CONDA_ENV} conda-build" >> "${GITHUB_ENV}"
+          echo "CONDA_INSTALL=conda install -p ${CONDA_ENV}" >> "${GITHUB_ENV}"
+      - name: Get disk space usage and throw an error for low disk space
+        shell: bash
+        run: |
+          echo "Print the available disk space for manual inspection"
+          df -h
+          # Set the minimum requirement space to 4GB
+          MINIMUM_AVAILABLE_SPACE_IN_GB=4
+          MINIMUM_AVAILABLE_SPACE_IN_KB=$(($MINIMUM_AVAILABLE_SPACE_IN_GB * 1024 * 1024))
+          # Use KB to avoid floating point warning like 3.1GB
+          df -k | tr -s ' ' | cut -d' ' -f 4,9 | while read -r LINE;
+          do
+            AVAIL=$(echo $LINE | cut -f1 -d' ')
+            MOUNT=$(echo $LINE | cut -f2 -d' ')
+            if [ "$MOUNT" = "/" ]; then
+              if [ "$AVAIL" -lt "$MINIMUM_AVAILABLE_SPACE_IN_KB" ]; then
+                echo "There is only ${AVAIL}KB free space left in $MOUNT, which is less than the minimum requirement of ${MINIMUM_AVAILABLE_SPACE_IN_KB}KB. Please help create an issue to PyTorch Release Engineering via https://github.com/pytorch/test-infra/issues and provide the link to the workflow run."
+                exit 1;
+              else
+                echo "There is ${AVAIL}KB free space left in $MOUNT, continue"
+              fi
+            fi
+          done
--- a/.github/workflows/pr_quality.yml
+++ b/.github/workflows/pr_quality.yml
@@ -0,0 +1,33 @@
+name: Run code quality checks
+
+on:
+  pull_request:
+    branches:
+      - main
+  push:
+    branches:
+      - main
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  check_code_quality:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.7"
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install .[quality]
+      - name: Check quality
+        run: |
+          black  --check --preview examples tests src utils scripts
+          isort --check-only examples tests src utils scripts
+          flake8 examples tests src utils scripts
+          doc-builder style src/diffusers docs/source --max_len 119 --check_only --path_to_docs docs/source
--- a/.github/workflows/pr_tests.yml
+++ b/.github/workflows/pr_tests.yml
@@ -0,0 +1,103 @@
+name: Run fast tests
+
+on:
+  pull_request:
+    branches:
+      - main
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+env:
+  OMP_NUM_THREADS: 8
+  MKL_NUM_THREADS: 8
+  PYTEST_TIMEOUT: 60
+  MPS_TORCH_VERSION: 1.13.0
+
+jobs:
+  run_tests_cpu:
+    name: CPU tests on Ubuntu
+    runs-on: [ self-hosted, docker-gpu ]
+    container:
+      image: python:3.7
+      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
+
+    steps:
+    - name: Checkout diffusers
+      uses: actions/checkout@v3
+      with:
+        fetch-depth: 2
+
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        python -m pip install torch --extra-index-url https://download.pytorch.org/whl/cpu
+        python -m pip install -e .[quality,test]
+
+    - name: Environment
+      run: |
+        python utils/print_env.py
+
+    - name: Run all fast tests on CPU
+      run: |
+        python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile -s -v --make-reports=tests_torch_cpu tests/
+
+    - name: Failure short reports
+      if: ${{ failure() }}
+      run: cat reports/tests_torch_cpu_failures_short.txt
+
+    - name: Test suite reports artifacts
+      if: ${{ always() }}
+      uses: actions/upload-artifact@v2
+      with:
+        name: pr_torch_cpu_test_reports
+        path: reports
+
+  run_tests_apple_m1:
+    name: MPS tests on Apple M1
+    runs-on: [ self-hosted, apple-m1 ]
+
+    steps:
+    - name: Checkout diffusers
+      uses: actions/checkout@v3
+      with:
+        fetch-depth: 2
+
+    - name: Clean checkout
+      shell: arch -arch arm64 bash {0}
+      run: |
+        git clean -fxd
+
+    - name: Setup miniconda
+      uses: ./.github/actions/setup-miniconda
+      with:
+        python-version: 3.9
+
+    - name: Install dependencies
+      shell: arch -arch arm64 bash {0}
+      run: |
+        ${CONDA_RUN} python -m pip install --upgrade pip
+        ${CONDA_RUN} python -m pip install -e .[quality,test]
+        ${CONDA_RUN} python -m pip install --pre torch==${MPS_TORCH_VERSION} --extra-index-url https://download.pytorch.org/whl/test/cpu
+
+    - name: Environment
+      shell: arch -arch arm64 bash {0}
+      run: |
+        ${CONDA_RUN} python utils/print_env.py
+
+    - name: Run all fast tests on MPS
+      shell: arch -arch arm64 bash {0}
+      run: |
+        ${CONDA_RUN} python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile -s -v --make-reports=tests_torch_mps tests/
+
+    - name: Failure short reports
+      if: ${{ failure() }}
+      run: cat reports/tests_torch_mps_failures_short.txt
+
+    - name: Test suite reports artifacts
+      if: ${{ always() }}
+      uses: actions/upload-artifact@v2
+      with:
+        name: pr_torch_mps_test_reports
+        path: reports
--- a/.github/workflows/push_tests.yml
+++ b/.github/workflows/push_tests.yml
@@ -0,0 +1,106 @@
+name: Run all tests
+
+on:
+  push:
+    branches:
+      - main
+
+env:
+  HF_HOME: /mnt/cache
+  OMP_NUM_THREADS: 8
+  MKL_NUM_THREADS: 8
+  PYTEST_TIMEOUT: 1000
+  RUN_SLOW: yes
+
+jobs:
+  run_tests_single_gpu:
+    name: Diffusers tests
+    runs-on: [ self-hosted, docker-gpu, single-gpu ]
+    container:
+      image: nvcr.io/nvidia/pytorch:22.07-py3
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache
+
+    steps:
+    - name: Checkout diffusers
+      uses: actions/checkout@v3
+      with:
+        fetch-depth: 2
+
+    - name: NVIDIA-SMI
+      run: |
+        nvidia-smi
+
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        python -m pip uninstall -y torch torchvision torchtext
+        python -m pip install torch --extra-index-url https://download.pytorch.org/whl/cu116
+        python -m pip install -e .[quality,test]
+
+    - name: Environment
+      run: |
+        python utils/print_env.py
+
+    - name: Run all (incl. slow) tests on GPU
+      env:
+        HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+      run: |
+        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v --make-reports=tests_torch_gpu tests/
+
+    - name: Failure short reports
+      if: ${{ failure() }}
+      run: cat reports/tests_torch_gpu_failures_short.txt
+
+    - name: Test suite reports artifacts
+      if: ${{ always() }}
+      uses: actions/upload-artifact@v2
+      with:
+        name: torch_test_reports
+        path: reports
+
+
+
+  run_examples_single_gpu:
+    name: Examples tests
+    runs-on: [ self-hosted, docker-gpu, single-gpu ]
+    container:
+      image: nvcr.io/nvidia/pytorch:22.07-py3
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache
+
+    steps:
+    - name: Checkout diffusers
+      uses: actions/checkout@v3
+      with:
+        fetch-depth: 2
+
+    - name: NVIDIA-SMI
+      run: |
+        nvidia-smi
+
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        python -m pip uninstall -y torch torchvision torchtext
+        python -m pip install torch --extra-index-url https://download.pytorch.org/whl/cu116
+        python -m pip install -e .[quality,test,training]
+
+    - name: Environment
+      run: |
+        python utils/print_env.py
+
+    - name: Run example tests on GPU
+      env:
+        HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+      run: |
+        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v --make-reports=examples_torch_gpu examples/
+
+    - name: Failure short reports
+      if: ${{ failure() }}
+      run: cat reports/examples_torch_gpu_failures_short.txt
+
+    - name: Test suite reports artifacts
+      if: ${{ always() }}
+      uses: actions/upload-artifact@v2
+      with:
+        name: examples_test_reports
+        path: reports
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@@ -0,0 +1,27 @@
+name: Stale Bot
+
+on:
+  schedule:
+    - cron: "0 15 * * *"
+
+jobs:
+  close_stale_issues:
+    name: Close Stale Issues
+    if: github.repository == 'huggingface/diffusers'
+    runs-on: ubuntu-latest
+    env:
+      GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+    steps:
+    - uses: actions/checkout@v2
+
+    - name: Setup Python
+      uses: actions/setup-python@v1
+      with:
+        python-version: 3.7
+
+    - name: Install requirements
+      run: |
+        pip install PyGithub
+    - name: Close stale issues
+      run: |
+        python utils/stale.py
--- a/.github/workflows/typos.yml
+++ b/.github/workflows/typos.yml
@@ -0,0 +1,14 @@
+name: Check typos
+
+on:
+  workflow_dispatch:
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: typos-action
+        uses: crate-ci/typos@v1.12.4
--- a/CODE_OF_CONDUCT.md
+++ b/CODE_OF_CONDUCT.md
@@ -0,0 +1,129 @@
+
+# Contributor Covenant Code of Conduct
+
+## Our Pledge
+
+We as members, contributors, and leaders pledge to make participation in our
+community a harassment-free experience for everyone, regardless of age, body
+size, visible or invisible disability, ethnicity, sex characteristics, gender
+identity and expression, level of experience, education, socio-economic status,
+nationality, personal appearance, race, religion, or sexual identity
+and orientation.
+
+We pledge to act and interact in ways that contribute to an open, welcoming,
+diverse, inclusive, and healthy community.
+
+## Our Standards
+
+Examples of behavior that contributes to a positive environment for our
+community include:
+
+* Demonstrating empathy and kindness toward other people
+* Being respectful of differing opinions, viewpoints, and experiences
+* Giving and gracefully accepting constructive feedback
+* Accepting responsibility and apologizing to those affected by our mistakes,
+  and learning from the experience
+* Focusing on what is best not just for us as individuals, but for the
+  overall community
+
+Examples of unacceptable behavior include:
+
+* The use of sexualized language or imagery, and sexual attention or
+  advances of any kind
+* Trolling, insulting or derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or email
+  address, without their explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+  professional setting
+
+## Enforcement Responsibilities
+
+Community leaders are responsible for clarifying and enforcing our standards of
+acceptable behavior and will take appropriate and fair corrective action in
+response to any behavior that they deem inappropriate, threatening, offensive,
+or harmful.
+
+Community leaders have the right and responsibility to remove, edit, or reject
+comments, commits, code, wiki edits, issues, and other contributions that are
+not aligned to this Code of Conduct, and will communicate reasons for moderation
+decisions when appropriate.
+
+## Scope
+
+This Code of Conduct applies within all community spaces, and also applies when
+an individual is officially representing the community in public spaces.
+Examples of representing our community include using an official e-mail address,
+posting via an official social media account, or acting as an appointed
+representative at an online or offline event.
+
+## Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported to the community leaders responsible for enforcement at
+feedback@huggingface.co.
+All complaints will be reviewed and investigated promptly and fairly.
+
+All community leaders are obligated to respect the privacy and security of the
+reporter of any incident.
+
+## Enforcement Guidelines
+
+Community leaders will follow these Community Impact Guidelines in determining
+the consequences for any action they deem in violation of this Code of Conduct:
+
+### 1. Correction
+
+**Community Impact**: Use of inappropriate language or other behavior deemed
+unprofessional or unwelcome in the community.
+
+**Consequence**: A private, written warning from community leaders, providing
+clarity around the nature of the violation and an explanation of why the
+behavior was inappropriate. A public apology may be requested.
+
+### 2. Warning
+
+**Community Impact**: A violation through a single incident or series
+of actions.
+
+**Consequence**: A warning with consequences for continued behavior. No
+interaction with the people involved, including unsolicited interaction with
+those enforcing the Code of Conduct, for a specified period of time. This
+includes avoiding interactions in community spaces as well as external channels
+like social media. Violating these terms may lead to a temporary or
+permanent ban.
+
+### 3. Temporary Ban
+
+**Community Impact**: A serious violation of community standards, including
+sustained inappropriate behavior.
+
+**Consequence**: A temporary ban from any sort of interaction or public
+communication with the community for a specified period of time. No public or
+private interaction with the people involved, including unsolicited interaction
+with those enforcing the Code of Conduct, is allowed during this period.
+Violating these terms may lead to a permanent ban.
+
+### 4. Permanent Ban
+
+**Community Impact**: Demonstrating a pattern of violation of community
+standards, including sustained inappropriate behavior,  harassment of an
+individual, or aggression toward or disparagement of classes of individuals.
+
+**Consequence**: A permanent ban from any sort of public interaction within
+the community.
+
+## Attribution
+
+This Code of Conduct is adapted from the [Contributor Covenant][homepage],
+version 2.0, available at
+https://www.contributor-covenant.org/version/2/0/code_of_conduct.html.
+
+Community Impact Guidelines were inspired by [Mozilla's code of conduct
+enforcement ladder](https://github.com/mozilla/diversity).
+
+[homepage]: https://www.contributor-covenant.org
+
+For answers to common questions about this code of conduct, see the FAQ at
+https://www.contributor-covenant.org/faq. Translations are available at
+https://www.contributor-covenant.org/translations.
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -0,0 +1,294 @@
+<!---
+Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+# How to contribute to diffusers?
+
+Everyone is welcome to contribute, and we value everybody's contribution. Code
+is thus not the only way to help the community. Answering questions, helping
+others, reaching out and improving the documentations are immensely valuable to
+the community.
+
+It also helps us if you spread the word: reference the library from blog posts
+on the awesome projects it made possible, shout out on Twitter every time it has
+helped you, or simply star the repo to say "thank you".
+
+Whichever way you choose to contribute, please be mindful to respect our
+[code of conduct](https://github.com/huggingface/diffusers/blob/main/CODE_OF_CONDUCT.md).
+
+## You can contribute in so many ways!
+
+There are 4 ways you can contribute to diffusers:
+* Fixing outstanding issues with the existing code;
+* Implementing [new diffusion pipelines](https://github.com/huggingface/diffusers/tree/main/src/diffusers/pipelines#contribution), [new schedulers](https://github.com/huggingface/diffusers/tree/main/src/diffusers/schedulers) or [new models](https://github.com/huggingface/diffusers/tree/main/src/diffusers/models)
+* [Contributing to the examples](https://github.com/huggingface/diffusers/tree/main/examples) or to the documentation;
+* Submitting issues related to bugs or desired new features.
+
+In particular there is a special [Good First Issue](https://github.com/huggingface/diffusers/contribute) listing. 
+It will give you a list of open Issues that are open to anybody to work on. Just comment in the issue that you'd like to work on it. 
+In that same listing you will also find some Issues with `Good Second Issue` label. These are
+typically slightly more complicated than the Issues with just `Good First Issue` label. But if you
+feel you know what you're doing, go for it.
+
+*All are equally valuable to the community.*
+
+## Submitting a new issue or feature request
+
+Do your best to follow these guidelines when submitting an issue or a feature
+request. It will make it easier for us to come back to you quickly and with good
+feedback.
+
+### Did you find a bug?
+
+The 🧨 Diffusers library is robust and reliable thanks to the users who notify us of
+the problems they encounter. So thank you for reporting an issue.
+
+First, we would really appreciate it if you could **make sure the bug was not
+already reported** (use the search bar on Github under Issues).
+
+### Do you want to implement a new diffusion pipeline / diffusion model?
+
+Awesome! Please provide the following information:
+
+* Short description of the diffusion pipeline and link to the paper;
+* Link to the implementation if it is open-source;
+* Link to the model weights if they are available.
+
+If you are willing to contribute the model yourself, let us know so we can best
+guide you.
+
+### Do you want a new feature (that is not a model)?
+
+A world-class feature request addresses the following points:
+
+1. Motivation first:
+  * Is it related to a problem/frustration with the library? If so, please explain
+    why. Providing a code snippet that demonstrates the problem is best.
+  * Is it related to something you would need for a project? We'd love to hear
+    about it!
+  * Is it something you worked on and think could benefit the community?
+    Awesome! Tell us what problem it solved for you.
+2. Write a *full paragraph* describing the feature;
+3. Provide a **code snippet** that demonstrates its future use;
+4. In case this is related to a paper, please attach a link;
+5. Attach any additional information (drawings, screenshots, etc.) you think may help.
+
+If your issue is well written we're already 80% of the way there by the time you
+post it.
+
+## Start contributing! (Pull Requests)
+
+Before writing code, we strongly advise you to search through the existing PRs or
+issues to make sure that nobody is already working on the same thing. If you are
+unsure, it is always a good idea to open an issue to get some feedback.
+
+You will need basic `git` proficiency to be able to contribute to
+🧨 Diffusers. `git` is not the easiest tool to use but it has the greatest
+manual. Type `git --help` in a shell and enjoy. If you prefer books, [Pro
+Git](https://git-scm.com/book/en/v2) is a very good reference.
+
+Follow these steps to start contributing ([supported Python versions](https://github.com/huggingface/diffusers/blob/main/setup.py#L426)):
+
+1. Fork the [repository](https://github.com/huggingface/diffusers) by
+   clicking on the 'Fork' button on the repository's page. This creates a copy of the code
+   under your GitHub user account.
+
+2. Clone your fork to your local disk, and add the base repository as a remote:
+
+   ```bash
+   $ git clone git@github.com:<your Github handle>/diffusers.git
+   $ cd diffusers
+   $ git remote add upstream https://github.com/huggingface/diffusers.git
+   ```
+
+3. Create a new branch to hold your development changes:
+
+   ```bash
+   $ git checkout -b a-descriptive-name-for-my-changes
+   ```
+
+   **Do not** work on the `main` branch.
+
+4. Set up a development environment by running the following command in a virtual environment:
+
+   ```bash
+   $ pip install -e ".[dev]"
+   ```
+
+   (If diffusers was already installed in the virtual environment, remove
+   it with `pip uninstall diffusers` before reinstalling it in editable
+   mode with the `-e` flag.)
+
+   To run the full test suite, you might need the additional dependency on `transformers` and `datasets` which requires a separate source
+   install:
+
+   ```bash
+   $ git clone https://github.com/huggingface/transformers
+   $ cd transformers
+   $ pip install -e .
+   ```
+
+   ```bash
+   $ git clone https://github.com/huggingface/datasets
+   $ cd datasets
+   $ pip install -e .
+   ```
+
+   If you have already cloned that repo, you might need to `git pull` to get the most recent changes in the `datasets`
+   library.
+
+5. Develop the features on your branch.
+
+   As you work on the features, you should make sure that the test suite
+   passes. You should run the tests impacted by your changes like this:
+
+   ```bash
+   $ pytest tests/<TEST_TO_RUN>.py
+   ```
+
+   You can also run the full suite with the following command, but it takes
+   a beefy machine to produce a result in a decent amount of time now that
+   Diffusers has grown a lot. Here is the command for it:
+
+   ```bash
+   $ make test
+   ```
+
+   For more information about tests, check out the
+   [dedicated documentation](https://huggingface.co/docs/diffusers/testing)
+
+   🧨 Diffusers relies on `black` and `isort` to format its source code
+   consistently. After you make changes, apply automatic style corrections and code verifications
+   that can't be automated in one go with:
+
+   ```bash
+   $ make style
+   ```
+
+   🧨 Diffusers also uses `flake8` and a few custom scripts to check for coding mistakes. Quality
+   control runs in CI, however you can also run the same checks with:
+
+   ```bash
+   $ make quality
+   ```
+
+   Once you're happy with your changes, add changed files using `git add` and
+   make a commit with `git commit` to record your changes locally:
+
+   ```bash
+   $ git add modified_file.py
+   $ git commit
+   ```
+
+   It is a good idea to sync your copy of the code with the original
+   repository regularly. This way you can quickly account for changes:
+
+   ```bash
+   $ git fetch upstream
+   $ git rebase upstream/main
+   ```
+
+   Push the changes to your account using:
+
+   ```bash
+   $ git push -u origin a-descriptive-name-for-my-changes
+   ```
+
+6. Once you are satisfied (**and the checklist below is happy too**), go to the
+   webpage of your fork on GitHub. Click on 'Pull request' to send your changes
+   to the project maintainers for review.
+
+7. It's ok if maintainers ask you for changes. It happens to core contributors
+   too! So everyone can see the changes in the Pull request, work in your local
+   branch and push the changes to your fork. They will automatically appear in
+   the pull request.
+
+
+### Checklist
+
+1. The title of your pull request should be a summary of its contribution;
+2. If your pull request addresses an issue, please mention the issue number in
+   the pull request description to make sure they are linked (and people
+   consulting the issue know you are working on it);
+3. To indicate a work in progress please prefix the title with `[WIP]`. These
+   are useful to avoid duplicated work, and to differentiate it from PRs ready
+   to be merged;
+4. Make sure existing tests pass;
+5. Add high-coverage tests. No quality testing = no merge.
+   - If you are adding new `@slow` tests, make sure they pass using
+     `RUN_SLOW=1 python -m pytest tests/test_my_new_model.py`.
+   - If you are adding a new tokenizer, write tests, and make sure
+     `RUN_SLOW=1 python -m pytest tests/test_tokenization_{your_model_name}.py` passes.
+   CircleCI does not run the slow tests, but github actions does every night!
+6. All public methods must have informative docstrings that work nicely with sphinx. See `modeling_bert.py` for an
+   example.
+7. Due to the rapidly growing repository, it is important to make sure that no files that would significantly weigh down the repository are added. This includes images, videos and other non-text files. We prefer to leverage a hf.co hosted `dataset` like
+   the ones hosted on [`hf-internal-testing`](https://huggingface.co/hf-internal-testing) in which to place these files and reference 
+   them by URL. We recommend putting them in the following dataset: [huggingface/documentation-images](https://huggingface.co/datasets/huggingface/documentation-images).
+   If an external contribution, feel free to add the images to your PR and ask a Hugging Face member to migrate your images
+   to this dataset.
+
+### Tests
+
+An extensive test suite is included to test the library behavior and several examples. Library tests can be found in
+the [tests folder](https://github.com/huggingface/diffusers/tree/main/tests).
+
+We like `pytest` and `pytest-xdist` because it's faster. From the root of the
+repository, here's how to run tests with `pytest` for the library:
+
+```bash
+$ python -m pytest -n auto --dist=loadfile -s -v ./tests/
+```
+
+In fact, that's how `make test` is implemented (sans the `pip install` line)!
+
+You can specify a smaller set of tests in order to test only the feature
+you're working on.
+
+By default, slow tests are skipped. Set the `RUN_SLOW` environment variable to
+`yes` to run them. This will download many gigabytes of models — make sure you
+have enough disk space and a good Internet connection, or a lot of patience!
+
+```bash
+$ RUN_SLOW=yes python -m pytest -n auto --dist=loadfile -s -v ./tests/
+```
+
+This means `unittest` is fully supported. Here's how to run tests with
+`unittest`:
+
+```bash
+$ python -m unittest discover -s tests -t . -v
+$ python -m unittest discover -s examples -t examples -v
+```
+
+
+### Style guide
+
+For documentation strings, 🧨 Diffusers follows the [google style](https://google.github.io/styleguide/pyguide.html).
+
+**This guide was heavily inspired by the awesome [scikit-learn guide to contributing](https://github.com/scikit-learn/scikit-learn/blob/main/CONTRIBUTING.md).**
+
+### Syncing forked main with upstream (HuggingFace) main
+
+To avoid pinging the upstream repository which adds reference notes to each upstream PR and sends unnecessary notifications to the developers involved in these PRs,
+when syncing the main branch of a forked repository, please, follow these steps:
+1. When possible, avoid syncing with the upstream using a branch and PR on the forked repository. Instead merge directly into the forked main.
+2. If a PR is absolutely necessary, use the following steps after checking out your branch:
+```
+$ git checkout -b your-branch-for-syncing
+$ git pull --squash --no-commit upstream main
+$ git commit -m '<your message without GitHub references>'
+$ git push --set-upstream origin your-branch-for-syncing
+```
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1 +1,2 @@
+include LICENSE
 include src/diffusers/utils/model_card_template.md
--- a/2
+++ b/2
@@ -3,7 +3,7 @@
 # make sure to test the local checkout in scripts and not the pre-installed one (don't use quotes!)
 export PYTHONPATH = src

-check_dirs := examples tests src utils
+check_dirs := examples scripts src tests utils

 modified_only_fixup:
 	$(eval modified_py_files := $(shell python utils/get_modified_files.py $(check_dirs)))
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 <p align="center">
    <br>
-    <img src="docs/source/imgs/diffusers_library.jpg" width="400"/>
+    <img src="https://github.com/huggingface/diffusers/raw/main/docs/source/imgs/diffusers_library.jpg" width="400"/>
    <br>
 <p>
 <p align="center">
@@ -20,31 +20,116 @@ as a modular toolbox for inference and training of diffusion models.

 More precisely, 🤗 Diffusers offers:

- State-of-the-art diffusion pipelines that can be run in inference with just a couple of lines of code (see [src/diffusers/pipelines](https://github.com/huggingface/diffusers/tree/main/src/diffusers/pipelines)).
- Various noise schedulers that can be used interchangeably for the prefered speed vs. quality trade-off in inference (see [src/diffusers/schedulers](https://github.com/huggingface/diffusers/tree/main/src/diffusers/schedulers)).
+- State-of-the-art diffusion pipelines that can be run in inference with just a couple of lines of code (see [src/diffusers/pipelines](https://github.com/huggingface/diffusers/tree/main/src/diffusers/pipelines)). Check [this overview](https://github.com/huggingface/diffusers/tree/main/src/diffusers/pipelines/README.md#pipelines-summary) to see all supported pipelines and their corresponding official papers.
+- Various noise schedulers that can be used interchangeably for the preferred speed vs. quality trade-off in inference (see [src/diffusers/schedulers](https://github.com/huggingface/diffusers/tree/main/src/diffusers/schedulers)).
 - Multiple types of models, such as UNet, can be used as building blocks in an end-to-end diffusion system (see [src/diffusers/models](https://github.com/huggingface/diffusers/tree/main/src/diffusers/models)).
- Training examples to show how to train the most popular diffusion models (see [examples](https://github.com/huggingface/diffusers/tree/main/examples)).
+- Training examples to show how to train the most popular diffusion model tasks (see [examples](https://github.com/huggingface/diffusers/tree/main/examples), *e.g.* [unconditional-image-generation](https://github.com/huggingface/diffusers/tree/main/examples/unconditional_image_generation)).
+
+## Installation
+
+**With `pip`**
+    
+```bash
+pip install --upgrade diffusers
+```
+
+**With `conda`**
+
+```sh
+conda install -c conda-forge diffusers
+```
+
+**Apple Silicon (M1/M2) support**
+
+Please, refer to [the documentation](https://huggingface.co/docs/diffusers/optimization/mps).
+
+## Contributing
+
+We ❤️  contributions from the open-source community! 
+If you want to contribute to this library, please check out our [Contribution guide](https://github.com/huggingface/diffusers/blob/main/CONTRIBUTING.md).
+You can look out for [issues](https://github.com/huggingface/diffusers/issues) you'd like to tackle to contribute to the library.
+- See [Good first issues](https://github.com/huggingface/diffusers/issues?q=is%3Aopen+is%3Aissue+label%3A%22good+first+issue%22) for general opportunities to contribute
+- See [New model/pipeline](https://github.com/huggingface/diffusers/issues?q=is%3Aopen+is%3Aissue+label%3A%22New+pipeline%2Fmodel%22) to contribute exciting new diffusion models / diffusion pipelines
+- See [New scheduler](https://github.com/huggingface/diffusers/issues?q=is%3Aopen+is%3Aissue+label%3A%22New+scheduler%22)
+
+Also, say 👋 in our public Discord channel <a href="https://discord.gg/G7tWnz98XR"><img alt="Join us on Discord" src="https://img.shields.io/discord/823813159592001537?color=5865F2&logo=discord&logoColor=white"></a>. We discuss the hottest trends about diffusion models, help each other with contributions, personal projects or
+just hang out ☕.

 ## Quickstart

 In order to get started, we recommend taking a look at two notebooks:

- The [Getting started with Diffusers](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/diffusers_intro.ipynb) notebook, which showcases an end-to-end example of usage for diffusion models, schedulers and pipelines.
+- The [Getting started with Diffusers](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/diffusers_intro.ipynb) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/diffusers_intro.ipynb) notebook, which showcases an end-to-end example of usage for diffusion models, schedulers and pipelines.
  Take a look at this notebook to learn how to use the pipeline abstraction, which takes care of everything (model, scheduler, noise handling) for you, and also to understand each independent building block in the library.
- The [Training a diffusers model](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/training_example.ipynb) notebook summarizes diffuser model training methods. This notebook takes a step-by-step approach to training your
-  diffuser model on an image dataset, with explanatory graphics.
+- The [Training a diffusers model](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/training_example.ipynb) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/training_example.ipynb) notebook summarizes diffusion models training methods. This notebook takes a step-by-step approach to training your
+  diffusion models on an image dataset, with explanatory graphics. 
  
-## **New 🎨🎨🎨** Stable Diffusion is now fully compatible with `diffusers`! 
+## **New** Stable Diffusion is now fully compatible with `diffusers`!  

 Stable Diffusion is a text-to-image latent diffusion model created by the researchers and engineers from [CompVis](https://github.com/CompVis), [Stability AI](https://stability.ai/) and [LAION](https://laion.ai/). It's trained on 512x512 images from a subset of the [LAION-5B](https://laion.ai/blog/laion-5b/) database. This model uses a frozen CLIP ViT-L/14 text encoder to condition the model on text prompts. With its 860M UNet and 123M text encoder, the model is relatively lightweight and runs on a GPU with at least 10GB VRAM.
 See the [model card](https://huggingface.co/CompVis/stable-diffusion) for more information.

-**The Stable Diffusion weights are currently only available to universities, academics, research institutions and independent researchers. Please request access applying to <a href="https://stability.ai/academia-access-form" target="_blank">this</a> form**
+You need to accept the model license before downloading or using the Stable Diffusion weights. Please, visit the [model card](https://huggingface.co/CompVis/stable-diffusion-v1-4), read the license and tick the checkbox if you agree. You have to be a registered user in 🤗 Hugging Face Hub, and you'll also need to use an access token for the code to work. For more information on access tokens, please refer to [this section](https://huggingface.co/docs/hub/security-tokens) of the documentation.

-```py
+
+### Text-to-Image generation with Stable Diffusion
+
+We recommend using the model in [half-precision (`fp16`)](https://pytorch.org/blog/accelerating-training-on-nvidia-gpus-with-pytorch-automatic-mixed-precision/) as it gives almost always the same results as full
+precision while being roughly twice as fast and requiring half the amount of GPU RAM.
+
+```python
 # make sure you're logged in with `huggingface-cli login`
-from torch import autocast
-from diffusers import StableDiffusionPipeline, LMSDiscreteScheduler
+from diffusers import StableDiffusionPipeline
+
+pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_type=torch.float16, revision="fp16")
+pipe = pipe.to("cuda")
+
+prompt = "a photo of an astronaut riding a horse on mars"
+image = pipe(prompt).images[0]  
+```
+
+**Note**: If you don't want to use the token, you can also simply download the model weights
+(after having [accepted the license](https://huggingface.co/CompVis/stable-diffusion-v1-4)) and pass
+the path to the local folder to the `StableDiffusionPipeline`.
+
+```
+git lfs install
+git clone https://huggingface.co/CompVis/stable-diffusion-v1-4
+```
+
+Assuming the folder is stored locally under `./stable-diffusion-v1-4`, you can also run stable diffusion
+without requiring an authentication token:
+
+```python
+pipe = StableDiffusionPipeline.from_pretrained("./stable-diffusion-v1-4")
+pipe = pipe.to("cuda")
+
+prompt = "a photo of an astronaut riding a horse on mars"
+image = pipe(prompt).images[0]  
+```
+
+If you are limited by GPU memory, you might want to consider chunking the attention computation in addition 
+to using `fp16`.
+The following snippet should result in less than 4GB VRAM.
+
+```python
+pipe = StableDiffusionPipeline.from_pretrained(
+    "CompVis/stable-diffusion-v1-4", 
+    revision="fp16", 
+    torch_dtype=torch.float16,
+)
+pipe = pipe.to("cuda")
+
+prompt = "a photo of an astronaut riding a horse on mars"
+pipe.enable_attention_slicing()
+image = pipe(prompt).images[0]  
+```
+
+If you wish to use a different scheduler, you can simply instantiate
+it before the pipeline and pass it to `from_pretrained`.
+    
+```python
+from diffusers import LMSDiscreteScheduler

 lms = LMSDiscreteScheduler(
    beta_start=0.00085, 
@@ -53,41 +138,145 @@ lms = LMSDiscreteScheduler(
 )

 pipe = StableDiffusionPipeline.from_pretrained(
-    "CompVis/stable-diffusion-v1-3-diffusers", 
+    "CompVis/stable-diffusion-v1-4", 
+    revision="fp16", 
+    torch_dtype=torch.float16,
    scheduler=lms,
-    use_auth_token=True
-)  
+)
+pipe = pipe.to("cuda")

 prompt = "a photo of an astronaut riding a horse on mars"
-with autocast("cuda"):
-    image = pipe(prompt)["sample"][0]  
+image = pipe(prompt).images[0]  
    
 image.save("astronaut_rides_horse.png")
 ```

-For more details, check out [the Stable Diffusion notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/stable_diffusion.ipynb)
+If you want to run Stable Diffusion on CPU or you want to have maximum precision on GPU, 
+please run the model in the default *full-precision* setting:
+
+```python
+# make sure you're logged in with `huggingface-cli login`
+from diffusers import StableDiffusionPipeline
+
+pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
+
+# disable the following line if you run on CPU
+pipe = pipe.to("cuda")
+
+prompt = "a photo of an astronaut riding a horse on mars"
+image = pipe(prompt).images[0]  
+    
+image.save("astronaut_rides_horse.png")
+```
+
+### Image-to-Image text-guided generation with Stable Diffusion
+
+The `StableDiffusionImg2ImgPipeline` lets you pass a text prompt and an initial image to condition the generation of new images.
+
+```python
+import requests
+import torch
+from PIL import Image
+from io import BytesIO
+
+from diffusers import StableDiffusionImg2ImgPipeline
+
+# load the pipeline
+device = "cuda"
+model_id_or_path = "CompVis/stable-diffusion-v1-4"
+pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
+    model_id_or_path,
+    revision="fp16", 
+    torch_dtype=torch.float16,
+)
+# or download via git clone https://huggingface.co/CompVis/stable-diffusion-v1-4
+# and pass `model_id_or_path="./stable-diffusion-v1-4"`.
+pipe = pipe.to(device)
+
+# let's download an initial image
+url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
+
+response = requests.get(url)
+init_image = Image.open(BytesIO(response.content)).convert("RGB")
+init_image = init_image.resize((768, 512))
+
+prompt = "A fantasy landscape, trending on artstation"
+
+images = pipe(prompt=prompt, init_image=init_image, strength=0.75, guidance_scale=7.5).images
+
+images[0].save("fantasy_landscape.png")
+```
+You can also run this example on colab [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/image_2_image_using_diffusers.ipynb)
+
+### In-painting using Stable Diffusion
+
+The `StableDiffusionInpaintPipeline` lets you edit specific parts of an image by providing a mask and a text prompt. It uses a model optimized for this particular task, whose license you need to accept before use.
+
+Please, visit the [model card](https://huggingface.co/runwayml/stable-diffusion-inpainting), read the license carefully and tick the checkbox if you agree. Note that this is an additional license, you need to accept it even if you accepted the text-to-image Stable Diffusion license in the past. You have to be a registered user in 🤗 Hugging Face Hub, and you'll also need to use an access token for the code to work. For more information on access tokens, please refer to [this section](https://huggingface.co/docs/hub/security-tokens) of the documentation.
+
+
+```python
+import PIL
+import requests
+import torch
+from io import BytesIO
+
+from diffusers import StableDiffusionInpaintPipeline
+
+def download_image(url):
+    response = requests.get(url)
+    return PIL.Image.open(BytesIO(response.content)).convert("RGB")
+
+img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png"
+mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png"
+
+init_image = download_image(img_url).resize((512, 512))
+mask_image = download_image(mask_url).resize((512, 512))
+
+pipe = StableDiffusionInpaintPipeline.from_pretrained(
+    "runwayml/stable-diffusion-inpainting",
+    revision="fp16",
+    torch_dtype=torch.float16,
+)
+pipe = pipe.to("cuda")
+
+prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
+image = pipe(prompt=prompt, image=init_image, mask_image=mask_image).images[0]
+```
+
+### Tweak prompts reusing seeds and latents
+
+You can generate your own latents to reproduce results, or tweak your prompt on a specific result you liked. [This notebook](https://github.com/pcuenca/diffusers-examples/blob/main/notebooks/stable-diffusion-seeds.ipynb) shows how to do it step by step. You can also run it in Google Colab [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/pcuenca/diffusers-examples/blob/main/notebooks/stable-diffusion-seeds.ipynb).
+
+
+For more details, check out [the Stable Diffusion notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/stable_diffusion.ipynb) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/stable_diffusion.ipynb)
 and have a look into the [release notes](https://github.com/huggingface/diffusers/releases/tag/v0.2.0).
  
 ## Examples

+There are many ways to try running Diffusers! Here we outline code-focused tools (primarily using `DiffusionPipeline`s and Google Colab) and interactive web-tools.
+
+### Running Code
+
 If you want to run the code yourself 💻, you can try out:
 - [Text-to-Image Latent Diffusion](https://huggingface.co/CompVis/ldm-text2im-large-256)
 ```python
 # !pip install diffusers transformers
 from diffusers import DiffusionPipeline

+device = "cuda"
 model_id = "CompVis/ldm-text2im-large-256"

 # load model and scheduler
 ldm = DiffusionPipeline.from_pretrained(model_id)
+ldm = ldm.to(device)

 # run pipeline in inference (sample random noise and denoise)
 prompt = "A painting of a squirrel eating a burger"
-images = ldm([prompt], num_inference_steps=50, eta=0.3, guidance_scale=6)["sample"]
+image = ldm([prompt], num_inference_steps=50, eta=0.3, guidance_scale=6).images[0]

-# save images
-for idx, image in enumerate(images):
-    image.save(f"squirrel-{idx}.png")
+# save image
+image.save("squirrel.png")
 ```
 - [Unconditional Diffusion with discrete scheduler](https://huggingface.co/google/ddpm-celebahq-256)
 ```python
@@ -95,25 +284,34 @@ for idx, image in enumerate(images):
 from diffusers import DDPMPipeline, DDIMPipeline, PNDMPipeline

 model_id = "google/ddpm-celebahq-256"
+device = "cuda"

 # load model and scheduler
 ddpm = DDPMPipeline.from_pretrained(model_id)  # you can replace DDPMPipeline with DDIMPipeline or PNDMPipeline for faster inference
+ddpm.to(device)

 # run pipeline in inference (sample random noise and denoise)
-image = ddpm()["sample"]
+image = ddpm().images[0]

 # save image
-image[0].save("ddpm_generated_image.png")
+image.save("ddpm_generated_image.png")
 ```
 - [Unconditional Latent Diffusion](https://huggingface.co/CompVis/ldm-celebahq-256)
- [Unconditional Diffusion with continous scheduler](https://huggingface.co/google/ncsnpp-ffhq-1024)
+- [Unconditional Diffusion with continuous scheduler](https://huggingface.co/google/ncsnpp-ffhq-1024)

+**Other Notebooks**:
+* [image-to-image generation with Stable Diffusion](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/image_2_image_using_diffusers.ipynb) ![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg),
+* [tweak images via repeated Stable Diffusion seeds](https://colab.research.google.com/github/pcuenca/diffusers-examples/blob/main/notebooks/stable-diffusion-seeds.ipynb) ![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg),
+
+### Web Demos
 If you just want to play around with some web demos, you can try out the following 🚀 Spaces:
 | Model                          	| Hugging Face Spaces                                                                                                                                               	|
 |--------------------------------	|-------------------------------------------------------------------------------------------------------------------------------------------------------------------	|
 | Text-to-Image Latent Diffusion 	| [![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/CompVis/text2img-latent-diffusion) 	|
 | Faces generator                	| [![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/CompVis/celeba-latent-diffusion)    	|
 | DDPM with different schedulers 	| [![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/fusing/celeba-diffusion)           	|
+| Conditional generation from sketch  	| [![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/huggingface/diffuse-the-rest)           	|
+| Composable diffusion | [![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/Shuang59/Composable-Diffusion)           	|

 ## Definitions

@@ -148,24 +346,10 @@ The class provides functionality to compute previous image according to alpha, b
    
 ## Philosophy

- Readability and clarity is prefered over highly optimized code. A strong importance is put on providing readable, intuitive and elementary code design. *E.g.*, the provided [schedulers](https://github.com/huggingface/diffusers/tree/main/src/diffusers/schedulers) are separated from the provided [models](https://github.com/huggingface/diffusers/tree/main/src/diffusers/models) and provide well-commented code that can be read alongside the original paper.
- Diffusers is **modality independent** and focuses on providing pretrained models and tools to build systems that generate **continous outputs**, *e.g.* vision and audio.
+- Readability and clarity is preferred over highly optimized code. A strong importance is put on providing readable, intuitive and elementary code design. *E.g.*, the provided [schedulers](https://github.com/huggingface/diffusers/tree/main/src/diffusers/schedulers) are separated from the provided [models](https://github.com/huggingface/diffusers/tree/main/src/diffusers/models) and provide well-commented code that can be read alongside the original paper.
+- Diffusers is **modality independent** and focuses on providing pretrained models and tools to build systems that generate **continuous outputs**, *e.g.* vision and audio.
 - Diffusion models and schedulers are provided as concise, elementary building blocks. In contrast, diffusion pipelines are a collection of end-to-end diffusion systems that can be used out-of-the-box, should stay as close as possible to their original implementation and can include components of another library, such as text-encoders. Examples for diffusion pipelines are [Glide](https://github.com/openai/glide-text2im) and [Latent Diffusion](https://github.com/CompVis/latent-diffusion).

-## Installation
-
-**With `pip`**
-    
-```bash
-pip install --upgrade diffusers  # should install diffusers 0.2.1
-```
-
-**With `conda`**
-
-```sh
-conda install -c conda-forge diffusers
-```
-
 ## In the works

 For the first release, 🤗 Diffusers focuses on text-to-image diffusion techniques. However, diffusers can be used for much more than that! Over the upcoming releases, we'll be focusing on:
@@ -193,3 +377,16 @@ This library concretizes previous work by many different authors and would not h
 - @yang-song's Score-VE and Score-VP implementations, available [here](https://github.com/yang-song/score_sde_pytorch)

 We also want to thank @heejkoo for the very helpful overview of papers, code and resources on diffusion models, available [here](https://github.com/heejkoo/Awesome-Diffusion-Models) as well as @crowsonkb and @rromb for useful discussions and insights.
+
+## Citation
+
+```bibtex
+@misc{von-platen-etal-2022-diffusers,
+  author = {Patrick von Platen and Suraj Patil and Anton Lozhkov and Pedro Cuenca and Nathan Lambert and Kashif Rasul and Mishig Davaadorj and Thomas Wolf},
+  title = {Diffusers: State-of-the-art diffusion models},
+  year = {2022},
+  publisher = {GitHub},
+  journal = {GitHub repository},
+  howpublished = {\url{https://github.com/huggingface/diffusers}}
+}
+```
--- a/_typos.toml
+++ b/_typos.toml
@@ -0,0 +1,13 @@
+# Files for typos
+# Instruction:  https://github.com/marketplace/actions/typos-action#getting-started
+
+[default.extend-identifiers]
+
+[default.extend-words]
+NIN="NIN" # NIN is used in scripts/convert_ncsnpp_original_checkpoint_to_diffusers.py
+nd="np" # nd may be np (numpy)
+parms="parms" # parms is used in scripts/convert_original_stable_diffusion_to_diffusers.py
+
+
+[files]
+extend-exclude = ["_typos.toml"]
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -1,40 +1,94 @@
 - sections:
  - local: index
-    title: 🧨 Diffusers
+    title: "🧨 Diffusers"
  - local: quicktour
-    title: Quicktour
-  - local: philosophy
-    title: Philosophy
-  title: Get started
+    title: "Quicktour"
+  - local: installation
+    title: "Installation"
+  title: "Get started"
 - sections:
  - sections:
-    - local: examples/diffusers_for_vision
-      title: Diffusers for Vision
-    - local: examples/diffusers_for_audio
-      title: Diffusers for Audio
-    - local: examples/diffusers_for_other
-      title: Diffusers for Other Modalities
-    title: Examples
-  title: Using Diffusers
+    - local: using-diffusers/loading
+      title: "Loading Pipelines, Models, and Schedulers"
+    - local: using-diffusers/configuration
+      title: "Configuring Pipelines, Models, and Schedulers"
+    - local: using-diffusers/custom_pipelines
+      title: "Loading and Creating Custom Pipelines"
+    title: "Loading"
+  - sections:
+    - local: using-diffusers/unconditional_image_generation
+      title: "Unconditional Image Generation"
+    - local: using-diffusers/conditional_image_generation
+      title: "Text-to-Image Generation"
+    - local: using-diffusers/img2img
+      title: "Text-Guided Image-to-Image"
+    - local: using-diffusers/inpaint
+      title: "Text-Guided Image-Inpainting"
+    - local: using-diffusers/custom
+      title: "Create a custom pipeline"
+    title: "Pipelines for Inference"
+  title: "Using Diffusers"
+- sections:
+  - local: optimization/fp16
+    title: "Memory and Speed"
+  - local: optimization/onnx
+    title: "ONNX"
+  - local: optimization/open_vino
+    title: "Open Vino"
+  - local: optimization/mps
+    title: "MPS"
+  title: "Optimization/Special Hardware"
+- sections:
+  - local: training/overview
+    title: "Overview"
+  - local: training/unconditional_training
+    title: "Unconditional Image Generation"
+  - local: training/text_inversion
+    title: "Text Inversion"
+  - local: training/text2image
+    title: "Text-to-image"
+  title: "Training"
+- sections:
+  - local: conceptual/stable_diffusion
+    title: "Stable Diffusion"
+  - local: conceptual/philosophy
+    title: "Philosophy"
+  - local: conceptual/contribution
+    title: "How to contribute?"
+  title: "Conceptual Guides"
 - sections:
  - sections:
-    - local: pipelines
-      title: Pipelines
-    - local: schedulers
-      title: Schedulers
-    - local: models
-      title: Models
-    title: Main Classes
+    - local: api/models
+      title: "Models"
+    - local: api/schedulers
+      title: "Schedulers"
+    - local: api/diffusion_pipeline
+      title: "Diffusion Pipeline"
+    - local: api/logging
+      title: "Logging"
+    - local: api/configuration
+      title: "Configuration"
+    - local: api/outputs
+      title: "Outputs"
+    title: "Main Classes"
  - sections:
-    - local: pipelines/glide
-      title: "Glide"
-    title: Pipelines
-  - sections:
-    - local: schedulers/ddpm
+    - local: api/pipelines/overview
+      title: "Overview"
+    - local: api/pipelines/ddim
+      title: "DDIM"
+    - local: api/pipelines/ddpm
      title: "DDPM"
-    title: Schedulers
-  - sections:
-    - local: models/unet
-      title: "Unet"
-    title: Models
-  title: API
+    - local: api/pipelines/latent_diffusion
+      title: "Latent Diffusion"
+    - local: api/pipelines/latent_diffusion_uncond
+      title: "Unconditional Latent Diffusion"
+    - local: api/pipelines/pndm
+      title: "PNDM"
+    - local: api/pipelines/score_sde_ve
+      title: "Score SDE VE"
+    - local: api/pipelines/stable_diffusion
+      title: "Stable Diffusion"
+    - local: api/pipelines/stochastic_karras_ve
+      title: "Stochastic Karras VE"
+    title: "Pipelines"
+  title: "API"
--- a/docs/source/api/configuration.mdx
+++ b/docs/source/api/configuration.mdx
@@ -0,0 +1,23 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Configuration
+
+In Diffusers, schedulers of type [`schedulers.scheduling_utils.SchedulerMixin`], and models of type [`ModelMixin`] inherit from [`ConfigMixin`] which conveniently takes care of storing all parameters that are 
+passed to the respective `__init__` methods in a JSON-configuration file.
+
+TODO(PVP) - add example and better info here
+
+## ConfigMixin
+[[autodoc]] ConfigMixin
+	- from_config
+	- save_config
--- a/docs/source/api/diffusion_pipeline.mdx
+++ b/docs/source/api/diffusion_pipeline.mdx
@@ -0,0 +1,39 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Pipelines
+
+The [`DiffusionPipeline`] is the easiest way to load any pretrained diffusion pipeline from the [Hub](https://huggingface.co/models?library=diffusers) and to use it in inference.
+
+<Tip>
+	
+	One should not use the Diffusion Pipeline class for training or fine-tuning a diffusion model. Individual 
+	components of diffusion pipelines are usually trained individually, so we suggest to directly work 
+	with [`UNetModel`] and [`UNetConditionModel`].
+
+</Tip>
+
+Any diffusion pipeline that is loaded with [`~DiffusionPipeline.from_pretrained`] will automatically 
+detect the pipeline type, *e.g.* [`StableDiffusionPipeline`] and consequently load each component of the 
+pipeline and pass them into the `__init__` function of the pipeline, *e.g.* [`~StableDiffusionPipeline.__init__`].
+
+Any pipeline object can be saved locally with [`~DiffusionPipeline.save_pretrained`].
+
+## DiffusionPipeline
+[[autodoc]] DiffusionPipeline
+	- from_pretrained
+	- save_pretrained
+
+## ImagePipelineOutput
+By default diffusion pipelines return an object of class
+
+[[autodoc]] pipeline_utils.ImagePipelineOutput
--- a/docs/source/api/logging.mdx
+++ b/docs/source/api/logging.mdx
@@ -0,0 +1,98 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Logging
+
+🧨 Diffusers has a centralized logging system, so that you can setup the verbosity of the library easily.
+
+Currently the default verbosity of the library is `WARNING`.
+
+To change the level of verbosity, just use one of the direct setters. For instance, here is how to change the verbosity
+to the INFO level.
+
+```python
+import diffusers
+
+diffusers.logging.set_verbosity_info()
+```
+
+You can also use the environment variable `DIFFUSERS_VERBOSITY` to override the default verbosity. You can set it
+to one of the following: `debug`, `info`, `warning`, `error`, `critical`. For example:
+
+```bash
+DIFFUSERS_VERBOSITY=error ./myprogram.py
+```
+
+Additionally, some `warnings` can be disabled by setting the environment variable
+`DIFFUSERS_NO_ADVISORY_WARNINGS` to a true value, like *1*. This will disable any warning that is logged using
+[`logger.warning_advice`]. For example:
+
+```bash
+DIFFUSERS_NO_ADVISORY_WARNINGS=1 ./myprogram.py
+```
+
+Here is an example of how to use the same logger as the library in your own module or script:
+
+```python
+from diffusers.utils import logging
+
+logging.set_verbosity_info()
+logger = logging.get_logger("diffusers")
+logger.info("INFO")
+logger.warning("WARN")
+```
+
+
+All the methods of this logging module are documented below, the main ones are
+[`logging.get_verbosity`] to get the current level of verbosity in the logger and
+[`logging.set_verbosity`] to set the verbosity to the level of your choice. In order (from the least
+verbose to the most verbose), those levels (with their corresponding int values in parenthesis) are:
+
+- `diffusers.logging.CRITICAL` or `diffusers.logging.FATAL` (int value, 50): only report the most
+  critical errors.
+- `diffusers.logging.ERROR` (int value, 40): only report errors.
+- `diffusers.logging.WARNING` or `diffusers.logging.WARN` (int value, 30): only reports error and
+  warnings. This the default level used by the library.
+- `diffusers.logging.INFO` (int value, 20): reports error, warnings and basic information.
+- `diffusers.logging.DEBUG` (int value, 10): report all information.
+
+By default, `tqdm` progress bars will be displayed during model download. [`logging.disable_progress_bar`] and [`logging.enable_progress_bar`] can be used to suppress or unsuppress this behavior.
+
+## Base setters
+
+[[autodoc]] logging.set_verbosity_error
+
+[[autodoc]] logging.set_verbosity_warning
+
+[[autodoc]] logging.set_verbosity_info
+
+[[autodoc]] logging.set_verbosity_debug
+
+## Other functions
+
+[[autodoc]] logging.get_verbosity
+
+[[autodoc]] logging.set_verbosity
+
+[[autodoc]] logging.get_logger
+
+[[autodoc]] logging.enable_default_handler
+
+[[autodoc]] logging.disable_default_handler
+
+[[autodoc]] logging.enable_explicit_format
+
+[[autodoc]] logging.reset_format
+
+[[autodoc]] logging.enable_progress_bar
+
+[[autodoc]] logging.disable_progress_bar
--- a/docs/source/api/models.mdx
+++ b/docs/source/api/models.mdx
@@ -16,13 +16,50 @@ Diffusers contains pretrained models for popular algorithms and modules for crea
 The primary function of these models is to denoise an input sample, by modeling the distribution $p_\theta(\mathbf{x}_{t-1}|\mathbf{x}_t)$.
 The models are built on the base class ['ModelMixin'] that is a `torch.nn.module` with basic functionality for saving and loading models both locally and from the HuggingFace hub.

-## API
+## ModelMixin
+[[autodoc]] ModelMixin

-Models should provide the `def forward` function and initialization of the model.
-All saving, loading, and utilities should be in the base ['ModelMixin'] class.
+## UNet2DOutput
+[[autodoc]] models.unet_2d.UNet2DOutput

-## Examples
+## UNet2DModel
+[[autodoc]] UNet2DModel

- The ['UNetModel'] was proposed in [TODO](https://arxiv.org/) and has been used in paper1, paper2, paper3.
- Extensions of the ['UNetModel'] include the ['UNetGlideModel'] that uses attention and timestep embeddings for the [GLIDE](https://arxiv.org/abs/2112.10741) paper, the ['UNetGradTTS'] model from this [paper](https://arxiv.org/abs/2105.06337) for text-to-speech, ['UNetLDMModel'] for latent-diffusion models in this [paper](https://arxiv.org/abs/2112.10752), and the ['TemporalUNet'] used for time-series prediciton in this reinforcement learning [paper](https://arxiv.org/abs/2205.09991).
- TODO: mention VAE / SDE score estimation
+## UNet2DConditionOutput
+[[autodoc]] models.unet_2d_condition.UNet2DConditionOutput
+
+## UNet2DConditionModel
+[[autodoc]] UNet2DConditionModel
+
+## DecoderOutput
+[[autodoc]] models.vae.DecoderOutput
+
+## VQEncoderOutput
+[[autodoc]] models.vae.VQEncoderOutput
+
+## VQModel
+[[autodoc]] VQModel
+
+## AutoencoderKLOutput
+[[autodoc]] models.vae.AutoencoderKLOutput
+
+## AutoencoderKL
+[[autodoc]] AutoencoderKL
+
+## FlaxModelMixin
+[[autodoc]] FlaxModelMixin
+
+## FlaxUNet2DConditionOutput
+[[autodoc]] models.unet_2d_condition_flax.FlaxUNet2DConditionOutput
+
+## FlaxUNet2DConditionModel
+[[autodoc]] FlaxUNet2DConditionModel
+
+## FlaxDecoderOutput
+[[autodoc]] models.vae_flax.FlaxDecoderOutput
+
+## FlaxAutoencoderKLOutput
+[[autodoc]] models.vae_flax.FlaxAutoencoderKLOutput
+
+## FlaxAutoencoderKL
+[[autodoc]] FlaxAutoencoderKL
--- a/docs/source/api/outputs.mdx
+++ b/docs/source/api/outputs.mdx
@@ -0,0 +1,55 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# BaseOutputs
+
+All models have outputs that are instances of subclasses of [`~utils.BaseOutput`]. Those are
+data structures containing all the information returned by the model, but that can also be used as tuples or
+dictionaries.
+
+Let's see how this looks in an example:
+
+```python
+from diffusers import DDIMPipeline
+
+pipeline = DDIMPipeline.from_pretrained("google/ddpm-cifar10-32")
+outputs = pipeline()
+```
+
+The `outputs` object is a [`~pipeline_utils.ImagePipelineOutput`], as we can see in the
+documentation of that class below, it means it has an image attribute.
+
+You can access each attribute as you would usually do, and if that attribute has not been returned by the model, you will get `None`:
+
+```python
+outputs.images
+```
+
+or via keyword lookup
+
+```python
+outputs["images"]
+```
+
+When considering our `outputs` object as tuple, it only considers the attributes that don't have `None` values.
+Here for instance, we could retrieve images via indexing:
+
+```python
+outputs[:1]
+```
+
+which will return the tuple `(outputs.images)` for instance.
+
+## BaseOutput
+
+[[autodoc]] utils.BaseOutput
+    - to_tuple
--- a/docs/source/api/pipelines/ddim.mdx
+++ b/docs/source/api/pipelines/ddim.mdx
@@ -0,0 +1,22 @@
+# DDIM
+
+## Overview
+
+[Denoising Diffusion Implicit Models](https://arxiv.org/abs/2010.02502) (DDIM) by Jiaming Song, Chenlin Meng and Stefano Ermon.
+
+The abstract of the paper is the following:
+
+Denoising diffusion probabilistic models (DDPMs) have achieved high quality image generation without adversarial training, yet they require simulating a Markov chain for many steps to produce a sample. To accelerate sampling, we present denoising diffusion implicit models (DDIMs), a more efficient class of iterative implicit probabilistic models with the same training procedure as DDPMs. In DDPMs, the generative process is defined as the reverse of a Markovian diffusion process. We construct a class of non-Markovian diffusion processes that lead to the same training objective, but whose reverse process can be much faster to sample from. We empirically demonstrate that DDIMs can produce high quality samples 10× to 50× faster in terms of wall-clock time compared to DDPMs, allow us to trade off computation for sample quality, and can perform semantically meaningful image interpolation directly in the latent space.
+
+The original codebase of this paper can be found [here](https://github.com/ermongroup/ddim).
+
+## Available Pipelines:
+
+| Pipeline | Tasks | Colab
+|---|---|:---:|
+| [pipeline_ddim.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/ddim/pipeline_ddim.py) | *Unconditional Image Generation* | - |
+
+
+## DDIMPipeline
+[[autodoc]] DDIMPipeline
+    - __call__
--- a/docs/source/api/pipelines/ddpm.mdx
+++ b/docs/source/api/pipelines/ddpm.mdx
@@ -0,0 +1,24 @@
+# DDPM
+
+## Overview
+
+[Denoising Diffusion Probabilistic Models](https://arxiv.org/abs/2006.11239) 
+ (DDPM) by Jonathan Ho, Ajay Jain and Pieter Abbeel proposes the diffusion based model of the same name, but in the context of the 🤗 Diffusers library, DDPM refers to the discrete denoising scheduler from the paper as well as the pipeline.
+
+The abstract of the paper is the following:
+
+We present high quality image synthesis results using diffusion probabilistic models, a class of latent variable models inspired by considerations from nonequilibrium thermodynamics. Our best results are obtained by training on a weighted variational bound designed according to a novel connection between diffusion probabilistic models and denoising score matching with Langevin dynamics, and our models naturally admit a progressive lossy decompression scheme that can be interpreted as a generalization of autoregressive decoding. On the unconditional CIFAR10 dataset, we obtain an Inception score of 9.46 and a state-of-the-art FID score of 3.17. On 256x256 LSUN, we obtain sample quality similar to ProgressiveGAN.
+
+The original codebase of this paper can be found [here](https://github.com/hojonathanho/diffusion).
+
+
+## Available Pipelines:
+
+| Pipeline | Tasks | Colab
+|---|---|:---:|
+| [pipeline_ddpm.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/ddpm/pipeline_ddpm.py) | *Unconditional Image Generation* | - |
+
+
+# DDPMPipeline
+[[autodoc]] DDPMPipeline
+    - __call__
--- a/docs/source/api/pipelines/latent_diffusion.mdx
+++ b/docs/source/api/pipelines/latent_diffusion.mdx
@@ -0,0 +1,30 @@
+# Latent Diffusion
+
+## Overview
+
+Latent Diffusion was proposed in [High-Resolution Image Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752) by Robin Rombach, Andreas Blattmann, Dominik Lorenz, Patrick Esser, Björn Ommer.
+
+The abstract of the paper is the following:
+
+*By decomposing the image formation process into a sequential application of denoising autoencoders, diffusion models (DMs) achieve state-of-the-art synthesis results on image data and beyond. Additionally, their formulation allows for a guiding mechanism to control the image generation process without retraining. However, since these models typically operate directly in pixel space, optimization of powerful DMs often consumes hundreds of GPU days and inference is expensive due to sequential evaluations. To enable DM training on limited computational resources while retaining their quality and flexibility, we apply them in the latent space of powerful pretrained autoencoders. In contrast to previous work, training diffusion models on such a representation allows for the first time to reach a near-optimal point between complexity reduction and detail preservation, greatly boosting visual fidelity. By introducing cross-attention layers into the model architecture, we turn diffusion models into powerful and flexible generators for general conditioning inputs such as text or bounding boxes and high-resolution synthesis becomes possible in a convolutional manner. Our latent diffusion models (LDMs) achieve a new state of the art for image inpainting and highly competitive performance on various tasks, including unconditional image generation, semantic scene synthesis, and super-resolution, while significantly reducing computational requirements compared to pixel-based DMs.*
+
+The original codebase can be found [here](https://github.com/CompVis/latent-diffusion).
+
+## Tips:
+
+- 
+- 
+- 
+
+## Available Pipelines:
+
+| Pipeline | Tasks | Colab
+|---|---|:---:|
+| [pipeline_latent_diffusion.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py) | *Text-to-Image Generation* | - |
+
+## Examples:
+
+
+## LDMTextToImagePipeline
+[[autodoc]] pipelines.latent_diffusion.pipeline_latent_diffusion.LDMTextToImagePipeline
+    - __call__
--- a/docs/source/api/pipelines/latent_diffusion_uncond.mdx
+++ b/docs/source/api/pipelines/latent_diffusion_uncond.mdx
@@ -0,0 +1,29 @@
+# Unconditional Latent Diffusion
+
+## Overview
+
+Unconditional Latent Diffusion was proposed in [High-Resolution Image Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752) by Robin Rombach, Andreas Blattmann, Dominik Lorenz, Patrick Esser, Björn Ommer.
+
+The abstract of the paper is the following:
+
+*By decomposing the image formation process into a sequential application of denoising autoencoders, diffusion models (DMs) achieve state-of-the-art synthesis results on image data and beyond. Additionally, their formulation allows for a guiding mechanism to control the image generation process without retraining. However, since these models typically operate directly in pixel space, optimization of powerful DMs often consumes hundreds of GPU days and inference is expensive due to sequential evaluations. To enable DM training on limited computational resources while retaining their quality and flexibility, we apply them in the latent space of powerful pretrained autoencoders. In contrast to previous work, training diffusion models on such a representation allows for the first time to reach a near-optimal point between complexity reduction and detail preservation, greatly boosting visual fidelity. By introducing cross-attention layers into the model architecture, we turn diffusion models into powerful and flexible generators for general conditioning inputs such as text or bounding boxes and high-resolution synthesis becomes possible in a convolutional manner. Our latent diffusion models (LDMs) achieve a new state of the art for image inpainting and highly competitive performance on various tasks, including unconditional image generation, semantic scene synthesis, and super-resolution, while significantly reducing computational requirements compared to pixel-based DMs.*
+
+The original codebase can be found [here](https://github.com/CompVis/latent-diffusion).
+
+## Tips:
+
+- 
+- 
+- 
+
+## Available Pipelines:
+
+| Pipeline | Tasks | Colab
+|---|---|:---:|
+| [pipeline_latent_diffusion_uncond.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py) | *Unconditional Image Generation* | - |
+
+## Examples:
+
+## LDMPipeline
+[[autodoc]] LDMPipeline
+    - __call__
--- a/docs/source/api/pipelines/overview.mdx
+++ b/docs/source/api/pipelines/overview.mdx
@@ -0,0 +1,184 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Pipelines
+
+Pipelines provide a simple way to run state-of-the-art diffusion models in inference.
+Most diffusion systems consist of multiple independently-trained models and highly adaptable scheduler 
+components - all of which are needed to have a functioning end-to-end diffusion system.
+
+As an example, [Stable Diffusion](https://huggingface.co/blog/stable_diffusion) has three independently trained models:
+- [Autoencoder](./api/models#vae)
+- [Conditional Unet](./api/models#UNet2DConditionModel)
+- [CLIP text encoder](https://huggingface.co/docs/transformers/v4.21.2/en/model_doc/clip#transformers.CLIPTextModel)
+- a scheduler component, [scheduler](./api/scheduler#pndm), 
+- a [CLIPFeatureExtractor](https://huggingface.co/docs/transformers/v4.21.2/en/model_doc/clip#transformers.CLIPFeatureExtractor),
+- as well as a [safety checker](./stable_diffusion#safety_checker).
+All of these components are necessary to run stable diffusion in inference even though they were trained 
+or created independently from each other.
+
+To that end, we strive to offer all open-sourced, state-of-the-art diffusion system under a unified API. 
+More specifically, we strive to provide pipelines that
+- 1. can load the officially published weights and yield 1-to-1 the same outputs as the original implementation according to the corresponding paper (*e.g.* [LatentDiffusionPipeline](https://github.com/huggingface/diffusers/tree/main/src/diffusers/pipelines/latent_diffusion), uses the officially released weights of [High-Resolution Image Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752)),
+- 2. have a simple user interface to run the model in inference (see the [Pipelines API](#pipelines-api) section), 
+- 3. are easy to understand with code that is self-explanatory and can be read along-side the official paper (see [Pipelines summary](#pipelines-summary)),
+- 4. can easily be contributed by the community (see the [Contribution](#contribution) section).
+
+**Note** that pipelines do not (and should not) offer any training functionality. 
+If you are looking for *official* training examples, please have a look at [examples](https://github.com/huggingface/diffusers/tree/main/examples).
+
+## 🧨 Diffusers Summary
+
+The following table summarizes all officially supported pipelines, their corresponding paper, and if 
+available a colab notebook to directly try them out.
+
+| Pipeline | Paper | Tasks | Colab
+|---|---|:---:|:---:|
+| [ddpm](./ddpm) | [**Denoising Diffusion Probabilistic Models**](https://arxiv.org/abs/2006.11239) | Unconditional Image Generation |
+| [ddim](./ddim) | [**Denoising Diffusion Implicit Models**](https://arxiv.org/abs/2010.02502) | Unconditional Image Generation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/training_example.ipynb)
+| [latent_diffusion](./latent_diffusion) | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752)| Text-to-Image Generation | 
+| [latent_diffusion_uncond](./latent_diffusion_uncond) | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752) | Unconditional Image Generation | 
+| [pndm](./pndm) | [**Pseudo Numerical Methods for Diffusion Models on Manifolds**](https://arxiv.org/abs/2202.09778) | Unconditional Image Generation | 
+| [score_sde_ve](./score_sde_ve) | [**Score-Based Generative Modeling through Stochastic Differential Equations**](https://openreview.net/forum?id=PxTIG12RRHS) | Unconditional Image Generation | 
+| [score_sde_vp](./score_sde_vp) | [**Score-Based Generative Modeling through Stochastic Differential Equations**](https://openreview.net/forum?id=PxTIG12RRHS) | Unconditional Image Generation | 
+| [stable_diffusion](./stable_diffusion) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Text-to-Image Generation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/training_example.ipynb)
+| [stable_diffusion](./stable_diffusion) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Image-to-Image Text-Guided Generation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/image_2_image_using_diffusers.ipynb)
+| [stable_diffusion](./stable_diffusion) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Text-Guided Image Inpainting | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/in_painting_with_stable_diffusion_using_diffusers.ipynb)
+| [stochastic_karras_ve](./stochastic_karras_ve) | [**Elucidating the Design Space of Diffusion-Based Generative Models**](https://arxiv.org/abs/2206.00364) | Unconditional Image Generation | 
+
+**Note**: Pipelines are simple examples of how to play around with the diffusion systems as described in the corresponding papers. 
+
+However, most of them can be adapted to use different scheduler components or even different model components. Some pipeline examples are shown in the [Examples](#examples) below.
+
+## Pipelines API
+
+Diffusion models often consist of multiple independently-trained models or other previously existing components. 
+
+
+Each model has been trained independently on a different task and the scheduler can easily be swapped out and replaced with a different one. 
+During inference, we however want to be able to easily load all components and use them in inference - even if one component, *e.g.* CLIP's text encoder, originates from a different library, such as [Transformers](https://github.com/huggingface/transformers). To that end, all pipelines provide the following functionality:
+
+- [`from_pretrained` method](../diffusion_pipeline) that accepts a Hugging Face Hub repository id, *e.g.* [CompVis/stable-diffusion-v1-4](https://huggingface.co/CompVis/stable-diffusion-v1-4) or a path to a local directory, *e.g.*
+"./stable-diffusion". To correctly retrieve which models and components should be loaded, one has to provide a `model_index.json` file, *e.g.* [CompVis/stable-diffusion-v1-4/model_index.json](https://huggingface.co/CompVis/stable-diffusion-v1-4/blob/main/model_index.json), which defines all components that should be 
+loaded into the pipelines. More specifically, for each model/component one needs to define the format `<name>: ["<library>", "<class name>"]`. `<name>` is the attribute name given to the loaded instance of `<class name>` which can be found in the library or pipeline folder called `"<library>"`.
+- [`save_pretrained`](../diffusion_pipeline) that accepts a local path, *e.g.* `./stable-diffusion` under which all models/components of the pipeline will be saved. For each component/model a folder is created inside the local path that is named after the given attribute name, *e.g.* `./stable_diffusion/unet`. 
+In addition, a `model_index.json` file is created at the root of the local path, *e.g.* `./stable_diffusion/model_index.json` so that the complete pipeline can again be instantiated 
+from the local path.
+- [`to`](../diffusion_pipeline) which accepts a `string` or `torch.device` to move all models that are of type `torch.nn.Module` to the passed device. The behavior is fully analogous to [PyTorch's `to` method](https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module.to).
+- [`__call__`] method to use the pipeline in inference. `__call__` defines inference logic of the pipeline and should ideally encompass all aspects of it, from pre-processing to forwarding tensors to the different models and schedulers, as well as post-processing. The API of the `__call__` method can strongly vary from pipeline to pipeline. *E.g.* a text-to-image pipeline, such as [`StableDiffusionPipeline`](./stable_diffusion) should accept among other things the text prompt to generate the image. A pure image generation pipeline, such as [DDPMPipeline](https://github.com/huggingface/diffusers/tree/main/src/diffusers/pipelines/ddpm) on the other hand can be run without providing any inputs. To better understand what inputs can be adapted for 
+each pipeline, one should look directly into the respective pipeline.
+
+**Note**: All pipelines have PyTorch's autograd disabled by decorating the `__call__` method with a [`torch.no_grad`](https://pytorch.org/docs/stable/generated/torch.no_grad.html) decorator because pipelines should
+not be used for training. If you want to store the gradients during the forward pass, we recommend writing your own pipeline, see also our [community-examples](https://github.com/huggingface/diffusers/tree/main/examples/community)
+
+## Contribution
+
+We are more than happy about any contribution to the officially supported pipelines 🤗. We aspire
+all of our pipelines to be  **self-contained**, **easy-to-tweak**, **beginner-friendly** and for **one-purpose-only**.
+
+- **Self-contained**: A pipeline shall be as self-contained as possible. More specifically, this means that all functionality should be either directly defined in the pipeline file itself, should be inherited from (and only from) the [`DiffusionPipeline` class](.../diffusion_pipeline) or be directly attached to the model and scheduler components of the pipeline. 
+- **Easy-to-use**: Pipelines should be extremely easy to use - one should be able to load the pipeline and 
+use it for its designated task, *e.g.* text-to-image generation, in just a couple of lines of code. Most 
+logic including pre-processing, an unrolled diffusion loop, and post-processing should all happen inside the `__call__` method.
+- **Easy-to-tweak**: Certain pipelines will not be able to handle all use cases and tasks that you might like them to. If you want to use a certain pipeline for a specific use case that is not yet supported, you might have to copy the pipeline file and tweak the code to your needs. We try to make the pipeline code as readable as possible so that each part –from pre-processing to diffusing to post-processing– can easily be adapted. If you would like the community to benefit from your customized pipeline, we would love to see a contribution to our [community-examples](https://github.com/huggingface/diffusers/tree/main/examples/community). If you feel that an important pipeline should be part of the official pipelines but isn't, a contribution to the [official pipelines](./overview) would be even better.
+- **One-purpose-only**: Pipelines should be used for one task and one task only. Even if two tasks are very similar from a modeling point of view, *e.g.* image2image translation and in-painting, pipelines shall be used for one task only to keep them *easy-to-tweak* and *readable*.
+
+## Examples
+
+### Text-to-Image generation with Stable Diffusion
+
+```python
+# make sure you're logged in with `huggingface-cli login`
+from diffusers import StableDiffusionPipeline, LMSDiscreteScheduler
+
+pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
+pipe = pipe.to("cuda")
+
+prompt = "a photo of an astronaut riding a horse on mars"
+image = pipe(prompt).images[0]
+
+image.save("astronaut_rides_horse.png")
+```
+
+### Image-to-Image text-guided generation with Stable Diffusion
+
+The `StableDiffusionImg2ImgPipeline` lets you pass a text prompt and an initial image to condition the generation of new images.
+
+```python
+import requests
+from PIL import Image
+from io import BytesIO
+
+from diffusers import StableDiffusionImg2ImgPipeline
+
+# load the pipeline
+device = "cuda"
+pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
+    "CompVis/stable-diffusion-v1-4", revision="fp16", torch_dtype=torch.float16
+).to(device)
+
+# let's download an initial image
+url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
+
+response = requests.get(url)
+init_image = Image.open(BytesIO(response.content)).convert("RGB")
+init_image = init_image.resize((768, 512))
+
+prompt = "A fantasy landscape, trending on artstation"
+
+images = pipe(prompt=prompt, init_image=init_image, strength=0.75, guidance_scale=7.5).images
+
+images[0].save("fantasy_landscape.png")
+```
+You can also run this example on colab [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/image_2_image_using_diffusers.ipynb)
+
+### Tweak prompts reusing seeds and latents
+
+You can generate your own latents to reproduce results, or tweak your prompt on a specific result you liked. [This notebook](https://github.com/pcuenca/diffusers-examples/blob/main/notebooks/stable-diffusion-seeds.ipynb) shows how to do it step by step. You can also run it in Google Colab [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/pcuenca/diffusers-examples/blob/main/notebooks/stable-diffusion-seeds.ipynb).
+
+
+### In-painting using Stable Diffusion
+
+The `StableDiffusionInpaintPipeline` lets you edit specific parts of an image by providing a mask and text prompt.
+
+```python
+import PIL
+import requests
+import torch
+from io import BytesIO
+
+from diffusers import StableDiffusionInpaintPipeline
+
+
+def download_image(url):
+    response = requests.get(url)
+    return PIL.Image.open(BytesIO(response.content)).convert("RGB")
+
+
+img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png"
+mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png"
+
+init_image = download_image(img_url).resize((512, 512))
+mask_image = download_image(mask_url).resize((512, 512))
+
+pipe = StableDiffusionInpaintPipeline.from_pretrained(
+    "runwayml/stable-diffusion-inpainting",
+    revision="fp16",
+    torch_dtype=torch.float16,
+)
+pipe = pipe.to("cuda")
+
+prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
+image = pipe(prompt=prompt, image=init_image, mask_image=mask_image).images[0]
+```
+
+You can also run this example on colab [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/in_painting_with_stable_diffusion_using_diffusers.ipynb)
--- a/docs/source/api/pipelines/pndm.mdx
+++ b/docs/source/api/pipelines/pndm.mdx
@@ -0,0 +1,23 @@
+# PNDM
+
+## Overview
+
+[Pseudo Numerical methods for Diffusion Models on manifolds](https://arxiv.org/abs/2202.09778) (PNDM) by  Luping Liu, Yi Ren, Zhijie Lin and Zhou Zhao.
+
+The abstract of the paper is the following:
+
+Denoising Diffusion Probabilistic Models (DDPMs) can generate high-quality samples such as image and audio samples. However, DDPMs require hundreds to thousands of iterations to produce final samples. Several prior works have successfully accelerated DDPMs through adjusting the variance schedule (e.g., Improved Denoising Diffusion Probabilistic Models) or the denoising equation (e.g., Denoising Diffusion Implicit Models (DDIMs)). However, these acceleration methods cannot maintain the quality of samples and even introduce new noise at a high speedup rate, which limit their practicability. To accelerate the inference process while keeping the sample quality, we provide a fresh perspective that DDPMs should be treated as solving differential equations on manifolds. Under such a perspective, we propose pseudo numerical methods for diffusion models (PNDMs). Specifically, we figure out how to solve differential equations on manifolds and show that DDIMs are simple cases of pseudo numerical methods. We change several classical numerical methods to corresponding pseudo numerical methods and find that the pseudo linear multi-step method is the best in most situations. According to our experiments, by directly using pre-trained models on Cifar10, CelebA and LSUN, PNDMs can generate higher quality synthetic images with only 50 steps compared with 1000-step DDIMs (20x speedup), significantly outperform DDIMs with 250 steps (by around 0.4 in FID) and have good generalization on different variance schedules. 
+
+The original codebase can be found [here](https://github.com/luping-liu/PNDM).
+
+## Available Pipelines:
+
+| Pipeline | Tasks | Colab
+|---|---|:---:|
+| [pipeline_pndm.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/pndm/pipeline_pndm.py) | *Unconditional Image Generation* | - |
+
+
+## PNDMPipeline
+[[autodoc]] pipelines.pndm.pipeline_pndm.PNDMPipeline
+    - __call__
+
--- a/docs/source/api/pipelines/score_sde_ve.mdx
+++ b/docs/source/api/pipelines/score_sde_ve.mdx
@@ -0,0 +1,24 @@
+# Score SDE VE
+
+## Overview
+
+[Score-Based Generative Modeling through Stochastic Differential Equations](https://arxiv.org/abs/2011.13456) (Score SDE) by Yang Song, Jascha Sohl-Dickstein, Diederik P. Kingma, Abhishek Kumar, Stefano Ermon and Ben Poole.
+
+The abstract of the paper is the following:
+
+Creating noise from data is easy; creating data from noise is generative modeling. We present a stochastic differential equation (SDE) that smoothly transforms a complex data distribution to a known prior distribution by slowly injecting noise, and a corresponding reverse-time SDE that transforms the prior distribution back into the data distribution by slowly removing the noise. Crucially, the reverse-time SDE depends only on the time-dependent gradient field (\aka, score) of the perturbed data distribution. By leveraging advances in score-based generative modeling, we can accurately estimate these scores with neural networks, and use numerical SDE solvers to generate samples. We show that this framework encapsulates previous approaches in score-based generative modeling and diffusion probabilistic modeling, allowing for new sampling procedures and new modeling capabilities. In particular, we introduce a predictor-corrector framework to correct errors in the evolution of the discretized reverse-time SDE. We also derive an equivalent neural ODE that samples from the same distribution as the SDE, but additionally enables exact likelihood computation, and improved sampling efficiency. In addition, we provide a new way to solve inverse problems with score-based models, as demonstrated with experiments on class-conditional generation, image inpainting, and colorization. Combined with multiple architectural improvements, we achieve record-breaking performance for unconditional image generation on CIFAR-10 with an Inception score of 9.89 and FID of 2.20, a competitive likelihood of 2.99 bits/dim, and demonstrate high fidelity generation of 1024 x 1024 images for the first time from a score-based generative model.
+
+The original codebase can be found [here](https://github.com/yang-song/score_sde_pytorch).
+
+This pipeline implements the Variance Expanding (VE) variant of the method.
+
+## Available Pipelines:
+
+| Pipeline | Tasks | Colab
+|---|---|:---:|
+| [pipeline_score_sde_ve.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/score_sde_ve/pipeline_score_sde_ve.py) | *Unconditional Image Generation* | - |
+
+## ScoreSdeVePipeline
+[[autodoc]] ScoreSdeVePipeline
+    - __call__
+
--- a/docs/source/api/pipelines/stable_diffusion.mdx
+++ b/docs/source/api/pipelines/stable_diffusion.mdx
@@ -0,0 +1,39 @@
+# Stable diffusion pipelines
+
+Stable Diffusion is a text-to-image _latent diffusion_ model created by the researchers and engineers from [CompVis](https://github.com/CompVis), [Stability AI](https://stability.ai/) and [LAION](https://laion.ai/). It's trained on 512x512 images from a subset of the [LAION-5B](https://laion.ai/blog/laion-5b/) dataset. This model uses a frozen CLIP ViT-L/14 text encoder to condition the model on text prompts. With its 860M UNet and 123M text encoder, the model is relatively lightweight and can run on consumer GPUs.
+
+Latent diffusion is the research on top of which Stable Diffusion was built. It was proposed in [High-Resolution Image Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752) by Robin Rombach, Andreas Blattmann, Dominik Lorenz, Patrick Esser, Björn Ommer. You can learn more details about it in the [specific pipeline for latent diffusion](pipelines/latent_diffusion) that is part of 🤗 Diffusers.
+
+For more details about how Stable Diffusion works and how it differs from the base latent diffusion model, please refer to the official [launch announcement post](https://stability.ai/blog/stable-diffusion-announcement) and [this section of our own blog post](https://huggingface.co/blog/stable_diffusion#how-does-stable-diffusion-work).
+
+*Tips*:
+- To tweak your prompts on a specific result you liked, you can generate your own latents, as demonstrated in the following notebook: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/pcuenca/diffusers-examples/blob/main/notebooks/stable-diffusion-seeds.ipynb)
+
+*Overview*:
+
+| Pipeline | Tasks | Colab | Demo
+|---|---|:---:|:---:|
+| [pipeline_stable_diffusion.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py) | *Text-to-Image Generation* | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/stable_diffusion.ipynb) | [🤗 Stable Diffusion](https://huggingface.co/spaces/stabilityai/stable-diffusion)
+| [pipeline_stable_diffusion_img2img.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py) | *Image-to-Image Text-Guided Generation* | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/image_2_image_using_diffusers.ipynb) | [🤗 Diffuse the Rest](https://huggingface.co/spaces/huggingface/diffuse-the-rest)
+| [pipeline_stable_diffusion_inpaint.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py) | **Experimental** – *Text-Guided Image Inpainting* | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/in_painting_with_stable_diffusion_using_diffusers.ipynb) | Coming soon
+
+## StableDiffusionPipelineOutput
+[[autodoc]] pipelines.stable_diffusion.StableDiffusionPipelineOutput
+
+## StableDiffusionPipeline
+[[autodoc]] StableDiffusionPipeline
+	- __call__
+	- enable_attention_slicing
+	- disable_attention_slicing
+
+## StableDiffusionImg2ImgPipeline
+[[autodoc]] StableDiffusionImg2ImgPipeline
+	- __call__
+	- enable_attention_slicing
+	- disable_attention_slicing
+
+## StableDiffusionInpaintPipeline
+[[autodoc]] StableDiffusionInpaintPipeline
+	- __call__
+	- enable_attention_slicing
+	- disable_attention_slicing
--- a/docs/source/api/pipelines/stochastic_karras_ve.mdx
+++ b/docs/source/api/pipelines/stochastic_karras_ve.mdx
@@ -0,0 +1,23 @@
+# Stochastic Karras VE
+
+## Overview
+
+[Elucidating the Design Space of Diffusion-Based Generative Models](https://arxiv.org/abs/2206.00364) by Tero Karras, Miika Aittala, Timo Aila and Samuli Laine.
+
+The abstract of the paper is the following:
+
+We argue that the theory and practice of diffusion-based generative models are currently unnecessarily convoluted and seek to remedy the situation by presenting a design space that clearly separates the concrete design choices. This lets us identify several changes to both the sampling and training processes, as well as preconditioning of the score networks. Together, our improvements yield new state-of-the-art FID of 1.79 for CIFAR-10 in a class-conditional setting and 1.97 in an unconditional setting, with much faster sampling (35 network evaluations per image) than prior designs. To further demonstrate their modular nature, we show that our design changes dramatically improve both the efficiency and quality obtainable with pre-trained score networks from previous work, including improving the FID of an existing ImageNet-64 model from 2.07 to near-SOTA 1.55.
+
+This pipeline implements the Stochastic sampling tailored to the Variance-Expanding (VE) models.
+
+
+## Available Pipelines:
+
+| Pipeline | Tasks | Colab
+|---|---|:---:|
+| [pipeline_stochastic_karras_ve.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stochastic_karras_ve/pipeline_stochastic_karras_ve.py) | *Unconditional Image Generation* | - |
+
+
+## KarrasVePipeline
+[[autodoc]] KarrasVePipeline
+    - __call__
--- a/docs/source/api/schedulers.mdx
+++ b/docs/source/api/schedulers.mdx
@@ -0,0 +1,108 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Schedulers
+
+Diffusers contains multiple pre-built schedule functions for the diffusion process.
+
+## What is a scheduler?
+
+The schedule functions, denoted *Schedulers* in the library take in the output of a trained model, a sample which the diffusion process is iterating on, and a timestep to return a denoised sample.
+
+- Schedulers define the methodology for iteratively adding noise to an image or for updating a sample based on model outputs.
+    - adding noise in different manners represent the algorithmic processes to train a diffusion model by adding noise to images.
+    - for inference, the scheduler defines how to update a sample based on an output from a pretrained model.
+- Schedulers are often defined by a *noise schedule* and an *update rule* to solve the differential equation solution.
+
+### Discrete versus continuous schedulers
+
+All schedulers take in a timestep to predict the updated version of the sample being diffused.
+The timesteps dictate where in the diffusion process the step is, where data is generated by iterating forward in time and inference is executed by propagating backwards through timesteps.
+Different algorithms use timesteps that both discrete (accepting `int` inputs), such as the [`DDPMScheduler`] or [`PNDMScheduler`], and continuous (accepting `float` inputs), such as the score-based schedulers [`ScoreSdeVeScheduler`] or [`ScoreSdeVpScheduler`].
+
+## Designing Re-usable schedulers
+
+The core design principle between the schedule functions is to be model, system, and framework independent.
+This allows for rapid experimentation and cleaner abstractions in the code, where the model prediction is separated from the sample update.
+To this end, the design of schedulers is such that:
+
+- Schedulers can be used interchangeably between diffusion models in inference to find the preferred trade-off between speed and generation quality.
+- Schedulers are currently by default in PyTorch, but are designed to be framework independent (partial Jax support currently exists).
+
+
+## API
+
+The core API for any new scheduler must follow a limited structure.
+- Schedulers should provide one or more `def step(...)` functions that should be called to update the generated sample iteratively.
+- Schedulers should provide a `set_timesteps(...)` method that configures the parameters of a schedule function for a specific inference task.
+- Schedulers should be framework-specific.
+
+The base class [`SchedulerMixin`] implements low level utilities used by multiple schedulers.
+
+### SchedulerMixin
+[[autodoc]] SchedulerMixin
+
+### SchedulerOutput
+The class [`SchedulerOutput`] contains the outputs from any schedulers `step(...)` call.
+
+[[autodoc]] schedulers.scheduling_utils.SchedulerOutput
+
+### Implemented Schedulers
+
+#### Denoising diffusion implicit models (DDIM)
+
+Original paper can be found here.
+
+[[autodoc]] DDIMScheduler
+
+#### Denoising diffusion probabilistic models (DDPM)
+
+Original paper can be found [here](https://arxiv.org/abs/2010.02502).
+
+[[autodoc]] DDPMScheduler
+
+#### Variance exploding, stochastic sampling from Karras et. al
+
+Original paper can be found [here](https://arxiv.org/abs/2006.11239).
+
+[[autodoc]] KarrasVeScheduler
+
+#### Linear multistep scheduler for discrete beta schedules
+
+Original implementation can be found [here](https://arxiv.org/abs/2206.00364).
+
+
+[[autodoc]] LMSDiscreteScheduler
+
+#### Pseudo numerical methods for diffusion models (PNDM)
+
+Original implementation can be found [here](https://github.com/crowsonkb/k-diffusion/blob/481677d114f6ea445aa009cf5bd7a9cdee909e47/k_diffusion/sampling.py#L181).
+
+[[autodoc]] PNDMScheduler
+
+#### variance exploding stochastic differential equation (SDE) scheduler
+
+Original paper can be found [here](https://arxiv.org/abs/2011.13456).
+
+[[autodoc]] ScoreSdeVeScheduler
+
+#### variance preserving stochastic differential equation (SDE) scheduler
+
+Original paper can be found [here](https://arxiv.org/abs/2011.13456).
+
+<Tip warning={true}>
+
+Score SDE-VP is under construction.
+
+</Tip>
+
+[[autodoc]] schedulers.scheduling_sde_vp.ScoreSdeVpScheduler
--- a/docs/source/conceptual/contribution.mdx
+++ b/docs/source/conceptual/contribution.mdx
@@ -0,0 +1,291 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# How to contribute to Diffusers 🧨
+
+We ❤️ contributions from the open-source community! Everyone is welcome, and all types of participation –not just code– are valued and appreciated. Answering questions, helping others, reaching out and improving the documentation are all immensely valuable to the community, so don't be afraid and get involved if you're up for it!
+
+It also helps us if you spread the word: reference the library from blog posts
+on the awesome projects it made possible, shout out on Twitter every time it has
+helped you, or simply star the repo to say "thank you".
+
+We encourage everyone to start by saying 👋 in our public Discord channel. We discuss the hottest trends about diffusion models, ask questions, show-off personal projects, help each other with contributions, or just hang out ☕. <a href="https://discord.gg/G7tWnz98XR"><img alt="Join us on Discord" src="https://img.shields.io/discord/823813159592001537?color=5865F2&logo=discord&logoColor=white"></a>
+
+Whichever way you choose to contribute, we strive to be part of an open, welcoming and kind community. Please, read our [code of conduct](https://github.com/huggingface/diffusers/blob/main/CODE_OF_CONDUCT.md) and be mindful to respect it during your interactions.
+
+
+## Overview
+
+You can contribute in so many ways! Just to name a few:
+
+* Fixing outstanding issues with the existing code.
+* Implementing [new diffusion pipelines](https://github.com/huggingface/diffusers/tree/main/src/diffusers/pipelines#contribution), [new schedulers](https://github.com/huggingface/diffusers/tree/main/src/diffusers/schedulers) or [new models](https://github.com/huggingface/diffusers/tree/main/src/diffusers/models).
+* [Contributing to the examples](https://github.com/huggingface/diffusers/tree/main/examples).
+* [Contributing to the documentation](https://github.com/huggingface/diffusers/tree/main/docs/source).
+* Submitting issues related to bugs or desired new features.
+
+*All are equally valuable to the community.*
+
+### Browse GitHub issues for suggestions
+
+If you need inspiration, you can look out for [issues](https://github.com/huggingface/diffusers/issues) you'd like to tackle to contribute to the library. There are a few filters that can be helpful:
+
+- See [Good first issues](https://github.com/huggingface/diffusers/issues?q=is%3Aopen+is%3Aissue+label%3A%22good+first+issue%22) for general opportunities to contribute and getting started with the codebase.
+- See [New pipeline/model](https://github.com/huggingface/diffusers/issues?q=is%3Aopen+is%3Aissue+label%3A%22New+pipeline%2Fmodel%22) to contribute exciting new diffusion models or diffusion pipelines.
+- See [New scheduler](https://github.com/huggingface/diffusers/issues?q=is%3Aopen+is%3Aissue+label%3A%22New+scheduler%22) to work on new samplers and schedulers.
+
+
+## Submitting a new issue or feature request
+
+Do your best to follow these guidelines when submitting an issue or a feature
+request. It will make it easier for us to come back to you quickly and with good
+feedback.
+
+### Did you find a bug?
+
+The 🧨 Diffusers library is robust and reliable thanks to the users who notify us of
+the problems they encounter. So thank you for reporting an issue.
+
+First, we would really appreciate it if you could **make sure the bug was not
+already reported** (use the search bar on GitHub under Issues).
+
+### Do you want to implement a new diffusion pipeline / diffusion model?
+
+Awesome! Please provide the following information:
+
+* Short description of the diffusion pipeline and link to the paper;
+* Link to the implementation if it is open-source;
+* Link to the model weights if they are available.
+
+If you are willing to contribute the model yourself, let us know so we can best
+guide you.
+
+### Do you want a new feature (that is not a model)?
+
+A world-class feature request addresses the following points:
+
+1. Motivation first:
+  * Is it related to a problem/frustration with the library? If so, please explain
+    why. Providing a code snippet that demonstrates the problem is best.
+  * Is it related to something you would need for a project? We'd love to hear
+    about it!
+  * Is it something you worked on and think could benefit the community?
+    Awesome! Tell us what problem it solved for you.
+2. Write a *full paragraph* describing the feature;
+3. Provide a **code snippet** that demonstrates its future use;
+4. In case this is related to a paper, please attach a link;
+5. Attach any additional information (drawings, screenshots, etc.) you think may help.
+
+If your issue is well written we're already 80% of the way there by the time you
+post it.
+
+## Start contributing! (Pull Requests)
+
+Before writing code, we strongly advise you to search through the existing PRs or
+issues to make sure that nobody is already working on the same thing. If you are
+unsure, it is always a good idea to open an issue to get some feedback.
+
+You will need basic `git` proficiency to be able to contribute to
+🧨 Diffusers. `git` is not the easiest tool to use but it has the greatest
+manual. Type `git --help` in a shell and enjoy. If you prefer books, [Pro
+Git](https://git-scm.com/book/en/v2) is a very good reference.
+
+Follow these steps to start contributing ([supported Python versions](https://github.com/huggingface/diffusers/blob/main/setup.py#L212)):
+
+1. Fork the [repository](https://github.com/huggingface/diffusers) by
+   clicking on the 'Fork' button on the repository's page. This creates a copy of the code
+   under your GitHub user account.
+
+2. Clone your fork to your local disk, and add the base repository as a remote:
+
+   ```bash
+   $ git clone git@github.com:<your Github handle>/diffusers.git
+   $ cd diffusers
+   $ git remote add upstream https://github.com/huggingface/diffusers.git
+   ```
+
+3. Create a new branch to hold your development changes:
+
+   ```bash
+   $ git checkout -b a-descriptive-name-for-my-changes
+   ```
+
+   **Do not** work on the `main` branch.
+
+4. Set up a development environment by running the following command in a virtual environment:
+
+   ```bash
+   $ pip install -e ".[dev]"
+   ```
+
+   (If Diffusers was already installed in the virtual environment, remove
+   it with `pip uninstall diffusers` before reinstalling it in editable
+   mode with the `-e` flag.)
+
+   To run the full test suite, you might need the additional dependency on `transformers` and `datasets` which requires a separate source
+   install:
+
+   ```bash
+   $ git clone https://github.com/huggingface/transformers
+   $ cd transformers
+   $ pip install -e .
+   ```
+
+   ```bash
+   $ git clone https://github.com/huggingface/datasets
+   $ cd datasets
+   $ pip install -e .
+   ```
+
+   If you have already cloned that repo, you might need to `git pull` to get the most recent changes in the `datasets`
+   library.
+
+5. Develop the features on your branch.
+
+   As you work on the features, you should make sure that the test suite
+   passes. You should run the tests impacted by your changes like this:
+
+   ```bash
+   $ pytest tests/<TEST_TO_RUN>.py
+   ```
+
+   You can also run the full suite with the following command, but it takes
+   a beefy machine to produce a result in a decent amount of time now that
+   Diffusers has grown a lot. Here is the command for it:
+
+   ```bash
+   $ make test
+   ```
+
+   For more information about tests, check out the
+   [dedicated documentation](https://huggingface.co/docs/diffusers/testing)
+
+   🧨 Diffusers relies on `black` and `isort` to format its source code
+   consistently. After you make changes, apply automatic style corrections and code verifications
+   that can't be automated in one go with:
+
+   ```bash
+   $ make style
+   ```
+
+   🧨 Diffusers also uses `flake8` and a few custom scripts to check for coding mistakes. Quality
+   control runs in CI, however you can also run the same checks with:
+
+   ```bash
+   $ make quality
+   ```
+
+   Once you're happy with your changes, add changed files using `git add` and
+   make a commit with `git commit` to record your changes locally:
+
+   ```bash
+   $ git add modified_file.py
+   $ git commit
+   ```
+
+   It is a good idea to sync your copy of the code with the original
+   repository regularly. This way you can quickly account for changes:
+
+   ```bash
+   $ git fetch upstream
+   $ git rebase upstream/main
+   ```
+
+   Push the changes to your account using:
+
+   ```bash
+   $ git push -u origin a-descriptive-name-for-my-changes
+   ```
+
+6. Once you are satisfied (**and the checklist below is happy too**), go to the
+   webpage of your fork on GitHub. Click on 'Pull request' to send your changes
+   to the project maintainers for review.
+
+7. It's ok if maintainers ask you for changes. It happens to core contributors
+   too! So everyone can see the changes in the Pull request, work in your local
+   branch and push the changes to your fork. They will automatically appear in
+   the pull request.
+
+
+### Checklist
+
+1. The title of your pull request should be a summary of its contribution;
+2. If your pull request addresses an issue, please mention the issue number in
+   the pull request description to make sure they are linked (and people
+   consulting the issue know you are working on it);
+3. To indicate a work in progress please prefix the title with `[WIP]`. These
+   are useful to avoid duplicated work, and to differentiate it from PRs ready
+   to be merged;
+4. Make sure existing tests pass;
+5. Add high-coverage tests. No quality testing = no merge.
+   - If you are adding new `@slow` tests, make sure they pass using
+     `RUN_SLOW=1 python -m pytest tests/test_my_new_model.py`.
+   - If you are adding a new tokenizer, write tests, and make sure
+     `RUN_SLOW=1 python -m pytest tests/test_tokenization_{your_model_name}.py` passes.
+   CircleCI does not run the slow tests, but GitHub actions does every night!
+6. All public methods must have informative docstrings that work nicely with sphinx. See `[pipeline_latent_diffusion.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py)` for an example.
+7. Due to the rapidly growing repository, it is important to make sure that no files that would significantly weigh down the repository are added. This includes images, videos and other non-text files. We prefer to leverage a hf.co hosted `dataset` like
+   the ones hosted on [`hf-internal-testing`](https://huggingface.co/hf-internal-testing) in which to place these files and reference or [huggingface/documentation-images](https://huggingface.co/datasets/huggingface/documentation-images).
+   If an external contribution, feel free to add the images to your PR and ask a Hugging Face member to migrate your images
+   to this dataset.
+
+### Tests
+
+An extensive test suite is included to test the library behavior and several examples. Library tests can be found in
+the [tests folder](https://github.com/huggingface/diffusers/tree/main/tests).
+
+We like `pytest` and `pytest-xdist` because it's faster. From the root of the
+repository, here's how to run tests with `pytest` for the library:
+
+```bash
+$ python -m pytest -n auto --dist=loadfile -s -v ./tests/
+```
+
+In fact, that's how `make test` is implemented!
+
+You can specify a smaller set of tests in order to test only the feature
+you're working on.
+
+By default, slow tests are skipped. Set the `RUN_SLOW` environment variable to
+`yes` to run them. This will download many gigabytes of models — make sure you
+have enough disk space and a good Internet connection, or a lot of patience!
+
+```bash
+$ RUN_SLOW=yes python -m pytest -n auto --dist=loadfile -s -v ./tests/
+```
+
+`unittest` is fully supported, here's how to run tests with it:
+
+```bash
+$ python -m unittest discover -s tests -t . -v
+$ python -m unittest discover -s examples -t examples -v
+```
+
+### Syncing forked main with upstream (HuggingFace) main
+
+To avoid pinging the upstream repository which adds reference notes to each upstream PR and sends unnecessary notifications to the developers involved in these PRs,
+when syncing the main branch of a forked repository, please, follow these steps:
+1. When possible, avoid syncing with the upstream using a branch and PR on the forked repository. Instead, merge directly into the forked main.
+2. If a PR is absolutely necessary, use the following steps after checking out your branch:
+```
+$ git checkout -b your-branch-for-syncing
+$ git pull --squash --no-commit upstream main
+$ git commit -m '<your message without GitHub references>'
+$ git push --set-upstream origin your-branch-for-syncing
+```
+
+### Style guide
+
+For documentation strings, 🧨 Diffusers follows the [google style](https://google.github.io/styleguide/pyguide.html).
+
+
+**This guide was heavily inspired by the awesome [scikit-learn guide to contributing](https://github.com/scikit-learn/scikit-learn/blob/main/CONTRIBUTING.md).**
--- a/docs/source/conceptual/philosophy.mdx
+++ b/docs/source/conceptual/philosophy.mdx
@@ -0,0 +1,17 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Philosophy
+
+- Readability and clarity are preferred over highly optimized code. A strong importance is put on providing readable, intuitive and elementary code design. *E.g.*, the provided [schedulers](https://github.com/huggingface/diffusers/tree/main/src/diffusers/schedulers) are separated from the provided [models](https://github.com/huggingface/diffusers/tree/main/src/diffusers/models) and use well-commented code that can be read alongside the original paper.
+- Diffusers is **modality independent** and focuses on providing pretrained models and tools to build systems that generate **continuous outputs**, *e.g.* vision and audio. This is one of the guiding goals even if the initial pipelines are devoted to vision tasks.
+- Diffusion models and schedulers are provided as concise, elementary building blocks. In contrast, diffusion pipelines are a collection of end-to-end diffusion systems that can be used out-of-the-box, should stay as close as possible to their original implementations and can include components of other libraries, such as text encoders. Examples of diffusion pipelines are [Glide](https://github.com/openai/glide-text2im), [Latent Diffusion](https://github.com/CompVis/latent-diffusion) and [Stable Diffusion](https://github.com/compvis/stable-diffusion).
--- a/docs/source/conceptual/stable_diffusion.mdx
+++ b/docs/source/conceptual/stable_diffusion.mdx
@@ -0,0 +1,17 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Stable Diffusion
+
+Under construction 🚧
+
+For now please visit this [very in-detail blog post](https://huggingface.co/blog/stable_diffusion)
--- a/docs/source/examples/diffusers_for_other.mdx
+++ b/docs/source/examples/diffusers_for_other.mdx
@@ -1,20 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# Diffusers for other modalities
-
-Diffusers offers support to other modalities than vision and audio.
-Currently, some examples include:
- [Diffuser](https://diffusion-planning.github.io/) for planning in reinforcement learning (currenlty only inference): [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1TmBmlYeKUZSkUZoJqfBmaicVTKx6nN1R?usp=sharing)
-
-If you are interested in contributing to under-construction examples, you can explore:
- [GeoDiff](https://github.com/MinkaiXu/GeoDiff) for generating 3D configurations of molecule diagrams [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1pLYYWQhdLuv1q-JtEHGZybxp2RBF8gPs?usp=sharing).
--- a/docs/source/examples/diffusers_for_vision.mdx
+++ b/docs/source/examples/diffusers_for_vision.mdx
@@ -1,150 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# Diffusers for vision
-
-## Direct image generation
-
-#### **Example image generation with PNDM**
-
-```python
-from diffusers import PNDM, UNetModel, PNDMScheduler
-import PIL.Image
-import numpy as np
-import torch
-
-model_id = "fusing/ddim-celeba-hq"
-
-model = UNetModel.from_pretrained(model_id)
-scheduler = PNDMScheduler()
-
-# load model and scheduler
-pndm = PNDM(unet=model, noise_scheduler=scheduler)
-
-# run pipeline in inference (sample random noise and denoise)
-with torch.no_grad():
-    image = pndm()
-
-# process image to PIL
-image_processed = image.cpu().permute(0, 2, 3, 1)
-image_processed = (image_processed + 1.0) / 2
-image_processed = torch.clamp(image_processed, 0.0, 1.0)
-image_processed = image_processed * 255
-image_processed = image_processed.numpy().astype(np.uint8)
-image_pil = PIL.Image.fromarray(image_processed[0])
-
-# save image
-image_pil.save("test.png")
-```
-
-#### **Example 1024x1024 image generation with SDE VE**
-
-See [paper](https://arxiv.org/abs/2011.13456) for more information on SDE VE.
-
-```python
-from diffusers import DiffusionPipeline
-import torch
-import PIL.Image
-import numpy as np
-
-torch.manual_seed(32)
-
-score_sde_sv = DiffusionPipeline.from_pretrained("fusing/ffhq_ncsnpp")
-
-# Note this might take up to 3 minutes on a GPU
-image = score_sde_sv(num_inference_steps=2000)
-
-image = image.permute(0, 2, 3, 1).cpu().numpy()
-image = np.clip(image * 255, 0, 255).astype(np.uint8)
-image_pil = PIL.Image.fromarray(image[0])
-
-# save image
-image_pil.save("test.png")
-```
-#### **Example 32x32 image generation with SDE VP**
-
-See [paper](https://arxiv.org/abs/2011.13456) for more information on SDE VE.
-
-```python
-from diffusers import DiffusionPipeline
-import torch
-import PIL.Image
-import numpy as np
-
-torch.manual_seed(32)
-
-score_sde_sv = DiffusionPipeline.from_pretrained("fusing/cifar10-ddpmpp-deep-vp")
-
-# Note this might take up to 3 minutes on a GPU
-image = score_sde_sv(num_inference_steps=1000)
-
-image = image.permute(0, 2, 3, 1).cpu().numpy()
-image = np.clip(image * 255, 0, 255).astype(np.uint8)
-image_pil = PIL.Image.fromarray(image[0])
-
-# save image
-image_pil.save("test.png")
-```
-
-
-#### **Text to Image generation with Latent Diffusion**
-
-_Note: To use latent diffusion install transformers from [this branch](https://github.com/patil-suraj/transformers/tree/ldm-bert)._
-
-```python
-from diffusers import DiffusionPipeline
-
-ldm = DiffusionPipeline.from_pretrained("fusing/latent-diffusion-text2im-large")
-
-generator = torch.manual_seed(42)
-
-prompt = "A painting of a squirrel eating a burger"
-image = ldm([prompt], generator=generator, eta=0.3, guidance_scale=6.0, num_inference_steps=50)
-
-image_processed = image.cpu().permute(0, 2, 3, 1)
-image_processed = image_processed * 255.0
-image_processed = image_processed.numpy().astype(np.uint8)
-image_pil = PIL.Image.fromarray(image_processed[0])
-
-# save image
-image_pil.save("test.png")
-```
-
-
-## Text to image generation
-
-```python
-import torch
-from diffusers import BDDMPipeline, GradTTSPipeline
-
-torch_device = "cuda"
-
-# load grad tts and bddm pipelines
-grad_tts = GradTTSPipeline.from_pretrained("fusing/grad-tts-libri-tts")
-bddm = BDDMPipeline.from_pretrained("fusing/diffwave-vocoder-ljspeech")
-
-text = "Hello world, I missed you so much."
-
-# generate mel spectograms using text
-mel_spec = grad_tts(text, torch_device=torch_device)
-
-#  generate the speech by passing mel spectograms to BDDMPipeline pipeline
-generator = torch.manual_seed(42)
-audio = bddm(mel_spec, generator, torch_device=torch_device)
-
-# save generated audio
-from scipy.io.wavfile import write as wavwrite
-
-sampling_rate = 22050
-wavwrite("generated_audio.wav", sampling_rate, audio.squeeze().cpu().numpy())
-```
-
--- a/docs/source/index.mdx
+++ b/docs/source/index.mdx
@@ -18,93 +18,32 @@ specific language governing permissions and limitations under the License.

 # 🧨 Diffusers

-
-🤗 Diffusers provides pretrained diffusion models across multiple modalities, such as vision and audio, and serves
-as a modular toolbox for inference and training of diffusion models.
+🤗 Diffusers provides pretrained vision diffusion models, and serves as a modular toolbox for inference and training.

 More precisely, 🤗 Diffusers offers:

- State-of-the-art diffusion pipelines that can be run in inference with just a couple of lines of code (see [src/diffusers/pipelines](https://github.com/huggingface/diffusers/tree/main/src/diffusers/pipelines)).
- Various noise schedulers that can be used interchangeably for the prefered speed vs. quality trade-off in inference (see [src/diffusers/schedulers](https://github.com/huggingface/diffusers/tree/main/src/diffusers/schedulers)).
- Multiple types of models, such as UNet, that can be used as building blocks in an end-to-end diffusion system (see [src/diffusers/models](https://github.com/huggingface/diffusers/tree/main/src/diffusers/models)).
- Training examples to show how to train the most popular diffusion models (see [examples](https://github.com/huggingface/diffusers/tree/main/examples)).
+- State-of-the-art diffusion pipelines that can be run in inference with just a couple of lines of code (see [**Using Diffusers**](./using-diffusers/conditional_image_generation)) or have a look at [**Pipelines**](#pipelines) to get an overview of all supported pipelines and their corresponding papers.
+- Various noise schedulers that can be used interchangeably for the preferred speed vs. quality trade-off in inference. For more information see [**Schedulers**](./api/schedulers).
+- Multiple types of models, such as UNet, can be used as building blocks in an end-to-end diffusion system. See [**Models**](./api/models) for more details 
+- Training examples to show how to train the most popular diffusion model tasks. For more information see [**Training**](./training/overview).

-# Installation
+## 🧨 Diffusers Pipelines

-Install Diffusers for with PyTorch. Support for other libraries will come in the future
+The following table summarizes all officially supported pipelines, their corresponding paper, and if 
+available a colab notebook to directly try them out.

-🤗 Diffusers is tested on Python 3.6+, and PyTorch 1.4.0+.
-
-## Install with pip
-
-You should install 🤗 Diffusers in a [virtual environment](https://docs.python.org/3/library/venv.html).
-If you're unfamiliar with Python virtual environments, take a look at this [guide](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/).
-A virtual environment makes it easier to manage different projects, and avoid compatibility issues between dependencies.
-
-Start by creating a virtual environment in your project directory:
-
-```bash
-python -m venv .env
-```
-
-Activate the virtual environment:
-
-```bash
-source .env/bin/activate
-```
-
-Now you're ready to install 🤗 Diffusers with the following command:
-
-```bash
-pip install diffusers
-```
-
-## Install from source
-
-Install 🤗 Diffusers from source with the following command:
-
-```bash
-pip install git+https://github.com/huggingface/diffusers
-```
-
-This command installs the bleeding edge `main` version rather than the latest `stable` version.
-The `main` version is useful for staying up-to-date with the latest developments.
-For instance, if a bug has been fixed since the last official release but a new release hasn't been rolled out yet.
-However, this means the `main` version may not always be stable.
-We strive to keep the `main` version operational, and most issues are usually resolved within a few hours or a day.
-If you run into a problem, please open an [Issue](https://github.com/huggingface/transformers/issues) so we can fix it even sooner!
-
-## Editable install
-
-You will need an editable install if you'd like to:
-
-* Use the `main` version of the source code.
-* Contribute to 🤗 Diffusers and need to test changes in the code.
-
-Clone the repository and install 🤗 Diffusers with the following commands:
-
-```bash
-git clone https://github.com/huggingface/diffusers.git
-cd transformers
-pip install -e .
-```
-
-These commands will link the folder you cloned the repository to and your Python library paths.
-Python will now look inside the folder you cloned to in addition to the normal library paths.
-For example, if your Python packages are typically installed in `~/anaconda3/envs/main/lib/python3.7/site-packages/`, Python will also search the folder you cloned to: `~/diffusers/`.
-
-<Tip warning={true}>
-
-You must keep the `diffusers` folder if you want to keep using the library.
-
-</Tip>
-
-Now you can easily update your clone to the latest version of 🤗 Diffusers with the following command:
-
-```bash
-cd ~/diffusers/
-git pull
-```
-
-Your Python environment will find the `main` version of 🤗 Diffuers on the next run.
+| Pipeline | Paper | Tasks | Colab
+|---|---|:---:|:---:|
+| [ddpm](./api/pipelines/ddpm) | [**Denoising Diffusion Probabilistic Models**](https://arxiv.org/abs/2006.11239) | Unconditional Image Generation |
+| [ddim](./api/pipelines/ddim) | [**Denoising Diffusion Implicit Models**](https://arxiv.org/abs/2010.02502) | Unconditional Image Generation |
+| [latent_diffusion](./api/pipelines/latent_diffusion) | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752)| Text-to-Image Generation | 
+| [latent_diffusion_uncond](./api/pipelines/latent_diffusion_uncond) | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752) | Unconditional Image Generation | 
+| [pndm](./api/pipelines/pndm) | [**Pseudo Numerical Methods for Diffusion Models on Manifolds**](https://arxiv.org/abs/2202.09778) | Unconditional Image Generation | 
+| [score_sde_ve](./api/pipelines/score_sde_ve) | [**Score-Based Generative Modeling through Stochastic Differential Equations**](https://openreview.net/forum?id=PxTIG12RRHS) | Unconditional Image Generation | 
+| [score_sde_vp](./api/pipelines/score_sde_vp) | [**Score-Based Generative Modeling through Stochastic Differential Equations**](https://openreview.net/forum?id=PxTIG12RRHS) | Unconditional Image Generation | 
+| [stable_diffusion](./api/pipelines/stable_diffusion) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Text-to-Image Generation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/training_example.ipynb)
+| [stable_diffusion](./api/pipelines/stable_diffusion) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Image-to-Image Text-Guided Generation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/image_2_image_using_diffusers.ipynb)
+| [stable_diffusion](./api/pipelines/stable_diffusion) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Text-Guided Image Inpainting | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/in_painting_with_stable_diffusion_using_diffusers.ipynb)
+| [stochastic_karras_ve](./api/pipelines/stochastic_karras_ve) | [**Elucidating the Design Space of Diffusion-Based Generative Models**](https://arxiv.org/abs/2206.00364) | Unconditional Image Generation | 

+**Note**: Pipelines are simple examples of how to play around with the diffusion systems as described in the corresponding papers. 
--- a/docs/source/installation.mdx
+++ b/docs/source/installation.mdx
@@ -0,0 +1,90 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Installation
+
+Install Diffusers for with PyTorch. Support for other libraries will come in the future
+
+🤗 Diffusers is tested on Python 3.7+, and PyTorch 1.7.0+.
+
+## Install with pip
+
+You should install 🤗 Diffusers in a [virtual environment](https://docs.python.org/3/library/venv.html).
+If you're unfamiliar with Python virtual environments, take a look at this [guide](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/).
+A virtual environment makes it easier to manage different projects, and avoid compatibility issues between dependencies.
+
+Start by creating a virtual environment in your project directory:
+
+```bash
+python -m venv .env
+```
+
+Activate the virtual environment:
+
+```bash
+source .env/bin/activate
+```
+
+Now you're ready to install 🤗 Diffusers with the following command:
+
+```bash
+pip install diffusers
+```
+
+## Install from source
+
+Install 🤗 Diffusers from source with the following command:
+
+```bash
+pip install git+https://github.com/huggingface/diffusers
+```
+
+This command installs the bleeding edge `main` version rather than the latest `stable` version.
+The `main` version is useful for staying up-to-date with the latest developments.
+For instance, if a bug has been fixed since the last official release but a new release hasn't been rolled out yet.
+However, this means the `main` version may not always be stable.
+We strive to keep the `main` version operational, and most issues are usually resolved within a few hours or a day.
+If you run into a problem, please open an [Issue](https://github.com/huggingface/transformers/issues) so we can fix it even sooner!
+
+## Editable install
+
+You will need an editable install if you'd like to:
+
+* Use the `main` version of the source code.
+* Contribute to 🤗 Diffusers and need to test changes in the code.
+
+Clone the repository and install 🤗 Diffusers with the following commands:
+
+```bash
+git clone https://github.com/huggingface/diffusers.git
+cd diffusers
+pip install -e .
+```
+
+These commands will link the folder you cloned the repository to and your Python library paths.
+Python will now look inside the folder you cloned to in addition to the normal library paths.
+For example, if your Python packages are typically installed in `~/anaconda3/envs/main/lib/python3.7/site-packages/`, Python will also search the folder you cloned to: `~/diffusers/`.
+
+<Tip warning={true}>
+
+You must keep the `diffusers` folder if you want to keep using the library.
+
+</Tip>
+
+Now you can easily update your clone to the latest version of 🤗 Diffusers with the following command:
+
+```bash
+cd ~/diffusers/
+git pull
+```
+
+Your Python environment will find the `main` version of 🤗 Diffusers on the next run.
--- a/docs/source/models/unet.mdx
+++ b/docs/source/models/unet.mdx
@@ -1,4 +0,0 @@
-# UNet
-
-The UNet is an example often used in diffusion models.
-It was originally published [here](https://www.google.com).
--- a/docs/source/optimization/fp16.mdx
+++ b/docs/source/optimization/fp16.mdx
@@ -0,0 +1,242 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Memory and speed
+
+We present some techniques and ideas to optimize 🤗 Diffusers _inference_ for memory or speed.
+
+
+|                  | Latency | Speedup |
+|------------------|---------|---------|
+| original         | 9.50s   | x1      |
+| cuDNN auto-tuner | 9.37s   | x1.01   |
+| autocast (fp16)  | 5.47s   | x1.91   |
+| fp16             | 3.61s   | x2.91   |
+| channels last    | 3.30s   | x2.87   |
+| traced UNet      | 3.21s   | x2.96   |
+
+<em>obtained on NVIDIA TITAN RTX by generating a single image of size 512x512 from the prompt "a photo of an astronaut riding a horse on mars" with 50 DDIM steps.</em>
+
+## Enable cuDNN auto-tuner
+
+[NVIDIA cuDNN](https://developer.nvidia.com/cudnn) supports many algorithms to compute a convolution. Autotuner runs a short benchmark and selects the kernel with the best performance on a given hardware for a given input size.
+
+Since we’re using **convolutional networks** (other types currently not supported), we can enable cuDNN autotuner before launching the inference by setting:
+
+```python
+import torch
+
+torch.backends.cudnn.benchmark = True
+```
+
+### Use tf32 instead of fp32 (on Ampere and later CUDA devices)
+
+On Ampere and later CUDA devices matrix multiplications and convolutions can use the TensorFloat32 (TF32) mode for faster but slightly less accurate computations. By default PyTorch enables TF32 mode for convolutions but not matrix multiplications, and unless a network requires full float32 precision we recommend enabling this setting for matrix multiplications, too. It can significantly speed up computations with typically negligible loss of numerical accuracy. You can read more about it [here](https://huggingface.co/docs/transformers/v4.18.0/en/performance#tf32). All you need to do is to add this before your inference:
+
+```python
+import torch
+
+torch.backends.cuda.matmul.allow_tf32 = True
+```
+
+## Automatic mixed precision (AMP)
+
+If you use a CUDA GPU, you can take advantage of `torch.autocast` to perform inference roughly twice as fast at the cost of slightly lower precision. All you need to do is put your inference call inside an `autocast` context manager. The following example shows how to do it using Stable Diffusion text-to-image generation as an example:
+
+```Python
+from torch import autocast
+from diffusers import StableDiffusionPipeline
+
+pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
+pipe = pipe.to("cuda")
+
+prompt = "a photo of an astronaut riding a horse on mars"
+with autocast("cuda"):
+    image = pipe(prompt).images[0]  
+```
+
+Despite the precision loss, in our experience the final image results look the same as the `float32` versions. Feel free to experiment and report back!
+
+## Half precision weights
+
+To save more GPU memory and get even more speed, you can load and run the model weights directly in half precision. This involves loading the float16 version of the weights, which was saved to a branch named `fp16`, and telling PyTorch to use the `float16` type when loading them:
+
+```Python
+pipe = StableDiffusionPipeline.from_pretrained(
+    "CompVis/stable-diffusion-v1-4",
+    revision="fp16",
+    torch_dtype=torch.float16,
+)
+pipe = pipe.to("cuda")
+
+prompt = "a photo of an astronaut riding a horse on mars"
+image = pipe(prompt).images[0]  
+```
+
+## Sliced attention for additional memory savings
+
+For even additional memory savings, you can use a sliced version of attention that performs the computation in steps instead of all at once.
+
+<Tip>
+Attention slicing is useful even if a batch size of just 1 is used - as long as the model uses more than one attention head. If there is more than one attention head the *QK^T* attention matrix can be computed sequentially for each head which can save a significant amount of memory.
+</Tip>
+
+To perform the attention computation sequentially over each head, you only need to invoke [`~StableDiffusionPipeline.enable_attention_slicing`] in your pipeline before inference, like here:
+
+```Python
+import torch
+from diffusers import StableDiffusionPipeline
+
+pipe = StableDiffusionPipeline.from_pretrained(
+    "CompVis/stable-diffusion-v1-4",
+    revision="fp16",
+    torch_dtype=torch.float16,
+)
+pipe = pipe.to("cuda")
+
+prompt = "a photo of an astronaut riding a horse on mars"
+pipe.enable_attention_slicing()
+image = pipe(prompt).images[0]  
+```
+
+There's a small performance penalty of about 10% slower inference times, but this method allows you to use Stable Diffusion in as little as 3.2 GB of VRAM!
+
+## Using Channels Last memory format
+
+Channels last memory format is an alternative way of ordering NCHW tensors in memory preserving dimensions ordering. Channels last tensors ordered in such a way that channels become the densest dimension (aka storing images pixel-per-pixel). Since not all operators currently support channels last format it may result in a worst performance, so it's better to try it and see if it works for your model.
+
+For example, in order to set the UNet model in our pipeline to use channels last format, we can use the following:
+
+```python
+print(pipe.unet.conv_out.state_dict()["weight"].stride())  # (2880, 9, 3, 1)
+pipe.unet.to(memory_format=torch.channels_last)  # in-place operation
+print(
+    pipe.unet.conv_out.state_dict()["weight"].stride()
+)  # (2880, 1, 960, 320) having a stride of 1 for the 2nd dimension proves that it works
+```
+
+## Tracing
+
+Tracing runs an example input tensor through your model, and captures the operations that are invoked as that input makes its way through the model's layers so that an executable or `ScriptFunction` is returned that will be optimized using just-in-time compilation.
+
+To trace our UNet model, we can use the following:
+
+```python
+import time
+import torch
+from diffusers import StableDiffusionPipeline
+import functools
+
+# torch disable grad
+torch.set_grad_enabled(False)
+
+# set variables
+n_experiments = 2
+unet_runs_per_experiment = 50
+
+# load inputs
+def generate_inputs():
+    sample = torch.randn(2, 4, 64, 64).half().cuda()
+    timestep = torch.rand(1).half().cuda() * 999
+    encoder_hidden_states = torch.randn(2, 77, 768).half().cuda()
+    return sample, timestep, encoder_hidden_states
+
+
+pipe = StableDiffusionPipeline.from_pretrained(
+    "CompVis/stable-diffusion-v1-4",
+    revision="fp16",
+    torch_dtype=torch.float16,
+).to("cuda")
+unet = pipe.unet
+unet.eval()
+unet.to(memory_format=torch.channels_last)  # use channels_last memory format
+unet.forward = functools.partial(unet.forward, return_dict=False)  # set return_dict=False as default
+
+# warmup
+for _ in range(3):
+    with torch.inference_mode():
+        inputs = generate_inputs()
+        orig_output = unet(*inputs)
+
+# trace
+print("tracing..")
+unet_traced = torch.jit.trace(unet, inputs)
+unet_traced.eval()
+print("done tracing")
+
+
+# warmup and optimize graph
+for _ in range(5):
+    with torch.inference_mode():
+        inputs = generate_inputs()
+        orig_output = unet_traced(*inputs)
+
+
+# benchmarking
+with torch.inference_mode():
+    for _ in range(n_experiments):
+        torch.cuda.synchronize()
+        start_time = time.time()
+        for _ in range(unet_runs_per_experiment):
+            orig_output = unet_traced(*inputs)
+        torch.cuda.synchronize()
+        print(f"unet traced inference took {time.time() - start_time:.2f} seconds")
+    for _ in range(n_experiments):
+        torch.cuda.synchronize()
+        start_time = time.time()
+        for _ in range(unet_runs_per_experiment):
+            orig_output = unet(*inputs)
+        torch.cuda.synchronize()
+        print(f"unet inference took {time.time() - start_time:.2f} seconds")
+
+# save the model
+unet_traced.save("unet_traced.pt")
+```
+
+Then we can replace the `unet` attribute of the pipeline with the traced model like the following
+
+```python
+from diffusers import StableDiffusionPipeline
+import torch
+from dataclasses import dataclass
+
+
+@dataclass
+class UNet2DConditionOutput:
+    sample: torch.FloatTensor
+
+
+pipe = StableDiffusionPipeline.from_pretrained(
+    "CompVis/stable-diffusion-v1-4",
+    revision="fp16",
+    torch_dtype=torch.float16,
+).to("cuda")
+
+# use jitted unet
+unet_traced = torch.jit.load("unet_traced.pt")
+# del pipe.unet
+class TracedUNet(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.in_channels = pipe.unet.in_channels
+        self.device = pipe.unet.device
+
+    def forward(self, latent_model_input, t, encoder_hidden_states):
+        sample = unet_traced(latent_model_input, t, encoder_hidden_states)[0]
+        return UNet2DConditionOutput(sample=sample)
+
+
+pipe.unet = TracedUNet()
+
+with torch.inference_mode():
+    image = pipe([prompt] * 1, num_inference_steps=50).images[0]
+```
--- a/docs/source/optimization/mps.mdx
+++ b/docs/source/optimization/mps.mdx
@@ -0,0 +1,58 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# How to use Stable Diffusion in Apple Silicon (M1/M2)
+
+🤗 Diffusers is compatible with Apple silicon for Stable Diffusion inference, using the PyTorch `mps` device. These are the steps you need to follow to use your M1 or M2 computer with Stable Diffusion.
+
+## Requirements
+
+- Mac computer with Apple silicon (M1/M2) hardware.
+- macOS 12.3 or later.
+- arm64 version of Python.
+- PyTorch [Preview (Nightly)](https://pytorch.org/get-started/locally/), version `1.14.0.dev20221007` or later.
+
+## Inference Pipeline
+
+The snippet below demonstrates how to use the `mps` backend using the familiar `to()` interface to move the Stable Diffusion pipeline to your M1 or M2 device.
+
+We recommend to "prime" the pipeline using an additional one-time pass through it. This is a temporary workaround for a weird issue we have detected: the first inference pass produces slightly different results than subsequent ones. You only need to do this pass once, and it's ok to use just one inference step and discard the result.
+
+```python
+# make sure you're logged in with `huggingface-cli login`
+from diffusers import StableDiffusionPipeline
+
+pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
+pipe = pipe.to("mps")
+
+prompt = "a photo of an astronaut riding a horse on mars"
+
+# First-time "warmup" pass (see explanation above)
+_ = pipe(prompt, num_inference_steps=1)
+
+# Results match those from the CPU device after the warmup pass.
+image = pipe(prompt).images[0]
+```
+
+## Known Issues
+
+- As mentioned above, we are investigating a strange [first-time inference issue](https://github.com/huggingface/diffusers/issues/372).
+- Generating multiple prompts in a batch [crashes or doesn't work reliably](https://github.com/huggingface/diffusers/issues/363). We believe this might be related to the [`mps` backend in PyTorch](https://github.com/pytorch/pytorch/issues/84039#issuecomment-1237735249), but we need to investigate in more depth. For now, we recommend to iterate instead of batching.
+
+## Performance
+
+These are the results we got on a M1 Max MacBook Pro with 64 GB of RAM, running macOS Ventura Version 13.0 Beta (22A5331f). We performed Stable Diffusion text-to-image generation of the same prompt for 50 inference steps, using a guidance scale of 7.5.
+
+| Device | Steps | Time    |
+|--------|-------|---------|
+| CPU    | 50    | 213.46s |
+| MPS    | 50    | 30.81s  |
--- a/docs/source/optimization/onnx.mdx
+++ b/docs/source/optimization/onnx.mdx
@@ -0,0 +1,42 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+
+# How to use the ONNX Runtime for inference
+
+🤗 Diffusers provides a Stable Diffusion pipeline compatible with the ONNX Runtime. This allows you to run Stable Diffusion on any hardware that supports ONNX (including CPUs), and where an accelerated version of PyTorch is not available.
+
+## Installation
+
+- TODO
+
+## Stable Diffusion Inference
+
+The snippet below demonstrates how to use the ONNX runtime. You need to use `StableDiffusionOnnxPipeline` instead of `StableDiffusionPipeline`. You also need to download the weights from the `onnx` branch of the repository, and indicate the runtime provider you want to use.
+
+```python
+# make sure you're logged in with `huggingface-cli login`
+from diffusers import StableDiffusionOnnxPipeline
+
+pipe = StableDiffusionOnnxPipeline.from_pretrained(
+    "CompVis/stable-diffusion-v1-4",
+    revision="onnx",
+    provider="CUDAExecutionProvider",
+)
+
+prompt = "a photo of an astronaut riding a horse on mars"
+image = pipe(prompt).images[0]
+```
+
+## Known Issues
+
+- Generating multiple prompts in a batch seems to take too much memory. While we look into it, you may need to iterate instead of batching.
--- a/docs/source/examples/diffusers_for_audio.mdx
+++ b/docs/source/examples/diffusers_for_audio.mdx
@@ -10,4 +10,6 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->

-# Diffusers for audio
+# OpenVINO
+
+Under construction 🚧
--- a/docs/source/philosophy.mdx
+++ b/docs/source/philosophy.mdx
@@ -1,17 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# Philosophy
-
- Readability and clarity is prefered over highly optimized code. A strong importance is put on providing readable, intuitive and elementary code design. *E.g.*, the provided [schedulers](https://github.com/huggingface/diffusers/tree/main/src/diffusers/schedulers) are separated from the provided [models](https://github.com/huggingface/diffusers/tree/main/src/diffusers/models) and provide well-commented code that can be read alongside the original paper.
- Diffusers is **modality independent** and focusses on providing pretrained models and tools to build systems that generate **continous outputs**, *e.g.* vision and audio.
- Diffusion models and schedulers are provided as consise, elementary building blocks whereas diffusion pipelines are a collection of end-to-end diffusion systems that can be used out-of-the-box, should stay as close as possible to their original implementation and can include components of other library, such as text-encoders. Examples for diffusion pipelines are [Glide](https://github.com/openai/glide-text2im) and [Latent Diffusion](https://github.com/CompVis/latent-diffusion).
--- a/docs/source/pipelines.mdx
+++ b/docs/source/pipelines.mdx
@@ -1,31 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# Pipelines
-
- Pipelines are a collection of end-to-end diffusion systems that can be used out-of-the-box
- Pipelines should stay as close as possible to their original implementation
- Pipelines can include components of other library, such as text-encoders.
-
-## API
-
-TODO(Patrick, Anton, Suraj)
-
-## Examples
-
- DDPM for unconditional image generation in [pipeline_ddpm](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/pipeline_ddpm.py).
- DDIM for unconditional image generation in [pipeline_ddim](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/pipeline_ddim.py).
- PNDM for unconditional image generation in [pipeline_pndm](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/pipeline_pndm.py).
- Latent diffusion for text to image generation / conditional image generation in [pipeline_latent_diffusion](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/pipeline_latent_diffusion.py).
- Glide for text to image generation / conditional image generation in [pipeline_glide](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/pipeline_glide.py).
- BDDMPipeline for spectrogram-to-sound vocoding in [pipeline_bddm](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/pipeline_bddm.py).
- Grad-TTS for text to audio generation / conditional audio generation in [pipeline_grad_tts](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/pipeline_grad_tts.py).
--- a/docs/source/pipelines/glide.mdx
+++ b/docs/source/pipelines/glide.mdx
@@ -1 +0,0 @@
-# GLIDE MODEL
--- a/docs/source/quicktour.mdx
+++ b/docs/source/quicktour.mdx
@@ -10,23 +10,137 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->

-
-
 # Quicktour

-Start using Diffusers🧨 quickly!
-To start, use the [`DiffusionPipeline`] for quick inference and sample generations!
+Get up and running with 🧨 Diffusers quickly!
+Whether you're a developer or an everyday user, this quick tour will help you get started and show you how to use [`DiffusionPipeline`] for inference.

-```
-pip install diffusers
+Before you begin, make sure you have all the necessary libraries installed:
+
+```bash
+pip install --upgrade diffusers
 ```

-## Main classes
+## DiffusionPipeline

-### Models
+The [`DiffusionPipeline`] is the easiest way to use a pre-trained diffusion system for inference. You can use the [`DiffusionPipeline`] out-of-the-box for many tasks across different modalities. Take a look at the table below for some supported tasks:

-### Schedulers
+| **Task**                     | **Description**                                                                                              | **Pipeline**
+|------------------------------|--------------------------------------------------------------------------------------------------------------|-----------------|
+| Unconditional Image Generation          | generate an image from gaussian noise | [unconditional_image_generation](./using-diffusers/unconditional_image_generation`) |
+| Text-Guided Image Generation | generate an image given a text prompt | [conditional_image_generation](./using-diffusers/conditional_image_generation) |
+| Text-Guided Image-to-Image Translation     | generate an image given an original image and a text prompt | [img2img](./using-diffusers/img2img) |
+| Text-Guided Image-Inpainting          | fill the masked part of an image given the image, the mask and a text prompt | [inpaint](./using-diffusers/inpaint) |

-### Pipeliens
+For more in-detail information on how diffusion pipelines function for the different tasks, please have a look at the [**Using Diffusers**](./using-diffusers/overview) section.

+As an example, start by creating an instance of [`DiffusionPipeline`] and specify which pipeline checkpoint you would like to download.
+You can use the [`DiffusionPipeline`] for any [Diffusers' checkpoint](https://huggingface.co/models?library=diffusers&sort=downloads).
+In this guide though, you'll use [`DiffusionPipeline`] for text-to-image generation with [Latent Diffusion](https://huggingface.co/CompVis/ldm-text2im-large-256):

+```python
+>>> from diffusers import DiffusionPipeline
+
+>>> generator = DiffusionPipeline.from_pretrained("CompVis/ldm-text2im-large-256")
+```
+
+The [`DiffusionPipeline`] downloads and caches all modeling, tokenization, and scheduling components. 
+Because the model consists of roughly 1.4 billion parameters, we strongly recommend running it on GPU.
+You can move the generator object to GPU, just like you would in PyTorch.
+
+```python
+>>> generator.to("cuda")
+```
+
+Now you can use the `generator` on your text prompt:
+
+```python
+>>> image = generator("An image of a squirrel in Picasso style").images[0]
+```
+
+The output is by default wrapped into a [PIL Image object](https://pillow.readthedocs.io/en/stable/reference/Image.html?highlight=image#the-image-class).
+
+You can save the image by simply calling:
+
+```python
+>>> image.save("image_of_squirrel_painting.png")
+```
+
+More advanced models, like [Stable Diffusion](https://huggingface.co/CompVis/stable-diffusion) require you to accept a [license](https://huggingface.co/spaces/CompVis/stable-diffusion-license) before running the model.
+This is due to the improved image generation capabilities of the model and the potentially harmful content that could be produced with it.
+Long story short: Head over to your stable diffusion model of choice, *e.g.* [`CompVis/stable-diffusion-v1-4`](https://huggingface.co/CompVis/stable-diffusion-v1-4), read through the license and click-accept to get 
+access to the model. 
+You have to be a registered user in 🤗 Hugging Face Hub, and you'll also need to use an access token for the code to work. For more information on access tokens, please refer to [this section of the documentation](https://huggingface.co/docs/hub/security-tokens).
+Having "click-accepted" the license, you can save your token:
+
+```python
+AUTH_TOKEN = "<please-fill-with-your-token>"
+```
+
+You can then load [`CompVis/stable-diffusion-v1-4`](https://huggingface.co/CompVis/stable-diffusion-v1-4) 
+just like we did before only that now you need to pass your `AUTH_TOKEN`:
+
+```python
+>>> from diffusers import DiffusionPipeline
+
+>>> generator = DiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", use_auth_token=AUTH_TOKEN)
+```
+
+If you do not pass your authentication token you will see that the diffusion system will not be correctly 
+downloaded. Forcing the user to pass an authentication token ensures that it can be verified that the 
+user has indeed read and accepted the license, which also means that an internet connection is required.
+
+**Note**: If you do not want to be forced to pass an authentication token, you can also simply download 
+the weights locally via:
+
+```
+git lfs install
+git clone https://huggingface.co/CompVis/stable-diffusion-v1-4
+```
+
+and then load locally saved weights into the pipeline. This way, you do not need to pass an authentication
+token. Assuming that `"./stable-diffusion-v1-4"` is the local path to the cloned stable-diffusion-v1-4 repo,
+you can also load the pipeline as follows:
+
+```python
+>>> generator = DiffusionPipeline.from_pretrained("./stable-diffusion-v1-4")
+```
+
+Running the pipeline is then identical to the code above as it's the same model architecture.
+
+```python
+>>> generator.to("cuda")
+>>> image = generator("An image of a squirrel in Picasso style").images[0]
+>>> image.save("image_of_squirrel_painting.png")
+```
+
+Diffusion systems can be used with multiple different [schedulers](./api/schedulers) each with their
+pros and cons. By default, Stable Diffusion runs with [`PNDMScheduler`], but it's very simple to 
+use a different scheduler. *E.g.* if you would instead like to use the [`LMSDiscreteScheduler`] scheduler,
+you could use it as follows:
+
+```python
+>>> from diffusers import LMSDiscreteScheduler
+
+>>> scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear")
+
+>>> generator = StableDiffusionPipeline.from_pretrained(
+...     "CompVis/stable-diffusion-v1-4", scheduler=scheduler, use_auth_token=AUTH_TOKEN
+... )
+```
+
+[Stability AI's](https://stability.ai/) Stable Diffusion model is an impressive image generation model
+and can do much more than just generating images from text. We have dedicated a whole documentation page,
+just for Stable Diffusion [here](./conceptual/stable_diffusion).
+
+If you want to know how to optimize Stable Diffusion to run on less memory, higher inference speeds, on specific hardware, such as Mac, or with [ONNX Runtime](https://onnxruntime.ai/), please have a look at our 
+optimization pages:
+
+- [Optimized PyTorch on GPU](./optimization/fp16)
+- [Mac OS with PyTorch](./optimization/mps)
+- [ONNX](./optimization/onnx)
+- [OpenVINO](./optimization/open_vino)
+
+If you want to fine-tune or train your diffusion model, please have a look at the [**training section**](./training/overview)
+
+Finally, please be considerate when distributing generated images publicly 🤗.
--- a/docs/source/schedulers.mdx
+++ b/docs/source/schedulers.mdx
@@ -1,33 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# Schedulers
-
-The base class ['SchedulerMixin'] implements low level utilities used by multiple schedulers.
-At a high level:
- Schedulers are the algorithms to use diffusion models in inference as well as for training. They include the noise schedules and define algorithm-specific diffusion steps.
- Schedulers can be used interchangable between diffusion models in inference to find the preferred tradef-off between speed and generation quality.
- Schedulers are available in numpy, but can easily be transformed into PyTorch.
-
-## API
-
- Schedulers should provide one or more `def step(...)` functions that should be called iteratively to unroll the diffusion loop during
-the forward pass.
- Schedulers should be framework-agonstic, but provide a simple functionality to convert the scheduler into a specific framework, such as PyTorch
-with a `set_format(...)` method.
-
-## Examples
-
- The ['DDPMScheduler'] was proposed in [Denoising Diffusion Probabilistic Models](https://arxiv.org/abs/2006.11239) and can be found in [scheduling_ddpm.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/schedulers/scheduling_ddpm.py).
-An example of how to use this scheduler can be found in [pipeline_ddpm.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/pipeline_ddpm.py).
- The ['DDIMScheduler'] was proposed in [Denoising Diffusion Implicit Models](https://arxiv.org/abs/2010.02502) and can be found in [scheduling_ddim.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/schedulers/scheduling_ddim.py). An example of how to use this scheduler can be found in [pipeline_ddim.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/pipeline_ddim.py).
- The ['PNDMScheduler'] was proposed in [Pseudo Numerical Methods for Diffusion Models on Manifolds](https://arxiv.org/abs/2202.09778) and can be found in [scheduling_pndm.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/schedulers/scheduling_pndm.py). An example of how to use this scheduler can be found in [pipeline_pndm.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/pipeline_pndm.py).
--- a/docs/source/schedulers/ddpm.mdx
+++ b/docs/source/schedulers/ddpm.mdx
@@ -1,3 +0,0 @@
-# DDPM
-
-DDPM is a scheduler.
--- a/docs/source/training/overview.mdx
+++ b/docs/source/training/overview.mdx
@@ -0,0 +1,69 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# 🧨 Diffusers Training Examples
+
+Diffusers examples are a collection of scripts to demonstrate how to effectively use the `diffusers` library
+for a variety of use cases.
+
+**Note**: If you are looking for **official** examples on how to use `diffusers` for inference, 
+please have a look at [src/diffusers/pipelines](https://github.com/huggingface/diffusers/tree/main/src/diffusers/pipelines)
+
+Our examples aspire to be **self-contained**, **easy-to-tweak**, **beginner-friendly** and for **one-purpose-only**.
+More specifically, this means:
+
+- **Self-contained**: An example script shall only depend on "pip-install-able" Python packages that can be found in a `requirements.txt` file. Example scripts shall **not** depend on any local files. This means that one can simply download an example script, *e.g.* [train_unconditional.py](https://github.com/huggingface/diffusers/blob/main/examples/unconditional_image_generation/train_unconditional.py), install the required dependencies, *e.g.* [requirements.txt](https://github.com/huggingface/diffusers/blob/main/examples/unconditional_image_generation/requirements.txt) and execute the example script.
+- **Easy-to-tweak**: While we strive to present as many use cases as possible, the example scripts are just that - examples. It is expected that they won't work out-of-the box on your specific problem and that you will be required to change a few lines of code to adapt them to your needs. To help you with that, most of the examples fully expose the preprocessing of the data and the training loop to allow you to tweak and edit them as required.
+- **Beginner-friendly**: We do not aim for providing state-of-the-art training scripts for the newest models, but rather examples that can be used as a way to better understand diffusion models and how to use them with the `diffusers` library. We often purposefully leave out certain state-of-the-art methods if we consider them too complex for beginners.
+- **One-purpose-only**: Examples should show one task and one task only. Even if a task is from a modeling 
+point of view very similar, *e.g.* image super-resolution and image modification tend to use the same model and training method, we want examples to showcase only one task to keep them as readable and easy-to-understand as possible.
+
+We provide **official** examples that cover the most popular tasks of diffusion models.
+*Official* examples are **actively** maintained by the `diffusers` maintainers and we try to rigorously follow our example philosophy as defined above. 
+If you feel like another important example should exist, we are more than happy to welcome a [Feature Request](https://github.com/huggingface/diffusers/issues/new?assignees=&labels=&template=feature_request.md&title=) or directly a [Pull Request](https://github.com/huggingface/diffusers/compare) from you!
+
+Training examples show how to pretrain or fine-tune diffusion models for a variety of tasks. Currently we support:
+
+- [Unconditional Training](./unconditional_training)
+- [Text-to-Image Training](./text2image)
+- [Text Inversion](./text_inversion)
+
+
+| Task | 🤗 Accelerate | 🤗 Datasets | Colab
+|---|---|:---:|:---:|
+| [**Unconditional Image Generation**](./unconditional_training) | ✅ | ✅ | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/training_example.ipynb)
+| [**Text-to-Image**](./text2image) | - | - | 
+| [**Text-Inversion**](./text_inversion) | ✅ | ✅ | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/sd_textual_inversion_training.ipynb)
+
+## Community
+
+In addition, we provide **community** examples, which are examples added and maintained by our community.
+Community examples can consist of both *training* examples or *inference* pipelines.
+For such examples, we are more lenient regarding the philosophy defined above and also cannot guarantee to provide maintenance for every issue.
+Examples that are useful for the community, but are either not yet deemed popular or not yet following our above philosophy should go into the [community examples](https://github.com/huggingface/diffusers/tree/main/examples/community) folder. The community folder therefore includes training examples and inference pipelines.
+**Note**: Community examples can be a [great first contribution](https://github.com/huggingface/diffusers/issues?q=is%3Aopen+is%3Aissue+label%3A%22good+first+issue%22) to show to the community how you like to use `diffusers` 🪄.
+
+## Important note
+
+To make sure you can successfully run the latest versions of the example scripts, you have to **install the library from source** and install some example-specific requirements. To do this, execute the following steps in a new virtual environment:
+
+```bash
+git clone https://github.com/huggingface/diffusers
+cd diffusers
+pip install .
+```
+
+Then cd in the example folder of your choice and run
+
+```bash
+pip install -r requirements.txt
+```
--- a/docs/source/training/text2image.mdx
+++ b/docs/source/training/text2image.mdx
@@ -0,0 +1,16 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+
+# Text-to-Image Training
+
+Under construction 🚧
--- a/docs/source/training/text_inversion.mdx
+++ b/docs/source/training/text_inversion.mdx
@@ -0,0 +1,122 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+
+
+# Textual Inversion
+
+Textual Inversion is a technique for capturing novel concepts from a small number of example images in a way that can later be used to control text-to-image pipelines. It does so by learning new 'words' in the embedding space of the pipeline's text encoder. These special words can then be used within text prompts to achieve very fine-grained control of the resulting images. 
+
+![Textual Inversion example](https://textual-inversion.github.io/static/images/editing/colorful_teapot.JPG)
+_By using just 3-5 images you can teach new concepts to a model such as Stable Diffusion for personalized image generation ([image source](https://github.com/rinongal/textual_inversion))._
+
+This technique was introduced in [An Image is Worth One Word: Personalizing Text-to-Image Generation using Textual Inversion](https://arxiv.org/abs/2208.01618). The paper demonstrated the concept using a [latent diffusion model](https://github.com/CompVis/latent-diffusion) but the idea has since been applied to other variants such as [Stable Diffusion](https://huggingface.co/docs/diffusers/main/en/conceptual/stable_diffusion).
+
+
+## How It Works
+
+![Diagram from the paper showing overview](https://textual-inversion.github.io/static/images/training/training.JPG)
+_Architecture Overview from the [textual inversion blog post](https://textual-inversion.github.io/)_
+
+Before a text prompt can be used in a diffusion model, it must first be processed into a numerical representation. This typically involves tokenizing the text, converting each token to an embedding and then feeding those embeddings through a model (typically a transformer) whose output will be used as the conditioning for the diffusion model. 
+
+Textual inversion learns a new token embedding (v* in the diagram above). A prompt (that includes a token which will be mapped to this new embedding) is used in conjunction with a noised version of one or more training images as inputs to the generator model, which attempts to predict the denoised version of the image. The embedding is optimized based on how well the model does at this task - an embedding that better captures the object or style shown by the training images will give more useful information to the diffusion model and thus result in a lower denoising loss. After many steps (typically several thousand) with a variety of prompt and image variants the learned embedding should hopefully capture the essence of the new concept being taught.
+
+## Usage
+
+To train your own textual inversions, see the [example script here](https://github.com/huggingface/diffusers/tree/main/examples/textual_inversion). 
+
+There is also a notebook for training:
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/sd_textual_inversion_training.ipynb)
+
+And one for inference:
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/stable_conceptualizer_inference.ipynb)
+
+In addition to using concepts you have trained yourself, there is a community-created collection of trained textual inversions in the new [Stable Diffusion public concepts library](https://huggingface.co/sd-concepts-library) which you can also use from the inference notebook above. Over time this will hopefully grow into a useful resource as more examples are added.
+
+## Example: Running locally 
+
+The `textual_inversion.py` script [here](https://github.com/huggingface/diffusers/blob/main/examples/textual_inversion) shows how to implement the training procedure and adapt it for stable diffusion.
+
+### Installing the dependencies
+
+Before running the scripts, make sure to install the library's training dependencies:
+
+```bash
+pip install diffusers[training] accelerate transformers
+```
+
+And initialize an [🤗Accelerate](https://github.com/huggingface/accelerate/) environment with:
+
+```bash
+accelerate config
+```
+
+
+### Cat toy example
+
+You need to accept the model license before downloading or using the weights. In this example we'll use model version `v1-4`, so you'll need to visit [its card](https://huggingface.co/CompVis/stable-diffusion-v1-4), read the license and tick the checkbox if you agree. 
+
+You have to be a registered user in 🤗 Hugging Face Hub, and you'll also need to use an access token for the code to work. For more information on access tokens, please refer to [this section of the documentation](https://huggingface.co/docs/hub/security-tokens).
+
+Run the following command to authenticate your token
+
+```bash
+huggingface-cli login
+```
+
+If you have already cloned the repo, then you won't need to go through these steps.
+
+<br>
+
+Now let's get our dataset.Download 3-4 images from [here](https://drive.google.com/drive/folders/1fmJMs25nxS_rSNqS5hTcRdLem_YQXbq5) and save them in a directory. This will be our training data.
+
+And launch the training using
+
+```bash
+export MODEL_NAME="CompVis/stable-diffusion-v1-4"
+export DATA_DIR="path-to-dir-containing-images"
+
+accelerate launch textual_inversion.py \
+  --pretrained_model_name_or_path=$MODEL_NAME \
+  --train_data_dir=$DATA_DIR \
+  --learnable_property="object" \
+  --placeholder_token="<cat-toy>" --initializer_token="toy" \
+  --resolution=512 \
+  --train_batch_size=1 \
+  --gradient_accumulation_steps=4 \
+  --max_train_steps=3000 \
+  --learning_rate=5.0e-04 --scale_lr \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --output_dir="textual_inversion_cat"
+```
+
+A full training run takes ~1 hour on one V100 GPU.
+
+
+### Inference
+
+Once you have trained a model using above command, the inference can be done simply using the `StableDiffusionPipeline`. Make sure to include the `placeholder_token` in your prompt.
+
+```python
+from diffusers import StableDiffusionPipeline
+
+model_id = "path-to-your-trained-model"
+pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to("cuda")
+
+prompt = "A <cat-toy> backpack"
+
+image = pipe(prompt, num_inference_steps=50, guidance_scale=7.5).images[0]
+
+image.save("cat-backpack.png")
+```
--- a/docs/source/training/unconditional_training.mdx
+++ b/docs/source/training/unconditional_training.mdx
@@ -0,0 +1,149 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Unconditional Image-Generation
+
+In this section, we explain how one can train an unconditional image generation diffusion 
+model. "Unconditional" because the model is not conditioned on any context to generate an image - once trained the model will simply generate images that resemble its training data 
+distribution.
+
+## Installing the dependencies
+
+Before running the scripts, make sure to install the library's training dependencies:
+
+```bash
+pip install diffusers[training] accelerate datasets
+```
+
+And initialize an [🤗Accelerate](https://github.com/huggingface/accelerate/) environment with:
+
+```bash
+accelerate config
+```
+
+## Unconditional Flowers  
+
+The command to train a DDPM UNet model on the Oxford Flowers dataset:
+
+```bash
+accelerate launch train_unconditional.py \
+  --dataset_name="huggan/flowers-102-categories" \
+  --resolution=64 \
+  --output_dir="ddpm-ema-flowers-64" \
+  --train_batch_size=16 \
+  --num_epochs=100 \
+  --gradient_accumulation_steps=1 \
+  --learning_rate=1e-4 \
+  --lr_warmup_steps=500 \
+  --mixed_precision=no \
+  --push_to_hub
+```
+An example trained model: https://huggingface.co/anton-l/ddpm-ema-flowers-64
+
+A full training run takes 2 hours on 4xV100 GPUs.
+
+<img src="https://user-images.githubusercontent.com/26864830/180248660-a0b143d0-b89a-42c5-8656-2ebf6ece7e52.png" width="700" />
+
+## Unconditional Pokemon 
+
+The command to train a DDPM UNet model on the Pokemon dataset:
+
+```bash
+accelerate launch train_unconditional.py \
+  --dataset_name="huggan/pokemon" \
+  --resolution=64 \
+  --output_dir="ddpm-ema-pokemon-64" \
+  --train_batch_size=16 \
+  --num_epochs=100 \
+  --gradient_accumulation_steps=1 \
+  --learning_rate=1e-4 \
+  --lr_warmup_steps=500 \
+  --mixed_precision=no \
+  --push_to_hub
+```
+An example trained model: https://huggingface.co/anton-l/ddpm-ema-pokemon-64
+
+A full training run takes 2 hours on 4xV100 GPUs.
+
+<img src="https://user-images.githubusercontent.com/26864830/180248200-928953b4-db38-48db-b0c6-8b740fe6786f.png" width="700" />
+
+
+## Using your own data
+
+To use your own dataset, there are 2 ways:
+- you can either provide your own folder as `--train_data_dir`
+- or you can upload your dataset to the hub (possibly as a private repo, if you prefer so), and simply pass the `--dataset_name` argument.
+
+**Note**: If you want to create your own training dataset please have a look at [this document](https://huggingface.co/docs/datasets/image_process#image-datasets).
+
+Below, we explain both in more detail.
+
+### Provide the dataset as a folder
+
+If you provide your own folders with images, the script expects the following directory structure:
+
+```bash
+data_dir/xxx.png
+data_dir/xxy.png
+data_dir/[...]/xxz.png
+```
+
+In other words, the script will take care of gathering all images inside the folder. You can then run the script like this:
+
+```bash
+accelerate launch train_unconditional.py \
+    --train_data_dir <path-to-train-directory> \
+    <other-arguments>
+```
+
+Internally, the script will use the [`ImageFolder`](https://huggingface.co/docs/datasets/v2.0.0/en/image_process#imagefolder) feature which will automatically turn the folders into 🤗 Dataset objects.
+
+### Upload your data to the hub, as a (possibly private) repo
+
+It's very easy (and convenient) to upload your image dataset to the hub using the [`ImageFolder`](https://huggingface.co/docs/datasets/v2.0.0/en/image_process#imagefolder) feature available in 🤗 Datasets. Simply do the following:
+
+```python
+from datasets import load_dataset
+
+# example 1: local folder
+dataset = load_dataset("imagefolder", data_dir="path_to_your_folder")
+
+# example 2: local files (supported formats are tar, gzip, zip, xz, rar, zstd)
+dataset = load_dataset("imagefolder", data_files="path_to_zip_file")
+
+# example 3: remote files (supported formats are tar, gzip, zip, xz, rar, zstd)
+dataset = load_dataset(
+    "imagefolder",
+    data_files="https://download.microsoft.com/download/3/E/1/3E1C3F21-ECDB-4869-8368-6DEBA77B919F/kagglecatsanddogs_3367a.zip",
+)
+
+# example 4: providing several splits
+dataset = load_dataset(
+    "imagefolder", data_files={"train": ["path/to/file1", "path/to/file2"], "test": ["path/to/file3", "path/to/file4"]}
+)
+```
+
+`ImageFolder` will create an `image` column containing the PIL-encoded images.
+
+Next, push it to the hub!
+
+```python
+# assuming you have ran the huggingface-cli login command in a terminal
+dataset.push_to_hub("name_of_your_dataset")
+
+# if you want to push to a private repo, simply pass private=True:
+dataset.push_to_hub("name_of_your_dataset", private=True)
+```
+
+and that's it! You can now train your model by simply setting the `--dataset_name` argument to the name of your dataset on the hub.
+
+More on this can also be found in [this blog post](https://huggingface.co/blog/image-search-datasets).
--- a/docs/source/using-diffusers/conditional_image_generation.mdx
+++ b/docs/source/using-diffusers/conditional_image_generation.mdx
@@ -0,0 +1,48 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Conditional Image Generation
+
+The [`DiffusionPipeline`] is the easiest way to use a pre-trained diffusion system for inference
+
+Start by creating an instance of [`DiffusionPipeline`] and specify which pipeline checkpoint you would like to download.
+You can use the [`DiffusionPipeline`] for any [Diffusers' checkpoint](https://huggingface.co/models?library=diffusers&sort=downloads).
+In this guide though, you'll use [`DiffusionPipeline`] for text-to-image generation with [Latent Diffusion](https://huggingface.co/CompVis/ldm-text2im-large-256):
+
+```python
+>>> from diffusers import DiffusionPipeline
+
+>>> generator = DiffusionPipeline.from_pretrained("CompVis/ldm-text2im-large-256")
+```
+The [`DiffusionPipeline`] downloads and caches all modeling, tokenization, and scheduling components. 
+Because the model consists of roughly 1.4 billion parameters, we strongly recommend running it on GPU.
+You can move the generator object to GPU, just like you would in PyTorch.
+
+```python
+>>> generator.to("cuda")
+```
+
+Now you can use the `generator` on your text prompt:
+
+```python
+>>> image = generator("An image of a squirrel in Picasso style").images[0]
+```
+
+The output is by default wrapped into a [PIL Image object](https://pillow.readthedocs.io/en/stable/reference/Image.html?highlight=image#the-image-class).
+
+You can save the image by simply calling:
+
+```python
+>>> image.save("image_of_squirrel_painting.png")
+```
+
+
--- a/docs/source/using-diffusers/configuration.mdx
+++ b/docs/source/using-diffusers/configuration.mdx
@@ -0,0 +1,32 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+
+
+# Quicktour
+
+Start using Diffusers🧨 quickly!
+To start, use the [`DiffusionPipeline`] for quick inference and sample generations!
+
+```
+pip install diffusers
+```
+
+## Main classes
+
+### Models
+
+### Schedulers
+
+### Pipelines
+
+
--- a/docs/source/using-diffusers/custom.mdx
+++ b/docs/source/using-diffusers/custom.mdx
@@ -0,0 +1,15 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Custom Pipeline
+
+Under construction 🚧
--- a/docs/source/using-diffusers/custom_pipelines.mdx
+++ b/docs/source/using-diffusers/custom_pipelines.mdx
@@ -0,0 +1,121 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Custom Pipelines
+
+Diffusers allows you to conveniently load any custom pipeline from the Hugging Face Hub as well as any [official community pipeline](https://github.com/huggingface/diffusers/tree/main/examples/community) 
+via the [`DiffusionPipeline`] class.
+
+## Loading custom pipelines from the Hub
+
+Custom pipelines can be easily loaded from any model repository on the Hub that defines a diffusion pipeline in a `pipeline.py` file.
+Let's load a dummy pipeline from [hf-internal-testing/diffusers-dummy-pipeline](https://huggingface.co/hf-internal-testing/diffusers-dummy-pipeline).
+
+All you need to do is pass the custom pipeline repo id with the `custom_pipeline` argument alongside the repo from where you wish to load the pipeline modules.
+
+```python
+from diffusers import DiffusionPipeline
+
+pipeline = DiffusionPipeline.from_pretrained(
+    "google/ddpm-cifar10-32", custom_pipeline="hf-internal-testing/diffusers-dummy-pipeline"
+)
+```
+
+This will load the custom pipeline as defined in the [model repository](https://huggingface.co/hf-internal-testing/diffusers-dummy-pipeline/blob/main/pipeline.py).
+
+<Tip warning={true} >
+
+By loading a custom pipeline from the Hugging Face Hub, you are trusting that the code you are loading 
+is safe 🔒. Make sure to check out the code online before loading & running it automatically.
+
+</Tip>
+
+## Loading official community pipelines
+
+Community pipelines are summarized in the [community examples folder](https://github.com/huggingface/diffusers/tree/main/examples/community) 
+
+Similarly, you need to pass both the *repo id* from where you wish to load the weights as well as the `custom_pipeline` argument. Here the `custom_pipeline` argument should consist simply of the filename of the community pipeline excluding the `.py` suffix, *e.g.* `clip_guided_stable_diffusion`.
+
+Since community pipelines are often more complex, one can mix loading weights from an official *repo id*
+and passing pipeline modules directly.
+
+```python
+from diffusers import DiffusionPipeline
+from transformers import CLIPFeatureExtractor, CLIPModel
+
+clip_model_id = "laion/CLIP-ViT-B-32-laion2B-s34B-b79K"
+
+feature_extractor = CLIPFeatureExtractor.from_pretrained(clip_model_id)
+clip_model = CLIPModel.from_pretrained(clip_model_id)
+
+pipeline = DiffusionPipeline.from_pretrained(
+    "CompVis/stable-diffusion-v1-4",
+    custom_pipeline="clip_guided_stable_diffusion",
+    clip_model=clip_model,
+    feature_extractor=feature_extractor,
+)
+```
+
+## Adding custom pipelines to the Hub
+
+To add a custom pipeline to the Hub, all you need to do is to define a pipeline class that inherits 
+from [`DiffusionPipeline`] in a `pipeline.py` file.
+Make sure that the whole pipeline is encapsulated within a single class and that the `pipeline.py` file
+has only one such class.
+
+Let's quickly define an example pipeline.
+
+
+```python
+import torch
+from diffusers import DiffusionPipeline
+
+
+class MyPipeline(DiffusionPipeline):
+    def __init__(self, unet, scheduler):
+        super().__init__()
+
+    self.register_modules(unet=unet, scheduler=scheduler)
+
+    @torch.no_grad()
+    def __call__(self, batch_size: int = 1, num_inference_steps: int = 50):
+        # Sample gaussian noise to begin loop
+        image = torch.randn((batch_size, self.unet.in_channels, self.unet.sample_size, self.unet.sample_size))
+
+        image = image.to(self.device)
+
+        # set step values
+        self.scheduler.set_timesteps(num_inference_steps)
+
+        for t in self.progress_bar(self.scheduler.timesteps):
+            # 1. predict noise model_output
+            model_output = self.unet(image, t).sample
+
+            # 2. predict previous mean of image x_t-1 and add variance depending on eta
+            # eta corresponds to η in paper and should be between [0, 1]
+            # do x_t -> x_t-1
+            image = self.scheduler.step(model_output, t, image, eta).prev_sample
+
+        image = (image / 2 + 0.5).clamp(0, 1)
+        image = image.cpu().permute(0, 2, 3, 1).numpy()
+
+        return image
+```
+
+Now you can upload this short file under the name `pipeline.py` in your preferred [model repository](https://huggingface.co/docs/hub/models-uploading). For Stable Diffusion pipelines, you may also [join the community organisation for shared pipelines](https://huggingface.co/organizations/sd-diffusers-pipelines-library/share/BUPyDUuHcciGTOKaExlqtfFcyCZsVFdrjr) to upload yours.
+Finally, we can load the custom pipeline by passing the model repository name, *e.g.* `sd-diffusers-pipelines-library/my_custom_pipeline` alongside the model repository from where we want to load the `unet` and `scheduler` components.
+
+```python
+my_pipeline = DiffusionPipeline.from_pretrained(
+    "google/ddpm-cifar10-32", custom_pipeline="patrickvonplaten/my_custom_pipeline"
+)
+```
--- a/docs/source/using-diffusers/img2img.mdx
+++ b/docs/source/using-diffusers/img2img.mdx
@@ -0,0 +1,45 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Text-Guided Image-to-Image Generation
+
+The [`StableDiffusionImg2ImgPipeline`] lets you pass a text prompt and an initial image to condition the generation of new images.
+
+```python
+import torch
+import requests
+from PIL import Image
+from io import BytesIO
+
+from diffusers import StableDiffusionImg2ImgPipeline
+
+# load the pipeline
+device = "cuda"
+pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
+    "CompVis/stable-diffusion-v1-4", revision="fp16", torch_dtype=torch.float16
+).to(device)
+
+# let's download an initial image
+url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
+
+response = requests.get(url)
+init_image = Image.open(BytesIO(response.content)).convert("RGB")
+init_image = init_image.resize((768, 512))
+
+prompt = "A fantasy landscape, trending on artstation"
+
+images = pipe(prompt=prompt, init_image=init_image, strength=0.75, guidance_scale=7.5).images
+
+images[0].save("fantasy_landscape.png")
+```
+You can also run this example on colab [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/image_2_image_using_diffusers.ipynb)
+
--- a/docs/source/using-diffusers/inpaint.mdx
+++ b/docs/source/using-diffusers/inpaint.mdx
@@ -0,0 +1,63 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Text-Guided Image-Inpainting
+
+The [`StableDiffusionInpaintPipeline`] lets you edit specific parts of an image by providing a mask and a text prompt. It uses a version of Stable Diffusion specifically trained for in-painting tasks.
+
+<Tip warning={true}>
+Note that this model is distributed separately from the regular Stable Diffusion model, so you have to accept its license even if you accepted the Stable Diffusion one in the past.
+
+Please, visit the [model card](https://huggingface.co/runwayml/stable-diffusion-inpainting), read the license carefully and tick the checkbox if you agree. You have to be a registered user in 🤗 Hugging Face Hub, and you'll also need to use an access token for the code to work. For more information on access tokens, please refer to [this section](https://huggingface.co/docs/hub/security-tokens) of the documentation.
+</Tip>
+
+```python
+import PIL
+import requests
+import torch
+from io import BytesIO
+
+from diffusers import StableDiffusionInpaintPipeline
+
+
+def download_image(url):
+    response = requests.get(url)
+    return PIL.Image.open(BytesIO(response.content)).convert("RGB")
+
+
+img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png"
+mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png"
+
+init_image = download_image(img_url).resize((512, 512))
+mask_image = download_image(mask_url).resize((512, 512))
+
+pipe = StableDiffusionInpaintPipeline.from_pretrained(
+    "runwayml/stable-diffusion-inpainting",
+    revision="fp16",
+    torch_dtype=torch.float16,
+)
+pipe = pipe.to("cuda")
+
+prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
+image = pipe(prompt=prompt, image=init_image, mask_image=mask_image).images[0]
+```
+
+`image`          | `mask_image` | `prompt` | **Output** |
+:-------------------------:|:-------------------------:|:-------------------------:|-------------------------:|
+<img src="https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png" alt="drawing" width="250"/> | <img src="https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png" alt="drawing" width="250"/> | ***Face of a yellow cat, high resolution, sitting on a park bench*** | <img src="https://huggingface.co/datasets/patrickvonplaten/images/resolve/main/test.png" alt="drawing" width="250"/> |
+
+
+You can also run this example on colab [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/in_painting_with_stable_diffusion_using_diffusers.ipynb)
+
+<Tip warning={true}>
+A previous experimental implementation of in-painting used a different, lower-quality process. To ensure backwards compatibility, loading a pretrained pipeline that doesn't contain the new model will still apply the old in-painting method.
+</Tip>
--- a/docs/source/using-diffusers/loading.mdx
+++ b/docs/source/using-diffusers/loading.mdx
@@ -0,0 +1,15 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Loading 
+
+Under construction 🚧
--- a/docs/source/using-diffusers/unconditional_image_generation.mdx
+++ b/docs/source/using-diffusers/unconditional_image_generation.mdx
@@ -0,0 +1,52 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+
+
+# Unconditional Image Generation
+
+The [`DiffusionPipeline`] is the easiest way to use a pre-trained diffusion system for inference
+
+Start by creating an instance of [`DiffusionPipeline`] and specify which pipeline checkpoint you would like to download.
+You can use the [`DiffusionPipeline`] for any [Diffusers' checkpoint](https://huggingface.co/models?library=diffusers&sort=downloads).
+In this guide though, you'll use [`DiffusionPipeline`] for unconditional image generation with [DDPM](https://arxiv.org/abs/2006.11239):
+
+```python
+>>> from diffusers import DiffusionPipeline
+
+>>> generator = DiffusionPipeline.from_pretrained("google/ddpm-celebahq-256")
+```
+The [`DiffusionPipeline`] downloads and caches all modeling, tokenization, and scheduling components. 
+Because the model consists of roughly 1.4 billion parameters, we strongly recommend running it on GPU.
+You can move the generator object to GPU, just like you would in PyTorch.
+
+```python
+>>> generator.to("cuda")
+```
+
+Now you can use the `generator` on your text prompt:
+
+```python
+>>> image = generator().images[0]
+```
+
+The output is by default wrapped into a [PIL Image object](https://pillow.readthedocs.io/en/stable/reference/Image.html?highlight=image#the-image-class).
+
+You can save the image by simply calling:
+
+```python
+>>> image.save("generated_image.png")
+```
+
+
+
+
--- a/examples/README.md
+++ b/examples/README.md
@@ -1,129 +1,62 @@
-## Training examples
+<!---
+Copyright 2022 The HuggingFace Team. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at

-Creating a training image set is [described in a different document](https://huggingface.co/docs/datasets/image_process#image-datasets).
+    http://www.apache.org/licenses/LICENSE-2.0

-### Installing the dependencies
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->

-Before running the scipts, make sure to install the library's training dependencies:
+# 🧨 Diffusers Examples

+Diffusers examples are a collection of scripts to demonstrate how to effectively use the `diffusers` library
+for a variety of use cases.
+
+**Note**: If you are looking for **official** examples on how to use `diffusers` for inference, 
+please have a look at [src/diffusers/pipelines](https://github.com/huggingface/diffusers/tree/main/src/diffusers/pipelines)
+
+Our examples aspire to be **self-contained**, **easy-to-tweak**, **beginner-friendly** and for **one-purpose-only**.
+More specifically, this means:
+
+- **Self-contained**: An example script shall only depend on "pip-install-able" Python packages that can be found in a `requirements.txt` file. Example scripts shall **not** depend on any local files. This means that one can simply download an example script, *e.g.* [train_unconditional.py](https://github.com/huggingface/diffusers/blob/main/examples/unconditional_image_generation/train_unconditional.py), install the required dependencies, *e.g.* [requirements.txt](https://github.com/huggingface/diffusers/blob/main/examples/unconditional_image_generation/requirements.txt) and execute the example script.
+- **Easy-to-tweak**: While we strive to present as many use cases as possible, the example scripts are just that - examples. It is expected that they won't work out-of-the box on your specific problem and that you will be required to change a few lines of code to adapt them to your needs. To help you with that, most of the examples fully expose the preprocessing of the data and the training loop to allow you to tweak and edit them as required.
+- **Beginner-friendly**: We do not aim for providing state-of-the-art training scripts for the newest models, but rather examples that can be used as a way to better understand diffusion models and how to use them with the `diffusers` library. We often purposefully leave out certain state-of-the-art methods if we consider them too complex for beginners.
+- **One-purpose-only**: Examples should show one task and one task only. Even if a task is from a modeling 
+point of view very similar, *e.g.* image super-resolution and image modification tend to use the same model and training method, we want examples to showcase only one task to keep them as readable and easy-to-understand as possible.
+
+We provide **official** examples that cover the most popular tasks of diffusion models.
+*Official* examples are **actively** maintained by the `diffusers` maintainers and we try to rigorously follow our example philosophy as defined above. 
+If you feel like another important example should exist, we are more than happy to welcome a [Feature Request](https://github.com/huggingface/diffusers/issues/new?assignees=&labels=&template=feature_request.md&title=) or directly a [Pull Request](https://github.com/huggingface/diffusers/compare) from you!
+
+Training examples show how to pretrain or fine-tune diffusion models for a variety of tasks. Currently we support:
+
+| Task | 🤗 Accelerate | 🤗 Datasets | Colab
+|---|---|:---:|:---:|
+| [**Unconditional Image Generation**](https://github.com/huggingface/diffusers/blob/main/examples/unconditional_image_generation/train_unconditional.py) | ✅ | ✅ | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/training_example.ipynb)
+
+## Community
+
+In addition, we provide **community** examples, which are examples added and maintained by our community.
+Community examples can consist of both *training* examples or *inference* pipelines.
+For such examples, we are more lenient regarding the philosophy defined above and also cannot guarantee to provide maintenance for every issue.
+Examples that are useful for the community, but are either not yet deemed popular or not yet following our above philosophy should go into the [community examples](https://github.com/huggingface/diffusers/tree/main/examples/community) folder. The community folder therefore includes training examples and inference pipelines.
+**Note**: Community examples can be a [great first contribution](https://github.com/huggingface/diffusers/issues?q=is%3Aopen+is%3Aissue+label%3A%22good+first+issue%22) to show to the community how you like to use `diffusers` 🪄.
+
+## Important note
+
+To make sure you can successfully run the latest versions of the example scripts, you have to **install the library from source** and install some example-specific requirements. To do this, execute the following steps in a new virtual environment:
 ```bash
-pip install diffusers[training] accelerate datasets
+git clone https://github.com/huggingface/diffusers
+cd diffusers
+pip install .
 ```
-
-And initialize an [🤗Accelerate](https://github.com/huggingface/accelerate/) environment with:
-
+Then cd in the example folder of your choice and run
 ```bash
-accelerate config
+pip install -r requirements.txt
 ```
-
-### Unconditional Flowers  
-
-The command to train a DDPM UNet model on the Oxford Flowers dataset:
-
-```bash
-accelerate launch train_unconditional.py \
-  --dataset_name="huggan/flowers-102-categories" \
-  --resolution=64 \
-  --output_dir="ddpm-ema-flowers-64" \
-  --train_batch_size=16 \
-  --num_epochs=100 \
-  --gradient_accumulation_steps=1 \
-  --learning_rate=1e-4 \
-  --lr_warmup_steps=500 \
-  --mixed_precision=no \
-  --push_to_hub
-```
-An example trained model: https://huggingface.co/anton-l/ddpm-ema-flowers-64
-
-A full training run takes 2 hours on 4xV100 GPUs.
-
-<img src="https://user-images.githubusercontent.com/26864830/180248660-a0b143d0-b89a-42c5-8656-2ebf6ece7e52.png" width="700" />
-
-
-### Unconditional Pokemon 
-
-The command to train a DDPM UNet model on the Pokemon dataset:
-
-```bash
-accelerate launch train_unconditional.py \
-  --dataset_name="huggan/pokemon" \
-  --resolution=64 \
-  --output_dir="ddpm-ema-pokemon-64" \
-  --train_batch_size=16 \
-  --num_epochs=100 \
-  --gradient_accumulation_steps=1 \
-  --learning_rate=1e-4 \
-  --lr_warmup_steps=500 \
-  --mixed_precision=no \
-  --push_to_hub
-```
-An example trained model: https://huggingface.co/anton-l/ddpm-ema-pokemon-64
-
-A full training run takes 2 hours on 4xV100 GPUs.
-
-<img src="https://user-images.githubusercontent.com/26864830/180248200-928953b4-db38-48db-b0c6-8b740fe6786f.png" width="700" />
-
-
-### Using your own data
-
-To use your own dataset, there are 2 ways:
- you can either provide your own folder as `--train_data_dir`
- or you can upload your dataset to the hub (possibly as a private repo, if you prefer so), and simply pass the `--dataset_name` argument.
-
-Below, we explain both in more detail.
-
-#### Provide the dataset as a folder
-
-If you provide your own folders with images, the script expects the following directory structure:
-
-```bash
-data_dir/xxx.png
-data_dir/xxy.png
-data_dir/[...]/xxz.png
-```
-
-In other words, the script will take care of gathering all images inside the folder. You can then run the script like this:
-
-```bash
-accelerate launch train_unconditional.py \
-    --train_data_dir <path-to-train-directory> \
-    <other-arguments>
-```
-
-Internally, the script will use the [`ImageFolder`](https://huggingface.co/docs/datasets/v2.0.0/en/image_process#imagefolder) feature which will automatically turn the folders into 🤗 Dataset objects.
-
-#### Upload your data to the hub, as a (possibly private) repo
-
-It's very easy (and convenient) to upload your image dataset to the hub using the [`ImageFolder`](https://huggingface.co/docs/datasets/v2.0.0/en/image_process#imagefolder) feature available in 🤗 Datasets. Simply do the following:
-
-```python
-from datasets import load_dataset
-
-# example 1: local folder
-dataset = load_dataset("imagefolder", data_dir="path_to_your_folder")
-
-# example 2: local files (suppoted formats are tar, gzip, zip, xz, rar, zstd)
-dataset = load_dataset("imagefolder", data_files="path_to_zip_file")
-
-# example 3: remote files (supported formats are tar, gzip, zip, xz, rar, zstd)
-dataset = load_dataset("imagefolder", data_files="https://download.microsoft.com/download/3/E/1/3E1C3F21-ECDB-4869-8368-6DEBA77B919F/kagglecatsanddogs_3367a.zip")
-
-# example 4: providing several splits
-dataset = load_dataset("imagefolder", data_files={"train": ["path/to/file1", "path/to/file2"], "test": ["path/to/file3", "path/to/file4"]})
-```
-
-`ImageFolder` will create an `image` column containing the PIL-encoded images.
-
-Next, push it to the hub!
-
-```python
-# assuming you have ran the huggingface-cli login command in a terminal
-dataset.push_to_hub("name_of_your_dataset")
-
-# if you want to push to a private repo, simply pass private=True:
-dataset.push_to_hub("name_of_your_dataset", private=True)
-```
-
-and that's it! You can now train your model by simply setting the `--dataset_name` argument to the name of your dataset on the hub.
-
-More on this can also be found in [this blog post](https://huggingface.co/blog/image-search-datasets).
--- a/examples/community/README.md
+++ b/examples/community/README.md
@@ -0,0 +1,170 @@
+# Community Examples
+
+> **For more information about community pipelines, please have a look at [this issue](https://github.com/huggingface/diffusers/issues/841).**
+
+**Community** examples consist of both inference and training examples that have been added by the community.
+Please have a look at the following table to get an overview of all community examples. Click on the **Code Example** to get a copy-and-paste ready code example that you can try out.
+If a community doesn't work as expected, please open an issue and ping the author on it.
+
+| Example   |      Description      |      Code Example |  Colab |    Author |
+|:----------|:----------------------|:-----------------|:-------------|----------:|
+| CLIP Guided Stable Diffusion | Doing CLIP guidance for text to image generation with Stable Diffusion| [CLIP Guided Stable Diffusion](#clip-guided-stable-diffusion) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/CLIP_Guided_Stable_diffusion_with_diffusers.ipynb) | [Suraj Patil](https://github.com/patil-suraj/) | 
+| One Step U-Net (Dummy) | Example showcasing of how to use Community Pipelines (see https://github.com/huggingface/diffusers/issues/841) | [One Step U-Net](#one-step-unet) | - | [Patrick von Platen](https://github.com/patrickvonplaten/) |
+| Stable Diffusion Interpolation | Interpolate the latent space of Stable Diffusion between different prompts/seeds | [Stable Diffusion Interpolation](#stable-diffusion-interpolation) | - | [Nate Raw](https://github.com/nateraw/) |
+| Stable Diffusion Mega | **One** Stable Diffusion Pipeline with all functionalities of [Text2Image](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py), [Image2Image](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py) and [Inpainting](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py) | [Stable Diffusion Mega](#stable-diffusion-mega) | - | [Patrick von Platen](https://github.com/patrickvonplaten/) |
+
+To load a custom pipeline you just need to pass the `custom_pipeline` argument to `DiffusionPipeline`, as one of the files in `diffusers/examples/community`. Feel free to send a PR with your own pipelines, we will merge them quickly.
+```py
+pipe = DiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", custom_pipeline="filename_in_the_community_folder")
+```
+
+## Example usages
+
+### CLIP Guided Stable Diffusion
+
+CLIP guided stable diffusion can help to generate more realistic images 
+by guiding stable diffusion at every denoising step with an additional CLIP model.
+
+The following code requires roughly 12GB of GPU RAM.
+
+```python
+from diffusers import DiffusionPipeline
+from transformers import CLIPFeatureExtractor, CLIPModel
+import torch
+
+
+feature_extractor = CLIPFeatureExtractor.from_pretrained("laion/CLIP-ViT-B-32-laion2B-s34B-b79K")
+clip_model = CLIPModel.from_pretrained("laion/CLIP-ViT-B-32-laion2B-s34B-b79K", torch_dtype=torch.float16)
+
+
+guided_pipeline = DiffusionPipeline.from_pretrained(
+    "CompVis/stable-diffusion-v1-4",
+    custom_pipeline="clip_guided_stable_diffusion",
+    clip_model=clip_model,
+    feature_extractor=feature_extractor,
+    revision="fp16",
+    torch_dtype=torch.float16,
+)
+guided_pipeline.enable_attention_slicing()
+guided_pipeline = guided_pipeline.to("cuda")
+
+prompt = "fantasy book cover, full moon, fantasy forest landscape, golden vector elements, fantasy magic, dark light night, intricate, elegant, sharp focus, illustration, highly detailed, digital painting, concept art, matte, art by WLOP and Artgerm and Albert Bierstadt, masterpiece"
+
+generator = torch.Generator(device="cuda").manual_seed(0)
+images = []
+for i in range(4):
+    image = guided_pipeline(
+        prompt,
+        num_inference_steps=50,
+        guidance_scale=7.5,
+        clip_guidance_scale=100,
+        num_cutouts=4,
+        use_cutouts=False,
+        generator=generator,
+    ).images[0]
+    images.append(image)
+    
+# save images locally
+for i, img in enumerate(images):
+    img.save(f"./clip_guided_sd/image_{i}.png")
+```
+
+The `images` list contains a list of PIL images that can be saved locally or displayed directly in a google colab.
+Generated images tend to be of higher qualtiy than natively using stable diffusion. E.g. the above script generates the following images:
+
+![clip_guidance](https://huggingface.co/datasets/patrickvonplaten/images/resolve/main/clip_guidance/merged_clip_guidance.jpg).
+
+### One Step Unet
+
+The dummy "one-step-unet" can be run as follows:
+
+```python
+from diffusers import DiffusionPipeline
+
+pipe = DiffusionPipeline.from_pretrained("google/ddpm-cifar10-32", custom_pipeline="one_step_unet")
+pipe()
+```
+
+**Note**: This community pipeline is not useful as a feature, but rather just serves as an example of how community pipelines can be added (see https://github.com/huggingface/diffusers/issues/841).
+
+### Stable Diffusion Interpolation
+
+The following code can be run on a GPU of at least 8GB VRAM and should take approximately 5 minutes.
+
+```python
+from diffusers import DiffusionPipeline
+import torch
+
+pipe = DiffusionPipeline.from_pretrained(
+    "CompVis/stable-diffusion-v1-4",
+    revision='fp16',
+    torch_dtype=torch.float16,
+    safety_checker=None,  # Very important for videos...lots of false positives while interpolating
+    custom_pipeline="interpolate_stable_diffusion",
+).to('cuda')
+pipe.enable_attention_slicing()
+
+frame_filepaths = pipe.walk(
+    prompts=['a dog', 'a cat', 'a horse'],
+    seeds=[42, 1337, 1234],
+    num_interpolation_steps=16,
+    output_dir='./dreams',
+    batch_size=4,
+    height=512,
+    width=512,
+    guidance_scale=8.5,
+    num_inference_steps=50,
+)
+```
+
+The output of the `walk(...)` function returns a list of images saved under the folder as defined in `output_dir`. You can use these images to create videos of stable diffusion. 
+
+> **Please have a look at https://github.com/nateraw/stable-diffusion-videos for more in-detail information on how to create videos using stable diffusion as well as more feature-complete functionality.**
+
+### Stable Diffusion Mega
+
+The Stable Diffusion Mega Pipeline lets you use the main use cases of the stable diffusion pipeline in a single class.
+
+```python
+#!/usr/bin/env python3
+from diffusers import DiffusionPipeline
+import PIL
+import requests
+from io import BytesIO
+import torch
+
+
+def download_image(url):
+    response = requests.get(url)
+    return PIL.Image.open(BytesIO(response.content)).convert("RGB")
+
+pipe = DiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", custom_pipeline="stable_diffusion_mega", torch_dtype=torch.float16, revision="fp16")
+pipe.to("cuda")
+pipe.enable_attention_slicing()
+
+
+### Text-to-Image
+
+images = pipe.text2img("An astronaut riding a horse").images
+
+### Image-to-Image
+
+init_image = download_image("https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg")
+
+prompt = "A fantasy landscape, trending on artstation"
+
+images = pipe.img2img(prompt=prompt, init_image=init_image, strength=0.75, guidance_scale=7.5).images
+
+### Inpainting
+
+img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png"
+mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png"
+init_image = download_image(img_url).resize((512, 512))
+mask_image = download_image(mask_url).resize((512, 512))
+
+prompt = "a cat sitting on a bench"
+images = pipe.inpaint(prompt=prompt, init_image=init_image, mask_image=mask_image, strength=0.75).images
+```
+
+As shown above this one pipeline can run all both "text-to-image", "image-to-image", and "inpainting" in one pipeline.
+
--- a/examples/community/clip_guided_stable_diffusion.py
+++ b/examples/community/clip_guided_stable_diffusion.py
@@ -0,0 +1,324 @@
+import inspect
+from typing import List, Optional, Union
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from diffusers import AutoencoderKL, DiffusionPipeline, LMSDiscreteScheduler, PNDMScheduler, UNet2DConditionModel
+from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import StableDiffusionPipelineOutput
+from torchvision import transforms
+from transformers import CLIPFeatureExtractor, CLIPModel, CLIPTextModel, CLIPTokenizer
+
+
+class MakeCutouts(nn.Module):
+    def __init__(self, cut_size, cut_power=1.0):
+        super().__init__()
+
+        self.cut_size = cut_size
+        self.cut_power = cut_power
+
+    def forward(self, pixel_values, num_cutouts):
+        sideY, sideX = pixel_values.shape[2:4]
+        max_size = min(sideX, sideY)
+        min_size = min(sideX, sideY, self.cut_size)
+        cutouts = []
+        for _ in range(num_cutouts):
+            size = int(torch.rand([]) ** self.cut_power * (max_size - min_size) + min_size)
+            offsetx = torch.randint(0, sideX - size + 1, ())
+            offsety = torch.randint(0, sideY - size + 1, ())
+            cutout = pixel_values[:, :, offsety : offsety + size, offsetx : offsetx + size]
+            cutouts.append(F.adaptive_avg_pool2d(cutout, self.cut_size))
+        return torch.cat(cutouts)
+
+
+def spherical_dist_loss(x, y):
+    x = F.normalize(x, dim=-1)
+    y = F.normalize(y, dim=-1)
+    return (x - y).norm(dim=-1).div(2).arcsin().pow(2).mul(2)
+
+
+def set_requires_grad(model, value):
+    for param in model.parameters():
+        param.requires_grad = value
+
+
+class CLIPGuidedStableDiffusion(DiffusionPipeline):
+    """CLIP guided stable diffusion based on the amazing repo by @crowsonkb and @Jack000
+    - https://github.com/Jack000/glid-3-xl
+    - https://github.dev/crowsonkb/k-diffusion
+    """
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        clip_model: CLIPModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: Union[PNDMScheduler, LMSDiscreteScheduler],
+        feature_extractor: CLIPFeatureExtractor,
+    ):
+        super().__init__()
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            clip_model=clip_model,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            feature_extractor=feature_extractor,
+        )
+
+        self.normalize = transforms.Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std)
+        self.make_cutouts = MakeCutouts(feature_extractor.size)
+
+        set_requires_grad(self.text_encoder, False)
+        set_requires_grad(self.clip_model, False)
+
+    def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
+        if slice_size == "auto":
+            # half the attention head size is usually a good trade-off between
+            # speed and memory
+            slice_size = self.unet.config.attention_head_dim // 2
+        self.unet.set_attention_slice(slice_size)
+
+    def disable_attention_slicing(self):
+        self.enable_attention_slicing(None)
+
+    def freeze_vae(self):
+        set_requires_grad(self.vae, False)
+
+    def unfreeze_vae(self):
+        set_requires_grad(self.vae, True)
+
+    def freeze_unet(self):
+        set_requires_grad(self.unet, False)
+
+    def unfreeze_unet(self):
+        set_requires_grad(self.unet, True)
+
+    @torch.enable_grad()
+    def cond_fn(
+        self,
+        latents,
+        timestep,
+        index,
+        text_embeddings,
+        noise_pred_original,
+        text_embeddings_clip,
+        clip_guidance_scale,
+        num_cutouts,
+        use_cutouts=True,
+    ):
+        latents = latents.detach().requires_grad_()
+
+        if isinstance(self.scheduler, LMSDiscreteScheduler):
+            sigma = self.scheduler.sigmas[index]
+            # the model input needs to be scaled to match the continuous ODE formulation in K-LMS
+            latent_model_input = latents / ((sigma**2 + 1) ** 0.5)
+        else:
+            latent_model_input = latents
+
+        # predict the noise residual
+        noise_pred = self.unet(latent_model_input, timestep, encoder_hidden_states=text_embeddings).sample
+
+        if isinstance(self.scheduler, PNDMScheduler):
+            alpha_prod_t = self.scheduler.alphas_cumprod[timestep]
+            beta_prod_t = 1 - alpha_prod_t
+            # compute predicted original sample from predicted noise also called
+            # "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+            pred_original_sample = (latents - beta_prod_t ** (0.5) * noise_pred) / alpha_prod_t ** (0.5)
+
+            fac = torch.sqrt(beta_prod_t)
+            sample = pred_original_sample * (fac) + latents * (1 - fac)
+        elif isinstance(self.scheduler, LMSDiscreteScheduler):
+            sigma = self.scheduler.sigmas[index]
+            sample = latents - sigma * noise_pred
+        else:
+            raise ValueError(f"scheduler type {type(self.scheduler)} not supported")
+
+        sample = 1 / 0.18215 * sample
+        image = self.vae.decode(sample).sample
+        image = (image / 2 + 0.5).clamp(0, 1)
+
+        if use_cutouts:
+            image = self.make_cutouts(image, num_cutouts)
+        else:
+            image = transforms.Resize(self.feature_extractor.size)(image)
+        image = self.normalize(image).to(latents.dtype)
+
+        image_embeddings_clip = self.clip_model.get_image_features(image)
+        image_embeddings_clip = image_embeddings_clip / image_embeddings_clip.norm(p=2, dim=-1, keepdim=True)
+
+        if use_cutouts:
+            dists = spherical_dist_loss(image_embeddings_clip, text_embeddings_clip)
+            dists = dists.view([num_cutouts, sample.shape[0], -1])
+            loss = dists.sum(2).mean(0).sum() * clip_guidance_scale
+        else:
+            loss = spherical_dist_loss(image_embeddings_clip, text_embeddings_clip).mean() * clip_guidance_scale
+
+        grads = -torch.autograd.grad(loss, latents)[0]
+
+        if isinstance(self.scheduler, LMSDiscreteScheduler):
+            latents = latents.detach() + grads * (sigma**2)
+            noise_pred = noise_pred_original
+        else:
+            noise_pred = noise_pred_original - torch.sqrt(beta_prod_t) * grads
+        return noise_pred, latents
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        height: Optional[int] = 512,
+        width: Optional[int] = 512,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 7.5,
+        num_images_per_prompt: Optional[int] = 1,
+        clip_guidance_scale: Optional[float] = 100,
+        clip_prompt: Optional[Union[str, List[str]]] = None,
+        num_cutouts: Optional[int] = 4,
+        use_cutouts: Optional[bool] = True,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+    ):
+        if isinstance(prompt, str):
+            batch_size = 1
+        elif isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        # get prompt text embeddings
+        text_input = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_embeddings = self.text_encoder(text_input.input_ids.to(self.device))[0]
+        # duplicate text embeddings for each generation per prompt
+        text_embeddings = text_embeddings.repeat_interleave(num_images_per_prompt, dim=0)
+
+        if clip_guidance_scale > 0:
+            if clip_prompt is not None:
+                clip_text_input = self.tokenizer(
+                    clip_prompt,
+                    padding="max_length",
+                    max_length=self.tokenizer.model_max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                ).input_ids.to(self.device)
+            else:
+                clip_text_input = text_input.input_ids.to(self.device)
+            text_embeddings_clip = self.clip_model.get_text_features(clip_text_input)
+            text_embeddings_clip = text_embeddings_clip / text_embeddings_clip.norm(p=2, dim=-1, keepdim=True)
+            # duplicate text embeddings clip for each generation per prompt
+            text_embeddings_clip = text_embeddings_clip.repeat_interleave(num_images_per_prompt, dim=0)
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance:
+            max_length = text_input.input_ids.shape[-1]
+            uncond_input = self.tokenizer([""], padding="max_length", max_length=max_length, return_tensors="pt")
+            uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(self.device))[0]
+            # duplicate unconditional embeddings for each generation per prompt
+            uncond_embeddings = uncond_embeddings.repeat_interleave(num_images_per_prompt, dim=0)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+
+        # get the initial random noise unless the user supplied it
+
+        # Unlike in other pipelines, latents need to be generated in the target device
+        # for 1-to-1 results reproducibility with the CompVis implementation.
+        # However this currently doesn't work in `mps`.
+        latents_shape = (batch_size * num_images_per_prompt, self.unet.in_channels, height // 8, width // 8)
+        latents_dtype = text_embeddings.dtype
+        if latents is None:
+            if self.device.type == "mps":
+                # randn does not exist on mps
+                latents = torch.randn(latents_shape, generator=generator, device="cpu", dtype=latents_dtype).to(
+                    self.device
+                )
+            else:
+                latents = torch.randn(latents_shape, generator=generator, device=self.device, dtype=latents_dtype)
+        else:
+            if latents.shape != latents_shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
+            latents = latents.to(self.device)
+
+        # set timesteps
+        accepts_offset = "offset" in set(inspect.signature(self.scheduler.set_timesteps).parameters.keys())
+        extra_set_kwargs = {}
+        if accepts_offset:
+            extra_set_kwargs["offset"] = 1
+
+        self.scheduler.set_timesteps(num_inference_steps, **extra_set_kwargs)
+
+        # Some schedulers like PNDM have timesteps as arrays
+        # It's more optimized to move all timesteps to correct device beforehand
+        timesteps_tensor = self.scheduler.timesteps.to(self.device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+
+        for i, t in enumerate(self.progress_bar(timesteps_tensor)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+            # predict the noise residual
+            noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
+
+            # perform classifier free guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+            # perform clip guidance
+            if clip_guidance_scale > 0:
+                text_embeddings_for_guidance = (
+                    text_embeddings.chunk(2)[1] if do_classifier_free_guidance else text_embeddings
+                )
+                noise_pred, latents = self.cond_fn(
+                    latents,
+                    t,
+                    i,
+                    text_embeddings_for_guidance,
+                    noise_pred,
+                    text_embeddings_clip,
+                    clip_guidance_scale,
+                    num_cutouts,
+                    use_cutouts,
+                )
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(noise_pred, t, latents).prev_sample
+
+        # scale and decode the image latents with vae
+        latents = 1 / 0.18215 * latents
+        image = self.vae.decode(latents).sample
+
+        image = (image / 2 + 0.5).clamp(0, 1)
+        image = image.cpu().permute(0, 2, 3, 1).numpy()
+
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image, None)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=None)
--- a/examples/community/interpolate_stable_diffusion.py
+++ b/examples/community/interpolate_stable_diffusion.py
@@ -0,0 +1,524 @@
+import inspect
+import time
+from pathlib import Path
+from typing import Callable, List, Optional, Union
+
+import numpy as np
+import torch
+
+from diffusers.configuration_utils import FrozenDict
+from diffusers.models import AutoencoderKL, UNet2DConditionModel
+from diffusers.pipeline_utils import DiffusionPipeline
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
+from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
+from diffusers.utils import deprecate, logging
+from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+def slerp(t, v0, v1, DOT_THRESHOLD=0.9995):
+    """helper function to spherically interpolate two arrays v1 v2"""
+
+    if not isinstance(v0, np.ndarray):
+        inputs_are_torch = True
+        input_device = v0.device
+        v0 = v0.cpu().numpy()
+        v1 = v1.cpu().numpy()
+
+    dot = np.sum(v0 * v1 / (np.linalg.norm(v0) * np.linalg.norm(v1)))
+    if np.abs(dot) > DOT_THRESHOLD:
+        v2 = (1 - t) * v0 + t * v1
+    else:
+        theta_0 = np.arccos(dot)
+        sin_theta_0 = np.sin(theta_0)
+        theta_t = theta_0 * t
+        sin_theta_t = np.sin(theta_t)
+        s0 = np.sin(theta_0 - theta_t) / sin_theta_0
+        s1 = sin_theta_t / sin_theta_0
+        v2 = s0 * v0 + s1 * v1
+
+    if inputs_are_torch:
+        v2 = torch.from_numpy(v2).to(input_device)
+
+    return v2
+
+
+class StableDiffusionWalkPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please, refer to the [model card](https://huggingface.co/CompVis/stable-diffusion-v1-4) for details.
+        feature_extractor ([`CLIPFeatureExtractor`]):
+            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+    """
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPFeatureExtractor,
+    ):
+        super().__init__()
+
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if safety_checker is None:
+            logger.warn(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+
+    def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
+        r"""
+        Enable sliced attention computation.
+
+        When this option is enabled, the attention module will split the input tensor in slices, to compute attention
+        in several steps. This is useful to save some memory in exchange for a small speed decrease.
+
+        Args:
+            slice_size (`str` or `int`, *optional*, defaults to `"auto"`):
+                When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
+                a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case,
+                `attention_head_dim` must be a multiple of `slice_size`.
+        """
+        if slice_size == "auto":
+            # half the attention head size is usually a good trade-off between
+            # speed and memory
+            slice_size = self.unet.config.attention_head_dim // 2
+        self.unet.set_attention_slice(slice_size)
+
+    def disable_attention_slicing(self):
+        r"""
+        Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go
+        back to computing attention in one step.
+        """
+        # set slice_size = `None` to disable `attention slicing`
+        self.enable_attention_slicing(None)
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Optional[Union[str, List[str]]] = None,
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        text_embeddings: Optional[torch.FloatTensor] = None,
+        **kwargs,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*, defaults to `None`):
+                The prompt or prompts to guide the image generation. If not provided, `text_embeddings` is required.
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator`, *optional*):
+                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+                deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            text_embeddings (`torch.FloatTensor`, *optional*, defaults to `None`):
+                Pre-generated text embeddings to be used as inputs for image generation. Can be used in place of
+                `prompt` to avoid re-computing the embeddings. If not provided, the embeddings will be generated from
+                the supplied `prompt`.
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if text_embeddings is None:
+            if isinstance(prompt, str):
+                batch_size = 1
+            elif isinstance(prompt, list):
+                batch_size = len(prompt)
+            else:
+                raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+            # get prompt text embeddings
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+
+            if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
+                removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :])
+                print(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+                text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
+            text_embeddings = self.text_encoder(text_input_ids.to(self.device))[0]
+        else:
+            batch_size = text_embeddings.shape[0]
+
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        bs_embed, seq_len, _ = text_embeddings.shape
+        text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)
+        text_embeddings = text_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""]
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            max_length = self.tokenizer.model_max_length
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(self.device))[0]
+
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = uncond_embeddings.shape[1]
+            uncond_embeddings = uncond_embeddings.repeat(batch_size, num_images_per_prompt, 1)
+            uncond_embeddings = uncond_embeddings.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+
+        # get the initial random noise unless the user supplied it
+
+        # Unlike in other pipelines, latents need to be generated in the target device
+        # for 1-to-1 results reproducibility with the CompVis implementation.
+        # However this currently doesn't work in `mps`.
+        latents_shape = (batch_size * num_images_per_prompt, self.unet.in_channels, height // 8, width // 8)
+        latents_dtype = text_embeddings.dtype
+        if latents is None:
+            if self.device.type == "mps":
+                # randn does not exist on mps
+                latents = torch.randn(latents_shape, generator=generator, device="cpu", dtype=latents_dtype).to(
+                    self.device
+                )
+            else:
+                latents = torch.randn(latents_shape, generator=generator, device=self.device, dtype=latents_dtype)
+        else:
+            if latents.shape != latents_shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
+            latents = latents.to(self.device)
+
+        # set timesteps
+        self.scheduler.set_timesteps(num_inference_steps)
+
+        # Some schedulers like PNDM have timesteps as arrays
+        # It's more optimized to move all timesteps to correct device beforehand
+        timesteps_tensor = self.scheduler.timesteps.to(self.device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        for i, t in enumerate(self.progress_bar(timesteps_tensor)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+            # predict the noise residual
+            noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
+
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+            # call the callback, if provided
+            if callback is not None and i % callback_steps == 0:
+                callback(i, t, latents)
+
+        latents = 1 / 0.18215 * latents
+        image = self.vae.decode(latents).sample
+
+        image = (image / 2 + 0.5).clamp(0, 1)
+
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+        if self.safety_checker is not None:
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(
+                self.device
+            )
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(text_embeddings.dtype)
+            )
+        else:
+            has_nsfw_concept = None
+
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
+
+    def embed_text(self, text):
+        """takes in text and turns it into text embeddings"""
+        text_input = self.tokenizer(
+            text,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        with torch.no_grad():
+            embed = self.text_encoder(text_input.input_ids.to(self.device))[0]
+        return embed
+
+    def get_noise(self, seed, dtype=torch.float32, height=512, width=512):
+        """Takes in random seed and returns corresponding noise vector"""
+        return torch.randn(
+            (1, self.unet.in_channels, height // 8, width // 8),
+            generator=torch.Generator(device=self.device).manual_seed(seed),
+            device=self.device,
+            dtype=dtype,
+        )
+
+    def walk(
+        self,
+        prompts: List[str],
+        seeds: List[int],
+        num_interpolation_steps: Optional[int] = 6,
+        output_dir: Optional[str] = "./dreams",
+        name: Optional[str] = None,
+        batch_size: Optional[int] = 1,
+        height: Optional[int] = 512,
+        width: Optional[int] = 512,
+        guidance_scale: Optional[float] = 7.5,
+        num_inference_steps: Optional[int] = 50,
+        eta: Optional[float] = 0.0,
+    ) -> List[str]:
+        """
+        Walks through a series of prompts and seeds, interpolating between them and saving the results to disk.
+
+        Args:
+            prompts (`List[str]`):
+                List of prompts to generate images for.
+            seeds (`List[int]`):
+                List of seeds corresponding to provided prompts. Must be the same length as prompts.
+            num_interpolation_steps (`int`, *optional*, defaults to 6):
+                Number of interpolation steps to take between prompts.
+            output_dir (`str`, *optional*, defaults to `./dreams`):
+                Directory to save the generated images to.
+            name (`str`, *optional*, defaults to `None`):
+                Subdirectory of `output_dir` to save the generated images to. If `None`, the name will
+                be the current time.
+            batch_size (`int`, *optional*, defaults to 1):
+                Number of images to generate at once.
+            height (`int`, *optional*, defaults to 512):
+                Height of the generated images.
+            width (`int`, *optional*, defaults to 512):
+                Width of the generated images.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+
+        Returns:
+            `List[str]`: List of paths to the generated images.
+        """
+        if not len(prompts) == len(seeds):
+            raise ValueError(
+                f"Number of prompts and seeds must be equalGot {len(prompts)} prompts and {len(seeds)} seeds"
+            )
+
+        name = name or time.strftime("%Y%m%d-%H%M%S")
+        save_path = Path(output_dir) / name
+        save_path.mkdir(exist_ok=True, parents=True)
+
+        frame_idx = 0
+        frame_filepaths = []
+        for prompt_a, prompt_b, seed_a, seed_b in zip(prompts, prompts[1:], seeds, seeds[1:]):
+            # Embed Text
+            embed_a = self.embed_text(prompt_a)
+            embed_b = self.embed_text(prompt_b)
+
+            # Get Noise
+            noise_dtype = embed_a.dtype
+            noise_a = self.get_noise(seed_a, noise_dtype, height, width)
+            noise_b = self.get_noise(seed_b, noise_dtype, height, width)
+
+            noise_batch, embeds_batch = None, None
+            T = np.linspace(0.0, 1.0, num_interpolation_steps)
+            for i, t in enumerate(T):
+                noise = slerp(float(t), noise_a, noise_b)
+                embed = torch.lerp(embed_a, embed_b, t)
+
+                noise_batch = noise if noise_batch is None else torch.cat([noise_batch, noise], dim=0)
+                embeds_batch = embed if embeds_batch is None else torch.cat([embeds_batch, embed], dim=0)
+
+                batch_is_ready = embeds_batch.shape[0] == batch_size or i + 1 == T.shape[0]
+                if batch_is_ready:
+                    outputs = self(
+                        latents=noise_batch,
+                        text_embeddings=embeds_batch,
+                        height=height,
+                        width=width,
+                        guidance_scale=guidance_scale,
+                        eta=eta,
+                        num_inference_steps=num_inference_steps,
+                    )
+                    noise_batch, embeds_batch = None, None
+
+                    for image in outputs["images"]:
+                        frame_filepath = str(save_path / f"frame_{frame_idx:06d}.png")
+                        image.save(frame_filepath)
+                        frame_filepaths.append(frame_filepath)
+                        frame_idx += 1
+        return frame_filepaths
--- a/examples/community/one_step_unet.py
+++ b/examples/community/one_step_unet.py
@@ -0,0 +1,22 @@
+#!/usr/bin/env python3
+import torch
+
+from diffusers import DiffusionPipeline
+
+
+class UnetSchedulerOneForwardPipeline(DiffusionPipeline):
+    def __init__(self, unet, scheduler):
+        super().__init__()
+
+        self.register_modules(unet=unet, scheduler=scheduler)
+
+    def __call__(self):
+        image = torch.randn(
+            (1, self.unet.in_channels, self.unet.sample_size, self.unet.sample_size),
+        )
+        timestep = 1
+
+        model_output = self.unet(image, timestep).sample
+        scheduler_output = self.scheduler.step(model_output, timestep, image).prev_sample
+
+        return scheduler_output
--- a/examples/community/stable_diffusion_mega.py
+++ b/examples/community/stable_diffusion_mega.py
@@ -0,0 +1,224 @@
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import torch
+
+import PIL.Image
+from diffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    DiffusionPipeline,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    StableDiffusionImg2ImgPipeline,
+    StableDiffusionInpaintPipelineLegacy,
+    StableDiffusionPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.configuration_utils import FrozenDict
+from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from diffusers.utils import deprecate, logging
+from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class StableDiffusionMegaPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionMegaSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please, refer to the [model card](https://huggingface.co/CompVis/stable-diffusion-v1-4) for details.
+        feature_extractor ([`CLIPFeatureExtractor`]):
+            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+    """
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPFeatureExtractor,
+    ):
+        super().__init__()
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+
+    @property
+    def components(self) -> Dict[str, Any]:
+        return {k: getattr(self, k) for k in self.config.keys() if not k.startswith("_")}
+
+    def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
+        r"""
+        Enable sliced attention computation.
+
+        When this option is enabled, the attention module will split the input tensor in slices, to compute attention
+        in several steps. This is useful to save some memory in exchange for a small speed decrease.
+
+        Args:
+            slice_size (`str` or `int`, *optional*, defaults to `"auto"`):
+                When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
+                a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case,
+                `attention_head_dim` must be a multiple of `slice_size`.
+        """
+        if slice_size == "auto":
+            # half the attention head size is usually a good trade-off between
+            # speed and memory
+            slice_size = self.unet.config.attention_head_dim // 2
+        self.unet.set_attention_slice(slice_size)
+
+    def disable_attention_slicing(self):
+        r"""
+        Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go
+        back to computing attention in one step.
+        """
+        # set slice_size = `None` to disable `attention slicing`
+        self.enable_attention_slicing(None)
+
+    @torch.no_grad()
+    def inpaint(
+        self,
+        prompt: Union[str, List[str]],
+        init_image: Union[torch.FloatTensor, PIL.Image.Image],
+        mask_image: Union[torch.FloatTensor, PIL.Image.Image],
+        strength: float = 0.8,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: Optional[float] = 0.0,
+        generator: Optional[torch.Generator] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+    ):
+        # For more information on how this function works, please see: https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion#diffusers.StableDiffusionImg2ImgPipeline
+        return StableDiffusionInpaintPipelineLegacy(**self.components)(
+            prompt=prompt,
+            init_image=init_image,
+            mask_image=mask_image,
+            strength=strength,
+            num_inference_steps=num_inference_steps,
+            guidance_scale=guidance_scale,
+            negative_prompt=negative_prompt,
+            num_images_per_prompt=num_images_per_prompt,
+            eta=eta,
+            generator=generator,
+            output_type=output_type,
+            return_dict=return_dict,
+            callback=callback,
+        )
+
+    @torch.no_grad()
+    def img2img(
+        self,
+        prompt: Union[str, List[str]],
+        init_image: Union[torch.FloatTensor, PIL.Image.Image],
+        strength: float = 0.8,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: Optional[float] = 0.0,
+        generator: Optional[torch.Generator] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        **kwargs,
+    ):
+        # For more information on how this function works, please see: https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion#diffusers.StableDiffusionImg2ImgPipeline
+        return StableDiffusionImg2ImgPipeline(**self.components)(
+            prompt=prompt,
+            init_image=init_image,
+            strength=strength,
+            num_inference_steps=num_inference_steps,
+            guidance_scale=guidance_scale,
+            negative_prompt=negative_prompt,
+            num_images_per_prompt=num_images_per_prompt,
+            eta=eta,
+            generator=generator,
+            output_type=output_type,
+            return_dict=return_dict,
+            callback=callback,
+            callback_steps=callback_steps,
+        )
+
+    @torch.no_grad()
+    def text2img(
+        self,
+        prompt: Union[str, List[str]],
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+    ):
+        # For more information on how this function https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion#diffusers.StableDiffusionPipeline
+        return StableDiffusionPipeline(**self.components)(
+            prompt=prompt,
+            height=height,
+            width=width,
+            num_inference_steps=num_inference_steps,
+            guidance_scale=guidance_scale,
+            negative_prompt=negative_prompt,
+            num_images_per_prompt=num_images_per_prompt,
+            eta=eta,
+            generator=generator,
+            latents=latents,
+            output_type=output_type,
+            return_dict=return_dict,
+            callback=callback,
+            callback_steps=callback_steps,
+        )
--- a/examples/conftest.py
+++ b/examples/conftest.py
@@ -0,0 +1,45 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# tests directory-specific settings - this file is run automatically
+# by pytest before any tests are run
+
+import sys
+import warnings
+from os.path import abspath, dirname, join
+
+
+# allow having multiple repository checkouts and not needing to remember to rerun
+# 'pip install -e .[dev]' when switching between checkouts and running tests.
+git_repo_path = abspath(join(dirname(dirname(dirname(__file__))), "src"))
+sys.path.insert(1, git_repo_path)
+
+
+# silence FutureWarning warnings in tests since often we can't act on them until
+# they become normal warnings - i.e. the tests still need to test the current functionality
+warnings.simplefilter(action="ignore", category=FutureWarning)
+
+
+def pytest_addoption(parser):
+    from diffusers.utils.testing_utils import pytest_addoption_shared
+
+    pytest_addoption_shared(parser)
+
+
+def pytest_terminal_summary(terminalreporter):
+    from diffusers.utils.testing_utils import pytest_terminal_summary_main
+
+    make_reports = terminalreporter.config.getoption("--make-reports")
+    if make_reports:
+        pytest_terminal_summary_main(terminalreporter, id=make_reports)
--- a/examples/dreambooth/README.md
+++ b/examples/dreambooth/README.md
@@ -0,0 +1,211 @@
+# DreamBooth training example
+
+[DreamBooth](https://arxiv.org/abs/2208.12242) is a method to personalize text2image models like stable diffusion given just a few(3~5) images of a subject.
+The `train_dreambooth.py` script shows how to implement the training procedure and adapt it for stable diffusion.
+
+
+## Running locally 
+### Installing the dependencies
+
+Before running the scripts, make sure to install the library's training dependencies:
+
+```bash
+pip install git+https://github.com/huggingface/diffusers.git
+pip install -U -r requirements.txt
+```
+
+And initialize an [🤗Accelerate](https://github.com/huggingface/accelerate/) environment with:
+
+```bash
+accelerate config
+```
+
+### Dog toy example
+
+You need to accept the model license before downloading or using the weights. In this example we'll use model version `v1-4`, so you'll need to visit [its card](https://huggingface.co/CompVis/stable-diffusion-v1-4), read the license and tick the checkbox if you agree. 
+
+You have to be a registered user in 🤗 Hugging Face Hub, and you'll also need to use an access token for the code to work. For more information on access tokens, please refer to [this section of the documentation](https://huggingface.co/docs/hub/security-tokens).
+
+Run the following command to authenticate your token
+
+```bash
+huggingface-cli login
+```
+
+If you have already cloned the repo, then you won't need to go through these steps.
+
+<br>
+
+Now let's get our dataset. Download images from [here](https://drive.google.com/drive/folders/1BO_dyz-p65qhBRRMRA4TbZ8qW4rB99JZ) and save them in a directory. This will be our training data.
+
+And launch the training using
+
+```bash
+export MODEL_NAME="CompVis/stable-diffusion-v1-4"
+export INSTANCE_DIR="path-to-instance-images"
+export OUTPUT_DIR="path-to-save-model"
+
+accelerate launch train_dreambooth.py \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --instance_data_dir=$INSTANCE_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --instance_prompt="a photo of sks dog" \
+  --resolution=512 \
+  --train_batch_size=1 \
+  --gradient_accumulation_steps=1 \
+  --learning_rate=5e-6 \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --max_train_steps=400
+```
+
+### Training with prior-preservation loss
+
+Prior-preservation is used to avoid overfitting and language-drift. Refer to the paper to learn more about it. For prior-preservation we first generate images using the model with a class prompt and then use those during training along with our data.
+According to the paper, it's recommended to generate `num_epochs * num_samples` images for prior-preservation. 200-300 works well for most cases.
+
+```bash
+export MODEL_NAME="CompVis/stable-diffusion-v1-4"
+export INSTANCE_DIR="path-to-instance-images"
+export CLASS_DIR="path-to-class-images"
+export OUTPUT_DIR="path-to-save-model"
+
+accelerate launch train_dreambooth.py \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --instance_data_dir=$INSTANCE_DIR \
+  --class_data_dir=$CLASS_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --with_prior_preservation --prior_loss_weight=1.0 \
+  --instance_prompt="a photo of sks dog" \
+  --class_prompt="a photo of dog" \
+  --resolution=512 \
+  --train_batch_size=1 \
+  --gradient_accumulation_steps=1 \
+  --learning_rate=5e-6 \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --num_class_images=200 \
+  --max_train_steps=800
+```
+
+### Training on a 16GB GPU:
+
+With the help of gradient checkpointing and the 8-bit optimizer from bitsandbytes it's possible to run train dreambooth on a 16GB GPU.
+
+Install `bitsandbytes` with `pip install bitsandbytes`
+
+```bash
+export MODEL_NAME="CompVis/stable-diffusion-v1-4"
+export INSTANCE_DIR="path-to-instance-images"
+export CLASS_DIR="path-to-class-images"
+export OUTPUT_DIR="path-to-save-model"
+
+accelerate launch train_dreambooth.py \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --instance_data_dir=$INSTANCE_DIR \
+  --class_data_dir=$CLASS_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --with_prior_preservation --prior_loss_weight=1.0 \
+  --instance_prompt="a photo of sks dog" \
+  --class_prompt="a photo of dog" \
+  --resolution=512 \
+  --train_batch_size=1 \
+  --gradient_accumulation_steps=2 --gradient_checkpointing \
+  --use_8bit_adam \
+  --learning_rate=5e-6 \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --num_class_images=200 \
+  --max_train_steps=800
+```
+
+### Training on a 8 GB GPU:
+
+By using [DeepSpeed](https://www.deepspeed.ai/) it's possible to offload some
+tensors from VRAM to either CPU or NVME allowing to train with less VRAM.
+
+DeepSpeed needs to be enabled with `accelerate config`. During configuration
+answer yes to "Do you want to use DeepSpeed?". With DeepSpeed stage 2, fp16
+mixed precision and offloading both parameters and optimizer state to cpu it's
+possible to train on under 8 GB VRAM with a drawback of requiring significantly
+more RAM (about 25 GB). See [documentation](https://huggingface.co/docs/accelerate/usage_guides/deepspeed) for more DeepSpeed configuration options.
+
+Changing the default Adam optimizer to DeepSpeed's special version of Adam
+`deepspeed.ops.adam.DeepSpeedCPUAdam` gives a substantial speedup but enabling
+it requires CUDA toolchain with the same version as pytorch. 8-bit optimizer
+does not seem to be compatible with DeepSpeed at the moment.
+
+```bash
+export MODEL_NAME="CompVis/stable-diffusion-v1-4"
+export INSTANCE_DIR="path-to-instance-images"
+export CLASS_DIR="path-to-class-images"
+export OUTPUT_DIR="path-to-save-model"
+
+accelerate launch train_dreambooth.py \
+  --pretrained_model_name_or_path=$MODEL_NAME \
+  --instance_data_dir=$INSTANCE_DIR \
+  --class_data_dir=$CLASS_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --with_prior_preservation --prior_loss_weight=1.0 \
+  --instance_prompt="a photo of sks dog" \
+  --class_prompt="a photo of dog" \
+  --resolution=512 \
+  --train_batch_size=1 \
+  --gradient_accumulation_steps=1 --gradient_checkpointing \
+  --learning_rate=5e-6 \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --num_class_images=200 \
+  --max_train_steps=800 \
+  --mixed_precision=fp16
+```
+
+### Fine-tune text encoder with the UNet.
+
+The script also allows to fine-tune the `text_encoder` along with the `unet`. It's been observed experimentally that fine-tuning `text_encoder` gives much better results especially on faces. 
+Pass the `--train_text_encoder` argument to the script to enable training `text_encoder`.
+
+___Note: Training text encoder requires more memory, with this option the training won't fit on 16GB GPU. It needs at least 24GB VRAM.___
+
+```bash
+export MODEL_NAME="CompVis/stable-diffusion-v1-4"
+export INSTANCE_DIR="path-to-instance-images"
+export CLASS_DIR="path-to-class-images"
+export OUTPUT_DIR="path-to-save-model"
+
+accelerate launch train_dreambooth.py \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --train_text_encoder \
+  --instance_data_dir=$INSTANCE_DIR \
+  --class_data_dir=$CLASS_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --with_prior_preservation --prior_loss_weight=1.0 \
+  --instance_prompt="a photo of sks dog" \
+  --class_prompt="a photo of dog" \
+  --resolution=512 \
+  --train_batch_size=1 \
+  --use_8bit_adam
+  --gradient_checkpointing \
+  --learning_rate=2e-6 \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --num_class_images=200 \
+  --max_train_steps=800
+```
+
+## Inference
+
+Once you have trained a model using above command, the inference can be done simply using the `StableDiffusionPipeline`. Make sure to include the `identifier`(e.g. sks in above example) in your prompt.
+
+```python
+from diffusers import StableDiffusionPipeline
+import torch
+
+model_id = "path-to-your-trained-model"
+pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to("cuda")
+
+prompt = "A photo of sks dog in a bucket"
+image = pipe(prompt, num_inference_steps=50, guidance_scale=7.5).images[0]
+
+image.save("dog-bucket.png")
+```
--- a/examples/dreambooth/requirements.txt
+++ b/examples/dreambooth/requirements.txt
@@ -0,0 +1,6 @@
+accelerate
+torchvision
+transformers>=4.21.0
+ftfy
+tensorboard
+modelcards
--- a/examples/dreambooth/train_dreambooth.py
+++ b/examples/dreambooth/train_dreambooth.py
@@ -0,0 +1,623 @@
+import argparse
+import itertools
+import math
+import os
+from pathlib import Path
+from typing import Optional
+
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch.utils.data import Dataset
+
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import set_seed
+from diffusers import AutoencoderKL, DDPMScheduler, StableDiffusionPipeline, UNet2DConditionModel
+from diffusers.optimization import get_scheduler
+from huggingface_hub import HfFolder, Repository, whoami
+from PIL import Image
+from torchvision import transforms
+from tqdm.auto import tqdm
+from transformers import CLIPTextModel, CLIPTokenizer
+
+
+logger = get_logger(__name__)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        type=str,
+        default=None,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--instance_data_dir",
+        type=str,
+        default=None,
+        required=True,
+        help="A folder containing the training data of instance images.",
+    )
+    parser.add_argument(
+        "--class_data_dir",
+        type=str,
+        default=None,
+        required=False,
+        help="A folder containing the training data of class images.",
+    )
+    parser.add_argument(
+        "--instance_prompt",
+        type=str,
+        default=None,
+        help="The prompt with identifier specifying the instance",
+    )
+    parser.add_argument(
+        "--class_prompt",
+        type=str,
+        default=None,
+        help="The prompt to specify images in the same class as provided instance images.",
+    )
+    parser.add_argument(
+        "--with_prior_preservation",
+        default=False,
+        action="store_true",
+        help="Flag to add prior preservation loss.",
+    )
+    parser.add_argument("--prior_loss_weight", type=float, default=1.0, help="The weight of prior preservation loss.")
+    parser.add_argument(
+        "--num_class_images",
+        type=int,
+        default=100,
+        help=(
+            "Minimal class images for prior preservation loss. If not have enough images, additional images will be"
+            " sampled with class_prompt."
+        ),
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="text-inversion-model",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=512,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--center_crop", action="store_true", help="Whether to center crop images before resizing to resolution"
+    )
+    parser.add_argument("--train_text_encoder", action="store_true", help="Whether to train the text encoder")
+    parser.add_argument(
+        "--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument(
+        "--sample_batch_size", type=int, default=4, help="Batch size (per device) for sampling images."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=1)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=5e-6,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default="no",
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose"
+            "between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
+            "and an Nvidia Ampere GPU."
+        ),
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+
+    args = parser.parse_args()
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    if args.instance_data_dir is None:
+        raise ValueError("You must specify a train data directory.")
+
+    if args.with_prior_preservation:
+        if args.class_data_dir is None:
+            raise ValueError("You must specify a data directory for class images.")
+        if args.class_prompt is None:
+            raise ValueError("You must specify prompt for class images.")
+
+    return args
+
+
+class DreamBoothDataset(Dataset):
+    """
+    A dataset to prepare the instance and class images with the prompts for fine-tuning the model.
+    It pre-processes the images and the tokenizes prompts.
+    """
+
+    def __init__(
+        self,
+        instance_data_root,
+        instance_prompt,
+        tokenizer,
+        class_data_root=None,
+        class_prompt=None,
+        size=512,
+        center_crop=False,
+    ):
+        self.size = size
+        self.center_crop = center_crop
+        self.tokenizer = tokenizer
+
+        self.instance_data_root = Path(instance_data_root)
+        if not self.instance_data_root.exists():
+            raise ValueError("Instance images root doesn't exists.")
+
+        self.instance_images_path = list(Path(instance_data_root).iterdir())
+        self.num_instance_images = len(self.instance_images_path)
+        self.instance_prompt = instance_prompt
+        self._length = self.num_instance_images
+
+        if class_data_root is not None:
+            self.class_data_root = Path(class_data_root)
+            self.class_data_root.mkdir(parents=True, exist_ok=True)
+            self.class_images_path = list(self.class_data_root.iterdir())
+            self.num_class_images = len(self.class_images_path)
+            self._length = max(self.num_class_images, self.num_instance_images)
+            self.class_prompt = class_prompt
+        else:
+            self.class_data_root = None
+
+        self.image_transforms = transforms.Compose(
+            [
+                transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR),
+                transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size),
+                transforms.ToTensor(),
+                transforms.Normalize([0.5], [0.5]),
+            ]
+        )
+
+    def __len__(self):
+        return self._length
+
+    def __getitem__(self, index):
+        example = {}
+        instance_image = Image.open(self.instance_images_path[index % self.num_instance_images])
+        if not instance_image.mode == "RGB":
+            instance_image = instance_image.convert("RGB")
+        example["instance_images"] = self.image_transforms(instance_image)
+        example["instance_prompt_ids"] = self.tokenizer(
+            self.instance_prompt,
+            padding="do_not_pad",
+            truncation=True,
+            max_length=self.tokenizer.model_max_length,
+        ).input_ids
+
+        if self.class_data_root:
+            class_image = Image.open(self.class_images_path[index % self.num_class_images])
+            if not class_image.mode == "RGB":
+                class_image = class_image.convert("RGB")
+            example["class_images"] = self.image_transforms(class_image)
+            example["class_prompt_ids"] = self.tokenizer(
+                self.class_prompt,
+                padding="do_not_pad",
+                truncation=True,
+                max_length=self.tokenizer.model_max_length,
+            ).input_ids
+
+        return example
+
+
+class PromptDataset(Dataset):
+    "A simple dataset to prepare the prompts to generate class images on multiple GPUs."
+
+    def __init__(self, prompt, num_samples):
+        self.prompt = prompt
+        self.num_samples = num_samples
+
+    def __len__(self):
+        return self.num_samples
+
+    def __getitem__(self, index):
+        example = {}
+        example["prompt"] = self.prompt
+        example["index"] = index
+        return example
+
+
+def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: Optional[str] = None):
+    if token is None:
+        token = HfFolder.get_token()
+    if organization is None:
+        username = whoami(token)["name"]
+        return f"{username}/{model_id}"
+    else:
+        return f"{organization}/{model_id}"
+
+
+def main():
+    args = parse_args()
+    logging_dir = Path(args.output_dir, args.logging_dir)
+
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with="tensorboard",
+        logging_dir=logging_dir,
+    )
+
+    # Currently, it's not possible to do gradient accumulation when training two models with accelerate.accumulate
+    # This will be enabled soon in accelerate. For now, we don't allow gradient accumulation when training two models.
+    # TODO (patil-suraj): Remove this check when gradient accumulation with two models is enabled in accelerate.
+    if args.train_text_encoder and args.gradient_accumulation_steps > 1 and accelerator.num_processes > 1:
+        raise ValueError(
+            "Gradient accumulation is not supported when training the text encoder in distributed training. "
+            "Please set gradient_accumulation_steps to 1. This feature will be supported in the future."
+        )
+
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    if args.with_prior_preservation:
+        class_images_dir = Path(args.class_data_dir)
+        if not class_images_dir.exists():
+            class_images_dir.mkdir(parents=True)
+        cur_class_images = len(list(class_images_dir.iterdir()))
+
+        if cur_class_images < args.num_class_images:
+            torch_dtype = torch.float16 if accelerator.device.type == "cuda" else torch.float32
+            pipeline = StableDiffusionPipeline.from_pretrained(
+                args.pretrained_model_name_or_path, torch_dtype=torch_dtype
+            )
+            pipeline.set_progress_bar_config(disable=True)
+
+            num_new_images = args.num_class_images - cur_class_images
+            logger.info(f"Number of class images to sample: {num_new_images}.")
+
+            sample_dataset = PromptDataset(args.class_prompt, num_new_images)
+            sample_dataloader = torch.utils.data.DataLoader(sample_dataset, batch_size=args.sample_batch_size)
+
+            sample_dataloader = accelerator.prepare(sample_dataloader)
+            pipeline.to(accelerator.device)
+
+            for example in tqdm(
+                sample_dataloader, desc="Generating class images", disable=not accelerator.is_local_main_process
+            ):
+                images = pipeline(example["prompt"]).images
+
+                for i, image in enumerate(images):
+                    image.save(class_images_dir / f"{example['index'][i] + cur_class_images}.jpg")
+
+            del pipeline
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.push_to_hub:
+            if args.hub_model_id is None:
+                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
+            else:
+                repo_name = args.hub_model_id
+            repo = Repository(args.output_dir, clone_from=repo_name)
+
+            with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
+                if "step_*" not in gitignore:
+                    gitignore.write("step_*\n")
+                if "epoch_*" not in gitignore:
+                    gitignore.write("epoch_*\n")
+        elif args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+    # Load the tokenizer
+    if args.tokenizer_name:
+        tokenizer = CLIPTokenizer.from_pretrained(args.tokenizer_name)
+    elif args.pretrained_model_name_or_path:
+        tokenizer = CLIPTokenizer.from_pretrained(args.pretrained_model_name_or_path, subfolder="tokenizer")
+
+    # Load models and create wrapper for stable diffusion
+    text_encoder = CLIPTextModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="text_encoder")
+    vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae")
+    unet = UNet2DConditionModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="unet")
+
+    vae.requires_grad_(False)
+    if not args.train_text_encoder:
+        text_encoder.requires_grad_(False)
+
+    if args.gradient_checkpointing:
+        unet.enable_gradient_checkpointing()
+        if args.train_text_encoder:
+            text_encoder.gradient_checkpointing_enable()
+
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+        )
+
+    # Use 8-bit Adam for lower memory usage or to fine-tune the model in 16GB GPUs
+    if args.use_8bit_adam:
+        try:
+            import bitsandbytes as bnb
+        except ImportError:
+            raise ImportError(
+                "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`."
+            )
+
+        optimizer_class = bnb.optim.AdamW8bit
+    else:
+        optimizer_class = torch.optim.AdamW
+
+    params_to_optimize = (
+        itertools.chain(unet.parameters(), text_encoder.parameters()) if args.train_text_encoder else unet.parameters()
+    )
+    optimizer = optimizer_class(
+        params_to_optimize,
+        lr=args.learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+
+    noise_scheduler = DDPMScheduler(
+        beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000
+    )
+
+    train_dataset = DreamBoothDataset(
+        instance_data_root=args.instance_data_dir,
+        instance_prompt=args.instance_prompt,
+        class_data_root=args.class_data_dir if args.with_prior_preservation else None,
+        class_prompt=args.class_prompt,
+        tokenizer=tokenizer,
+        size=args.resolution,
+        center_crop=args.center_crop,
+    )
+
+    def collate_fn(examples):
+        input_ids = [example["instance_prompt_ids"] for example in examples]
+        pixel_values = [example["instance_images"] for example in examples]
+
+        # Concat class and instance examples for prior preservation.
+        # We do this to avoid doing two forward passes.
+        if args.with_prior_preservation:
+            input_ids += [example["class_prompt_ids"] for example in examples]
+            pixel_values += [example["class_images"] for example in examples]
+
+        pixel_values = torch.stack(pixel_values)
+        pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
+
+        input_ids = tokenizer.pad({"input_ids": input_ids}, padding=True, return_tensors="pt").input_ids
+
+        batch = {
+            "input_ids": input_ids,
+            "pixel_values": pixel_values,
+        }
+        return batch
+
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset, batch_size=args.train_batch_size, shuffle=True, collate_fn=collate_fn
+    )
+
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps,
+        num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
+    )
+
+    if args.train_text_encoder:
+        unet, text_encoder, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+            unet, text_encoder, optimizer, train_dataloader, lr_scheduler
+        )
+    else:
+        unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+            unet, optimizer, train_dataloader, lr_scheduler
+        )
+
+    weight_dtype = torch.float32
+    if args.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif args.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+
+    # Move text_encode and vae to gpu.
+    # For mixed precision training we cast the text_encoder and vae weights to half-precision
+    # as these models are only used for inference, keeping weights in full precision is not required.
+    vae.to(accelerator.device, dtype=weight_dtype)
+    if not args.train_text_encoder:
+        text_encoder.to(accelerator.device, dtype=weight_dtype)
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        accelerator.init_trackers("dreambooth", config=vars(args))
+
+    # Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num batches each epoch = {len(train_dataloader)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    # Only show the progress bar once on each machine.
+    progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process)
+    progress_bar.set_description("Steps")
+    global_step = 0
+
+    for epoch in range(args.num_train_epochs):
+        unet.train()
+        for step, batch in enumerate(train_dataloader):
+            with accelerator.accumulate(unet):
+                # Convert images to latent space
+                latents = vae.encode(batch["pixel_values"].to(dtype=weight_dtype)).latent_dist.sample()
+                latents = latents * 0.18215
+
+                # Sample noise that we'll add to the latents
+                noise = torch.randn_like(latents)
+                bsz = latents.shape[0]
+                # Sample a random timestep for each image
+                timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device)
+                timesteps = timesteps.long()
+
+                # Add noise to the latents according to the noise magnitude at each timestep
+                # (this is the forward diffusion process)
+                noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+
+                # Get the text embedding for conditioning
+                encoder_hidden_states = text_encoder(batch["input_ids"])[0]
+
+                # Predict the noise residual
+                noise_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
+
+                if args.with_prior_preservation:
+                    # Chunk the noise and noise_pred into two parts and compute the loss on each part separately.
+                    noise_pred, noise_pred_prior = torch.chunk(noise_pred, 2, dim=0)
+                    noise, noise_prior = torch.chunk(noise, 2, dim=0)
+
+                    # Compute instance loss
+                    loss = F.mse_loss(noise_pred.float(), noise.float(), reduction="none").mean([1, 2, 3]).mean()
+
+                    # Compute prior loss
+                    prior_loss = F.mse_loss(noise_pred_prior.float(), noise_prior.float(), reduction="mean")
+
+                    # Add the prior loss to the instance loss.
+                    loss = loss + args.prior_loss_weight * prior_loss
+                else:
+                    loss = F.mse_loss(noise_pred.float(), noise.float(), reduction="mean")
+
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    params_to_clip = (
+                        itertools.chain(unet.parameters(), text_encoder.parameters())
+                        if args.train_text_encoder
+                        else unet.parameters()
+                    )
+                    accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+
+            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+            accelerator.log(logs, step=global_step)
+
+            if global_step >= args.max_train_steps:
+                break
+
+        accelerator.wait_for_everyone()
+
+    # Create the pipeline using using the trained modules and save it.
+    if accelerator.is_main_process:
+        pipeline = StableDiffusionPipeline.from_pretrained(
+            args.pretrained_model_name_or_path,
+            unet=accelerator.unwrap_model(unet),
+            text_encoder=accelerator.unwrap_model(text_encoder),
+        )
+        pipeline.save_pretrained(args.output_dir)
+
+        if args.push_to_hub:
+            repo.push_to_hub(commit_message="End of training", blocking=False, auto_lfs_prune=True)
+
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/inference/README.md
+++ b/examples/inference/README.md
@@ -0,0 +1,8 @@
+# Inference Examples
+
+**The inference examples folder is deprecated and will be removed in a future version**.
+**Officially supported inference examples can be found in the [Pipelines folder](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines)**.
+
+- For `Image-to-Image text-guided generation with Stable Diffusion`, please have a look at the official [Pipeline examples](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines#examples)
+- For `In-painting using Stable Diffusion`, please have a look at the official [Pipeline examples](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines#examples)
+- For `Tweak prompts reusing seeds and latents`, please have a look at the official [Pipeline examples](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines#examples)
--- a/examples/inference/image_to_image.py
+++ b/examples/inference/image_to_image.py
@@ -0,0 +1,9 @@
+import warnings
+
+from diffusers import StableDiffusionImg2ImgPipeline  # noqa F401
+
+
+warnings.warn(
+    "The `image_to_image.py` script is outdated. Please use directly `from diffusers import"
+    " StableDiffusionImg2ImgPipeline` instead."
+)
--- a/examples/inference/inpainting.py
+++ b/examples/inference/inpainting.py
@@ -0,0 +1,9 @@
+import warnings
+
+from diffusers import StableDiffusionInpaintPipeline as StableDiffusionInpaintPipeline  # noqa F401
+
+
+warnings.warn(
+    "The `inpainting.py` script is outdated. Please use directly `from diffusers import"
+    " StableDiffusionInpaintPipeline` instead."
+)
--- a/examples/test_examples.py
+++ b/examples/test_examples.py
@@ -0,0 +1,123 @@
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc..
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import logging
+import os
+import shutil
+import subprocess
+import sys
+import tempfile
+import unittest
+from typing import List
+
+from accelerate.utils import write_basic_config
+from diffusers.utils import slow
+
+
+logging.basicConfig(level=logging.DEBUG)
+
+logger = logging.getLogger()
+
+
+# These utils relate to ensuring the right error message is received when running scripts
+class SubprocessCallException(Exception):
+    pass
+
+
+def run_command(command: List[str], return_stdout=False):
+    """
+    Runs `command` with `subprocess.check_output` and will potentially return the `stdout`. Will also properly capture
+    if an error occurred while running `command`
+    """
+    try:
+        output = subprocess.check_output(command, stderr=subprocess.STDOUT)
+        if return_stdout:
+            if hasattr(output, "decode"):
+                output = output.decode("utf-8")
+            return output
+    except subprocess.CalledProcessError as e:
+        raise SubprocessCallException(
+            f"Command `{' '.join(command)}` failed with the following error:\n\n{e.output.decode()}"
+        ) from e
+
+
+stream_handler = logging.StreamHandler(sys.stdout)
+logger.addHandler(stream_handler)
+
+
+class ExamplesTestsAccelerate(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
+        cls._tmpdir = tempfile.mkdtemp()
+        cls.configPath = os.path.join(cls._tmpdir, "default_config.yml")
+
+        write_basic_config(save_location=cls.configPath)
+        cls._launch_args = ["accelerate", "launch", "--config_file", cls.configPath]
+
+    @classmethod
+    def tearDownClass(cls):
+        super().tearDownClass()
+        shutil.rmtree(cls._tmpdir)
+
+    @slow
+    def test_train_unconditional(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+                examples/unconditional_image_generation/train_unconditional.py
+                --dataset_name huggan/few-shot-aurora
+                --resolution 64
+                --output_dir {tmpdir}
+                --train_batch_size 4
+                --num_epochs 1
+                --gradient_accumulation_steps 1
+                --learning_rate 1e-3
+                --lr_warmup_steps 5
+                --mixed_precision fp16
+                """.split()
+
+            run_command(self._launch_args + test_args, return_stdout=True)
+            # save_pretrained smoke test
+            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "unet", "diffusion_pytorch_model.bin")))
+            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "scheduler", "scheduler_config.json")))
+            # logging test
+            self.assertTrue(len(os.listdir(os.path.join(tmpdir, "logs", "train_unconditional"))) > 0)
+
+    @slow
+    def test_textual_inversion(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+                examples/textual_inversion/textual_inversion.py
+                --pretrained_model_name_or_path CompVis/stable-diffusion-v1-4
+                --train_data_dir docs/source/imgs
+                --learnable_property object
+                --placeholder_token <cat-toy>
+                --initializer_token toy
+                --resolution 64
+                --train_batch_size 1
+                --gradient_accumulation_steps 2
+                --max_train_steps 10
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --output_dir {tmpdir}
+                --mixed_precision fp16
+                """.split()
+
+            run_command(self._launch_args + test_args)
+            # save_pretrained smoke test
+            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "learned_embeds.bin")))
--- a/examples/text_to_image/README.md
+++ b/examples/text_to_image/README.md
@@ -0,0 +1,101 @@
+# Stable Diffusion text-to-image fine-tuning
+
+The `train_text_to_image.py` script shows how to fine-tune stable diffusion model on your own dataset.
+
+___Note___:
+
+___This script is experimental. The script fine-tunes the whole model and often times the model overifits and runs into issues like catastrophic forgetting. It's recommended to try different hyperparamters to get the best result on your dataset.___
+
+
+## Running locally 
+### Installing the dependencies
+
+Before running the scripts, make sure to install the library's training dependencies:
+
+```bash
+pip install git+https://github.com/huggingface/diffusers.git
+pip install -U -r requirements.txt
+```
+
+And initialize an [🤗Accelerate](https://github.com/huggingface/accelerate/) environment with:
+
+```bash
+accelerate config
+```
+
+### Pokemon example
+
+You need to accept the model license before downloading or using the weights. In this example we'll use model version `v1-4`, so you'll need to visit [its card](https://huggingface.co/CompVis/stable-diffusion-v1-4), read the license and tick the checkbox if you agree. 
+
+You have to be a registered user in 🤗 Hugging Face Hub, and you'll also need to use an access token for the code to work. For more information on access tokens, please refer to [this section of the documentation](https://huggingface.co/docs/hub/security-tokens).
+
+Run the following command to authenticate your token
+
+```bash
+huggingface-cli login
+```
+
+If you have already cloned the repo, then you won't need to go through these steps.
+
+<br>
+
+#### Hardware
+With `gradient_checkpointing` and `mixed_precision` it should be possible to fine tune the model on a single 24GB GPU. For higher `batch_size` and faster training it's better to use GPUs with >30GB memory.
+
+```bash
+export MODEL_NAME="CompVis/stable-diffusion-v1-4"
+export dataset_name="lambdalabs/pokemon-blip-captions"
+
+accelerate launch train_text_to_image.py \
+  --pretrained_model_name_or_path=$MODEL_NAME \
+  --dataset_name=$dataset_name \
+  --use_ema \
+  --resolution=512 --center_crop --random_flip \
+  --train_batch_size=1 \
+  --gradient_accumulation_steps=4 \
+  --gradient_checkpointing \
+  --mixed_precision="fp16" \
+  --max_train_steps=15000 \
+  --learning_rate=1e-05 \
+  --max_grad_norm=1 \
+  --lr_scheduler="constant" --lr_warmup_steps=0 \
+  --output_dir="sd-pokemon-model" 
+```
+
+
+To run on your own training files prepare the dataset according to the format required by `datasets`, you can find the instructions for how to do that in this [document](https://huggingface.co/docs/datasets/v2.4.0/en/image_load#imagefolder-with-metadata).
+If you wish to use custom loading logic, you should modify the script, we have left pointers for that in the training script.
+
+```bash
+export MODEL_NAME="CompVis/stable-diffusion-v1-4"
+export TRAIN_DIR="path_to_your_dataset"
+
+accelerate launch train_text_to_image.py \
+  --pretrained_model_name_or_path=$MODEL_NAME \
+  --train_data_dir=$TRAIN_DIR \
+  --use_ema \
+  --resolution=512 --center_crop --random_flip \
+  --train_batch_size=1 \
+  --gradient_accumulation_steps=4 \
+  --gradient_checkpointing \
+  --mixed_precision="fp16" \
+  --max_train_steps=15000 \
+  --learning_rate=1e-05 \
+  --max_grad_norm=1 \
+  --lr_scheduler="constant" --lr_warmup_steps=0 \
+  --output_dir="sd-pokemon-model"
+```
+
+Once the training is finished the model will be saved in the `output_dir` specified in the command. In this example it's `sd-pokemon-model`. To load the fine-tuned model for inference just pass that path to `StableDiffusionPipeline`
+
+
+```python
+from diffusers import StableDiffusionPipeline
+
+model_path = "path_to_saved_model"
+pipe = StableDiffusionPipeline.from_pretrained(model_path, torch_dtype=torch.float16)
+pipe.to("cuda")
+
+image = pipe(prompt="yoda").images[0]
+image.save("yoda-pokemon.png")
+```
--- a/examples/text_to_image/requirements.txt
+++ b/examples/text_to_image/requirements.txt
@@ -0,0 +1,7 @@
+diffusers==0.4.1
+accelerate
+torchvision
+transformers>=4.21.0
+ftfy
+tensorboard
+modelcards
--- a/examples/text_to_image/train_text_to_image.py
+++ b/examples/text_to_image/train_text_to_image.py
@@ -0,0 +1,627 @@
+import argparse
+import logging
+import math
+import os
+import random
+from pathlib import Path
+from typing import Iterable, Optional
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import set_seed
+from datasets import load_dataset
+from diffusers import AutoencoderKL, DDPMScheduler, PNDMScheduler, StableDiffusionPipeline, UNet2DConditionModel
+from diffusers.optimization import get_scheduler
+from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker
+from huggingface_hub import HfFolder, Repository, whoami
+from torchvision import transforms
+from tqdm.auto import tqdm
+from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
+
+
+logger = get_logger(__name__)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        default=None,
+        help=(
+            "The name of the Dataset (from the HuggingFace hub) to train on (could be your own, possibly private,"
+            " dataset). It can also be a path pointing to a local copy of a dataset in your filesystem,"
+            " or to a folder containing files that 🤗 Datasets can understand."
+        ),
+    )
+    parser.add_argument(
+        "--dataset_config_name",
+        type=str,
+        default=None,
+        help="The config of the Dataset, leave as None if there's only one config.",
+    )
+    parser.add_argument(
+        "--train_data_dir",
+        type=str,
+        default=None,
+        help=(
+            "A folder containing the training data. Folder contents must follow the structure described in"
+            " https://huggingface.co/docs/datasets/image_dataset#imagefolder. In particular, a `metadata.jsonl` file"
+            " must exist to provide the captions for the images. Ignored if `dataset_name` is specified."
+        ),
+    )
+    parser.add_argument(
+        "--image_column", type=str, default="image", help="The column of the dataset containing an image."
+    )
+    parser.add_argument(
+        "--caption_column",
+        type=str,
+        default="text",
+        help="The column of the dataset containing a caption or a list of captions.",
+    )
+    parser.add_argument(
+        "--max_train_samples",
+        type=int,
+        default=None,
+        help=(
+            "For debugging purposes or quicker training, truncate the number of training examples to this "
+            "value if set."
+        ),
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="sd-model-finetuned",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        type=str,
+        default=None,
+        help="The directory where the downloaded models and datasets will be stored.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=512,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--center_crop",
+        action="store_true",
+        help="Whether to center crop images before resizing to resolution (if not set, random crop will be used)",
+    )
+    parser.add_argument(
+        "--random_flip",
+        action="store_true",
+        help="whether to randomly flip images horizontally",
+    )
+    parser.add_argument(
+        "--train_batch_size", type=int, default=16, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=100)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1e-4,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
+    )
+    parser.add_argument("--use_ema", action="store_true", help="Whether to use EMA model.")
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default="no",
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose"
+            "between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
+            "and an Nvidia Ampere GPU."
+        ),
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="tensorboard",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`,'
+            ' `"wandb"` and `"comet_ml"`. Use `"all"` (default) to report to all integrations.'
+            "Only applicable when `--with_tracking` is passed."
+        ),
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+
+    args = parser.parse_args()
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    # Sanity checks
+    if args.dataset_name is None and args.train_data_dir is None:
+        raise ValueError("Need either a dataset name or a training folder.")
+
+    return args
+
+
+def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: Optional[str] = None):
+    if token is None:
+        token = HfFolder.get_token()
+    if organization is None:
+        username = whoami(token)["name"]
+        return f"{username}/{model_id}"
+    else:
+        return f"{organization}/{model_id}"
+
+
+dataset_name_mapping = {
+    "lambdalabs/pokemon-blip-captions": ("image", "text"),
+}
+
+
+# Adapted from torch-ema https://github.com/fadel/pytorch_ema/blob/master/torch_ema/ema.py#L14
+class EMAModel:
+    """
+    Exponential Moving Average of models weights
+    """
+
+    def __init__(self, parameters: Iterable[torch.nn.Parameter], decay=0.9999):
+        parameters = list(parameters)
+        self.shadow_params = [p.clone().detach() for p in parameters]
+
+        self.decay = decay
+        self.optimization_step = 0
+
+    def get_decay(self, optimization_step):
+        """
+        Compute the decay factor for the exponential moving average.
+        """
+        value = (1 + optimization_step) / (10 + optimization_step)
+        return 1 - min(self.decay, value)
+
+    @torch.no_grad()
+    def step(self, parameters):
+        parameters = list(parameters)
+
+        self.optimization_step += 1
+        self.decay = self.get_decay(self.optimization_step)
+
+        for s_param, param in zip(self.shadow_params, parameters):
+            if param.requires_grad:
+                tmp = self.decay * (s_param - param)
+                s_param.sub_(tmp)
+            else:
+                s_param.copy_(param)
+
+        torch.cuda.empty_cache()
+
+    def copy_to(self, parameters: Iterable[torch.nn.Parameter]) -> None:
+        """
+        Copy current averaged parameters into given collection of parameters.
+
+        Args:
+            parameters: Iterable of `torch.nn.Parameter`; the parameters to be
+                updated with the stored moving averages. If `None`, the
+                parameters with which this `ExponentialMovingAverage` was
+                initialized will be used.
+        """
+        parameters = list(parameters)
+        for s_param, param in zip(self.shadow_params, parameters):
+            param.data.copy_(s_param.data)
+
+    def to(self, device=None, dtype=None) -> None:
+        r"""Move internal buffers of the ExponentialMovingAverage to `device`.
+
+        Args:
+            device: like `device` argument to `torch.Tensor.to`
+        """
+        # .to() on the tensors handles None correctly
+        self.shadow_params = [
+            p.to(device=device, dtype=dtype) if p.is_floating_point() else p.to(device=device)
+            for p in self.shadow_params
+        ]
+
+
+def main():
+    args = parse_args()
+    logging_dir = os.path.join(args.output_dir, args.logging_dir)
+
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        logging_dir=logging_dir,
+    )
+
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.push_to_hub:
+            if args.hub_model_id is None:
+                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
+            else:
+                repo_name = args.hub_model_id
+            repo = Repository(args.output_dir, clone_from=repo_name)
+
+            with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
+                if "step_*" not in gitignore:
+                    gitignore.write("step_*\n")
+                if "epoch_*" not in gitignore:
+                    gitignore.write("epoch_*\n")
+        elif args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+    # Load models and create wrapper for stable diffusion
+    tokenizer = CLIPTokenizer.from_pretrained(args.pretrained_model_name_or_path, subfolder="tokenizer")
+    text_encoder = CLIPTextModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="text_encoder")
+    vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae")
+    unet = UNet2DConditionModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="unet")
+
+    # Freeze vae and text_encoder
+    vae.requires_grad_(False)
+    text_encoder.requires_grad_(False)
+
+    if args.gradient_checkpointing:
+        unet.enable_gradient_checkpointing()
+
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+        )
+
+    # Initialize the optimizer
+    if args.use_8bit_adam:
+        try:
+            import bitsandbytes as bnb
+        except ImportError:
+            raise ImportError(
+                "Please install bitsandbytes to use 8-bit Adam. You can do so by running `pip install bitsandbytes`"
+            )
+
+        optimizer_cls = bnb.optim.AdamW8bit
+    else:
+        optimizer_cls = torch.optim.AdamW
+
+    optimizer = optimizer_cls(
+        unet.parameters(),
+        lr=args.learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+
+    # TODO (patil-suraj): load scheduler using args
+    noise_scheduler = DDPMScheduler(
+        beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000, tensor_format="pt"
+    )
+
+    # Get the datasets: you can either provide your own training and evaluation files (see below)
+    # or specify a Dataset from the hub (the dataset will be downloaded automatically from the datasets Hub).
+
+    # In distributed training, the load_dataset function guarantees that only one local process can concurrently
+    # download the dataset.
+    if args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        dataset = load_dataset(
+            args.dataset_name,
+            args.dataset_config_name,
+            cache_dir=args.cache_dir,
+        )
+    else:
+        data_files = {}
+        if args.train_data_dir is not None:
+            data_files["train"] = os.path.join(args.train_data_dir, "**")
+        dataset = load_dataset(
+            "imagefolder",
+            data_files=data_files,
+            cache_dir=args.cache_dir,
+        )
+        # See more about loading custom images at
+        # https://huggingface.co/docs/datasets/v2.4.0/en/image_load#imagefolder
+
+    # Preprocessing the datasets.
+    # We need to tokenize inputs and targets.
+    column_names = dataset["train"].column_names
+
+    # 6. Get the column names for input/target.
+    dataset_columns = dataset_name_mapping.get(args.dataset_name, None)
+    if args.image_column is None:
+        image_column = dataset_columns[0] if dataset_columns is not None else column_names[0]
+    else:
+        image_column = args.image_column
+        if image_column not in column_names:
+            raise ValueError(
+                f"--image_column' value '{args.image_column}' needs to be one of: {', '.join(column_names)}"
+            )
+    if args.caption_column is None:
+        caption_column = dataset_columns[1] if dataset_columns is not None else column_names[1]
+    else:
+        caption_column = args.caption_column
+        if caption_column not in column_names:
+            raise ValueError(
+                f"--caption_column' value '{args.caption_column}' needs to be one of: {', '.join(column_names)}"
+            )
+
+    # Preprocessing the datasets.
+    # We need to tokenize input captions and transform the images.
+    def tokenize_captions(examples, is_train=True):
+        captions = []
+        for caption in examples[caption_column]:
+            if isinstance(caption, str):
+                captions.append(caption)
+            elif isinstance(caption, (list, np.ndarray)):
+                # take a random caption if there are multiple
+                captions.append(random.choice(caption) if is_train else caption[0])
+            else:
+                raise ValueError(
+                    f"Caption column `{caption_column}` should contain either strings or lists of strings."
+                )
+        inputs = tokenizer(captions, max_length=tokenizer.model_max_length, padding="do_not_pad", truncation=True)
+        input_ids = inputs.input_ids
+        return input_ids
+
+    train_transforms = transforms.Compose(
+        [
+            transforms.Resize((args.resolution, args.resolution), interpolation=transforms.InterpolationMode.BILINEAR),
+            transforms.CenterCrop(args.resolution) if args.center_crop else transforms.RandomCrop(args.resolution),
+            transforms.RandomHorizontalFlip() if args.random_flip else transforms.Lambda(lambda x: x),
+            transforms.ToTensor(),
+            transforms.Normalize([0.5], [0.5]),
+        ]
+    )
+
+    def preprocess_train(examples):
+        images = [image.convert("RGB") for image in examples[image_column]]
+        examples["pixel_values"] = [train_transforms(image) for image in images]
+        examples["input_ids"] = tokenize_captions(examples)
+
+        return examples
+
+    with accelerator.main_process_first():
+        if args.max_train_samples is not None:
+            dataset["train"] = dataset["train"].shuffle(seed=args.seed).select(range(args.max_train_samples))
+        # Set the training transforms
+        train_dataset = dataset["train"].with_transform(preprocess_train)
+
+    def collate_fn(examples):
+        pixel_values = torch.stack([example["pixel_values"] for example in examples])
+        pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
+        input_ids = [example["input_ids"] for example in examples]
+        padded_tokens = tokenizer.pad({"input_ids": input_ids}, padding=True, return_tensors="pt")
+        return {
+            "pixel_values": pixel_values,
+            "input_ids": padded_tokens.input_ids,
+            "attention_mask": padded_tokens.attention_mask,
+        }
+
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset, shuffle=True, collate_fn=collate_fn, batch_size=args.train_batch_size
+    )
+
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps,
+        num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
+    )
+
+    unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+        unet, optimizer, train_dataloader, lr_scheduler
+    )
+
+    weight_dtype = torch.float32
+    if args.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif args.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+
+    # Move text_encode and vae to gpu.
+    # For mixed precision training we cast the text_encoder and vae weights to half-precision
+    # as these models are only used for inference, keeping weights in full precision is not required.
+    text_encoder.to(accelerator.device, dtype=weight_dtype)
+    vae.to(accelerator.device, dtype=weight_dtype)
+
+    # Create EMA for the unet.
+    if args.use_ema:
+        ema_unet = EMAModel(unet.parameters())
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        accelerator.init_trackers("text2image-fine-tune", config=vars(args))
+
+    # Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+
+    # Only show the progress bar once on each machine.
+    progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process)
+    progress_bar.set_description("Steps")
+    global_step = 0
+
+    for epoch in range(args.num_train_epochs):
+        unet.train()
+        train_loss = 0.0
+        for step, batch in enumerate(train_dataloader):
+            with accelerator.accumulate(unet):
+                # Convert images to latent space
+                latents = vae.encode(batch["pixel_values"].to(weight_dtype)).latent_dist.sample()
+                latents = latents * 0.18215
+
+                # Sample noise that we'll add to the latents
+                noise = torch.randn_like(latents)
+                bsz = latents.shape[0]
+                # Sample a random timestep for each image
+                timesteps = torch.randint(0, noise_scheduler.num_train_timesteps, (bsz,), device=latents.device)
+                timesteps = timesteps.long()
+
+                # Add noise to the latents according to the noise magnitude at each timestep
+                # (this is the forward diffusion process)
+                noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+
+                # Get the text embedding for conditioning
+                encoder_hidden_states = text_encoder(batch["input_ids"])[0]
+
+                # Predict the noise residual and compute loss
+                noise_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
+                loss = F.mse_loss(noise_pred.float(), noise.float(), reduction="mean")
+
+                # Gather the losses across all processes for logging (if we use distributed training).
+                avg_loss = accelerator.gather(loss.repeat(args.train_batch_size)).mean()
+                train_loss += avg_loss.item() / args.gradient_accumulation_steps
+
+                # Backpropagate
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    accelerator.clip_grad_norm_(unet.parameters(), args.max_grad_norm)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                if args.use_ema:
+                    ema_unet.step(unet.parameters())
+                progress_bar.update(1)
+                global_step += 1
+                accelerator.log({"train_loss": train_loss}, step=global_step)
+                train_loss = 0.0
+
+            logs = {"step_loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+
+            if global_step >= args.max_train_steps:
+                break
+
+    # Create the pipeline using the trained modules and save it.
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        unet = accelerator.unwrap_model(unet)
+        if args.use_ema:
+            ema_unet.copy_to(unet.parameters())
+
+        pipeline = StableDiffusionPipeline(
+            text_encoder=text_encoder,
+            vae=vae,
+            unet=unet,
+            tokenizer=tokenizer,
+            scheduler=PNDMScheduler(
+                beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", skip_prk_steps=True
+            ),
+            safety_checker=StableDiffusionSafetyChecker.from_pretrained("CompVis/stable-diffusion-safety-checker"),
+            feature_extractor=CLIPFeatureExtractor.from_pretrained("openai/clip-vit-base-patch32"),
+        )
+        pipeline.save_pretrained(args.output_dir)
+
+        if args.push_to_hub:
+            repo.push_to_hub(commit_message="End of training", blocking=False, auto_lfs_prune=True)
+
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/textual_inversion/README.md
+++ b/examples/textual_inversion/README.md
@@ -0,0 +1,87 @@
+## Textual Inversion fine-tuning example
+
+[Textual inversion](https://arxiv.org/abs/2208.01618) is a method to personalize text2image models like stable diffusion on your own images using just 3-5 examples.
+The `textual_inversion.py` script shows how to implement the training procedure and adapt it for stable diffusion.
+
+## Running on Colab 
+
+Colab for training 
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/sd_textual_inversion_training.ipynb)
+
+Colab for inference
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/stable_conceptualizer_inference.ipynb)
+
+## Running locally 
+### Installing the dependencies
+
+Before running the scripts, make sure to install the library's training dependencies:
+
+```bash
+pip install diffusers"[training]" accelerate "transformers>=4.21.0"
+```
+
+And initialize an [🤗Accelerate](https://github.com/huggingface/accelerate/) environment with:
+
+```bash
+accelerate config
+```
+
+
+### Cat toy example
+
+You need to accept the model license before downloading or using the weights. In this example we'll use model version `v1-4`, so you'll need to visit [its card](https://huggingface.co/CompVis/stable-diffusion-v1-4), read the license and tick the checkbox if you agree. 
+
+You have to be a registered user in 🤗 Hugging Face Hub, and you'll also need to use an access token for the code to work. For more information on access tokens, please refer to [this section of the documentation](https://huggingface.co/docs/hub/security-tokens).
+
+Run the following command to authenticate your token
+
+```bash
+huggingface-cli login
+```
+
+If you have already cloned the repo, then you won't need to go through these steps. 
+
+<br>
+
+Now let's get our dataset.Download 3-4 images from [here](https://drive.google.com/drive/folders/1fmJMs25nxS_rSNqS5hTcRdLem_YQXbq5) and save them in a directory. This will be our training data.
+
+And launch the training using
+
+```bash
+export MODEL_NAME="CompVis/stable-diffusion-v1-4"
+export DATA_DIR="path-to-dir-containing-images"
+
+accelerate launch textual_inversion.py \
+  --pretrained_model_name_or_path=$MODEL_NAME \
+  --train_data_dir=$DATA_DIR \
+  --learnable_property="object" \
+  --placeholder_token="<cat-toy>" --initializer_token="toy" \
+  --resolution=512 \
+  --train_batch_size=1 \
+  --gradient_accumulation_steps=4 \
+  --max_train_steps=3000 \
+  --learning_rate=5.0e-04 --scale_lr \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --output_dir="textual_inversion_cat"
+```
+
+A full training run takes ~1 hour on one V100 GPU.
+
+
+### Inference
+
+Once you have trained a model using above command, the inference can be done simply using the `StableDiffusionPipeline`. Make sure to include the `placeholder_token` in your prompt.
+
+```python
+from diffusers import StableDiffusionPipeline
+
+model_id = "path-to-your-trained-model"
+pipe = StableDiffusionPipeline.from_pretrained(model_id,torch_dtype=torch.float16).to("cuda")
+
+prompt = "A <cat-toy> backpack"
+
+image = pipe(prompt, num_inference_steps=50, guidance_scale=7.5).images[0]
+
+image.save("cat-backpack.png")
+```
--- a/examples/textual_inversion/requirements.txt
+++ b/examples/textual_inversion/requirements.txt
@@ -0,0 +1,3 @@
+accelerate
+torchvision
+transformers>=4.21.0
--- a/examples/textual_inversion/textual_inversion.py
+++ b/examples/textual_inversion/textual_inversion.py
@@ -0,0 +1,578 @@
+import argparse
+import itertools
+import math
+import os
+import random
+from pathlib import Path
+from typing import Optional
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch.utils.data import Dataset
+
+import PIL
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import set_seed
+from diffusers import AutoencoderKL, DDPMScheduler, PNDMScheduler, StableDiffusionPipeline, UNet2DConditionModel
+from diffusers.optimization import get_scheduler
+from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker
+from huggingface_hub import HfFolder, Repository, whoami
+from PIL import Image
+from torchvision import transforms
+from tqdm.auto import tqdm
+from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
+
+
+logger = get_logger(__name__)
+
+
+def save_progress(text_encoder, placeholder_token_id, accelerator, args):
+    logger.info("Saving embeddings")
+    learned_embeds = accelerator.unwrap_model(text_encoder).get_input_embeddings().weight[placeholder_token_id]
+    learned_embeds_dict = {args.placeholder_token: learned_embeds.detach().cpu()}
+    torch.save(learned_embeds_dict, os.path.join(args.output_dir, "learned_embeds.bin"))
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument(
+        "--save_steps",
+        type=int,
+        default=500,
+        help="Save learned_embeds.bin every X updates steps.",
+    )
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        type=str,
+        default=None,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--train_data_dir", type=str, default=None, required=True, help="A folder containing the training data."
+    )
+    parser.add_argument(
+        "--placeholder_token",
+        type=str,
+        default=None,
+        required=True,
+        help="A token to use as a placeholder for the concept.",
+    )
+    parser.add_argument(
+        "--initializer_token", type=str, default=None, required=True, help="A token to use as initializer word."
+    )
+    parser.add_argument("--learnable_property", type=str, default="object", help="Choose between 'object' and 'style'")
+    parser.add_argument("--repeats", type=int, default=100, help="How many times to repeat the training data.")
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="text-inversion-model",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=512,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--center_crop", action="store_true", help="Whether to center crop images before resizing to resolution"
+    )
+    parser.add_argument(
+        "--train_batch_size", type=int, default=16, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=100)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=5000,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1e-4,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=True,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default="no",
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose"
+            "between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
+            "and an Nvidia Ampere GPU."
+        ),
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+
+    args = parser.parse_args()
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    if args.train_data_dir is None:
+        raise ValueError("You must specify a train data directory.")
+
+    return args
+
+
+imagenet_templates_small = [
+    "a photo of a {}",
+    "a rendering of a {}",
+    "a cropped photo of the {}",
+    "the photo of a {}",
+    "a photo of a clean {}",
+    "a photo of a dirty {}",
+    "a dark photo of the {}",
+    "a photo of my {}",
+    "a photo of the cool {}",
+    "a close-up photo of a {}",
+    "a bright photo of the {}",
+    "a cropped photo of a {}",
+    "a photo of the {}",
+    "a good photo of the {}",
+    "a photo of one {}",
+    "a close-up photo of the {}",
+    "a rendition of the {}",
+    "a photo of the clean {}",
+    "a rendition of a {}",
+    "a photo of a nice {}",
+    "a good photo of a {}",
+    "a photo of the nice {}",
+    "a photo of the small {}",
+    "a photo of the weird {}",
+    "a photo of the large {}",
+    "a photo of a cool {}",
+    "a photo of a small {}",
+]
+
+imagenet_style_templates_small = [
+    "a painting in the style of {}",
+    "a rendering in the style of {}",
+    "a cropped painting in the style of {}",
+    "the painting in the style of {}",
+    "a clean painting in the style of {}",
+    "a dirty painting in the style of {}",
+    "a dark painting in the style of {}",
+    "a picture in the style of {}",
+    "a cool painting in the style of {}",
+    "a close-up painting in the style of {}",
+    "a bright painting in the style of {}",
+    "a cropped painting in the style of {}",
+    "a good painting in the style of {}",
+    "a close-up painting in the style of {}",
+    "a rendition in the style of {}",
+    "a nice painting in the style of {}",
+    "a small painting in the style of {}",
+    "a weird painting in the style of {}",
+    "a large painting in the style of {}",
+]
+
+
+class TextualInversionDataset(Dataset):
+    def __init__(
+        self,
+        data_root,
+        tokenizer,
+        learnable_property="object",  # [object, style]
+        size=512,
+        repeats=100,
+        interpolation="bicubic",
+        flip_p=0.5,
+        set="train",
+        placeholder_token="*",
+        center_crop=False,
+    ):
+        self.data_root = data_root
+        self.tokenizer = tokenizer
+        self.learnable_property = learnable_property
+        self.size = size
+        self.placeholder_token = placeholder_token
+        self.center_crop = center_crop
+        self.flip_p = flip_p
+
+        self.image_paths = [os.path.join(self.data_root, file_path) for file_path in os.listdir(self.data_root)]
+
+        self.num_images = len(self.image_paths)
+        self._length = self.num_images
+
+        if set == "train":
+            self._length = self.num_images * repeats
+
+        self.interpolation = {
+            "linear": PIL.Image.LINEAR,
+            "bilinear": PIL.Image.BILINEAR,
+            "bicubic": PIL.Image.BICUBIC,
+            "lanczos": PIL.Image.LANCZOS,
+        }[interpolation]
+
+        self.templates = imagenet_style_templates_small if learnable_property == "style" else imagenet_templates_small
+        self.flip_transform = transforms.RandomHorizontalFlip(p=self.flip_p)
+
+    def __len__(self):
+        return self._length
+
+    def __getitem__(self, i):
+        example = {}
+        image = Image.open(self.image_paths[i % self.num_images])
+
+        if not image.mode == "RGB":
+            image = image.convert("RGB")
+
+        placeholder_string = self.placeholder_token
+        text = random.choice(self.templates).format(placeholder_string)
+
+        example["input_ids"] = self.tokenizer(
+            text,
+            padding="max_length",
+            truncation=True,
+            max_length=self.tokenizer.model_max_length,
+            return_tensors="pt",
+        ).input_ids[0]
+
+        # default to score-sde preprocessing
+        img = np.array(image).astype(np.uint8)
+
+        if self.center_crop:
+            crop = min(img.shape[0], img.shape[1])
+            h, w, = (
+                img.shape[0],
+                img.shape[1],
+            )
+            img = img[(h - crop) // 2 : (h + crop) // 2, (w - crop) // 2 : (w + crop) // 2]
+
+        image = Image.fromarray(img)
+        image = image.resize((self.size, self.size), resample=self.interpolation)
+
+        image = self.flip_transform(image)
+        image = np.array(image).astype(np.uint8)
+        image = (image / 127.5 - 1.0).astype(np.float32)
+
+        example["pixel_values"] = torch.from_numpy(image).permute(2, 0, 1)
+        return example
+
+
+def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: Optional[str] = None):
+    if token is None:
+        token = HfFolder.get_token()
+    if organization is None:
+        username = whoami(token)["name"]
+        return f"{username}/{model_id}"
+    else:
+        return f"{organization}/{model_id}"
+
+
+def freeze_params(params):
+    for param in params:
+        param.requires_grad = False
+
+
+def main():
+    args = parse_args()
+    logging_dir = os.path.join(args.output_dir, args.logging_dir)
+
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with="tensorboard",
+        logging_dir=logging_dir,
+    )
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.push_to_hub:
+            if args.hub_model_id is None:
+                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
+            else:
+                repo_name = args.hub_model_id
+            repo = Repository(args.output_dir, clone_from=repo_name)
+
+            with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
+                if "step_*" not in gitignore:
+                    gitignore.write("step_*\n")
+                if "epoch_*" not in gitignore:
+                    gitignore.write("epoch_*\n")
+        elif args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+    # Load the tokenizer and add the placeholder token as a additional special token
+    if args.tokenizer_name:
+        tokenizer = CLIPTokenizer.from_pretrained(args.tokenizer_name)
+    elif args.pretrained_model_name_or_path:
+        tokenizer = CLIPTokenizer.from_pretrained(args.pretrained_model_name_or_path, subfolder="tokenizer")
+
+    # Add the placeholder token in tokenizer
+    num_added_tokens = tokenizer.add_tokens(args.placeholder_token)
+    if num_added_tokens == 0:
+        raise ValueError(
+            f"The tokenizer already contains the token {args.placeholder_token}. Please pass a different"
+            " `placeholder_token` that is not already in the tokenizer."
+        )
+
+    # Convert the initializer_token, placeholder_token to ids
+    token_ids = tokenizer.encode(args.initializer_token, add_special_tokens=False)
+    # Check if initializer_token is a single token or a sequence of tokens
+    if len(token_ids) > 1:
+        raise ValueError("The initializer token must be a single token.")
+
+    initializer_token_id = token_ids[0]
+    placeholder_token_id = tokenizer.convert_tokens_to_ids(args.placeholder_token)
+
+    # Load models and create wrapper for stable diffusion
+    text_encoder = CLIPTextModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="text_encoder")
+    vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae")
+    unet = UNet2DConditionModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="unet")
+
+    # Resize the token embeddings as we are adding new special tokens to the tokenizer
+    text_encoder.resize_token_embeddings(len(tokenizer))
+
+    # Initialise the newly added placeholder token with the embeddings of the initializer token
+    token_embeds = text_encoder.get_input_embeddings().weight.data
+    token_embeds[placeholder_token_id] = token_embeds[initializer_token_id]
+
+    # Freeze vae and unet
+    freeze_params(vae.parameters())
+    freeze_params(unet.parameters())
+    # Freeze all parameters except for the token embeddings in text encoder
+    params_to_freeze = itertools.chain(
+        text_encoder.text_model.encoder.parameters(),
+        text_encoder.text_model.final_layer_norm.parameters(),
+        text_encoder.text_model.embeddings.position_embedding.parameters(),
+    )
+    freeze_params(params_to_freeze)
+
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+        )
+
+    # Initialize the optimizer
+    optimizer = torch.optim.AdamW(
+        text_encoder.get_input_embeddings().parameters(),  # only optimize the embeddings
+        lr=args.learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+
+    # TODO (patil-suraj): load scheduler using args
+    noise_scheduler = DDPMScheduler(
+        beta_start=0.00085,
+        beta_end=0.012,
+        beta_schedule="scaled_linear",
+        num_train_timesteps=1000,
+    )
+
+    train_dataset = TextualInversionDataset(
+        data_root=args.train_data_dir,
+        tokenizer=tokenizer,
+        size=args.resolution,
+        placeholder_token=args.placeholder_token,
+        repeats=args.repeats,
+        learnable_property=args.learnable_property,
+        center_crop=args.center_crop,
+        set="train",
+    )
+    train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=args.train_batch_size, shuffle=True)
+
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps,
+        num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
+    )
+
+    text_encoder, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+        text_encoder, optimizer, train_dataloader, lr_scheduler
+    )
+
+    # Move vae and unet to device
+    vae.to(accelerator.device)
+    unet.to(accelerator.device)
+
+    # Keep vae and unet in eval model as we don't train these
+    vae.eval()
+    unet.eval()
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        accelerator.init_trackers("textual_inversion", config=vars(args))
+
+    # Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    # Only show the progress bar once on each machine.
+    progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process)
+    progress_bar.set_description("Steps")
+    global_step = 0
+
+    for epoch in range(args.num_train_epochs):
+        text_encoder.train()
+        for step, batch in enumerate(train_dataloader):
+            with accelerator.accumulate(text_encoder):
+                # Convert images to latent space
+                latents = vae.encode(batch["pixel_values"]).latent_dist.sample().detach()
+                latents = latents * 0.18215
+
+                # Sample noise that we'll add to the latents
+                noise = torch.randn(latents.shape).to(latents.device)
+                bsz = latents.shape[0]
+                # Sample a random timestep for each image
+                timesteps = torch.randint(
+                    0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device
+                ).long()
+
+                # Add noise to the latents according to the noise magnitude at each timestep
+                # (this is the forward diffusion process)
+                noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+
+                # Get the text embedding for conditioning
+                encoder_hidden_states = text_encoder(batch["input_ids"])[0]
+
+                # Predict the noise residual
+                noise_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
+
+                loss = F.mse_loss(noise_pred, noise, reduction="none").mean([1, 2, 3]).mean()
+                accelerator.backward(loss)
+
+                # Zero out the gradients for all token embeddings except the newly added
+                # embeddings for the concept, as we only want to optimize the concept embeddings
+                if accelerator.num_processes > 1:
+                    grads = text_encoder.module.get_input_embeddings().weight.grad
+                else:
+                    grads = text_encoder.get_input_embeddings().weight.grad
+                # Get the index for tokens that we want to zero the grads for
+                index_grads_to_zero = torch.arange(len(tokenizer)) != placeholder_token_id
+                grads.data[index_grads_to_zero, :] = grads.data[index_grads_to_zero, :].fill_(0)
+
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+                if global_step % args.save_steps == 0:
+                    save_progress(text_encoder, placeholder_token_id, accelerator, args)
+
+            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+            accelerator.log(logs, step=global_step)
+
+            if global_step >= args.max_train_steps:
+                break
+
+        accelerator.wait_for_everyone()
+
+    # Create the pipeline using using the trained modules and save it.
+    if accelerator.is_main_process:
+        pipeline = StableDiffusionPipeline(
+            text_encoder=accelerator.unwrap_model(text_encoder),
+            vae=vae,
+            unet=unet,
+            tokenizer=tokenizer,
+            scheduler=PNDMScheduler(
+                beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", skip_prk_steps=True
+            ),
+            safety_checker=StableDiffusionSafetyChecker.from_pretrained("CompVis/stable-diffusion-safety-checker"),
+            feature_extractor=CLIPFeatureExtractor.from_pretrained("openai/clip-vit-base-patch32"),
+        )
+        pipeline.save_pretrained(args.output_dir)
+        # Also save the newly trained embeddings
+        save_progress(text_encoder, placeholder_token_id, accelerator, args)
+
+        if args.push_to_hub:
+            repo.push_to_hub(commit_message="End of training", blocking=False, auto_lfs_prune=True)
+
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/unconditional_image_generation/README.md
+++ b/examples/unconditional_image_generation/README.md
@@ -0,0 +1,129 @@
+## Training examples
+
+Creating a training image set is [described in a different document](https://huggingface.co/docs/datasets/image_process#image-datasets).
+
+### Installing the dependencies
+
+Before running the scripts, make sure to install the library's training dependencies:
+
+```bash
+pip install diffusers[training] accelerate datasets
+```
+
+And initialize an [🤗Accelerate](https://github.com/huggingface/accelerate/) environment with:
+
+```bash
+accelerate config
+```
+
+### Unconditional Flowers  
+
+The command to train a DDPM UNet model on the Oxford Flowers dataset:
+
+```bash
+accelerate launch train_unconditional.py \
+  --dataset_name="huggan/flowers-102-categories" \
+  --resolution=64 \
+  --output_dir="ddpm-ema-flowers-64" \
+  --train_batch_size=16 \
+  --num_epochs=100 \
+  --gradient_accumulation_steps=1 \
+  --learning_rate=1e-4 \
+  --lr_warmup_steps=500 \
+  --mixed_precision=no \
+  --push_to_hub
+```
+An example trained model: https://huggingface.co/anton-l/ddpm-ema-flowers-64
+
+A full training run takes 2 hours on 4xV100 GPUs.
+
+<img src="https://user-images.githubusercontent.com/26864830/180248660-a0b143d0-b89a-42c5-8656-2ebf6ece7e52.png" width="700" />
+
+
+### Unconditional Pokemon 
+
+The command to train a DDPM UNet model on the Pokemon dataset:
+
+```bash
+accelerate launch train_unconditional.py \
+  --dataset_name="huggan/pokemon" \
+  --resolution=64 \
+  --output_dir="ddpm-ema-pokemon-64" \
+  --train_batch_size=16 \
+  --num_epochs=100 \
+  --gradient_accumulation_steps=1 \
+  --learning_rate=1e-4 \
+  --lr_warmup_steps=500 \
+  --mixed_precision=no \
+  --push_to_hub
+```
+An example trained model: https://huggingface.co/anton-l/ddpm-ema-pokemon-64
+
+A full training run takes 2 hours on 4xV100 GPUs.
+
+<img src="https://user-images.githubusercontent.com/26864830/180248200-928953b4-db38-48db-b0c6-8b740fe6786f.png" width="700" />
+
+
+### Using your own data
+
+To use your own dataset, there are 2 ways:
+- you can either provide your own folder as `--train_data_dir`
+- or you can upload your dataset to the hub (possibly as a private repo, if you prefer so), and simply pass the `--dataset_name` argument.
+
+Below, we explain both in more detail.
+
+#### Provide the dataset as a folder
+
+If you provide your own folders with images, the script expects the following directory structure:
+
+```bash
+data_dir/xxx.png
+data_dir/xxy.png
+data_dir/[...]/xxz.png
+```
+
+In other words, the script will take care of gathering all images inside the folder. You can then run the script like this:
+
+```bash
+accelerate launch train_unconditional.py \
+    --train_data_dir <path-to-train-directory> \
+    <other-arguments>
+```
+
+Internally, the script will use the [`ImageFolder`](https://huggingface.co/docs/datasets/v2.0.0/en/image_process#imagefolder) feature which will automatically turn the folders into 🤗 Dataset objects.
+
+#### Upload your data to the hub, as a (possibly private) repo
+
+It's very easy (and convenient) to upload your image dataset to the hub using the [`ImageFolder`](https://huggingface.co/docs/datasets/v2.0.0/en/image_process#imagefolder) feature available in 🤗 Datasets. Simply do the following:
+
+```python
+from datasets import load_dataset
+
+# example 1: local folder
+dataset = load_dataset("imagefolder", data_dir="path_to_your_folder")
+
+# example 2: local files (supported formats are tar, gzip, zip, xz, rar, zstd)
+dataset = load_dataset("imagefolder", data_files="path_to_zip_file")
+
+# example 3: remote files (supported formats are tar, gzip, zip, xz, rar, zstd)
+dataset = load_dataset("imagefolder", data_files="https://download.microsoft.com/download/3/E/1/3E1C3F21-ECDB-4869-8368-6DEBA77B919F/kagglecatsanddogs_3367a.zip")
+
+# example 4: providing several splits
+dataset = load_dataset("imagefolder", data_files={"train": ["path/to/file1", "path/to/file2"], "test": ["path/to/file3", "path/to/file4"]})
+```
+
+`ImageFolder` will create an `image` column containing the PIL-encoded images.
+
+Next, push it to the hub!
+
+```python
+# assuming you have ran the huggingface-cli login command in a terminal
+dataset.push_to_hub("name_of_your_dataset")
+
+# if you want to push to a private repo, simply pass private=True:
+dataset.push_to_hub("name_of_your_dataset", private=True)
+```
+
+and that's it! You can now train your model by simply setting the `--dataset_name` argument to the name of your dataset on the hub.
+
+More on this can also be found in [this blog post](https://huggingface.co/blog/image-search-datasets).
--- a/examples/unconditional_image_generation/requirements.txt
+++ b/examples/unconditional_image_generation/requirements.txt
@@ -0,0 +1,3 @@
+accelerate
+torchvision
+datasets
--- a/examples/unconditional_image_generation/train_unconditional.py
+++ b/examples/unconditional_image_generation/train_unconditional.py
@@ -1,4 +1,5 @@
 import argparse
+import math
 import os

 import torch
@@ -8,7 +9,7 @@ from accelerate import Accelerator
 from accelerate.logging import get_logger
 from datasets import load_dataset
 from diffusers import DDPMPipeline, DDPMScheduler, UNet2DModel
-from diffusers.hub_utils import init_git_repo, push_to_hub
+from diffusers.hub_utils import init_git_repo
 from diffusers.optimization import get_scheduler
 from diffusers.training_utils import EMAModel
 from torchvision.transforms import (
@@ -29,6 +30,7 @@ logger = get_logger(__name__)
 def main(args):
    logging_dir = os.path.join(args.output_dir, args.logging_dir)
    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
        mixed_precision=args.mixed_precision,
        log_with="tensorboard",
        logging_dir=logging_dir,
@@ -57,7 +59,7 @@ def main(args):
            "UpBlock2D",
        ),
    )
-    noise_scheduler = DDPMScheduler(num_train_timesteps=1000, tensor_format="pt")
+    noise_scheduler = DDPMScheduler(num_train_timesteps=1000)
    optimizer = torch.optim.AdamW(
        model.parameters(),
        lr=args.learning_rate,
@@ -81,7 +83,6 @@ def main(args):
            args.dataset_name,
            args.dataset_config_name,
            cache_dir=args.cache_dir,
-            use_auth_token=True if args.use_auth_token else None,
            split="train",
        )
    else:
@@ -105,6 +106,8 @@ def main(args):
        model, optimizer, train_dataloader, lr_scheduler
    )

+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+
    ema_model = EMAModel(model, inv_gamma=args.ema_inv_gamma, power=args.ema_power, max_value=args.ema_max_decay)

    if args.push_to_hub:
@@ -117,7 +120,7 @@ def main(args):
    global_step = 0
    for epoch in range(args.num_epochs):
        model.train()
-        progress_bar = tqdm(total=len(train_dataloader), disable=not accelerator.is_local_main_process)
+        progress_bar = tqdm(total=num_update_steps_per_epoch, disable=not accelerator.is_local_main_process)
        progress_bar.set_description(f"Epoch {epoch}")
        for step, batch in enumerate(train_dataloader):
            clean_images = batch["input"]
@@ -126,7 +129,7 @@ def main(args):
            bsz = clean_images.shape[0]
            # Sample a random timestep for each image
            timesteps = torch.randint(
-                0, noise_scheduler.num_train_timesteps, (bsz,), device=clean_images.device
+                0, noise_scheduler.config.num_train_timesteps, (bsz,), device=clean_images.device
            ).long()

            # Add noise to the clean images according to the noise magnitude at each timestep
@@ -135,24 +138,28 @@ def main(args):

            with accelerator.accumulate(model):
                # Predict the noise residual
-                noise_pred = model(noisy_images, timesteps)["sample"]
+                noise_pred = model(noisy_images, timesteps).sample
                loss = F.mse_loss(noise_pred, noise)
                accelerator.backward(loss)

-                accelerator.clip_grad_norm_(model.parameters(), 1.0)
+                if accelerator.sync_gradients:
+                    accelerator.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()
                lr_scheduler.step()
                if args.use_ema:
                    ema_model.step(model)
                optimizer.zero_grad()

-            progress_bar.update(1)
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+
            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0], "step": global_step}
            if args.use_ema:
                logs["ema_decay"] = ema_model.decay
            progress_bar.set_postfix(**logs)
            accelerator.log(logs, step=global_step)
-            global_step += 1
        progress_bar.close()

        accelerator.wait_for_everyone()
@@ -167,7 +174,7 @@ def main(args):

                generator = torch.manual_seed(0)
                # run pipeline in inference (sample random noise and denoise)
-                images = pipeline(generator=generator, batch_size=args.eval_batch_size, output_type="numpy")["sample"]
+                images = pipeline(generator=generator, batch_size=args.eval_batch_size, output_type="numpy").images

                # denormalize the images and save to tensorboard
                images_processed = (images * 255).round().astype("uint8")
@@ -177,10 +184,9 @@ def main(args):

            if epoch % args.save_model_epochs == 0 or epoch == args.num_epochs - 1:
                # save the model
+                pipeline.save_pretrained(args.output_dir)
                if args.push_to_hub:
-                    push_to_hub(args, pipeline, repo, commit_message=f"Epoch {epoch}", blocking=False)
-                else:
-                    pipeline.save_pretrained(args.output_dir)
+                    repo.push_to_hub(commit_message=f"Epoch {epoch}", blocking=False)
        accelerator.wait_for_everyone()

    accelerator.end_training()
@@ -214,7 +220,6 @@ if __name__ == "__main__":
    parser.add_argument("--ema_power", type=float, default=3 / 4)
    parser.add_argument("--ema_max_decay", type=float, default=0.9999)
    parser.add_argument("--push_to_hub", action="store_true")
-    parser.add_argument("--use_auth_token", action="store_true")
    parser.add_argument("--hub_token", type=str, default=None)
    parser.add_argument("--hub_model_id", type=str, default=None)
    parser.add_argument("--hub_private_repo", action="store_true")
--- a/scripts/change_naming_configs_and_checkpoints.py
+++ b/scripts/change_naming_configs_and_checkpoints.py
@@ -15,12 +15,15 @@
 """ Conversion script for the LDM checkpoints. """

 import argparse
-import os
 import json
+import os
+
 import torch
-from diffusers import UNet2DModel, UNet2DConditionModel
+
+from diffusers import UNet2DConditionModel, UNet2DModel
 from transformers.file_utils import has_file

+
 do_only_config = False
 do_only_weights = True
 do_only_renaming = False
@@ -37,9 +40,7 @@ if __name__ == "__main__":
        help="The config json file corresponding to the architecture.",
    )

-    parser.add_argument(
-        "--dump_path", default=None, type=str, required=True, help="Path to the output model."
-    )
+    parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output model.")

    args = parser.parse_args()

--- a/scripts/conversion_ldm_uncond.py
+++ b/scripts/conversion_ldm_uncond.py
@@ -1,9 +1,10 @@
 import argparse

-import OmegaConf
 import torch

-from diffusers import UNetLDMModel, VQModel, LDMPipeline, DDIMScheduler
+import OmegaConf
+from diffusers import DDIMScheduler, LDMPipeline, UNetLDMModel, VQModel
+

 def convert_ldm_original(checkpoint_path, config_path, output_path):
    config = OmegaConf.load(config_path)
@@ -16,14 +17,14 @@ def convert_ldm_original(checkpoint_path, config_path, output_path):
    for key in keys:
        if key.startswith(first_stage_key):
            first_stage_dict[key.replace(first_stage_key, "")] = state_dict[key]
-    
+
    # extract state_dict for UNetLDM
    unet_state_dict = {}
    unet_key = "model.diffusion_model."
    for key in keys:
        if key.startswith(unet_key):
            unet_state_dict[key.replace(unet_key, "")] = state_dict[key]
-    
+
    vqvae_init_args = config.model.params.first_stage_config.params
    unet_init_args = config.model.params.unet_config.params

@@ -53,4 +54,3 @@ if __name__ == "__main__":
    args = parser.parse_args()

    convert_ldm_original(args.checkpoint_path, args.config_path, args.output_path)
-
--- a/scripts/convert_ddpm_original_checkpoint_to_diffusers.py
+++ b/scripts/convert_ddpm_original_checkpoint_to_diffusers.py
@@ -1,31 +1,33 @@
-from diffusers import UNet2DModel, DDPMScheduler, DDPMPipeline, VQModel, AutoencoderKL
 import argparse
 import json
+
 import torch

+from diffusers import AutoencoderKL, DDPMPipeline, DDPMScheduler, UNet2DModel, VQModel
+

 def shave_segments(path, n_shave_prefix_segments=1):
    """
    Removes segments. Positive values shave the first segments, negative shave the last segments.
    """
    if n_shave_prefix_segments >= 0:
-        return '.'.join(path.split('.')[n_shave_prefix_segments:])
+        return ".".join(path.split(".")[n_shave_prefix_segments:])
    else:
-        return '.'.join(path.split('.')[:n_shave_prefix_segments])
+        return ".".join(path.split(".")[:n_shave_prefix_segments])


 def renew_resnet_paths(old_list, n_shave_prefix_segments=0):
    mapping = []
    for old_item in old_list:
        new_item = old_item
-        new_item = new_item.replace('block.', 'resnets.')
-        new_item = new_item.replace('conv_shorcut', 'conv1')
-        new_item = new_item.replace('nin_shortcut', 'conv_shortcut')
-        new_item = new_item.replace('temb_proj', 'time_emb_proj')
+        new_item = new_item.replace("block.", "resnets.")
+        new_item = new_item.replace("conv_shorcut", "conv1")
+        new_item = new_item.replace("in_shortcut", "conv_shortcut")
+        new_item = new_item.replace("temb_proj", "time_emb_proj")

        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)

-        mapping.append({'old': old_item, 'new': new_item})
+        mapping.append({"old": old_item, "new": new_item})

    return mapping

@@ -37,21 +39,23 @@ def renew_attention_paths(old_list, n_shave_prefix_segments=0, in_mid=False):

        # In `model.mid`, the layer is called `attn`.
        if not in_mid:
-            new_item = new_item.replace('attn', 'attentions')
-        new_item = new_item.replace('.k.', '.key.')
-        new_item = new_item.replace('.v.', '.value.')
-        new_item = new_item.replace('.q.', '.query.')
+            new_item = new_item.replace("attn", "attentions")
+        new_item = new_item.replace(".k.", ".key.")
+        new_item = new_item.replace(".v.", ".value.")
+        new_item = new_item.replace(".q.", ".query.")

-        new_item = new_item.replace('proj_out', 'proj_attn')
-        new_item = new_item.replace('norm', 'group_norm')
+        new_item = new_item.replace("proj_out", "proj_attn")
+        new_item = new_item.replace("norm", "group_norm")

        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
-        mapping.append({'old': old_item, 'new': new_item})
+        mapping.append({"old": old_item, "new": new_item})

    return mapping


-def assign_to_checkpoint(paths, checkpoint, old_checkpoint, attention_paths_to_split=None, additional_replacements=None, config=None):
+def assign_to_checkpoint(
+    paths, checkpoint, old_checkpoint, attention_paths_to_split=None, additional_replacements=None, config=None
+):
    assert isinstance(paths, list), "Paths should be a list of dicts containing 'old' and 'new' keys."

    if attention_paths_to_split is not None:
@@ -69,27 +73,27 @@ def assign_to_checkpoint(paths, checkpoint, old_checkpoint, attention_paths_to_s
            old_tensor = old_tensor.reshape((num_heads, 3 * channels // num_heads) + old_tensor.shape[1:])
            query, key, value = old_tensor.split(channels // num_heads, dim=1)

-            checkpoint[path_map['query']] = query.reshape(target_shape).squeeze()
-            checkpoint[path_map['key']] = key.reshape(target_shape).squeeze()
-            checkpoint[path_map['value']] = value.reshape(target_shape).squeeze()
+            checkpoint[path_map["query"]] = query.reshape(target_shape).squeeze()
+            checkpoint[path_map["key"]] = key.reshape(target_shape).squeeze()
+            checkpoint[path_map["value"]] = value.reshape(target_shape).squeeze()

    for path in paths:
-        new_path = path['new']
+        new_path = path["new"]

        if attention_paths_to_split is not None and new_path in attention_paths_to_split:
            continue

-        new_path = new_path.replace('down.', 'down_blocks.')
-        new_path = new_path.replace('up.', 'up_blocks.')
+        new_path = new_path.replace("down.", "down_blocks.")
+        new_path = new_path.replace("up.", "up_blocks.")

        if additional_replacements is not None:
            for replacement in additional_replacements:
-                new_path = new_path.replace(replacement['old'], replacement['new'])
+                new_path = new_path.replace(replacement["old"], replacement["new"])

-        if 'attentions' in new_path:
-            checkpoint[new_path] = old_checkpoint[path['old']].squeeze()
+        if "attentions" in new_path:
+            checkpoint[new_path] = old_checkpoint[path["old"]].squeeze()
        else:
-            checkpoint[new_path] = old_checkpoint[path['old']]
+            checkpoint[new_path] = old_checkpoint[path["old"]]


 def convert_ddpm_checkpoint(checkpoint, config):
@@ -98,49 +102,63 @@ def convert_ddpm_checkpoint(checkpoint, config):
    """
    new_checkpoint = {}

-    new_checkpoint['time_embedding.linear_1.weight'] = checkpoint['temb.dense.0.weight']
-    new_checkpoint['time_embedding.linear_1.bias'] = checkpoint['temb.dense.0.bias']
-    new_checkpoint['time_embedding.linear_2.weight'] = checkpoint['temb.dense.1.weight']
-    new_checkpoint['time_embedding.linear_2.bias'] = checkpoint['temb.dense.1.bias']
+    new_checkpoint["time_embedding.linear_1.weight"] = checkpoint["temb.dense.0.weight"]
+    new_checkpoint["time_embedding.linear_1.bias"] = checkpoint["temb.dense.0.bias"]
+    new_checkpoint["time_embedding.linear_2.weight"] = checkpoint["temb.dense.1.weight"]
+    new_checkpoint["time_embedding.linear_2.bias"] = checkpoint["temb.dense.1.bias"]

-    new_checkpoint['conv_norm_out.weight'] = checkpoint['norm_out.weight']
-    new_checkpoint['conv_norm_out.bias'] = checkpoint['norm_out.bias']
+    new_checkpoint["conv_norm_out.weight"] = checkpoint["norm_out.weight"]
+    new_checkpoint["conv_norm_out.bias"] = checkpoint["norm_out.bias"]

-    new_checkpoint['conv_in.weight'] = checkpoint['conv_in.weight']
-    new_checkpoint['conv_in.bias'] = checkpoint['conv_in.bias']
-    new_checkpoint['conv_out.weight'] = checkpoint['conv_out.weight']
-    new_checkpoint['conv_out.bias'] = checkpoint['conv_out.bias']
+    new_checkpoint["conv_in.weight"] = checkpoint["conv_in.weight"]
+    new_checkpoint["conv_in.bias"] = checkpoint["conv_in.bias"]
+    new_checkpoint["conv_out.weight"] = checkpoint["conv_out.weight"]
+    new_checkpoint["conv_out.bias"] = checkpoint["conv_out.bias"]

-    num_down_blocks = len({'.'.join(layer.split('.')[:2]) for layer in checkpoint if 'down' in layer})
-    down_blocks = {layer_id: [key for key in checkpoint if f'down.{layer_id}' in key] for layer_id in range(num_down_blocks)}
+    num_down_blocks = len({".".join(layer.split(".")[:2]) for layer in checkpoint if "down" in layer})
+    down_blocks = {
+        layer_id: [key for key in checkpoint if f"down.{layer_id}" in key] for layer_id in range(num_down_blocks)
+    }

-    num_up_blocks = len({'.'.join(layer.split('.')[:2]) for layer in checkpoint if 'up' in layer})
-    up_blocks = {layer_id: [key for key in checkpoint if f'up.{layer_id}' in key] for layer_id in range(num_up_blocks)}
+    num_up_blocks = len({".".join(layer.split(".")[:2]) for layer in checkpoint if "up" in layer})
+    up_blocks = {layer_id: [key for key in checkpoint if f"up.{layer_id}" in key] for layer_id in range(num_up_blocks)}

    for i in range(num_down_blocks):
-        block_id = (i - 1) // (config['layers_per_block'] + 1)
+        block_id = (i - 1) // (config["layers_per_block"] + 1)

-        if any('downsample' in layer for layer in down_blocks[i]):
-            new_checkpoint[f'down_blocks.{i}.downsamplers.0.conv.weight'] = checkpoint[f'down.{i}.downsample.op.weight']
-            new_checkpoint[f'down_blocks.{i}.downsamplers.0.conv.bias'] = checkpoint[f'down.{i}.downsample.op.bias']
-#            new_checkpoint[f'down_blocks.{i}.downsamplers.0.op.weight'] = checkpoint[f'down.{i}.downsample.conv.weight']
-#            new_checkpoint[f'down_blocks.{i}.downsamplers.0.op.bias'] = checkpoint[f'down.{i}.downsample.conv.bias']
+        if any("downsample" in layer for layer in down_blocks[i]):
+            new_checkpoint[f"down_blocks.{i}.downsamplers.0.conv.weight"] = checkpoint[
+                f"down.{i}.downsample.op.weight"
+            ]
+            new_checkpoint[f"down_blocks.{i}.downsamplers.0.conv.bias"] = checkpoint[f"down.{i}.downsample.op.bias"]
+        #            new_checkpoint[f'down_blocks.{i}.downsamplers.0.op.weight'] = checkpoint[f'down.{i}.downsample.conv.weight']
+        #            new_checkpoint[f'down_blocks.{i}.downsamplers.0.op.bias'] = checkpoint[f'down.{i}.downsample.conv.bias']

-        if any('block' in layer for layer in down_blocks[i]):
-            num_blocks = len({'.'.join(shave_segments(layer, 2).split('.')[:2]) for layer in down_blocks[i] if 'block' in layer})
-            blocks = {layer_id: [key for key in down_blocks[i] if f'block.{layer_id}' in key] for layer_id in range(num_blocks)}
+        if any("block" in layer for layer in down_blocks[i]):
+            num_blocks = len(
+                {".".join(shave_segments(layer, 2).split(".")[:2]) for layer in down_blocks[i] if "block" in layer}
+            )
+            blocks = {
+                layer_id: [key for key in down_blocks[i] if f"block.{layer_id}" in key]
+                for layer_id in range(num_blocks)
+            }

            if num_blocks > 0:
-                for j in range(config['layers_per_block']):
+                for j in range(config["layers_per_block"]):
                    paths = renew_resnet_paths(blocks[j])
                    assign_to_checkpoint(paths, new_checkpoint, checkpoint)

-        if any('attn' in layer for layer in down_blocks[i]):
-            num_attn = len({'.'.join(shave_segments(layer, 2).split('.')[:2]) for layer in down_blocks[i] if 'attn' in layer})
-            attns = {layer_id: [key for key in down_blocks[i] if f'attn.{layer_id}' in key] for layer_id in range(num_blocks)}
+        if any("attn" in layer for layer in down_blocks[i]):
+            num_attn = len(
+                {".".join(shave_segments(layer, 2).split(".")[:2]) for layer in down_blocks[i] if "attn" in layer}
+            )
+            attns = {
+                layer_id: [key for key in down_blocks[i] if f"attn.{layer_id}" in key]
+                for layer_id in range(num_blocks)
+            }

            if num_attn > 0:
-                for j in range(config['layers_per_block']):
+                for j in range(config["layers_per_block"]):
                    paths = renew_attention_paths(attns[j])
                    assign_to_checkpoint(paths, new_checkpoint, checkpoint, config=config)

@@ -150,48 +168,67 @@ def convert_ddpm_checkpoint(checkpoint, config):

    # Mid new 2
    paths = renew_resnet_paths(mid_block_1_layers)
-    assign_to_checkpoint(paths, new_checkpoint, checkpoint, additional_replacements=[
-        {'old': 'mid.', 'new': 'mid_new_2.'}, {'old': 'block_1', 'new': 'resnets.0'}
-    ])
+    assign_to_checkpoint(
+        paths,
+        new_checkpoint,
+        checkpoint,
+        additional_replacements=[{"old": "mid.", "new": "mid_new_2."}, {"old": "block_1", "new": "resnets.0"}],
+    )

    paths = renew_resnet_paths(mid_block_2_layers)
-    assign_to_checkpoint(paths, new_checkpoint, checkpoint, additional_replacements=[
-        {'old': 'mid.', 'new': 'mid_new_2.'}, {'old': 'block_2', 'new': 'resnets.1'}
-    ])
+    assign_to_checkpoint(
+        paths,
+        new_checkpoint,
+        checkpoint,
+        additional_replacements=[{"old": "mid.", "new": "mid_new_2."}, {"old": "block_2", "new": "resnets.1"}],
+    )

    paths = renew_attention_paths(mid_attn_1_layers, in_mid=True)
-    assign_to_checkpoint(paths, new_checkpoint, checkpoint, additional_replacements=[
-        {'old': 'mid.', 'new': 'mid_new_2.'}, {'old': 'attn_1', 'new': 'attentions.0'}
-    ])
+    assign_to_checkpoint(
+        paths,
+        new_checkpoint,
+        checkpoint,
+        additional_replacements=[{"old": "mid.", "new": "mid_new_2."}, {"old": "attn_1", "new": "attentions.0"}],
+    )

    for i in range(num_up_blocks):
        block_id = num_up_blocks - 1 - i

-        if any('upsample' in layer for layer in up_blocks[i]):
-            new_checkpoint[f'up_blocks.{block_id}.upsamplers.0.conv.weight'] = checkpoint[f'up.{i}.upsample.conv.weight']
-            new_checkpoint[f'up_blocks.{block_id}.upsamplers.0.conv.bias'] = checkpoint[f'up.{i}.upsample.conv.bias']
+        if any("upsample" in layer for layer in up_blocks[i]):
+            new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = checkpoint[
+                f"up.{i}.upsample.conv.weight"
+            ]
+            new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = checkpoint[f"up.{i}.upsample.conv.bias"]

-        if any('block' in layer for layer in up_blocks[i]):
-            num_blocks = len({'.'.join(shave_segments(layer, 2).split('.')[:2]) for layer in up_blocks[i] if 'block' in layer})
-            blocks = {layer_id: [key for key in up_blocks[i] if f'block.{layer_id}' in key] for layer_id in range(num_blocks)}
+        if any("block" in layer for layer in up_blocks[i]):
+            num_blocks = len(
+                {".".join(shave_segments(layer, 2).split(".")[:2]) for layer in up_blocks[i] if "block" in layer}
+            )
+            blocks = {
+                layer_id: [key for key in up_blocks[i] if f"block.{layer_id}" in key] for layer_id in range(num_blocks)
+            }

            if num_blocks > 0:
-                for j in range(config['layers_per_block'] + 1):
-                    replace_indices = {'old': f'up_blocks.{i}', 'new': f'up_blocks.{block_id}'}
+                for j in range(config["layers_per_block"] + 1):
+                    replace_indices = {"old": f"up_blocks.{i}", "new": f"up_blocks.{block_id}"}
                    paths = renew_resnet_paths(blocks[j])
                    assign_to_checkpoint(paths, new_checkpoint, checkpoint, additional_replacements=[replace_indices])

-        if any('attn' in layer for layer in up_blocks[i]):
-            num_attn = len({'.'.join(shave_segments(layer, 2).split('.')[:2]) for layer in up_blocks[i] if 'attn' in layer})
-            attns = {layer_id: [key for key in up_blocks[i] if f'attn.{layer_id}' in key] for layer_id in range(num_blocks)}
+        if any("attn" in layer for layer in up_blocks[i]):
+            num_attn = len(
+                {".".join(shave_segments(layer, 2).split(".")[:2]) for layer in up_blocks[i] if "attn" in layer}
+            )
+            attns = {
+                layer_id: [key for key in up_blocks[i] if f"attn.{layer_id}" in key] for layer_id in range(num_blocks)
+            }

            if num_attn > 0:
-                for j in range(config['layers_per_block'] + 1):
-                    replace_indices = {'old': f'up_blocks.{i}', 'new': f'up_blocks.{block_id}'}
+                for j in range(config["layers_per_block"] + 1):
+                    replace_indices = {"old": f"up_blocks.{i}", "new": f"up_blocks.{block_id}"}
                    paths = renew_attention_paths(attns[j])
                    assign_to_checkpoint(paths, new_checkpoint, checkpoint, additional_replacements=[replace_indices])

-    new_checkpoint = {k.replace('mid_new_2', 'mid_block'): v for k, v in new_checkpoint.items()}
+    new_checkpoint = {k.replace("mid_new_2", "mid_block"): v for k, v in new_checkpoint.items()}
    return new_checkpoint


@@ -201,50 +238,66 @@ def convert_vq_autoenc_checkpoint(checkpoint, config):
    """
    new_checkpoint = {}

-    new_checkpoint['encoder.conv_norm_out.weight'] = checkpoint['encoder.norm_out.weight']
-    new_checkpoint['encoder.conv_norm_out.bias'] = checkpoint['encoder.norm_out.bias']
+    new_checkpoint["encoder.conv_norm_out.weight"] = checkpoint["encoder.norm_out.weight"]
+    new_checkpoint["encoder.conv_norm_out.bias"] = checkpoint["encoder.norm_out.bias"]

-    new_checkpoint['encoder.conv_in.weight'] = checkpoint['encoder.conv_in.weight']
-    new_checkpoint['encoder.conv_in.bias'] = checkpoint['encoder.conv_in.bias']
-    new_checkpoint['encoder.conv_out.weight'] = checkpoint['encoder.conv_out.weight']
-    new_checkpoint['encoder.conv_out.bias'] = checkpoint['encoder.conv_out.bias']
+    new_checkpoint["encoder.conv_in.weight"] = checkpoint["encoder.conv_in.weight"]
+    new_checkpoint["encoder.conv_in.bias"] = checkpoint["encoder.conv_in.bias"]
+    new_checkpoint["encoder.conv_out.weight"] = checkpoint["encoder.conv_out.weight"]
+    new_checkpoint["encoder.conv_out.bias"] = checkpoint["encoder.conv_out.bias"]

-    new_checkpoint['decoder.conv_norm_out.weight'] = checkpoint['decoder.norm_out.weight']
-    new_checkpoint['decoder.conv_norm_out.bias'] = checkpoint['decoder.norm_out.bias']
+    new_checkpoint["decoder.conv_norm_out.weight"] = checkpoint["decoder.norm_out.weight"]
+    new_checkpoint["decoder.conv_norm_out.bias"] = checkpoint["decoder.norm_out.bias"]

-    new_checkpoint['decoder.conv_in.weight'] = checkpoint['decoder.conv_in.weight']
-    new_checkpoint['decoder.conv_in.bias'] = checkpoint['decoder.conv_in.bias']
-    new_checkpoint['decoder.conv_out.weight'] = checkpoint['decoder.conv_out.weight']
-    new_checkpoint['decoder.conv_out.bias'] = checkpoint['decoder.conv_out.bias']
+    new_checkpoint["decoder.conv_in.weight"] = checkpoint["decoder.conv_in.weight"]
+    new_checkpoint["decoder.conv_in.bias"] = checkpoint["decoder.conv_in.bias"]
+    new_checkpoint["decoder.conv_out.weight"] = checkpoint["decoder.conv_out.weight"]
+    new_checkpoint["decoder.conv_out.bias"] = checkpoint["decoder.conv_out.bias"]

-    num_down_blocks = len({'.'.join(layer.split('.')[:3]) for layer in checkpoint if 'down' in layer})
-    down_blocks = {layer_id: [key for key in checkpoint if f'down.{layer_id}' in key] for layer_id in range(num_down_blocks)}
+    num_down_blocks = len({".".join(layer.split(".")[:3]) for layer in checkpoint if "down" in layer})
+    down_blocks = {
+        layer_id: [key for key in checkpoint if f"down.{layer_id}" in key] for layer_id in range(num_down_blocks)
+    }

-    num_up_blocks = len({'.'.join(layer.split('.')[:3]) for layer in checkpoint if 'up' in layer})
-    up_blocks = {layer_id: [key for key in checkpoint if f'up.{layer_id}' in key] for layer_id in range(num_up_blocks)}
+    num_up_blocks = len({".".join(layer.split(".")[:3]) for layer in checkpoint if "up" in layer})
+    up_blocks = {layer_id: [key for key in checkpoint if f"up.{layer_id}" in key] for layer_id in range(num_up_blocks)}

    for i in range(num_down_blocks):
-        block_id = (i - 1) // (config['layers_per_block'] + 1)
+        block_id = (i - 1) // (config["layers_per_block"] + 1)

-        if any('downsample' in layer for layer in down_blocks[i]):
-            new_checkpoint[f'encoder.down_blocks.{i}.downsamplers.0.conv.weight'] = checkpoint[f'encoder.down.{i}.downsample.conv.weight']
-            new_checkpoint[f'encoder.down_blocks.{i}.downsamplers.0.conv.bias'] = checkpoint[f'encoder.down.{i}.downsample.conv.bias']
+        if any("downsample" in layer for layer in down_blocks[i]):
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = checkpoint[
+                f"encoder.down.{i}.downsample.conv.weight"
+            ]
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = checkpoint[
+                f"encoder.down.{i}.downsample.conv.bias"
+            ]

-        if any('block' in layer for layer in down_blocks[i]):
-            num_blocks = len({'.'.join(shave_segments(layer, 3).split('.')[:3]) for layer in down_blocks[i] if 'block' in layer})
-            blocks = {layer_id: [key for key in down_blocks[i] if f'block.{layer_id}' in key] for layer_id in range(num_blocks)}
+        if any("block" in layer for layer in down_blocks[i]):
+            num_blocks = len(
+                {".".join(shave_segments(layer, 3).split(".")[:3]) for layer in down_blocks[i] if "block" in layer}
+            )
+            blocks = {
+                layer_id: [key for key in down_blocks[i] if f"block.{layer_id}" in key]
+                for layer_id in range(num_blocks)
+            }

            if num_blocks > 0:
-                for j in range(config['layers_per_block']):
+                for j in range(config["layers_per_block"]):
                    paths = renew_resnet_paths(blocks[j])
                    assign_to_checkpoint(paths, new_checkpoint, checkpoint)

-        if any('attn' in layer for layer in down_blocks[i]):
-            num_attn = len({'.'.join(shave_segments(layer, 3).split('.')[:3]) for layer in down_blocks[i] if 'attn' in layer})
-            attns = {layer_id: [key for key in down_blocks[i] if f'attn.{layer_id}' in key] for layer_id in range(num_blocks)}
+        if any("attn" in layer for layer in down_blocks[i]):
+            num_attn = len(
+                {".".join(shave_segments(layer, 3).split(".")[:3]) for layer in down_blocks[i] if "attn" in layer}
+            )
+            attns = {
+                layer_id: [key for key in down_blocks[i] if f"attn.{layer_id}" in key]
+                for layer_id in range(num_blocks)
+            }

            if num_attn > 0:
-                for j in range(config['layers_per_block']):
+                for j in range(config["layers_per_block"]):
                    paths = renew_attention_paths(attns[j])
                    assign_to_checkpoint(paths, new_checkpoint, checkpoint, config=config)

@@ -254,48 +307,69 @@ def convert_vq_autoenc_checkpoint(checkpoint, config):

    # Mid new 2
    paths = renew_resnet_paths(mid_block_1_layers)
-    assign_to_checkpoint(paths, new_checkpoint, checkpoint, additional_replacements=[
-        {'old': 'mid.', 'new': 'mid_new_2.'}, {'old': 'block_1', 'new': 'resnets.0'}
-    ])
+    assign_to_checkpoint(
+        paths,
+        new_checkpoint,
+        checkpoint,
+        additional_replacements=[{"old": "mid.", "new": "mid_new_2."}, {"old": "block_1", "new": "resnets.0"}],
+    )

    paths = renew_resnet_paths(mid_block_2_layers)
-    assign_to_checkpoint(paths, new_checkpoint, checkpoint, additional_replacements=[
-        {'old': 'mid.', 'new': 'mid_new_2.'}, {'old': 'block_2', 'new': 'resnets.1'}
-    ])
+    assign_to_checkpoint(
+        paths,
+        new_checkpoint,
+        checkpoint,
+        additional_replacements=[{"old": "mid.", "new": "mid_new_2."}, {"old": "block_2", "new": "resnets.1"}],
+    )

    paths = renew_attention_paths(mid_attn_1_layers, in_mid=True)
-    assign_to_checkpoint(paths, new_checkpoint, checkpoint, additional_replacements=[
-        {'old': 'mid.', 'new': 'mid_new_2.'}, {'old': 'attn_1', 'new': 'attentions.0'}
-    ])
+    assign_to_checkpoint(
+        paths,
+        new_checkpoint,
+        checkpoint,
+        additional_replacements=[{"old": "mid.", "new": "mid_new_2."}, {"old": "attn_1", "new": "attentions.0"}],
+    )

    for i in range(num_up_blocks):
        block_id = num_up_blocks - 1 - i

-        if any('upsample' in layer for layer in up_blocks[i]):
-            new_checkpoint[f'decoder.up_blocks.{block_id}.upsamplers.0.conv.weight'] = checkpoint[f'decoder.up.{i}.upsample.conv.weight']
-            new_checkpoint[f'decoder.up_blocks.{block_id}.upsamplers.0.conv.bias'] = checkpoint[f'decoder.up.{i}.upsample.conv.bias']
+        if any("upsample" in layer for layer in up_blocks[i]):
+            new_checkpoint[f"decoder.up_blocks.{block_id}.upsamplers.0.conv.weight"] = checkpoint[
+                f"decoder.up.{i}.upsample.conv.weight"
+            ]
+            new_checkpoint[f"decoder.up_blocks.{block_id}.upsamplers.0.conv.bias"] = checkpoint[
+                f"decoder.up.{i}.upsample.conv.bias"
+            ]

-        if any('block' in layer for layer in up_blocks[i]):
-            num_blocks = len({'.'.join(shave_segments(layer, 3).split('.')[:3]) for layer in up_blocks[i] if 'block' in layer})
-            blocks = {layer_id: [key for key in up_blocks[i] if f'block.{layer_id}' in key] for layer_id in range(num_blocks)}
+        if any("block" in layer for layer in up_blocks[i]):
+            num_blocks = len(
+                {".".join(shave_segments(layer, 3).split(".")[:3]) for layer in up_blocks[i] if "block" in layer}
+            )
+            blocks = {
+                layer_id: [key for key in up_blocks[i] if f"block.{layer_id}" in key] for layer_id in range(num_blocks)
+            }

            if num_blocks > 0:
-                for j in range(config['layers_per_block'] + 1):
-                    replace_indices = {'old': f'up_blocks.{i}', 'new': f'up_blocks.{block_id}'}
+                for j in range(config["layers_per_block"] + 1):
+                    replace_indices = {"old": f"up_blocks.{i}", "new": f"up_blocks.{block_id}"}
                    paths = renew_resnet_paths(blocks[j])
                    assign_to_checkpoint(paths, new_checkpoint, checkpoint, additional_replacements=[replace_indices])

-        if any('attn' in layer for layer in up_blocks[i]):
-            num_attn = len({'.'.join(shave_segments(layer, 3).split('.')[:3]) for layer in up_blocks[i] if 'attn' in layer})
-            attns = {layer_id: [key for key in up_blocks[i] if f'attn.{layer_id}' in key] for layer_id in range(num_blocks)}
+        if any("attn" in layer for layer in up_blocks[i]):
+            num_attn = len(
+                {".".join(shave_segments(layer, 3).split(".")[:3]) for layer in up_blocks[i] if "attn" in layer}
+            )
+            attns = {
+                layer_id: [key for key in up_blocks[i] if f"attn.{layer_id}" in key] for layer_id in range(num_blocks)
+            }

            if num_attn > 0:
-                for j in range(config['layers_per_block'] + 1):
-                    replace_indices = {'old': f'up_blocks.{i}', 'new': f'up_blocks.{block_id}'}
+                for j in range(config["layers_per_block"] + 1):
+                    replace_indices = {"old": f"up_blocks.{i}", "new": f"up_blocks.{block_id}"}
                    paths = renew_attention_paths(attns[j])
                    assign_to_checkpoint(paths, new_checkpoint, checkpoint, additional_replacements=[replace_indices])

-    new_checkpoint = {k.replace('mid_new_2', 'mid_block'): v for k, v in new_checkpoint.items()}
+    new_checkpoint = {k.replace("mid_new_2", "mid_block"): v for k, v in new_checkpoint.items()}
    new_checkpoint["quant_conv.weight"] = checkpoint["quant_conv.weight"]
    new_checkpoint["quant_conv.bias"] = checkpoint["quant_conv.bias"]
    if "quantize.embedding.weight" in checkpoint:
@@ -321,9 +395,7 @@ if __name__ == "__main__":
        help="The config json file corresponding to the architecture.",
    )

-    parser.add_argument(
-        "--dump_path", default=None, type=str, required=True, help="Path to the output model."
-    )
+    parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output model.")

    args = parser.parse_args()
    checkpoint = torch.load(args.checkpoint_path)
--- a/scripts/convert_diffusers_to_original_stable_diffusion.py
+++ b/scripts/convert_diffusers_to_original_stable_diffusion.py
@@ -0,0 +1,234 @@
+# Script for converting a HF Diffusers saved pipeline to a Stable Diffusion checkpoint.
+# *Only* converts the UNet, VAE, and Text Encoder.
+# Does not convert optimizer state or any other thing.
+
+import argparse
+import os.path as osp
+
+import torch
+
+
+# =================#
+# UNet Conversion #
+# =================#
+
+unet_conversion_map = [
+    # (stable-diffusion, HF Diffusers)
+    ("time_embed.0.weight", "time_embedding.linear_1.weight"),
+    ("time_embed.0.bias", "time_embedding.linear_1.bias"),
+    ("time_embed.2.weight", "time_embedding.linear_2.weight"),
+    ("time_embed.2.bias", "time_embedding.linear_2.bias"),
+    ("input_blocks.0.0.weight", "conv_in.weight"),
+    ("input_blocks.0.0.bias", "conv_in.bias"),
+    ("out.0.weight", "conv_norm_out.weight"),
+    ("out.0.bias", "conv_norm_out.bias"),
+    ("out.2.weight", "conv_out.weight"),
+    ("out.2.bias", "conv_out.bias"),
+]
+
+unet_conversion_map_resnet = [
+    # (stable-diffusion, HF Diffusers)
+    ("in_layers.0", "norm1"),
+    ("in_layers.2", "conv1"),
+    ("out_layers.0", "norm2"),
+    ("out_layers.3", "conv2"),
+    ("emb_layers.1", "time_emb_proj"),
+    ("skip_connection", "conv_shortcut"),
+]
+
+unet_conversion_map_layer = []
+# hardcoded number of downblocks and resnets/attentions...
+# would need smarter logic for other networks.
+for i in range(4):
+    # loop over downblocks/upblocks
+
+    for j in range(2):
+        # loop over resnets/attentions for downblocks
+        hf_down_res_prefix = f"down_blocks.{i}.resnets.{j}."
+        sd_down_res_prefix = f"input_blocks.{3*i + j + 1}.0."
+        unet_conversion_map_layer.append((sd_down_res_prefix, hf_down_res_prefix))
+
+        if i < 3:
+            # no attention layers in down_blocks.3
+            hf_down_atn_prefix = f"down_blocks.{i}.attentions.{j}."
+            sd_down_atn_prefix = f"input_blocks.{3*i + j + 1}.1."
+            unet_conversion_map_layer.append((sd_down_atn_prefix, hf_down_atn_prefix))
+
+    for j in range(3):
+        # loop over resnets/attentions for upblocks
+        hf_up_res_prefix = f"up_blocks.{i}.resnets.{j}."
+        sd_up_res_prefix = f"output_blocks.{3*i + j}.0."
+        unet_conversion_map_layer.append((sd_up_res_prefix, hf_up_res_prefix))
+
+        if i > 0:
+            # no attention layers in up_blocks.0
+            hf_up_atn_prefix = f"up_blocks.{i}.attentions.{j}."
+            sd_up_atn_prefix = f"output_blocks.{3*i + j}.1."
+            unet_conversion_map_layer.append((sd_up_atn_prefix, hf_up_atn_prefix))
+
+    if i < 3:
+        # no downsample in down_blocks.3
+        hf_downsample_prefix = f"down_blocks.{i}.downsamplers.0.conv."
+        sd_downsample_prefix = f"input_blocks.{3*(i+1)}.0.op."
+        unet_conversion_map_layer.append((sd_downsample_prefix, hf_downsample_prefix))
+
+        # no upsample in up_blocks.3
+        hf_upsample_prefix = f"up_blocks.{i}.upsamplers.0."
+        sd_upsample_prefix = f"output_blocks.{3*i + 2}.{1 if i == 0 else 2}."
+        unet_conversion_map_layer.append((sd_upsample_prefix, hf_upsample_prefix))
+
+hf_mid_atn_prefix = "mid_block.attentions.0."
+sd_mid_atn_prefix = "middle_block.1."
+unet_conversion_map_layer.append((sd_mid_atn_prefix, hf_mid_atn_prefix))
+
+for j in range(2):
+    hf_mid_res_prefix = f"mid_block.resnets.{j}."
+    sd_mid_res_prefix = f"middle_block.{2*j}."
+    unet_conversion_map_layer.append((sd_mid_res_prefix, hf_mid_res_prefix))
+
+
+def convert_unet_state_dict(unet_state_dict):
+    # buyer beware: this is a *brittle* function,
+    # and correct output requires that all of these pieces interact in
+    # the exact order in which I have arranged them.
+    mapping = {k: k for k in unet_state_dict.keys()}
+    for sd_name, hf_name in unet_conversion_map:
+        mapping[hf_name] = sd_name
+    for k, v in mapping.items():
+        if "resnets" in k:
+            for sd_part, hf_part in unet_conversion_map_resnet:
+                v = v.replace(hf_part, sd_part)
+            mapping[k] = v
+    for k, v in mapping.items():
+        for sd_part, hf_part in unet_conversion_map_layer:
+            v = v.replace(hf_part, sd_part)
+        mapping[k] = v
+    new_state_dict = {v: unet_state_dict[k] for k, v in mapping.items()}
+    return new_state_dict
+
+
+# ================#
+# VAE Conversion #
+# ================#
+
+vae_conversion_map = [
+    # (stable-diffusion, HF Diffusers)
+    ("nin_shortcut", "conv_shortcut"),
+    ("norm_out", "conv_norm_out"),
+    ("mid.attn_1.", "mid_block.attentions.0."),
+]
+
+for i in range(4):
+    # down_blocks have two resnets
+    for j in range(2):
+        hf_down_prefix = f"encoder.down_blocks.{i}.resnets.{j}."
+        sd_down_prefix = f"encoder.down.{i}.block.{j}."
+        vae_conversion_map.append((sd_down_prefix, hf_down_prefix))
+
+    if i < 3:
+        hf_downsample_prefix = f"down_blocks.{i}.downsamplers.0."
+        sd_downsample_prefix = f"down.{i}.downsample."
+        vae_conversion_map.append((sd_downsample_prefix, hf_downsample_prefix))
+
+        hf_upsample_prefix = f"up_blocks.{i}.upsamplers.0."
+        sd_upsample_prefix = f"up.{3-i}.upsample."
+        vae_conversion_map.append((sd_upsample_prefix, hf_upsample_prefix))
+
+    # up_blocks have three resnets
+    # also, up blocks in hf are numbered in reverse from sd
+    for j in range(3):
+        hf_up_prefix = f"decoder.up_blocks.{i}.resnets.{j}."
+        sd_up_prefix = f"decoder.up.{3-i}.block.{j}."
+        vae_conversion_map.append((sd_up_prefix, hf_up_prefix))
+
+# this part accounts for mid blocks in both the encoder and the decoder
+for i in range(2):
+    hf_mid_res_prefix = f"mid_block.resnets.{i}."
+    sd_mid_res_prefix = f"mid.block_{i+1}."
+    vae_conversion_map.append((sd_mid_res_prefix, hf_mid_res_prefix))
+
+
+vae_conversion_map_attn = [
+    # (stable-diffusion, HF Diffusers)
+    ("norm.", "group_norm."),
+    ("q.", "query."),
+    ("k.", "key."),
+    ("v.", "value."),
+    ("proj_out.", "proj_attn."),
+]
+
+
+def reshape_weight_for_sd(w):
+    # convert HF linear weights to SD conv2d weights
+    return w.reshape(*w.shape, 1, 1)
+
+
+def convert_vae_state_dict(vae_state_dict):
+    mapping = {k: k for k in vae_state_dict.keys()}
+    for k, v in mapping.items():
+        for sd_part, hf_part in vae_conversion_map:
+            v = v.replace(hf_part, sd_part)
+        mapping[k] = v
+    for k, v in mapping.items():
+        if "attentions" in k:
+            for sd_part, hf_part in vae_conversion_map_attn:
+                v = v.replace(hf_part, sd_part)
+            mapping[k] = v
+    new_state_dict = {v: vae_state_dict[k] for k, v in mapping.items()}
+    weights_to_convert = ["q", "k", "v", "proj_out"]
+    for k, v in new_state_dict.items():
+        for weight_name in weights_to_convert:
+            if f"mid.attn_1.{weight_name}.weight" in k:
+                print(f"Reshaping {k} for SD format")
+                new_state_dict[k] = reshape_weight_for_sd(v)
+    return new_state_dict
+
+
+# =========================#
+# Text Encoder Conversion #
+# =========================#
+# pretty much a no-op
+
+
+def convert_text_enc_state_dict(text_enc_dict):
+    return text_enc_dict
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--model_path", default=None, type=str, required=True, help="Path to the model to convert.")
+    parser.add_argument("--checkpoint_path", default=None, type=str, required=True, help="Path to the output model.")
+    parser.add_argument("--half", action="store_true", help="Save weights in half precision.")
+
+    args = parser.parse_args()
+
+    assert args.model_path is not None, "Must provide a model path!"
+
+    assert args.checkpoint_path is not None, "Must provide a checkpoint path!"
+
+    unet_path = osp.join(args.model_path, "unet", "diffusion_pytorch_model.bin")
+    vae_path = osp.join(args.model_path, "vae", "diffusion_pytorch_model.bin")
+    text_enc_path = osp.join(args.model_path, "text_encoder", "pytorch_model.bin")
+
+    # Convert the UNet model
+    unet_state_dict = torch.load(unet_path, map_location="cpu")
+    unet_state_dict = convert_unet_state_dict(unet_state_dict)
+    unet_state_dict = {"model.diffusion_model." + k: v for k, v in unet_state_dict.items()}
+
+    # Convert the VAE model
+    vae_state_dict = torch.load(vae_path, map_location="cpu")
+    vae_state_dict = convert_vae_state_dict(vae_state_dict)
+    vae_state_dict = {"first_stage_model." + k: v for k, v in vae_state_dict.items()}
+
+    # Convert the text encoder model
+    text_enc_dict = torch.load(text_enc_path, map_location="cpu")
+    text_enc_dict = convert_text_enc_state_dict(text_enc_dict)
+    text_enc_dict = {"cond_stage_model.transformer." + k: v for k, v in text_enc_dict.items()}
+
+    # Put together new checkpoint
+    state_dict = {**unet_state_dict, **vae_state_dict, **text_enc_dict}
+    if args.half:
+        state_dict = {k: v.half() for k, v in state_dict.items()}
+    state_dict = {"state_dict": state_dict}
+    torch.save(state_dict, args.checkpoint_path)
--- a/scripts/convert_ldm_original_checkpoint_to_diffusers.py
+++ b/scripts/convert_ldm_original_checkpoint_to_diffusers.py
@@ -16,8 +16,10 @@

 import argparse
 import json
+
 import torch
-from diffusers import VQModel, DDPMScheduler, UNet2DModel, LDMPipeline
+
+from diffusers import DDPMScheduler, LDMPipeline, UNet2DModel, VQModel


 def shave_segments(path, n_shave_prefix_segments=1):
@@ -25,9 +27,9 @@ def shave_segments(path, n_shave_prefix_segments=1):
    Removes segments. Positive values shave the first segments, negative shave the last segments.
    """
    if n_shave_prefix_segments >= 0:
-        return '.'.join(path.split('.')[n_shave_prefix_segments:])
+        return ".".join(path.split(".")[n_shave_prefix_segments:])
    else:
-        return '.'.join(path.split('.')[:n_shave_prefix_segments])
+        return ".".join(path.split(".")[:n_shave_prefix_segments])


 def renew_resnet_paths(old_list, n_shave_prefix_segments=0):
@@ -36,18 +38,18 @@ def renew_resnet_paths(old_list, n_shave_prefix_segments=0):
    """
    mapping = []
    for old_item in old_list:
-        new_item = old_item.replace('in_layers.0', 'norm1')
-        new_item = new_item.replace('in_layers.2', 'conv1')
+        new_item = old_item.replace("in_layers.0", "norm1")
+        new_item = new_item.replace("in_layers.2", "conv1")

-        new_item = new_item.replace('out_layers.0', 'norm2')
-        new_item = new_item.replace('out_layers.3', 'conv2')
+        new_item = new_item.replace("out_layers.0", "norm2")
+        new_item = new_item.replace("out_layers.3", "conv2")

-        new_item = new_item.replace('emb_layers.1', 'time_emb_proj')
-        new_item = new_item.replace('skip_connection', 'conv_shortcut')
+        new_item = new_item.replace("emb_layers.1", "time_emb_proj")
+        new_item = new_item.replace("skip_connection", "conv_shortcut")

        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)

-        mapping.append({'old': old_item, 'new': new_item})
+        mapping.append({"old": old_item, "new": new_item})

    return mapping

@@ -60,20 +62,22 @@ def renew_attention_paths(old_list, n_shave_prefix_segments=0):
    for old_item in old_list:
        new_item = old_item

-        new_item = new_item.replace('norm.weight', 'group_norm.weight')
-        new_item = new_item.replace('norm.bias', 'group_norm.bias')
+        new_item = new_item.replace("norm.weight", "group_norm.weight")
+        new_item = new_item.replace("norm.bias", "group_norm.bias")

-        new_item = new_item.replace('proj_out.weight', 'proj_attn.weight')
-        new_item = new_item.replace('proj_out.bias', 'proj_attn.bias')
+        new_item = new_item.replace("proj_out.weight", "proj_attn.weight")
+        new_item = new_item.replace("proj_out.bias", "proj_attn.bias")

        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)

-        mapping.append({'old': old_item, 'new': new_item})
+        mapping.append({"old": old_item, "new": new_item})

    return mapping


-def assign_to_checkpoint(paths, checkpoint, old_checkpoint, attention_paths_to_split=None, additional_replacements=None, config=None):
+def assign_to_checkpoint(
+    paths, checkpoint, old_checkpoint, attention_paths_to_split=None, additional_replacements=None, config=None
+):
    """
    This does the final conversion step: take locally converted weights and apply a global renaming
    to them. It splits attention layers, and takes into account additional replacements
@@ -96,31 +100,31 @@ def assign_to_checkpoint(paths, checkpoint, old_checkpoint, attention_paths_to_s
            old_tensor = old_tensor.reshape((num_heads, 3 * channels // num_heads) + old_tensor.shape[1:])
            query, key, value = old_tensor.split(channels // num_heads, dim=1)

-            checkpoint[path_map['query']] = query.reshape(target_shape)
-            checkpoint[path_map['key']] = key.reshape(target_shape)
-            checkpoint[path_map['value']] = value.reshape(target_shape)
+            checkpoint[path_map["query"]] = query.reshape(target_shape)
+            checkpoint[path_map["key"]] = key.reshape(target_shape)
+            checkpoint[path_map["value"]] = value.reshape(target_shape)

    for path in paths:
-        new_path = path['new']
+        new_path = path["new"]

        # These have already been assigned
        if attention_paths_to_split is not None and new_path in attention_paths_to_split:
            continue

        # Global renaming happens here
-        new_path = new_path.replace('middle_block.0', 'mid.resnets.0')
-        new_path = new_path.replace('middle_block.1', 'mid.attentions.0')
-        new_path = new_path.replace('middle_block.2', 'mid.resnets.1')
+        new_path = new_path.replace("middle_block.0", "mid.resnets.0")
+        new_path = new_path.replace("middle_block.1", "mid.attentions.0")
+        new_path = new_path.replace("middle_block.2", "mid.resnets.1")

        if additional_replacements is not None:
            for replacement in additional_replacements:
-                new_path = new_path.replace(replacement['old'], replacement['new'])
+                new_path = new_path.replace(replacement["old"], replacement["new"])

        # proj_attn.weight has to be converted from conv 1D to linear
        if "proj_attn.weight" in new_path:
-            checkpoint[new_path] = old_checkpoint[path['old']][:, :, 0]
+            checkpoint[new_path] = old_checkpoint[path["old"]][:, :, 0]
        else:
-            checkpoint[new_path] = old_checkpoint[path['old']]
+            checkpoint[new_path] = old_checkpoint[path["old"]]


 def convert_ldm_checkpoint(checkpoint, config):
@@ -129,60 +133,78 @@ def convert_ldm_checkpoint(checkpoint, config):
    """
    new_checkpoint = {}

-    new_checkpoint['time_embedding.linear_1.weight'] = checkpoint['time_embed.0.weight']
-    new_checkpoint['time_embedding.linear_1.bias'] = checkpoint['time_embed.0.bias']
-    new_checkpoint['time_embedding.linear_2.weight'] = checkpoint['time_embed.2.weight']
-    new_checkpoint['time_embedding.linear_2.bias'] = checkpoint['time_embed.2.bias']
+    new_checkpoint["time_embedding.linear_1.weight"] = checkpoint["time_embed.0.weight"]
+    new_checkpoint["time_embedding.linear_1.bias"] = checkpoint["time_embed.0.bias"]
+    new_checkpoint["time_embedding.linear_2.weight"] = checkpoint["time_embed.2.weight"]
+    new_checkpoint["time_embedding.linear_2.bias"] = checkpoint["time_embed.2.bias"]

-    new_checkpoint['conv_in.weight'] = checkpoint['input_blocks.0.0.weight']
-    new_checkpoint['conv_in.bias'] = checkpoint['input_blocks.0.0.bias']
+    new_checkpoint["conv_in.weight"] = checkpoint["input_blocks.0.0.weight"]
+    new_checkpoint["conv_in.bias"] = checkpoint["input_blocks.0.0.bias"]

-    new_checkpoint['conv_norm_out.weight'] = checkpoint['out.0.weight']
-    new_checkpoint['conv_norm_out.bias'] = checkpoint['out.0.bias']
-    new_checkpoint['conv_out.weight'] = checkpoint['out.2.weight']
-    new_checkpoint['conv_out.bias'] = checkpoint['out.2.bias']
+    new_checkpoint["conv_norm_out.weight"] = checkpoint["out.0.weight"]
+    new_checkpoint["conv_norm_out.bias"] = checkpoint["out.0.bias"]
+    new_checkpoint["conv_out.weight"] = checkpoint["out.2.weight"]
+    new_checkpoint["conv_out.bias"] = checkpoint["out.2.bias"]

    # Retrieves the keys for the input blocks only
-    num_input_blocks = len({'.'.join(layer.split('.')[:2]) for layer in checkpoint if 'input_blocks' in layer})
-    input_blocks = {layer_id: [key for key in checkpoint if f'input_blocks.{layer_id}' in key] for layer_id in range(num_input_blocks)}
+    num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in checkpoint if "input_blocks" in layer})
+    input_blocks = {
+        layer_id: [key for key in checkpoint if f"input_blocks.{layer_id}" in key]
+        for layer_id in range(num_input_blocks)
+    }

    # Retrieves the keys for the middle blocks only
-    num_middle_blocks = len({'.'.join(layer.split('.')[:2]) for layer in checkpoint if 'middle_block' in layer})
-    middle_blocks = {layer_id: [key for key in checkpoint if f'middle_block.{layer_id}' in key] for layer_id in range(num_middle_blocks)}
+    num_middle_blocks = len({".".join(layer.split(".")[:2]) for layer in checkpoint if "middle_block" in layer})
+    middle_blocks = {
+        layer_id: [key for key in checkpoint if f"middle_block.{layer_id}" in key]
+        for layer_id in range(num_middle_blocks)
+    }

    # Retrieves the keys for the output blocks only
-    num_output_blocks = len({'.'.join(layer.split('.')[:2]) for layer in checkpoint if 'output_blocks' in layer})
-    output_blocks = {layer_id: [key for key in checkpoint if f'output_blocks.{layer_id}' in key] for layer_id in range(num_output_blocks)}
+    num_output_blocks = len({".".join(layer.split(".")[:2]) for layer in checkpoint if "output_blocks" in layer})
+    output_blocks = {
+        layer_id: [key for key in checkpoint if f"output_blocks.{layer_id}" in key]
+        for layer_id in range(num_output_blocks)
+    }

    for i in range(1, num_input_blocks):
-        block_id = (i - 1) // (config['num_res_blocks'] + 1)
-        layer_in_block_id = (i - 1) % (config['num_res_blocks'] + 1)
+        block_id = (i - 1) // (config["num_res_blocks"] + 1)
+        layer_in_block_id = (i - 1) % (config["num_res_blocks"] + 1)

-        resnets = [key for key in input_blocks[i] if f'input_blocks.{i}.0' in key]
-        attentions = [key for key in input_blocks[i] if f'input_blocks.{i}.1' in key]
+        resnets = [key for key in input_blocks[i] if f"input_blocks.{i}.0" in key]
+        attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key]

-        if f'input_blocks.{i}.0.op.weight' in checkpoint:
-            new_checkpoint[f'downsample_blocks.{block_id}.downsamplers.0.conv.weight'] = checkpoint[f'input_blocks.{i}.0.op.weight']
-            new_checkpoint[f'downsample_blocks.{block_id}.downsamplers.0.conv.bias'] = checkpoint[f'input_blocks.{i}.0.op.bias']
+        if f"input_blocks.{i}.0.op.weight" in checkpoint:
+            new_checkpoint[f"downsample_blocks.{block_id}.downsamplers.0.conv.weight"] = checkpoint[
+                f"input_blocks.{i}.0.op.weight"
+            ]
+            new_checkpoint[f"downsample_blocks.{block_id}.downsamplers.0.conv.bias"] = checkpoint[
+                f"input_blocks.{i}.0.op.bias"
+            ]

        paths = renew_resnet_paths(resnets)
-        meta_path = {'old': f'input_blocks.{i}.0', 'new': f'downsample_blocks.{block_id}.resnets.{layer_in_block_id}'}
-        resnet_op = {'old': 'resnets.2.op', 'new': 'downsamplers.0.op'}
-        assign_to_checkpoint(paths, new_checkpoint, checkpoint, additional_replacements=[meta_path, resnet_op], config=config)
+        meta_path = {"old": f"input_blocks.{i}.0", "new": f"downsample_blocks.{block_id}.resnets.{layer_in_block_id}"}
+        resnet_op = {"old": "resnets.2.op", "new": "downsamplers.0.op"}
+        assign_to_checkpoint(
+            paths, new_checkpoint, checkpoint, additional_replacements=[meta_path, resnet_op], config=config
+        )

        if len(attentions):
            paths = renew_attention_paths(attentions)
-            meta_path = {'old': f'input_blocks.{i}.1', 'new': f'downsample_blocks.{block_id}.attentions.{layer_in_block_id}'}
+            meta_path = {
+                "old": f"input_blocks.{i}.1",
+                "new": f"downsample_blocks.{block_id}.attentions.{layer_in_block_id}",
+            }
            to_split = {
-                f'input_blocks.{i}.1.qkv.bias': {
-                    'key': f'downsample_blocks.{block_id}.attentions.{layer_in_block_id}.key.bias',
-                    'query': f'downsample_blocks.{block_id}.attentions.{layer_in_block_id}.query.bias',
-                    'value': f'downsample_blocks.{block_id}.attentions.{layer_in_block_id}.value.bias',
+                f"input_blocks.{i}.1.qkv.bias": {
+                    "key": f"downsample_blocks.{block_id}.attentions.{layer_in_block_id}.key.bias",
+                    "query": f"downsample_blocks.{block_id}.attentions.{layer_in_block_id}.query.bias",
+                    "value": f"downsample_blocks.{block_id}.attentions.{layer_in_block_id}.value.bias",
                },
-                f'input_blocks.{i}.1.qkv.weight': {
-                    'key': f'downsample_blocks.{block_id}.attentions.{layer_in_block_id}.key.weight',
-                    'query': f'downsample_blocks.{block_id}.attentions.{layer_in_block_id}.query.weight',
-                    'value': f'downsample_blocks.{block_id}.attentions.{layer_in_block_id}.value.weight',
+                f"input_blocks.{i}.1.qkv.weight": {
+                    "key": f"downsample_blocks.{block_id}.attentions.{layer_in_block_id}.key.weight",
+                    "query": f"downsample_blocks.{block_id}.attentions.{layer_in_block_id}.query.weight",
+                    "value": f"downsample_blocks.{block_id}.attentions.{layer_in_block_id}.value.weight",
                },
            }
            assign_to_checkpoint(
@@ -191,7 +213,7 @@ def convert_ldm_checkpoint(checkpoint, config):
                checkpoint,
                additional_replacements=[meta_path],
                attention_paths_to_split=to_split,
-                config=config
+                config=config,
            )

    resnet_0 = middle_blocks[0]
@@ -206,46 +228,52 @@ def convert_ldm_checkpoint(checkpoint, config):

    attentions_paths = renew_attention_paths(attentions)
    to_split = {
-        'middle_block.1.qkv.bias': {
-            'key': 'mid_block.attentions.0.key.bias',
-            'query': 'mid_block.attentions.0.query.bias',
-            'value': 'mid_block.attentions.0.value.bias',
+        "middle_block.1.qkv.bias": {
+            "key": "mid_block.attentions.0.key.bias",
+            "query": "mid_block.attentions.0.query.bias",
+            "value": "mid_block.attentions.0.value.bias",
        },
-        'middle_block.1.qkv.weight': {
-            'key': 'mid_block.attentions.0.key.weight',
-            'query': 'mid_block.attentions.0.query.weight',
-            'value': 'mid_block.attentions.0.value.weight',
+        "middle_block.1.qkv.weight": {
+            "key": "mid_block.attentions.0.key.weight",
+            "query": "mid_block.attentions.0.query.weight",
+            "value": "mid_block.attentions.0.value.weight",
        },
    }
-    assign_to_checkpoint(attentions_paths, new_checkpoint, checkpoint, attention_paths_to_split=to_split, config=config)
+    assign_to_checkpoint(
+        attentions_paths, new_checkpoint, checkpoint, attention_paths_to_split=to_split, config=config
+    )

    for i in range(num_output_blocks):
-        block_id = i // (config['num_res_blocks'] + 1)
-        layer_in_block_id = i % (config['num_res_blocks'] + 1)
+        block_id = i // (config["num_res_blocks"] + 1)
+        layer_in_block_id = i % (config["num_res_blocks"] + 1)
        output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]]
        output_block_list = {}

        for layer in output_block_layers:
-            layer_id, layer_name = layer.split('.')[0], shave_segments(layer, 1)
+            layer_id, layer_name = layer.split(".")[0], shave_segments(layer, 1)
            if layer_id in output_block_list:
                output_block_list[layer_id].append(layer_name)
            else:
                output_block_list[layer_id] = [layer_name]

        if len(output_block_list) > 1:
-            resnets = [key for key in output_blocks[i] if f'output_blocks.{i}.0' in key]
-            attentions = [key for key in output_blocks[i] if f'output_blocks.{i}.1' in key]
+            resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.0" in key]
+            attentions = [key for key in output_blocks[i] if f"output_blocks.{i}.1" in key]

            resnet_0_paths = renew_resnet_paths(resnets)
            paths = renew_resnet_paths(resnets)

-            meta_path = {'old': f'output_blocks.{i}.0', 'new': f'up_blocks.{block_id}.resnets.{layer_in_block_id}'}
+            meta_path = {"old": f"output_blocks.{i}.0", "new": f"up_blocks.{block_id}.resnets.{layer_in_block_id}"}
            assign_to_checkpoint(paths, new_checkpoint, checkpoint, additional_replacements=[meta_path], config=config)

-            if ['conv.weight', 'conv.bias'] in output_block_list.values():
-                index = list(output_block_list.values()).index(['conv.weight', 'conv.bias'])
-                new_checkpoint[f'up_blocks.{block_id}.upsamplers.0.conv.weight'] = checkpoint[f'output_blocks.{i}.{index}.conv.weight']
-                new_checkpoint[f'up_blocks.{block_id}.upsamplers.0.conv.bias'] = checkpoint[f'output_blocks.{i}.{index}.conv.bias']
+            if ["conv.weight", "conv.bias"] in output_block_list.values():
+                index = list(output_block_list.values()).index(["conv.weight", "conv.bias"])
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = checkpoint[
+                    f"output_blocks.{i}.{index}.conv.weight"
+                ]
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = checkpoint[
+                    f"output_blocks.{i}.{index}.conv.bias"
+                ]

                # Clear attentions as they have been attributed above.
                if len(attentions) == 2:
@@ -254,19 +282,19 @@ def convert_ldm_checkpoint(checkpoint, config):
            if len(attentions):
                paths = renew_attention_paths(attentions)
                meta_path = {
-                    'old': f'output_blocks.{i}.1',
-                    'new': f'up_blocks.{block_id}.attentions.{layer_in_block_id}'
+                    "old": f"output_blocks.{i}.1",
+                    "new": f"up_blocks.{block_id}.attentions.{layer_in_block_id}",
                }
                to_split = {
-                    f'output_blocks.{i}.1.qkv.bias': {
-                        'key': f'up_blocks.{block_id}.attentions.{layer_in_block_id}.key.bias',
-                        'query': f'up_blocks.{block_id}.attentions.{layer_in_block_id}.query.bias',
-                        'value': f'up_blocks.{block_id}.attentions.{layer_in_block_id}.value.bias',
+                    f"output_blocks.{i}.1.qkv.bias": {
+                        "key": f"up_blocks.{block_id}.attentions.{layer_in_block_id}.key.bias",
+                        "query": f"up_blocks.{block_id}.attentions.{layer_in_block_id}.query.bias",
+                        "value": f"up_blocks.{block_id}.attentions.{layer_in_block_id}.value.bias",
                    },
-                    f'output_blocks.{i}.1.qkv.weight': {
-                        'key': f'up_blocks.{block_id}.attentions.{layer_in_block_id}.key.weight',
-                        'query': f'up_blocks.{block_id}.attentions.{layer_in_block_id}.query.weight',
-                        'value': f'up_blocks.{block_id}.attentions.{layer_in_block_id}.value.weight',
+                    f"output_blocks.{i}.1.qkv.weight": {
+                        "key": f"up_blocks.{block_id}.attentions.{layer_in_block_id}.key.weight",
+                        "query": f"up_blocks.{block_id}.attentions.{layer_in_block_id}.query.weight",
+                        "value": f"up_blocks.{block_id}.attentions.{layer_in_block_id}.value.weight",
                    },
                }
                assign_to_checkpoint(
@@ -274,14 +302,14 @@ def convert_ldm_checkpoint(checkpoint, config):
                    new_checkpoint,
                    checkpoint,
                    additional_replacements=[meta_path],
-                    attention_paths_to_split=to_split if any('qkv' in key for key in attentions) else None,
+                    attention_paths_to_split=to_split if any("qkv" in key for key in attentions) else None,
                    config=config,
                )
        else:
            resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1)
            for path in resnet_0_paths:
-                old_path = '.'.join(['output_blocks', str(i), path['old']])
-                new_path = '.'.join(['up_blocks', str(block_id), 'resnets', str(layer_in_block_id), path['new']])
+                old_path = ".".join(["output_blocks", str(i), path["old"]])
+                new_path = ".".join(["up_blocks", str(block_id), "resnets", str(layer_in_block_id), path["new"]])

                new_checkpoint[new_path] = checkpoint[old_path]

@@ -303,9 +331,7 @@ if __name__ == "__main__":
        help="The config json file corresponding to the architecture.",
    )

-    parser.add_argument(
-        "--dump_path", default=None, type=str, required=True, help="Path to the output model."
-    )
+    parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output model.")

    args = parser.parse_args()

--- a/scripts/convert_ncsnpp_original_checkpoint_to_diffusers.py
+++ b/scripts/convert_ncsnpp_original_checkpoint_to_diffusers.py
@@ -16,8 +16,10 @@

 import argparse
 import json
+
 import torch
-from diffusers import UNet2DModel
+
+from diffusers import ScoreSdeVePipeline, ScoreSdeVeScheduler, UNet2DModel


 def convert_ncsnpp_checkpoint(checkpoint, config):
--- a/scripts/convert_original_stable_diffusion_to_diffusers.py
+++ b/scripts/convert_original_stable_diffusion_to_diffusers.py
@@ -0,0 +1,706 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Conversion script for the LDM checkpoints. """
+
+import argparse
+import os
+
+import torch
+
+
+try:
+    from omegaconf import OmegaConf
+except ImportError:
+    raise ImportError(
+        "OmegaConf is required to convert the LDM checkpoints. Please install it with `pip install OmegaConf`."
+    )
+
+from diffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    LDMTextToImagePipeline,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    StableDiffusionPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.pipelines.latent_diffusion.pipeline_latent_diffusion import LDMBertConfig, LDMBertModel
+from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker
+from transformers import AutoFeatureExtractor, BertTokenizerFast, CLIPTextModel, CLIPTokenizer
+
+
+def shave_segments(path, n_shave_prefix_segments=1):
+    """
+    Removes segments. Positive values shave the first segments, negative shave the last segments.
+    """
+    if n_shave_prefix_segments >= 0:
+        return ".".join(path.split(".")[n_shave_prefix_segments:])
+    else:
+        return ".".join(path.split(".")[:n_shave_prefix_segments])
+
+
+def renew_resnet_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside resnets to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item.replace("in_layers.0", "norm1")
+        new_item = new_item.replace("in_layers.2", "conv1")
+
+        new_item = new_item.replace("out_layers.0", "norm2")
+        new_item = new_item.replace("out_layers.3", "conv2")
+
+        new_item = new_item.replace("emb_layers.1", "time_emb_proj")
+        new_item = new_item.replace("skip_connection", "conv_shortcut")
+
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside resnets to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+
+        new_item = new_item.replace("nin_shortcut", "conv_shortcut")
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+def renew_attention_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside attentions to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+
+        #         new_item = new_item.replace('norm.weight', 'group_norm.weight')
+        #         new_item = new_item.replace('norm.bias', 'group_norm.bias')
+
+        #         new_item = new_item.replace('proj_out.weight', 'proj_attn.weight')
+        #         new_item = new_item.replace('proj_out.bias', 'proj_attn.bias')
+
+        #         new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside attentions to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+
+        new_item = new_item.replace("norm.weight", "group_norm.weight")
+        new_item = new_item.replace("norm.bias", "group_norm.bias")
+
+        new_item = new_item.replace("q.weight", "query.weight")
+        new_item = new_item.replace("q.bias", "query.bias")
+
+        new_item = new_item.replace("k.weight", "key.weight")
+        new_item = new_item.replace("k.bias", "key.bias")
+
+        new_item = new_item.replace("v.weight", "value.weight")
+        new_item = new_item.replace("v.bias", "value.bias")
+
+        new_item = new_item.replace("proj_out.weight", "proj_attn.weight")
+        new_item = new_item.replace("proj_out.bias", "proj_attn.bias")
+
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+def assign_to_checkpoint(
+    paths, checkpoint, old_checkpoint, attention_paths_to_split=None, additional_replacements=None, config=None
+):
+    """
+    This does the final conversion step: take locally converted weights and apply a global renaming
+    to them. It splits attention layers, and takes into account additional replacements
+    that may arise.
+
+    Assigns the weights to the new checkpoint.
+    """
+    assert isinstance(paths, list), "Paths should be a list of dicts containing 'old' and 'new' keys."
+
+    # Splits the attention layers into three variables.
+    if attention_paths_to_split is not None:
+        for path, path_map in attention_paths_to_split.items():
+            old_tensor = old_checkpoint[path]
+            channels = old_tensor.shape[0] // 3
+
+            target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (-1)
+
+            num_heads = old_tensor.shape[0] // config["num_head_channels"] // 3
+
+            old_tensor = old_tensor.reshape((num_heads, 3 * channels // num_heads) + old_tensor.shape[1:])
+            query, key, value = old_tensor.split(channels // num_heads, dim=1)
+
+            checkpoint[path_map["query"]] = query.reshape(target_shape)
+            checkpoint[path_map["key"]] = key.reshape(target_shape)
+            checkpoint[path_map["value"]] = value.reshape(target_shape)
+
+    for path in paths:
+        new_path = path["new"]
+
+        # These have already been assigned
+        if attention_paths_to_split is not None and new_path in attention_paths_to_split:
+            continue
+
+        # Global renaming happens here
+        new_path = new_path.replace("middle_block.0", "mid_block.resnets.0")
+        new_path = new_path.replace("middle_block.1", "mid_block.attentions.0")
+        new_path = new_path.replace("middle_block.2", "mid_block.resnets.1")
+
+        if additional_replacements is not None:
+            for replacement in additional_replacements:
+                new_path = new_path.replace(replacement["old"], replacement["new"])
+
+        # proj_attn.weight has to be converted from conv 1D to linear
+        if "proj_attn.weight" in new_path:
+            checkpoint[new_path] = old_checkpoint[path["old"]][:, :, 0]
+        else:
+            checkpoint[new_path] = old_checkpoint[path["old"]]
+
+
+def conv_attn_to_linear(checkpoint):
+    keys = list(checkpoint.keys())
+    attn_keys = ["query.weight", "key.weight", "value.weight"]
+    for key in keys:
+        if ".".join(key.split(".")[-2:]) in attn_keys:
+            if checkpoint[key].ndim > 2:
+                checkpoint[key] = checkpoint[key][:, :, 0, 0]
+        elif "proj_attn.weight" in key:
+            if checkpoint[key].ndim > 2:
+                checkpoint[key] = checkpoint[key][:, :, 0]
+
+
+def create_unet_diffusers_config(original_config):
+    """
+    Creates a config for the diffusers based on the config of the LDM model.
+    """
+    unet_params = original_config.model.params.unet_config.params
+
+    block_out_channels = [unet_params.model_channels * mult for mult in unet_params.channel_mult]
+
+    down_block_types = []
+    resolution = 1
+    for i in range(len(block_out_channels)):
+        block_type = "CrossAttnDownBlock2D" if resolution in unet_params.attention_resolutions else "DownBlock2D"
+        down_block_types.append(block_type)
+        if i != len(block_out_channels) - 1:
+            resolution *= 2
+
+    up_block_types = []
+    for i in range(len(block_out_channels)):
+        block_type = "CrossAttnUpBlock2D" if resolution in unet_params.attention_resolutions else "UpBlock2D"
+        up_block_types.append(block_type)
+        resolution //= 2
+
+    config = dict(
+        sample_size=unet_params.image_size,
+        in_channels=unet_params.in_channels,
+        out_channels=unet_params.out_channels,
+        down_block_types=tuple(down_block_types),
+        up_block_types=tuple(up_block_types),
+        block_out_channels=tuple(block_out_channels),
+        layers_per_block=unet_params.num_res_blocks,
+        cross_attention_dim=unet_params.context_dim,
+        attention_head_dim=unet_params.num_heads,
+    )
+
+    return config
+
+
+def create_vae_diffusers_config(original_config):
+    """
+    Creates a config for the diffusers based on the config of the LDM model.
+    """
+    vae_params = original_config.model.params.first_stage_config.params.ddconfig
+    _ = original_config.model.params.first_stage_config.params.embed_dim
+
+    block_out_channels = [vae_params.ch * mult for mult in vae_params.ch_mult]
+    down_block_types = ["DownEncoderBlock2D"] * len(block_out_channels)
+    up_block_types = ["UpDecoderBlock2D"] * len(block_out_channels)
+
+    config = dict(
+        sample_size=vae_params.resolution,
+        in_channels=vae_params.in_channels,
+        out_channels=vae_params.out_ch,
+        down_block_types=tuple(down_block_types),
+        up_block_types=tuple(up_block_types),
+        block_out_channels=tuple(block_out_channels),
+        latent_channels=vae_params.z_channels,
+        layers_per_block=vae_params.num_res_blocks,
+    )
+    return config
+
+
+def create_diffusers_schedular(original_config):
+    schedular = DDIMScheduler(
+        num_train_timesteps=original_config.model.params.timesteps,
+        beta_start=original_config.model.params.linear_start,
+        beta_end=original_config.model.params.linear_end,
+        beta_schedule="scaled_linear",
+    )
+    return schedular
+
+
+def create_ldm_bert_config(original_config):
+    bert_params = original_config.model.parms.cond_stage_config.params
+    config = LDMBertConfig(
+        d_model=bert_params.n_embed,
+        encoder_layers=bert_params.n_layer,
+        encoder_ffn_dim=bert_params.n_embed * 4,
+    )
+    return config
+
+
+def convert_ldm_unet_checkpoint(checkpoint, config):
+    """
+    Takes a state dict and a config, and returns a converted checkpoint.
+    """
+
+    # extract state_dict for UNet
+    unet_state_dict = {}
+    unet_key = "model.diffusion_model."
+    keys = list(checkpoint.keys())
+    for key in keys:
+        if key.startswith(unet_key):
+            unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(key)
+
+    new_checkpoint = {}
+
+    new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict["time_embed.0.weight"]
+    new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict["time_embed.0.bias"]
+    new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict["time_embed.2.weight"]
+    new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict["time_embed.2.bias"]
+
+    new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"]
+    new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"]
+
+    new_checkpoint["conv_norm_out.weight"] = unet_state_dict["out.0.weight"]
+    new_checkpoint["conv_norm_out.bias"] = unet_state_dict["out.0.bias"]
+    new_checkpoint["conv_out.weight"] = unet_state_dict["out.2.weight"]
+    new_checkpoint["conv_out.bias"] = unet_state_dict["out.2.bias"]
+
+    # Retrieves the keys for the input blocks only
+    num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "input_blocks" in layer})
+    input_blocks = {
+        layer_id: [key for key in unet_state_dict if f"input_blocks.{layer_id}" in key]
+        for layer_id in range(num_input_blocks)
+    }
+
+    # Retrieves the keys for the middle blocks only
+    num_middle_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "middle_block" in layer})
+    middle_blocks = {
+        layer_id: [key for key in unet_state_dict if f"middle_block.{layer_id}" in key]
+        for layer_id in range(num_middle_blocks)
+    }
+
+    # Retrieves the keys for the output blocks only
+    num_output_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "output_blocks" in layer})
+    output_blocks = {
+        layer_id: [key for key in unet_state_dict if f"output_blocks.{layer_id}" in key]
+        for layer_id in range(num_output_blocks)
+    }
+
+    for i in range(1, num_input_blocks):
+        block_id = (i - 1) // (config["layers_per_block"] + 1)
+        layer_in_block_id = (i - 1) % (config["layers_per_block"] + 1)
+
+        resnets = [
+            key for key in input_blocks[i] if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in key
+        ]
+        attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key]
+
+        if f"input_blocks.{i}.0.op.weight" in unet_state_dict:
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop(
+                f"input_blocks.{i}.0.op.weight"
+            )
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop(
+                f"input_blocks.{i}.0.op.bias"
+            )
+
+        paths = renew_resnet_paths(resnets)
+        meta_path = {"old": f"input_blocks.{i}.0", "new": f"down_blocks.{block_id}.resnets.{layer_in_block_id}"}
+        assign_to_checkpoint(
+            paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+        )
+
+        if len(attentions):
+            paths = renew_attention_paths(attentions)
+            meta_path = {"old": f"input_blocks.{i}.1", "new": f"down_blocks.{block_id}.attentions.{layer_in_block_id}"}
+            assign_to_checkpoint(
+                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+            )
+
+    resnet_0 = middle_blocks[0]
+    attentions = middle_blocks[1]
+    resnet_1 = middle_blocks[2]
+
+    resnet_0_paths = renew_resnet_paths(resnet_0)
+    assign_to_checkpoint(resnet_0_paths, new_checkpoint, unet_state_dict, config=config)
+
+    resnet_1_paths = renew_resnet_paths(resnet_1)
+    assign_to_checkpoint(resnet_1_paths, new_checkpoint, unet_state_dict, config=config)
+
+    attentions_paths = renew_attention_paths(attentions)
+    meta_path = {"old": "middle_block.1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(
+        attentions_paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+    )
+
+    for i in range(num_output_blocks):
+        block_id = i // (config["layers_per_block"] + 1)
+        layer_in_block_id = i % (config["layers_per_block"] + 1)
+        output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]]
+        output_block_list = {}
+
+        for layer in output_block_layers:
+            layer_id, layer_name = layer.split(".")[0], shave_segments(layer, 1)
+            if layer_id in output_block_list:
+                output_block_list[layer_id].append(layer_name)
+            else:
+                output_block_list[layer_id] = [layer_name]
+
+        if len(output_block_list) > 1:
+            resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.0" in key]
+            attentions = [key for key in output_blocks[i] if f"output_blocks.{i}.1" in key]
+
+            resnet_0_paths = renew_resnet_paths(resnets)
+            paths = renew_resnet_paths(resnets)
+
+            meta_path = {"old": f"output_blocks.{i}.0", "new": f"up_blocks.{block_id}.resnets.{layer_in_block_id}"}
+            assign_to_checkpoint(
+                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+            )
+
+            if ["conv.weight", "conv.bias"] in output_block_list.values():
+                index = list(output_block_list.values()).index(["conv.weight", "conv.bias"])
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
+                    f"output_blocks.{i}.{index}.conv.weight"
+                ]
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[
+                    f"output_blocks.{i}.{index}.conv.bias"
+                ]
+
+                # Clear attentions as they have been attributed above.
+                if len(attentions) == 2:
+                    attentions = []
+
+            if len(attentions):
+                paths = renew_attention_paths(attentions)
+                meta_path = {
+                    "old": f"output_blocks.{i}.1",
+                    "new": f"up_blocks.{block_id}.attentions.{layer_in_block_id}",
+                }
+                assign_to_checkpoint(
+                    paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+                )
+        else:
+            resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1)
+            for path in resnet_0_paths:
+                old_path = ".".join(["output_blocks", str(i), path["old"]])
+                new_path = ".".join(["up_blocks", str(block_id), "resnets", str(layer_in_block_id), path["new"]])
+
+                new_checkpoint[new_path] = unet_state_dict[old_path]
+
+    return new_checkpoint
+
+
+def convert_ldm_vae_checkpoint(checkpoint, config):
+    # extract state dict for VAE
+    vae_state_dict = {}
+    vae_key = "first_stage_model."
+    keys = list(checkpoint.keys())
+    for key in keys:
+        if key.startswith(vae_key):
+            vae_state_dict[key.replace(vae_key, "")] = checkpoint.get(key)
+
+    new_checkpoint = {}
+
+    new_checkpoint["encoder.conv_in.weight"] = vae_state_dict["encoder.conv_in.weight"]
+    new_checkpoint["encoder.conv_in.bias"] = vae_state_dict["encoder.conv_in.bias"]
+    new_checkpoint["encoder.conv_out.weight"] = vae_state_dict["encoder.conv_out.weight"]
+    new_checkpoint["encoder.conv_out.bias"] = vae_state_dict["encoder.conv_out.bias"]
+    new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict["encoder.norm_out.weight"]
+    new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict["encoder.norm_out.bias"]
+
+    new_checkpoint["decoder.conv_in.weight"] = vae_state_dict["decoder.conv_in.weight"]
+    new_checkpoint["decoder.conv_in.bias"] = vae_state_dict["decoder.conv_in.bias"]
+    new_checkpoint["decoder.conv_out.weight"] = vae_state_dict["decoder.conv_out.weight"]
+    new_checkpoint["decoder.conv_out.bias"] = vae_state_dict["decoder.conv_out.bias"]
+    new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict["decoder.norm_out.weight"]
+    new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict["decoder.norm_out.bias"]
+
+    new_checkpoint["quant_conv.weight"] = vae_state_dict["quant_conv.weight"]
+    new_checkpoint["quant_conv.bias"] = vae_state_dict["quant_conv.bias"]
+    new_checkpoint["post_quant_conv.weight"] = vae_state_dict["post_quant_conv.weight"]
+    new_checkpoint["post_quant_conv.bias"] = vae_state_dict["post_quant_conv.bias"]
+
+    # Retrieves the keys for the encoder down blocks only
+    num_down_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "encoder.down" in layer})
+    down_blocks = {
+        layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key] for layer_id in range(num_down_blocks)
+    }
+
+    # Retrieves the keys for the decoder up blocks only
+    num_up_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "decoder.up" in layer})
+    up_blocks = {
+        layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] for layer_id in range(num_up_blocks)
+    }
+
+    for i in range(num_down_blocks):
+        resnets = [key for key in down_blocks[i] if f"down.{i}" in key and f"down.{i}.downsample" not in key]
+
+        if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict:
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(
+                f"encoder.down.{i}.downsample.conv.weight"
+            )
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop(
+                f"encoder.down.{i}.downsample.conv.bias"
+            )
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"down.{i}.block", "new": f"down_blocks.{i}.resnets"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+
+    mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key]
+    num_mid_res_blocks = 2
+    for i in range(1, num_mid_res_blocks + 1):
+        resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key]
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+
+    mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key]
+    paths = renew_vae_attention_paths(mid_attentions)
+    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    conv_attn_to_linear(new_checkpoint)
+
+    for i in range(num_up_blocks):
+        block_id = num_up_blocks - 1 - i
+        resnets = [
+            key for key in up_blocks[block_id] if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key
+        ]
+
+        if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict:
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[
+                f"decoder.up.{block_id}.upsample.conv.weight"
+            ]
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[
+                f"decoder.up.{block_id}.upsample.conv.bias"
+            ]
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"up.{block_id}.block", "new": f"up_blocks.{i}.resnets"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+
+    mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key]
+    num_mid_res_blocks = 2
+    for i in range(1, num_mid_res_blocks + 1):
+        resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key]
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+
+    mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key]
+    paths = renew_vae_attention_paths(mid_attentions)
+    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    conv_attn_to_linear(new_checkpoint)
+    return new_checkpoint
+
+
+def convert_ldm_bert_checkpoint(checkpoint, config):
+    def _copy_attn_layer(hf_attn_layer, pt_attn_layer):
+        hf_attn_layer.q_proj.weight.data = pt_attn_layer.to_q.weight
+        hf_attn_layer.k_proj.weight.data = pt_attn_layer.to_k.weight
+        hf_attn_layer.v_proj.weight.data = pt_attn_layer.to_v.weight
+
+        hf_attn_layer.out_proj.weight = pt_attn_layer.to_out.weight
+        hf_attn_layer.out_proj.bias = pt_attn_layer.to_out.bias
+
+    def _copy_linear(hf_linear, pt_linear):
+        hf_linear.weight = pt_linear.weight
+        hf_linear.bias = pt_linear.bias
+
+    def _copy_layer(hf_layer, pt_layer):
+        # copy layer norms
+        _copy_linear(hf_layer.self_attn_layer_norm, pt_layer[0][0])
+        _copy_linear(hf_layer.final_layer_norm, pt_layer[1][0])
+
+        # copy attn
+        _copy_attn_layer(hf_layer.self_attn, pt_layer[0][1])
+
+        # copy MLP
+        pt_mlp = pt_layer[1][1]
+        _copy_linear(hf_layer.fc1, pt_mlp.net[0][0])
+        _copy_linear(hf_layer.fc2, pt_mlp.net[2])
+
+    def _copy_layers(hf_layers, pt_layers):
+        for i, hf_layer in enumerate(hf_layers):
+            if i != 0:
+                i += i
+            pt_layer = pt_layers[i : i + 2]
+            _copy_layer(hf_layer, pt_layer)
+
+    hf_model = LDMBertModel(config).eval()
+
+    # copy  embeds
+    hf_model.model.embed_tokens.weight = checkpoint.transformer.token_emb.weight
+    hf_model.model.embed_positions.weight.data = checkpoint.transformer.pos_emb.emb.weight
+
+    # copy layer norm
+    _copy_linear(hf_model.model.layer_norm, checkpoint.transformer.norm)
+
+    # copy hidden layers
+    _copy_layers(hf_model.model.layers, checkpoint.transformer.attn_layers.layers)
+
+    _copy_linear(hf_model.to_logits, checkpoint.transformer.to_logits)
+
+    return hf_model
+
+
+def convert_ldm_clip_checkpoint(checkpoint):
+    text_model = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14")
+
+    keys = list(checkpoint.keys())
+
+    text_model_dict = {}
+
+    for key in keys:
+        if key.startswith("cond_stage_model.transformer"):
+            text_model_dict[key[len("cond_stage_model.transformer.") :]] = checkpoint[key]
+
+    text_model.load_state_dict(text_model_dict)
+
+    return text_model
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--checkpoint_path", default=None, type=str, required=True, help="Path to the checkpoint to convert."
+    )
+    # !wget https://raw.githubusercontent.com/CompVis/stable-diffusion/main/configs/stable-diffusion/v1-inference.yaml
+    parser.add_argument(
+        "--original_config_file",
+        default=None,
+        type=str,
+        help="The YAML config file corresponding to the original architecture.",
+    )
+    parser.add_argument(
+        "--scheduler_type",
+        default="pndm",
+        type=str,
+        help="Type of scheduler to use. Should be one of ['pndm', 'lms', 'ddim']",
+    )
+    parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output model.")
+
+    args = parser.parse_args()
+
+    if args.original_config_file is None:
+        os.system(
+            "wget https://raw.githubusercontent.com/CompVis/stable-diffusion/main/configs/stable-diffusion/v1-inference.yaml"
+        )
+        args.original_config_file = "./v1-inference.yaml"
+
+    original_config = OmegaConf.load(args.original_config_file)
+    checkpoint = torch.load(args.checkpoint_path)["state_dict"]
+
+    num_train_timesteps = original_config.model.params.timesteps
+    beta_start = original_config.model.params.linear_start
+    beta_end = original_config.model.params.linear_end
+    if args.scheduler_type == "pndm":
+        scheduler = PNDMScheduler(
+            beta_end=beta_end,
+            beta_schedule="scaled_linear",
+            beta_start=beta_start,
+            num_train_timesteps=num_train_timesteps,
+            skip_prk_steps=True,
+        )
+    elif args.scheduler_type == "lms":
+        scheduler = LMSDiscreteScheduler(beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear")
+    elif args.scheduler_type == "ddim":
+        scheduler = DDIMScheduler(
+            beta_start=beta_start,
+            beta_end=beta_end,
+            beta_schedule="scaled_linear",
+            clip_sample=False,
+            set_alpha_to_one=False,
+        )
+    else:
+        raise ValueError(f"Scheduler of type {args.scheduler_type} doesn't exist!")
+
+    # Convert the UNet2DConditionModel model.
+    unet_config = create_unet_diffusers_config(original_config)
+    converted_unet_checkpoint = convert_ldm_unet_checkpoint(checkpoint, unet_config)
+
+    unet = UNet2DConditionModel(**unet_config)
+    unet.load_state_dict(converted_unet_checkpoint)
+
+    # Convert the VAE model.
+    vae_config = create_vae_diffusers_config(original_config)
+    converted_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config)
+
+    vae = AutoencoderKL(**vae_config)
+    vae.load_state_dict(converted_vae_checkpoint)
+
+    # Convert the text model.
+    text_model_type = original_config.model.params.cond_stage_config.target.split(".")[-1]
+    if text_model_type == "FrozenCLIPEmbedder":
+        text_model = convert_ldm_clip_checkpoint(checkpoint)
+        tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
+        safety_checker = StableDiffusionSafetyChecker.from_pretrained("CompVis/stable-diffusion-safety-checker")
+        feature_extractor = AutoFeatureExtractor.from_pretrained("CompVis/stable-diffusion-safety-checker")
+        pipe = StableDiffusionPipeline(
+            vae=vae,
+            text_encoder=text_model,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+    else:
+        text_config = create_ldm_bert_config(original_config)
+        text_model = convert_ldm_bert_checkpoint(checkpoint, text_config)
+        tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
+        pipe = LDMTextToImagePipeline(vqvae=vae, bert=text_model, tokenizer=tokenizer, unet=unet, scheduler=scheduler)
+
+    pipe.save_pretrained(args.dump_path)
--- a/scripts/convert_stable_diffusion_checkpoint_to_onnx.py
+++ b/scripts/convert_stable_diffusion_checkpoint_to_onnx.py
@@ -0,0 +1,227 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+import shutil
+from pathlib import Path
+
+import torch
+from torch.onnx import export
+
+import onnx
+from diffusers import OnnxStableDiffusionPipeline, StableDiffusionPipeline
+from diffusers.onnx_utils import OnnxRuntimeModel
+from packaging import version
+
+
+is_torch_less_than_1_11 = version.parse(version.parse(torch.__version__).base_version) < version.parse("1.11")
+
+
+def onnx_export(
+    model,
+    model_args: tuple,
+    output_path: Path,
+    ordered_input_names,
+    output_names,
+    dynamic_axes,
+    opset,
+    use_external_data_format=False,
+):
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    # PyTorch deprecated the `enable_onnx_checker` and `use_external_data_format` arguments in v1.11,
+    # so we check the torch version for backwards compatibility
+    if is_torch_less_than_1_11:
+        export(
+            model,
+            model_args,
+            f=output_path.as_posix(),
+            input_names=ordered_input_names,
+            output_names=output_names,
+            dynamic_axes=dynamic_axes,
+            do_constant_folding=True,
+            use_external_data_format=use_external_data_format,
+            enable_onnx_checker=True,
+            opset_version=opset,
+        )
+    else:
+        export(
+            model,
+            model_args,
+            f=output_path.as_posix(),
+            input_names=ordered_input_names,
+            output_names=output_names,
+            dynamic_axes=dynamic_axes,
+            do_constant_folding=True,
+            opset_version=opset,
+        )
+
+
+@torch.no_grad()
+def convert_models(model_path: str, output_path: str, opset: int):
+    pipeline = StableDiffusionPipeline.from_pretrained(model_path)
+    output_path = Path(output_path)
+
+    # TEXT ENCODER
+    text_input = pipeline.tokenizer(
+        "A sample prompt",
+        padding="max_length",
+        max_length=pipeline.tokenizer.model_max_length,
+        truncation=True,
+        return_tensors="pt",
+    )
+    onnx_export(
+        pipeline.text_encoder,
+        # casting to torch.int32 until the CLIP fix is released: https://github.com/huggingface/transformers/pull/18515/files
+        model_args=(text_input.input_ids.to(torch.int32)),
+        output_path=output_path / "text_encoder" / "model.onnx",
+        ordered_input_names=["input_ids"],
+        output_names=["last_hidden_state", "pooler_output"],
+        dynamic_axes={
+            "input_ids": {0: "batch", 1: "sequence"},
+        },
+        opset=opset,
+    )
+    del pipeline.text_encoder
+
+    # UNET
+    unet_path = output_path / "unet" / "model.onnx"
+    onnx_export(
+        pipeline.unet,
+        model_args=(
+            torch.randn(2, pipeline.unet.in_channels, 64, 64),
+            torch.LongTensor([0, 1]),
+            torch.randn(2, 77, 768),
+            False,
+        ),
+        output_path=unet_path,
+        ordered_input_names=["sample", "timestep", "encoder_hidden_states", "return_dict"],
+        output_names=["out_sample"],  # has to be different from "sample" for correct tracing
+        dynamic_axes={
+            "sample": {0: "batch", 1: "channels", 2: "height", 3: "width"},
+            "timestep": {0: "batch"},
+            "encoder_hidden_states": {0: "batch", 1: "sequence"},
+        },
+        opset=opset,
+        use_external_data_format=True,  # UNet is > 2GB, so the weights need to be split
+    )
+    unet_model_path = str(unet_path.absolute().as_posix())
+    unet_dir = os.path.dirname(unet_model_path)
+    unet = onnx.load(unet_model_path)
+    # clean up existing tensor files
+    shutil.rmtree(unet_dir)
+    os.mkdir(unet_dir)
+    # collate external tensor files into one
+    onnx.save_model(
+        unet,
+        unet_model_path,
+        save_as_external_data=True,
+        all_tensors_to_one_file=True,
+        location="weights.pb",
+        convert_attribute=False,
+    )
+    del pipeline.unet
+
+    # VAE ENCODER
+    vae_encoder = pipeline.vae
+    # need to get the raw tensor output (sample) from the encoder
+    vae_encoder.forward = lambda sample, return_dict: vae_encoder.encode(sample, return_dict)[0].sample()
+    onnx_export(
+        vae_encoder,
+        model_args=(torch.randn(1, 3, 512, 512), False),
+        output_path=output_path / "vae_encoder" / "model.onnx",
+        ordered_input_names=["sample", "return_dict"],
+        output_names=["latent_sample"],
+        dynamic_axes={
+            "sample": {0: "batch", 1: "channels", 2: "height", 3: "width"},
+        },
+        opset=opset,
+    )
+
+    # VAE DECODER
+    vae_decoder = pipeline.vae
+    # forward only through the decoder part
+    vae_decoder.forward = vae_encoder.decode
+    onnx_export(
+        vae_decoder,
+        model_args=(torch.randn(1, 4, 64, 64), False),
+        output_path=output_path / "vae_decoder" / "model.onnx",
+        ordered_input_names=["latent_sample", "return_dict"],
+        output_names=["sample"],
+        dynamic_axes={
+            "latent_sample": {0: "batch", 1: "channels", 2: "height", 3: "width"},
+        },
+        opset=opset,
+    )
+    del pipeline.vae
+
+    # SAFETY CHECKER
+    safety_checker = pipeline.safety_checker
+    safety_checker.forward = safety_checker.forward_onnx
+    onnx_export(
+        pipeline.safety_checker,
+        model_args=(torch.randn(1, 3, 224, 224), torch.randn(1, 512, 512, 3)),
+        output_path=output_path / "safety_checker" / "model.onnx",
+        ordered_input_names=["clip_input", "images"],
+        output_names=["out_images", "has_nsfw_concepts"],
+        dynamic_axes={
+            "clip_input": {0: "batch", 1: "channels", 2: "height", 3: "width"},
+            "images": {0: "batch", 1: "channels", 2: "height", 3: "width"},
+        },
+        opset=opset,
+    )
+    del pipeline.safety_checker
+
+    onnx_pipeline = OnnxStableDiffusionPipeline(
+        vae_encoder=OnnxRuntimeModel.from_pretrained(output_path / "vae_encoder"),
+        vae_decoder=OnnxRuntimeModel.from_pretrained(output_path / "vae_decoder"),
+        text_encoder=OnnxRuntimeModel.from_pretrained(output_path / "text_encoder"),
+        tokenizer=pipeline.tokenizer,
+        unet=OnnxRuntimeModel.from_pretrained(output_path / "unet"),
+        scheduler=pipeline.scheduler,
+        safety_checker=OnnxRuntimeModel.from_pretrained(output_path / "safety_checker"),
+        feature_extractor=pipeline.feature_extractor,
+    )
+
+    onnx_pipeline.save_pretrained(output_path)
+    print("ONNX pipeline saved to", output_path)
+
+    del pipeline
+    del onnx_pipeline
+    _ = OnnxStableDiffusionPipeline.from_pretrained(output_path, provider="CPUExecutionProvider")
+    print("ONNX pipeline is loadable")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--model_path",
+        type=str,
+        required=True,
+        help="Path to the `diffusers` checkpoint to convert (either a local directory or on the Hub).",
+    )
+
+    parser.add_argument("--output_path", type=str, required=True, help="Path to the output model.")
+
+    parser.add_argument(
+        "--opset",
+        default=14,
+        type=int,
+        help="The version of the ONNX operator set to use.",
+    )
+
+    args = parser.parse_args()
+
+    convert_models(args.model_path, args.output_path, args.opset)
--- a/scripts/generate_logits.py
+++ b/scripts/generate_logits.py
@@ -1,91 +1,127 @@
-from huggingface_hub import HfApi
-from transformers.file_utils import has_file
-from diffusers import UNet2DModel
 import random
+
 import torch
+
+from diffusers import UNet2DModel
+from huggingface_hub import HfApi
+
+
 api = HfApi()

 results = {}
-results["google_ddpm_cifar10_32"] = torch.tensor([-0.7515, -1.6883,  0.2420,  0.0300,  0.6347,  1.3433, -1.1743, -3.7467,
-         1.2342, -2.2485,  0.4636,  0.8076, -0.7991,  0.3969,  0.8498,  0.9189,
-        -1.8887, -3.3522,  0.7639,  0.2040,  0.6271, -2.7148, -1.6316,  3.0839,
-         0.3186,  0.2721, -0.9759, -1.2461,  2.6257,  1.3557])
-results["google_ddpm_ema_bedroom_256"] = torch.tensor([-2.3639, -2.5344,  0.0054, -0.6674,  1.5990,  1.0158,  0.3124, -2.1436,
-         1.8795, -2.5429, -0.1566, -0.3973,  1.2490,  2.6447,  1.2283, -0.5208,
-        -2.8154, -3.5119,  2.3838,  1.2033,  1.7201, -2.1256, -1.4576,  2.7948,
-         2.4204, -0.9752, -1.2546,  0.8027,  3.2758,  3.1365])
-results["CompVis_ldm_celebahq_256"] = torch.tensor([-0.6531, -0.6891, -0.3172, -0.5375, -0.9140, -0.5367, -0.1175, -0.7869,
-        -0.3808, -0.4513, -0.2098, -0.0083,  0.3183,  0.5140,  0.2247, -0.1304,
-        -0.1302, -0.2802, -0.2084, -0.2025, -0.4967, -0.4873, -0.0861,  0.6925,
-         0.0250,  0.1290, -0.1543,  0.6316,  1.0460,  1.4943])
-results["google_ncsnpp_ffhq_1024"] = torch.tensor([ 0.0911,  0.1107,  0.0182,  0.0435, -0.0805, -0.0608,  0.0381,  0.2172,
-        -0.0280,  0.1327, -0.0299, -0.0255, -0.0050, -0.1170, -0.1046,  0.0309,
-         0.1367,  0.1728, -0.0533, -0.0748, -0.0534,  0.1624,  0.0384, -0.1805,
-        -0.0707,  0.0642,  0.0220, -0.0134, -0.1333, -0.1505])
-results["google_ncsnpp_bedroom_256"] = torch.tensor([ 0.1321,  0.1337,  0.0440,  0.0622, -0.0591, -0.0370,  0.0503,  0.2133,
-        -0.0177,  0.1415, -0.0116, -0.0112,  0.0044, -0.0980, -0.0789,  0.0395,
-         0.1502,  0.1785, -0.0488, -0.0514, -0.0404,  0.1539,  0.0454, -0.1559,
-        -0.0665,  0.0659,  0.0383, -0.0005, -0.1266, -0.1386])
-results["google_ncsnpp_celebahq_256"] = torch.tensor([ 0.1154,  0.1218,  0.0307,  0.0526, -0.0711, -0.0541,  0.0366,  0.2078,
-        -0.0267,  0.1317, -0.0226, -0.0193, -0.0014, -0.1055, -0.0902,  0.0330,
-         0.1391,  0.1709, -0.0562, -0.0693, -0.0560,  0.1482,  0.0381, -0.1683,
-        -0.0681,  0.0661,  0.0331, -0.0046, -0.1268, -0.1431])
-results["google_ncsnpp_church_256"] = torch.tensor([ 0.1192,  0.1240,  0.0414,  0.0606, -0.0557, -0.0412,  0.0430,  0.2042,
-        -0.0200,  0.1385, -0.0115, -0.0132,  0.0017, -0.0965, -0.0802,  0.0398,
-         0.1433,  0.1747, -0.0458, -0.0533, -0.0407,  0.1545,  0.0419, -0.1574,
-        -0.0645,  0.0626,  0.0341, -0.0010, -0.1199, -0.1390])
-results["google_ncsnpp_ffhq_256"] = torch.tensor([ 0.1075,  0.1074,  0.0205,  0.0431, -0.0774, -0.0607,  0.0298,  0.2042,
-        -0.0320,  0.1267, -0.0281, -0.0250, -0.0064, -0.1091, -0.0946,  0.0290,
-         0.1328,  0.1650, -0.0580, -0.0738, -0.0586,  0.1440,  0.0337, -0.1746,
-        -0.0712,  0.0605,  0.0250, -0.0099, -0.1316, -0.1473])
-results["google_ddpm_cat_256"] = torch.tensor([-1.4572, -2.0481, -0.0414, -0.6005,  1.4136,  0.5848,  0.4028, -2.7330,
-         1.2212, -2.1228,  0.2155,  0.4039,  0.7662,  2.0535,  0.7477, -0.3243,
-        -2.1758, -2.7648,  1.6947,  0.7026,  1.2338, -1.6078, -0.8682,  2.2810,
-         1.8574, -0.5718, -0.5586, -0.0186,  2.3415,  2.1251])
-results["google_ddpm_celebahq_256"] = torch.tensor([-1.3690, -1.9720, -0.4090, -0.6966,  1.4660,  0.9938, -0.1385, -2.7324,
-         0.7736, -1.8917,  0.2923,  0.4293,  0.1693,  1.4112,  1.1887, -0.3181,
-        -2.2160, -2.6381,  1.3170,  0.8163,  0.9240, -1.6544, -0.6099,  2.5259,
-         1.6430, -0.9090, -0.9392, -0.0126,  2.4268,  2.3266])
-results["google_ddpm_ema_celebahq_256"] = torch.tensor([-1.3525, -1.9628, -0.3956, -0.6860,  1.4664,  1.0014, -0.1259, -2.7212,
-         0.7772, -1.8811,  0.2996,  0.4388,  0.1704,  1.4029,  1.1701, -0.3027,
-        -2.2053, -2.6287,  1.3350,  0.8131,  0.9274, -1.6292, -0.6098,  2.5131,
-         1.6505, -0.8958, -0.9298, -0.0151,  2.4257,  2.3355])
-results["google_ddpm_church_256"] = torch.tensor([-2.0585, -2.7897, -0.2850, -0.8940,  1.9052,  0.5702,  0.6345, -3.8959,
-         1.5932, -3.2319,  0.1974,  0.0287,  1.7566,  2.6543,  0.8387, -0.5351,
-        -3.2736, -4.3375,  2.9029,  1.6390,  1.4640, -2.1701, -1.9013,  2.9341,
-         3.4981, -0.6255, -1.1644, -0.1591,  3.7097,  3.2066])
-results["google_ddpm_bedroom_256"] = torch.tensor([-2.3139, -2.5594, -0.0197, -0.6785,  1.7001,  1.1606,  0.3075, -2.1740,
-         1.8071, -2.5630, -0.0926, -0.3811,  1.2116,  2.6246,  1.2731, -0.5398,
-        -2.8153, -3.6140,  2.3893,  1.3262,  1.6258, -2.1856, -1.3267,  2.8395,
-         2.3779, -1.0623, -1.2468,  0.8959,  3.3367,  3.2243])
-results["google_ddpm_ema_church_256"] = torch.tensor([-2.0628, -2.7667, -0.2089, -0.8263,  2.0539,  0.5992,  0.6495, -3.8336,
-         1.6025, -3.2817,  0.1721, -0.0633,  1.7516,  2.7039,  0.8100, -0.5908,
-        -3.2113, -4.4343,  2.9257,  1.3632,  1.5562, -2.1489, -1.9894,  3.0560,
-         3.3396, -0.7328, -1.0417,  0.0383,  3.7093,  3.2343])
-results["google_ddpm_ema_cat_256"] = torch.tensor([-1.4574, -2.0569, -0.0473, -0.6117,  1.4018,  0.5769,  0.4129, -2.7344,
-         1.2241, -2.1397,  0.2000,  0.3937,  0.7616,  2.0453,  0.7324, -0.3391,
-        -2.1746, -2.7744,  1.6963,  0.6921,  1.2187, -1.6172, -0.8877,  2.2439,
-         1.8471, -0.5839, -0.5605, -0.0464,  2.3250,  2.1219])
+# fmt: off
+results["google_ddpm_cifar10_32"] = torch.tensor([
+    -0.7515, -1.6883, 0.2420, 0.0300, 0.6347, 1.3433, -1.1743, -3.7467,
+    1.2342, -2.2485, 0.4636, 0.8076, -0.7991, 0.3969, 0.8498, 0.9189,
+    -1.8887, -3.3522, 0.7639, 0.2040, 0.6271, -2.7148, -1.6316, 3.0839,
+    0.3186, 0.2721, -0.9759, -1.2461, 2.6257, 1.3557
+])
+results["google_ddpm_ema_bedroom_256"] = torch.tensor([
+    -2.3639, -2.5344, 0.0054, -0.6674, 1.5990, 1.0158, 0.3124, -2.1436,
+    1.8795, -2.5429, -0.1566, -0.3973, 1.2490, 2.6447, 1.2283, -0.5208,
+    -2.8154, -3.5119, 2.3838, 1.2033, 1.7201, -2.1256, -1.4576, 2.7948,
+    2.4204, -0.9752, -1.2546, 0.8027, 3.2758, 3.1365
+])
+results["CompVis_ldm_celebahq_256"] = torch.tensor([
+    -0.6531, -0.6891, -0.3172, -0.5375, -0.9140, -0.5367, -0.1175, -0.7869,
+    -0.3808, -0.4513, -0.2098, -0.0083, 0.3183, 0.5140, 0.2247, -0.1304,
+    -0.1302, -0.2802, -0.2084, -0.2025, -0.4967, -0.4873, -0.0861, 0.6925,
+    0.0250, 0.1290, -0.1543, 0.6316, 1.0460, 1.4943
+])
+results["google_ncsnpp_ffhq_1024"] = torch.tensor([
+    0.0911, 0.1107, 0.0182, 0.0435, -0.0805, -0.0608, 0.0381, 0.2172,
+    -0.0280, 0.1327, -0.0299, -0.0255, -0.0050, -0.1170, -0.1046, 0.0309,
+    0.1367, 0.1728, -0.0533, -0.0748, -0.0534, 0.1624, 0.0384, -0.1805,
+    -0.0707, 0.0642, 0.0220, -0.0134, -0.1333, -0.1505
+])
+results["google_ncsnpp_bedroom_256"] = torch.tensor([
+    0.1321, 0.1337, 0.0440, 0.0622, -0.0591, -0.0370, 0.0503, 0.2133,
+    -0.0177, 0.1415, -0.0116, -0.0112, 0.0044, -0.0980, -0.0789, 0.0395,
+    0.1502, 0.1785, -0.0488, -0.0514, -0.0404, 0.1539, 0.0454, -0.1559,
+    -0.0665, 0.0659, 0.0383, -0.0005, -0.1266, -0.1386
+])
+results["google_ncsnpp_celebahq_256"] = torch.tensor([
+    0.1154, 0.1218, 0.0307, 0.0526, -0.0711, -0.0541, 0.0366, 0.2078,
+    -0.0267, 0.1317, -0.0226, -0.0193, -0.0014, -0.1055, -0.0902, 0.0330,
+    0.1391, 0.1709, -0.0562, -0.0693, -0.0560, 0.1482, 0.0381, -0.1683,
+    -0.0681, 0.0661, 0.0331, -0.0046, -0.1268, -0.1431
+])
+results["google_ncsnpp_church_256"] = torch.tensor([
+    0.1192, 0.1240, 0.0414, 0.0606, -0.0557, -0.0412, 0.0430, 0.2042,
+    -0.0200, 0.1385, -0.0115, -0.0132, 0.0017, -0.0965, -0.0802, 0.0398,
+    0.1433, 0.1747, -0.0458, -0.0533, -0.0407, 0.1545, 0.0419, -0.1574,
+    -0.0645, 0.0626, 0.0341, -0.0010, -0.1199, -0.1390
+])
+results["google_ncsnpp_ffhq_256"] = torch.tensor([
+    0.1075, 0.1074, 0.0205, 0.0431, -0.0774, -0.0607, 0.0298, 0.2042,
+    -0.0320, 0.1267, -0.0281, -0.0250, -0.0064, -0.1091, -0.0946, 0.0290,
+    0.1328, 0.1650, -0.0580, -0.0738, -0.0586, 0.1440, 0.0337, -0.1746,
+    -0.0712, 0.0605, 0.0250, -0.0099, -0.1316, -0.1473
+])
+results["google_ddpm_cat_256"] = torch.tensor([
+    -1.4572, -2.0481, -0.0414, -0.6005, 1.4136, 0.5848, 0.4028, -2.7330,
+    1.2212, -2.1228, 0.2155, 0.4039, 0.7662, 2.0535, 0.7477, -0.3243,
+    -2.1758, -2.7648, 1.6947, 0.7026, 1.2338, -1.6078, -0.8682, 2.2810,
+    1.8574, -0.5718, -0.5586, -0.0186, 2.3415, 2.1251])
+results["google_ddpm_celebahq_256"] = torch.tensor([
+    -1.3690, -1.9720, -0.4090, -0.6966, 1.4660, 0.9938, -0.1385, -2.7324,
+    0.7736, -1.8917, 0.2923, 0.4293, 0.1693, 1.4112, 1.1887, -0.3181,
+    -2.2160, -2.6381, 1.3170, 0.8163, 0.9240, -1.6544, -0.6099, 2.5259,
+    1.6430, -0.9090, -0.9392, -0.0126, 2.4268, 2.3266
+])
+results["google_ddpm_ema_celebahq_256"] = torch.tensor([
+    -1.3525, -1.9628, -0.3956, -0.6860, 1.4664, 1.0014, -0.1259, -2.7212,
+    0.7772, -1.8811, 0.2996, 0.4388, 0.1704, 1.4029, 1.1701, -0.3027,
+    -2.2053, -2.6287, 1.3350, 0.8131, 0.9274, -1.6292, -0.6098, 2.5131,
+    1.6505, -0.8958, -0.9298, -0.0151, 2.4257, 2.3355
+])
+results["google_ddpm_church_256"] = torch.tensor([
+    -2.0585, -2.7897, -0.2850, -0.8940, 1.9052, 0.5702, 0.6345, -3.8959,
+    1.5932, -3.2319, 0.1974, 0.0287, 1.7566, 2.6543, 0.8387, -0.5351,
+    -3.2736, -4.3375, 2.9029, 1.6390, 1.4640, -2.1701, -1.9013, 2.9341,
+    3.4981, -0.6255, -1.1644, -0.1591, 3.7097, 3.2066
+])
+results["google_ddpm_bedroom_256"] = torch.tensor([
+    -2.3139, -2.5594, -0.0197, -0.6785, 1.7001, 1.1606, 0.3075, -2.1740,
+    1.8071, -2.5630, -0.0926, -0.3811, 1.2116, 2.6246, 1.2731, -0.5398,
+    -2.8153, -3.6140, 2.3893, 1.3262, 1.6258, -2.1856, -1.3267, 2.8395,
+    2.3779, -1.0623, -1.2468, 0.8959, 3.3367, 3.2243
+])
+results["google_ddpm_ema_church_256"] = torch.tensor([
+    -2.0628, -2.7667, -0.2089, -0.8263, 2.0539, 0.5992, 0.6495, -3.8336,
+    1.6025, -3.2817, 0.1721, -0.0633, 1.7516, 2.7039, 0.8100, -0.5908,
+    -3.2113, -4.4343, 2.9257, 1.3632, 1.5562, -2.1489, -1.9894, 3.0560,
+    3.3396, -0.7328, -1.0417, 0.0383, 3.7093, 3.2343
+])
+results["google_ddpm_ema_cat_256"] = torch.tensor([
+    -1.4574, -2.0569, -0.0473, -0.6117, 1.4018, 0.5769, 0.4129, -2.7344,
+    1.2241, -2.1397, 0.2000, 0.3937, 0.7616, 2.0453, 0.7324, -0.3391,
+    -2.1746, -2.7744, 1.6963, 0.6921, 1.2187, -1.6172, -0.8877, 2.2439,
+    1.8471, -0.5839, -0.5605, -0.0464, 2.3250, 2.1219
+])
+# fmt: on

 models = api.list_models(filter="diffusers")
 for mod in models:
-    if "google" in mod.author or mod.modelId == "CompVis/ldm-celebahq-256": 
+    if "google" in mod.author or mod.modelId == "CompVis/ldm-celebahq-256":
        local_checkpoint = "/home/patrick/google_checkpoints/" + mod.modelId.split("/")[-1]

        print(f"Started running {mod.modelId}!!!")

        if mod.modelId.startswith("CompVis"):
-            model = UNet2DModel.from_pretrained(local_checkpoint, subfolder = "unet")
-        else: 
+            model = UNet2DModel.from_pretrained(local_checkpoint, subfolder="unet")
+        else:
            model = UNet2DModel.from_pretrained(local_checkpoint)
-        
+
        torch.manual_seed(0)
        random.seed(0)
-        
+
        noise = torch.randn(1, model.config.in_channels, model.config.sample_size, model.config.sample_size)
        time_step = torch.tensor([10] * noise.shape[0])
        with torch.no_grad():
-            logits = model(noise, time_step)['sample']
+            logits = model(noise, time_step).sample

-        assert torch.allclose(logits[0, 0, 0, :30], results["_".join("_".join(mod.modelId.split("/")).split("-"))], atol=1e-3)
-        print(f"{mod.modelId} has passed succesfully!!!")
+        assert torch.allclose(
+            logits[0, 0, 0, :30], results["_".join("_".join(mod.modelId.split("/")).split("-"))], atol=1e-3
+        )
+        print(f"{mod.modelId} has passed successfully!!!")
--- a/setup.cfg
+++ b/setup.cfg
@@ -17,3 +17,4 @@ use_parentheses = True
 [flake8]
 ignore = E203, E722, E501, E741, W503, W605
 max-line-length = 119
+per-file-ignores = __init__.py:F401
--- a/setup.py
+++ b/setup.py
@@ -67,32 +67,43 @@ To create the package for pypi.
    you need to go back to main before executing this.
 """

+import os
 import re
 from distutils.core import Command

 from setuptools import find_packages, setup

+
 # IMPORTANT:
 # 1. all dependencies should be listed here with their version requirements if any
 # 2. once modified, run: `make deps_table_update` to update src/diffusers/dependency_versions_table.py
 _deps = [
-    "Pillow",
+    "Pillow<10.0",  # keep the PIL.Image.Resampling deprecation away
    "accelerate>=0.11.0",
-    "black~=22.0,>=22.3",
+    "black==22.8",
    "datasets",
    "filelock",
    "flake8>=3.8.3",
+    "flax>=0.4.1",
    "hf-doc-builder>=0.3.0",
-    "huggingface-hub>=0.8.1,<1.0",
+    "huggingface-hub>=0.10.0",
    "importlib_metadata",
    "isort>=5.5.4",
-    "modelcards==0.1.4",
+    "jax>=0.2.8,!=0.3.2,<=0.3.6",
+    "jaxlib>=0.1.65,<=0.3.6",
+    "modelcards>=0.1.4",
    "numpy",
+    "onnxruntime",
    "pytest",
+    "pytest-timeout",
+    "pytest-xdist",
+    "scipy",
    "regex!=2019.12.17",
    "requests",
    "tensorboard",
    "torch>=1.4",
+    "torchvision",
+    "transformers>=4.21.0",
 ]

 # this is a lookup table with items like:
@@ -163,11 +174,30 @@ extras = {}


 extras = {}
-extras["quality"] = ["black ~= 22.0", "isort >= 5.5.4", "flake8 >= 3.8.3"]
-extras["docs"] = ["hf-doc-builder"]
-extras["training"] = ["accelerate", "datasets", "tensorboard", "modelcards"]
-extras["test"] = ["pytest"]
-extras["dev"] = extras["quality"] + extras["test"] + extras["training"] + extras["docs"]
+extras["quality"] = deps_list("black", "isort", "flake8", "hf-doc-builder")
+extras["docs"] = deps_list("hf-doc-builder")
+extras["training"] = deps_list("accelerate", "datasets", "tensorboard", "modelcards")
+extras["test"] = deps_list(
+    "accelerate",
+    "datasets",
+    "onnxruntime",
+    "pytest",
+    "pytest-timeout",
+    "pytest-xdist",
+    "scipy",
+    "torchvision",
+    "transformers"
+)
+extras["torch"] = deps_list("torch")
+
+if os.name == "nt":  # windows
+    extras["flax"] = []  # jax is not supported on windows
+else:
+    extras["flax"] = deps_list("jax", "jaxlib", "flax")
+
+extras["dev"] = (
+    extras["quality"] + extras["test"] + extras["training"] + extras["docs"] + extras["torch"] + extras["flax"]
+)

 install_requires = [
    deps["importlib_metadata"],
@@ -176,13 +206,12 @@ install_requires = [
    deps["numpy"],
    deps["regex"],
    deps["requests"],
-    deps["torch"],
    deps["Pillow"],
 ]

 setup(
    name="diffusers",
-    version="0.2.2",
+    version="0.6.0",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
    description="Diffusers",
    long_description=open("README.md", "r", encoding="utf-8").read(),
    long_description_content_type="text/markdown",
@@ -194,9 +223,10 @@ setup(
    package_dir={"": "src"},
    packages=find_packages("src"),
    include_package_data=True,
-    python_requires=">=3.6.0",
+    python_requires=">=3.7.0",
    install_requires=install_requires,
    extras_require=extras,
+    entry_points={"console_scripts": ["diffusers-cli=diffusers.commands.diffusers_cli:main"]},
    classifiers=[
        "Development Status :: 5 - Production/Stable",
        "Intended Audience :: Developers",
@@ -205,7 +235,6 @@ setup(
        "License :: OSI Approved :: Apache Software License",
        "Operating System :: OS Independent",
        "Programming Language :: Python :: 3",
-        "Programming Language :: Python :: 3.6",
        "Programming Language :: Python :: 3.7",
        "Programming Language :: Python :: 3.8",
        "Programming Language :: Python :: 3.9",
--- a/src/diffusers/init.py
+++ b/src/diffusers/init.py
@@ -1,43 +1,91 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-from .utils import is_inflect_available, is_scipy_available, is_transformers_available, is_unidecode_available
-
-
-__version__ = "0.2.2"
-
-from .modeling_utils import ModelMixin
-from .models import AutoencoderKL, UNet2DConditionModel, UNet2DModel, VQModel
-from .optimization import (
-    get_constant_schedule,
-    get_constant_schedule_with_warmup,
-    get_cosine_schedule_with_warmup,
-    get_cosine_with_hard_restarts_schedule_with_warmup,
-    get_linear_schedule_with_warmup,
-    get_polynomial_decay_schedule_with_warmup,
-    get_scheduler,
-)
-from .pipeline_utils import DiffusionPipeline
-from .pipelines import DDIMPipeline, DDPMPipeline, KarrasVePipeline, LDMPipeline, PNDMPipeline, ScoreSdeVePipeline
-from .schedulers import (
-    DDIMScheduler,
-    DDPMScheduler,
-    KarrasVeScheduler,
-    PNDMScheduler,
-    SchedulerMixin,
-    ScoreSdeVeScheduler,
+from .utils import (
+    is_flax_available,
+    is_inflect_available,
+    is_onnx_available,
+    is_scipy_available,
+    is_torch_available,
+    is_transformers_available,
+    is_unidecode_available,
 )


-if is_scipy_available():
+__version__ = "0.6.0"
+
+from .configuration_utils import ConfigMixin
+from .onnx_utils import OnnxRuntimeModel
+from .utils import logging
+
+
+if is_torch_available():
+    from .modeling_utils import ModelMixin
+    from .models import AutoencoderKL, UNet2DConditionModel, UNet2DModel, VQModel
+    from .optimization import (
+        get_constant_schedule,
+        get_constant_schedule_with_warmup,
+        get_cosine_schedule_with_warmup,
+        get_cosine_with_hard_restarts_schedule_with_warmup,
+        get_linear_schedule_with_warmup,
+        get_polynomial_decay_schedule_with_warmup,
+        get_scheduler,
+    )
+    from .pipeline_utils import DiffusionPipeline
+    from .pipelines import DDIMPipeline, DDPMPipeline, KarrasVePipeline, LDMPipeline, PNDMPipeline, ScoreSdeVePipeline
+    from .schedulers import (
+        DDIMScheduler,
+        DDPMScheduler,
+        KarrasVeScheduler,
+        PNDMScheduler,
+        SchedulerMixin,
+        ScoreSdeVeScheduler,
+    )
+    from .training_utils import EMAModel
+else:
+    from .utils.dummy_pt_objects import *  # noqa F403
+
+if is_torch_available() and is_scipy_available():
    from .schedulers import LMSDiscreteScheduler
 else:
-    from .utils.dummy_scipy_objects import *
+    from .utils.dummy_torch_and_scipy_objects import *  # noqa F403

-from .training_utils import EMAModel
-
-
-if is_transformers_available():
-    from .pipelines import LDMTextToImagePipeline, StableDiffusionPipeline
+if is_torch_available() and is_transformers_available():
+    from .pipelines import (
+        LDMTextToImagePipeline,
+        StableDiffusionImg2ImgPipeline,
+        StableDiffusionInpaintPipeline,
+        StableDiffusionInpaintPipelineLegacy,
+        StableDiffusionPipeline,
+    )
 else:
-    from .utils.dummy_transformers_objects import *
+    from .utils.dummy_torch_and_transformers_objects import *  # noqa F403
+
+if is_torch_available() and is_transformers_available() and is_onnx_available():
+    from .pipelines import (
+        OnnxStableDiffusionImg2ImgPipeline,
+        OnnxStableDiffusionInpaintPipeline,
+        OnnxStableDiffusionPipeline,
+        StableDiffusionOnnxPipeline,
+    )
+else:
+    from .utils.dummy_torch_and_transformers_and_onnx_objects import *  # noqa F403
+
+if is_flax_available():
+    from .modeling_flax_utils import FlaxModelMixin
+    from .models.unet_2d_condition_flax import FlaxUNet2DConditionModel
+    from .models.vae_flax import FlaxAutoencoderKL
+    from .pipeline_flax_utils import FlaxDiffusionPipeline
+    from .schedulers import (
+        FlaxDDIMScheduler,
+        FlaxDDPMScheduler,
+        FlaxKarrasVeScheduler,
+        FlaxLMSDiscreteScheduler,
+        FlaxPNDMScheduler,
+        FlaxSchedulerMixin,
+        FlaxScoreSdeVeScheduler,
+    )
+else:
+    from .utils.dummy_flax_objects import *  # noqa F403
+
+if is_flax_available() and is_transformers_available():
+    from .pipelines import FlaxStableDiffusionPipeline
+else:
+    from .utils.dummy_flax_and_transformers_objects import *  # noqa F403
--- a/src/diffusers/commands/init.py
+++ b/src/diffusers/commands/init.py
@@ -0,0 +1,27 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import ABC, abstractmethod
+from argparse import ArgumentParser
+
+
+class BaseDiffusersCLICommand(ABC):
+    @staticmethod
+    @abstractmethod
+    def register_subcommand(parser: ArgumentParser):
+        raise NotImplementedError()
+
+    @abstractmethod
+    def run(self):
+        raise NotImplementedError()
--- a/src/diffusers/commands/diffusers_cli.py
+++ b/src/diffusers/commands/diffusers_cli.py
@@ -0,0 +1,41 @@
+#!/usr/bin/env python
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from argparse import ArgumentParser
+
+from .env import EnvironmentCommand
+
+
+def main():
+    parser = ArgumentParser("Diffusers CLI tool", usage="diffusers-cli <command> [<args>]")
+    commands_parser = parser.add_subparsers(help="diffusers-cli command helpers")
+
+    # Register commands
+    EnvironmentCommand.register_subcommand(commands_parser)
+
+    # Let's go
+    args = parser.parse_args()
+
+    if not hasattr(args, "func"):
+        parser.print_help()
+        exit(1)
+
+    # Run
+    service = args.func(args)
+    service.run()
+
+
+if __name__ == "__main__":
+    main()
--- a/src/diffusers/commands/env.py
+++ b/src/diffusers/commands/env.py
@@ -0,0 +1,70 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import platform
+from argparse import ArgumentParser
+
+import huggingface_hub
+
+from .. import __version__ as version
+from ..utils import is_torch_available, is_transformers_available
+from . import BaseDiffusersCLICommand
+
+
+def info_command_factory(_):
+    return EnvironmentCommand()
+
+
+class EnvironmentCommand(BaseDiffusersCLICommand):
+    @staticmethod
+    def register_subcommand(parser: ArgumentParser):
+        download_parser = parser.add_parser("env")
+        download_parser.set_defaults(func=info_command_factory)
+
+    def run(self):
+        hub_version = huggingface_hub.__version__
+
+        pt_version = "not installed"
+        pt_cuda_available = "NA"
+        if is_torch_available():
+            import torch
+
+            pt_version = torch.__version__
+            pt_cuda_available = torch.cuda.is_available()
+
+        transformers_version = "not installed"
+        if is_transformers_available:
+            import transformers
+
+            transformers_version = transformers.__version__
+
+        info = {
+            "`diffusers` version": version,
+            "Platform": platform.platform(),
+            "Python version": platform.python_version(),
+            "PyTorch version (GPU?)": f"{pt_version} ({pt_cuda_available})",
+            "Huggingface_hub version": hub_version,
+            "Transformers version": transformers_version,
+            "Using GPU in script?": "<fill in>",
+            "Using distributed or parallel set-up in script?": "<fill in>",
+        }
+
+        print("\nCopy-and-paste the text below in your GitHub issue and FILL OUT the two last points.\n")
+        print(self.format_dict(info))
+
+        return info
+
+    @staticmethod
+    def format_dict(d):
+        return "\n".join([f"- {prop}: {val}" for prop, val in d.items()]) + "\n"
--- a/src/diffusers/configuration_utils.py
+++ b/src/diffusers/configuration_utils.py
@@ -14,6 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ ConfigMixinuration base class and utilities."""
+import dataclasses
 import functools
 import inspect
 import json
@@ -37,9 +38,16 @@ _re_configuration_file = re.compile(r"config\.(.*)\.json")

 class ConfigMixin:
    r"""
-    Base class for all configuration classes. Handles a few parameters common to all models' configurations as well as
-    methods for loading/downloading/saving configurations.
+    Base class for all configuration classes. Stores all configuration parameters under `self.config` Also handles all
+    methods for loading/downloading/saving classes inheriting from [`ConfigMixin`] with
+        - [`~ConfigMixin.from_config`]
+        - [`~ConfigMixin.save_config`]

+    Class attributes:
+        - **config_name** (`str`) -- A filename under which the config should stored when calling
+          [`~ConfigMixin.save_config`] (should be overridden by parent class).
+        - **ignore_for_config** (`List[str]`) -- A list of attributes that should not be saved in the config (should be
+          overridden by parent class).
    """
    config_name = None
    ignore_for_config = []
@@ -50,6 +58,10 @@ class ConfigMixin:
        kwargs["_class_name"] = self.__class__.__name__
        kwargs["_diffusers_version"] = __version__

+        # Special case for `kwargs` used in deprecation warning added to schedulers
+        # TODO: remove this when we remove the deprecation warning, and the `kwargs` argument,
+        # or solve in a more general way.
+        kwargs.pop("kwargs", None)
        for key, value in kwargs.items():
            try:
                setattr(self, key, value)
@@ -74,8 +86,6 @@ class ConfigMixin:
        Args:
            save_directory (`str` or `os.PathLike`):
                Directory where the configuration JSON file will be saved (will be created if it does not exist).
-            kwargs:
-                Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
        """
        if os.path.isfile(save_directory):
            raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")
@@ -90,16 +100,84 @@ class ConfigMixin:

    @classmethod
    def from_config(cls, pretrained_model_name_or_path: Union[str, os.PathLike], return_unused_kwargs=False, **kwargs):
-        config_dict = cls.get_config_dict(pretrained_model_name_or_path=pretrained_model_name_or_path, **kwargs)
+        r"""
+        Instantiate a Python class from a pre-defined JSON-file.

+        Parameters:
+            pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*):
+                Can be either:
+
+                    - A string, the *model id* of a model repo on huggingface.co. Valid model ids should have an
+                      organization name, like `google/ddpm-celebahq-256`.
+                    - A path to a *directory* containing model weights saved using [`~ConfigMixin.save_config`], e.g.,
+                      `./my_model_directory/`.
+
+            cache_dir (`Union[str, os.PathLike]`, *optional*):
+                Path to a directory in which a downloaded pretrained model configuration should be cached if the
+                standard cache should not be used.
+            ignore_mismatched_sizes (`bool`, *optional*, defaults to `False`):
+                Whether or not to raise an error if some of the weights from the checkpoint do not have the same size
+                as the weights of the model (if for instance, you are instantiating a model with 10 labels from a
+                checkpoint with 3 labels).
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+            resume_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to delete incompletely received files. Will attempt to resume the download if such a
+                file exists.
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            output_loading_info(`bool`, *optional*, defaults to `False`):
+                Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages.
+            local_files_only(`bool`, *optional*, defaults to `False`):
+                Whether or not to only look at local files (i.e., do not try to download the model).
+            use_auth_token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
+                when running `transformers-cli login` (stored in `~/.huggingface`).
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
+                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
+                identifier allowed by git.
+            subfolder (`str`, *optional*, defaults to `""`):
+                In case the relevant files are located inside a subfolder of the model repo (either remote in
+                huggingface.co or downloaded locally), you can specify the folder name here.
+
+        <Tip>
+
+         It is required to be logged in (`huggingface-cli login`) when you want to use private or [gated
+         models](https://huggingface.co/docs/hub/models-gated#gated-models).
+
+        </Tip>
+
+        <Tip>
+
+        Activate the special ["offline-mode"](https://huggingface.co/transformers/installation.html#offline-mode) to
+        use this method in a firewalled environment.
+
+        </Tip>
+
+        """
+        config_dict = cls.get_config_dict(pretrained_model_name_or_path=pretrained_model_name_or_path, **kwargs)
        init_dict, unused_kwargs = cls.extract_init_dict(config_dict, **kwargs)

+        # Allow dtype to be specified on initialization
+        if "dtype" in unused_kwargs:
+            init_dict["dtype"] = unused_kwargs.pop("dtype")
+
+        # Return model and optionally state and/or unused_kwargs
        model = cls(**init_dict)
+        return_tuple = (model,)
+
+        # Flax schedulers have a state, so return it.
+        if cls.__name__.startswith("Flax") and hasattr(model, "create_state") and getattr(model, "has_state", False):
+            state = model.create_state()
+            return_tuple += (state,)

        if return_unused_kwargs:
-            return model, unused_kwargs
+            return return_tuple + (unused_kwargs,)
        else:
-            return model
+            return return_tuple if len(return_tuple) > 1 else model

    @classmethod
    def get_config_dict(
@@ -112,6 +190,7 @@ class ConfigMixin:
        use_auth_token = kwargs.pop("use_auth_token", None)
        local_files_only = kwargs.pop("local_files_only", False)
        revision = kwargs.pop("revision", None)
+        _ = kwargs.pop("mirror", None)
        subfolder = kwargs.pop("subfolder", None)

        user_agent = {"file_type": "config"}
@@ -152,6 +231,7 @@ class ConfigMixin:
                    use_auth_token=use_auth_token,
                    user_agent=user_agent,
                    subfolder=subfolder,
+                    revision=revision,
                )

            except RepositoryNotFoundError:
@@ -159,7 +239,7 @@ class ConfigMixin:
                    f"{pretrained_model_name_or_path} is not a local folder and is not a valid model identifier"
                    " listed on 'https://huggingface.co/models'\nIf this is a private repository, make sure to pass a"
                    " token having permission to this repo with `use_auth_token` or log in with `huggingface-cli"
-                    " login` and pass `use_auth_token=True`."
+                    " login`."
                )
            except RevisionNotFoundError:
                raise EnvironmentError(
@@ -207,6 +287,11 @@ class ConfigMixin:
        # remove general kwargs if present in dict
        if "kwargs" in expected_keys:
            expected_keys.remove("kwargs")
+        # remove flax internal keys
+        if hasattr(cls, "_flax_internal_args"):
+            for arg in cls._flax_internal_args:
+                expected_keys.remove(arg)
+
        # remove keys to be ignored
        if len(cls.ignore_for_config) > 0:
            expected_keys = expected_keys - set(cls.ignore_for_config)
@@ -219,11 +304,20 @@ class ConfigMixin:
                # use value from config dict
                init_dict[key] = config_dict.pop(key)

-        unused_kwargs = config_dict.update(kwargs)
+        config_dict = {k: v for k, v in config_dict.items() if not k.startswith("_")}
+
+        if len(config_dict) > 0:
+            logger.warning(
+                f"The config attributes {config_dict} were passed to {cls.__name__}, "
+                "but are not expected and will be ignored. Please verify your "
+                f"{cls.config_name} configuration file."
+            )
+
+        unused_kwargs = {**config_dict, **kwargs}

        passed_keys = set(init_dict.keys())
        if len(expected_keys - passed_keys) > 0:
-            logger.warning(
+            logger.info(
                f"{expected_keys - passed_keys} was not found in config. Values will be initialized to default values."
            )

@@ -297,10 +391,10 @@ class FrozenDict(OrderedDict):


 def register_to_config(init):
-    """
-    Decorator to apply on the init of classes inheriting from `ConfigMixin` so that all the arguments are automatically
-    sent to `self.register_for_config`. To ignore a specific argument accepted by the init but that shouldn't be
-    registered in the config, use the `ignore_for_config` class variable
+    r"""
+    Decorator to apply on the init of classes inheriting from [`ConfigMixin`] so that all the arguments are
+    automatically sent to `self.register_for_config`. To ignore a specific argument accepted by the init but that
+    shouldn't be registered in the config, use the `ignore_for_config` class variable

    Warning: Once decorated, all private arguments (beginning with an underscore) are trashed and not sent to the init!
    """
@@ -337,3 +431,47 @@ def register_to_config(init):
        getattr(self, "register_to_config")(**new_kwargs)

    return inner_init
+
+
+def flax_register_to_config(cls):
+    original_init = cls.__init__
+
+    @functools.wraps(original_init)
+    def init(self, *args, **kwargs):
+        if not isinstance(self, ConfigMixin):
+            raise RuntimeError(
+                f"`@register_for_config` was applied to {self.__class__.__name__} init method, but this class does "
+                "not inherit from `ConfigMixin`."
+            )
+
+        # Ignore private kwargs in the init. Retrieve all passed attributes
+        init_kwargs = {k: v for k, v in kwargs.items() if not k.startswith("_")}
+
+        # Retrieve default values
+        fields = dataclasses.fields(self)
+        default_kwargs = {}
+        for field in fields:
+            # ignore flax specific attributes
+            if field.name in self._flax_internal_args:
+                continue
+            if type(field.default) == dataclasses._MISSING_TYPE:
+                default_kwargs[field.name] = None
+            else:
+                default_kwargs[field.name] = getattr(self, field.name)
+
+        # Make sure init_kwargs override default kwargs
+        new_kwargs = {**default_kwargs, **init_kwargs}
+        # dtype should be part of `init_kwargs`, but not `new_kwargs`
+        if "dtype" in new_kwargs:
+            new_kwargs.pop("dtype")
+
+        # Get positional arguments aligned with kwargs
+        for i, arg in enumerate(args):
+            name = fields[i].name
+            new_kwargs[name] = arg
+
+        getattr(self, "register_to_config")(**new_kwargs)
+        original_init(self, *args, **kwargs)
+
+    cls.__init__ = init
+    return cls
--- a/Show More
+++ b/Show More