Release: v0.13.0

add xformers 0.0.16 warning message (#2345 )
* add xformers 0.0.16 warning message * fix version check to check whole version string
2025-12-08 05:24:20 +08:00 · 2023-02-17 23:38:05 +02:00 · 2023-02-17 13:25:46 -08:00 · 2023-02-17 22:24:16 +01:00 · 2023-02-17 22:21:18 +02:00 · 2023-02-17 22:20:53 +02:00
455 changed files with 44656 additions and 8883 deletions
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -1,4 +1,7 @@
 contact_links:
  - name: Blank issue
    url: https://github.com/huggingface/diffusers/issues/new
-    about: General usage questions and community discussions
+    about: Other
+  - name: Forum
+    url: https://discuss.huggingface.co/
+    about: General usage questions and community discussions
--- a/.github/workflows/build_documentation.yml
+++ b/.github/workflows/build_documentation.yml
@@ -13,5 +13,6 @@ jobs:
    with:
      commit_sha: ${{ github.sha }}
      package: diffusers
+      languages: en ko
    secrets:
      token: ${{ secrets.HUGGINGFACE_PUSH }}
--- a/.github/workflows/build_pr_documentation.yml
+++ b/.github/workflows/build_pr_documentation.yml
@@ -14,3 +14,4 @@ jobs:
      commit_sha: ${{ github.event.pull_request.head.sha }}
      pr_number: ${{ github.event.number }}
      package: diffusers
+      languages: en ko
--- a/.github/workflows/nightly_tests.yml
+++ b/.github/workflows/nightly_tests.yml
@@ -61,8 +61,8 @@ jobs:
      - name: Install dependencies
        run: |
          python -m pip install -e .[quality,test]
-          python -m pip install git+https://github.com/huggingface/accelerate
          python -m pip install -U git+https://github.com/huggingface/transformers
+          python -m pip install git+https://github.com/huggingface/accelerate

      - name: Environment
        run: |
@@ -159,4 +159,4 @@ jobs:
        uses: actions/upload-artifact@v2
        with:
          name: torch_mps_test_reports
-          path: reports
+          path: reports
--- a/.github/workflows/pr_quality.yml
+++ b/.github/workflows/pr_quality.yml
@@ -27,9 +27,8 @@ jobs:
          pip install .[quality]
      - name: Check quality
        run: |
-          black  --check --preview examples tests src utils scripts
-          isort --check-only examples tests src utils scripts
-          flake8 examples tests src utils scripts
+          black --check examples tests src utils scripts
+          ruff examples tests src utils scripts
          doc-builder style src/diffusers docs/source --max_len 119 --check_only --path_to_docs docs/source

  check_repository_consistency:
--- a/.github/workflows/pr_tests.yml
+++ b/.github/workflows/pr_tests.yml
@@ -36,6 +36,11 @@ jobs:
            runner: docker-cpu
            image: diffusers/diffusers-onnxruntime-cpu
            report: onnx_cpu
+          - name: PyTorch Example CPU tests on Ubuntu
+            framework: pytorch_examples
+            runner: docker-cpu
+            image: diffusers/diffusers-pytorch-cpu
+            report: torch_cpu

    name: ${{ matrix.config.name }}

@@ -59,8 +64,8 @@ jobs:
      run: |
        apt-get update && apt-get install libsndfile1-dev -y
        python -m pip install -e .[quality,test]
-        python -m pip install git+https://github.com/huggingface/accelerate
        python -m pip install -U git+https://github.com/huggingface/transformers
+        python -m pip install git+https://github.com/huggingface/accelerate

    - name: Environment
      run: |
@@ -90,6 +95,13 @@ jobs:
          --make-reports=tests_${{ matrix.config.report }} \
          tests/

+    - name: Run example PyTorch CPU tests
+      if: ${{ matrix.config.framework == 'pytorch_examples' }}
+      run: |
+        python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \
+          --make-reports=tests_${{ matrix.config.report }} \
+          examples/test_examples.py 
+
    - name: Failure short reports
      if: ${{ failure() }}
      run: cat reports/tests_${{ matrix.config.report }}_failures_short.txt
--- a/.github/workflows/push_tests.yml
+++ b/.github/workflows/push_tests.yml
@@ -61,8 +61,8 @@ jobs:
    - name: Install dependencies
      run: |
        python -m pip install -e .[quality,test]
-        python -m pip install git+https://github.com/huggingface/accelerate
        python -m pip install -U git+https://github.com/huggingface/transformers
+        python -m pip install git+https://github.com/huggingface/accelerate

    - name: Environment
      run: |
@@ -153,4 +153,4 @@ jobs:
      uses: actions/upload-artifact@v2
      with:
        name: examples_test_reports
-        path: reports
+        path: reports
--- a/.github/workflows/push_tests_fast.yml
+++ b/.github/workflows/push_tests_fast.yml
@@ -0,0 +1,165 @@
+name: Slow tests on main
+
+on:
+  push:
+    branches:
+      - main
+
+env:
+  DIFFUSERS_IS_CI: yes
+  HF_HOME: /mnt/cache
+  OMP_NUM_THREADS: 8
+  MKL_NUM_THREADS: 8
+  PYTEST_TIMEOUT: 600
+  RUN_SLOW: no
+
+jobs:
+  run_fast_tests:
+    strategy:
+      fail-fast: false
+      matrix:
+        config:
+          - name: Fast PyTorch CPU tests on Ubuntu
+            framework: pytorch
+            runner: docker-cpu
+            image: diffusers/diffusers-pytorch-cpu
+            report: torch_cpu
+          - name: Fast Flax CPU tests on Ubuntu
+            framework: flax
+            runner: docker-cpu
+            image: diffusers/diffusers-flax-cpu
+            report: flax_cpu
+          - name: Fast ONNXRuntime CPU tests on Ubuntu
+            framework: onnxruntime
+            runner: docker-cpu
+            image: diffusers/diffusers-onnxruntime-cpu
+            report: onnx_cpu
+          - name: PyTorch Example CPU tests on Ubuntu
+            framework: pytorch_examples
+            runner: docker-cpu
+            image: diffusers/diffusers-pytorch-cpu
+            report: torch_cpu
+
+    name: ${{ matrix.config.name }}
+
+    runs-on: ${{ matrix.config.runner }}
+
+    container:
+      image: ${{ matrix.config.image }}
+      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
+
+    defaults:
+      run:
+        shell: bash
+
+    steps:
+    - name: Checkout diffusers
+      uses: actions/checkout@v3
+      with:
+        fetch-depth: 2
+
+    - name: Install dependencies
+      run: |
+        apt-get update && apt-get install libsndfile1-dev -y
+        python -m pip install -e .[quality,test]
+        python -m pip install -U git+https://github.com/huggingface/transformers
+        python -m pip install git+https://github.com/huggingface/accelerate
+
+    - name: Environment
+      run: |
+        python utils/print_env.py
+
+    - name: Run fast PyTorch CPU tests
+      if: ${{ matrix.config.framework == 'pytorch' }}
+      run: |
+        python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \
+          -s -v -k "not Flax and not Onnx" \
+          --make-reports=tests_${{ matrix.config.report }} \
+          tests/
+
+    - name: Run fast Flax TPU tests
+      if: ${{ matrix.config.framework == 'flax' }}
+      run: |
+        python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \
+          -s -v -k "Flax" \
+          --make-reports=tests_${{ matrix.config.report }} \
+          tests/
+
+    - name: Run fast ONNXRuntime CPU tests
+      if: ${{ matrix.config.framework == 'onnxruntime' }}
+      run: |
+        python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \
+          -s -v -k "Onnx" \
+          --make-reports=tests_${{ matrix.config.report }} \
+          tests/
+
+    - name: Run example PyTorch CPU tests
+      if: ${{ matrix.config.framework == 'pytorch_examples' }}
+      run: |
+        python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \
+          --make-reports=tests_${{ matrix.config.report }} \
+          examples/test_examples.py 
+
+    - name: Failure short reports
+      if: ${{ failure() }}
+      run: cat reports/tests_${{ matrix.config.report }}_failures_short.txt
+
+    - name: Test suite reports artifacts
+      if: ${{ always() }}
+      uses: actions/upload-artifact@v2
+      with:
+        name: pr_${{ matrix.config.report }}_test_reports
+        path: reports
+
+  run_fast_tests_apple_m1:
+    name: Fast PyTorch MPS tests on MacOS
+    runs-on: [ self-hosted, apple-m1 ]
+
+    steps:
+    - name: Checkout diffusers
+      uses: actions/checkout@v3
+      with:
+        fetch-depth: 2
+
+    - name: Clean checkout
+      shell: arch -arch arm64 bash {0}
+      run: |
+        git clean -fxd
+
+    - name: Setup miniconda
+      uses: ./.github/actions/setup-miniconda
+      with:
+        python-version: 3.9
+
+    - name: Install dependencies
+      shell: arch -arch arm64 bash {0}
+      run: |
+        ${CONDA_RUN} python -m pip install --upgrade pip
+        ${CONDA_RUN} python -m pip install -e .[quality,test]
+        ${CONDA_RUN} python -m pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu
+        ${CONDA_RUN} python -m pip install git+https://github.com/huggingface/accelerate
+        ${CONDA_RUN} python -m pip install -U git+https://github.com/huggingface/transformers
+
+    - name: Environment
+      shell: arch -arch arm64 bash {0}
+      run: |
+        ${CONDA_RUN} python utils/print_env.py
+
+    - name: Run fast PyTorch tests on M1 (MPS)
+      shell: arch -arch arm64 bash {0}
+      env:
+        HF_HOME: /System/Volumes/Data/mnt/cache
+        HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+      run: |
+        ${CONDA_RUN} python -m pytest -n 0 -s -v --make-reports=tests_torch_mps tests/
+
+    - name: Failure short reports
+      if: ${{ failure() }}
+      run: cat reports/tests_torch_mps_failures_short.txt
+
+    - name: Test suite reports artifacts
+      if: ${{ always() }}
+      uses: actions/upload-artifact@v2
+      with:
+        name: pr_torch_mps_test_reports
+        path: reports
--- a/.gitignore
+++ b/.gitignore
@@ -169,3 +169,6 @@ tags

 # dependencies
 /transformers
+
+# ruff
+.ruff_cache
--- a/CITATION.cff
+++ b/CITATION.cff
@@ -0,0 +1,40 @@
+cff-version: 1.2.0
+title: 'Diffusers: State-of-the-art diffusion models'
+message: >-
+  If you use this software, please cite it using the
+  metadata from this file.
+type: software
+authors:
+  - given-names: Patrick
+    family-names: von Platen
+  - given-names: Suraj
+    family-names: Patil
+  - given-names: Anton
+    family-names: Lozhkov
+  - given-names: Pedro
+    family-names: Cuenca
+  - given-names: Nathan
+    family-names: Lambert
+  - given-names: Kashif
+    family-names: Rasul
+  - given-names: Mishig
+    family-names: Davaadorj
+  - given-names: Thomas
+    family-names: Wolf
+repository-code: 'https://github.com/huggingface/diffusers'
+abstract: >-
+  Diffusers provides pretrained diffusion models across
+  multiple modalities, such as vision and audio, and serves
+  as a modular toolbox for inference and training of
+  diffusion models.
+keywords:
+  - deep-learning
+  - pytorch
+  - image-generation
+  - diffusion
+  - text2image
+  - image2image
+  - score-based-generative-modeling
+  - stable-diffusion
+license: Apache-2.0
+version: 0.12.1
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -177,7 +177,7 @@ Follow these steps to start contributing ([supported Python versions](https://gi
   $ make style
   ```

-   🧨 Diffusers also uses `flake8` and a few custom scripts to check for coding mistakes. Quality
+   🧨 Diffusers also uses `ruff` and a few custom scripts to check for coding mistakes. Quality
   control runs in CI, however you can also run the same checks with:

   ```bash
--- a/16
+++ b/16
@@ -9,9 +9,8 @@ modified_only_fixup:
 	$(eval modified_py_files := $(shell python utils/get_modified_files.py $(check_dirs)))
 	@if test -n "$(modified_py_files)"; then \
 		echo "Checking/fixing $(modified_py_files)"; \
-		black --preview $(modified_py_files); \
-		isort $(modified_py_files); \
-		flake8 $(modified_py_files); \
+		black $(modified_py_files); \
+		ruff $(modified_py_files); \
 	else \
 		echo "No library .py files were modified"; \
 	fi
@@ -41,22 +40,23 @@ repo-consistency:
 # this target runs checks on all files

 quality:
-	black --check --preview $(check_dirs)
-	isort --check-only $(check_dirs)
-	flake8 $(check_dirs)
+	black --check $(check_dirs)
+	ruff $(check_dirs)
 	doc-builder style src/diffusers docs/source --max_len 119 --check_only --path_to_docs docs/source
+	python utils/check_doc_toc.py

 # Format source code automatically and check is there are any problems left that need manual fixing

 extra_style_checks:
 	python utils/custom_init_isort.py
 	doc-builder style src/diffusers docs/source --max_len 119 --path_to_docs docs/source
+	python utils/check_doc_toc.py --fix_and_overwrite

 # this target runs checks on all files and potentially modifies some of them

 style:
-	black --preview $(check_dirs)
-	isort $(check_dirs)
+	black $(check_dirs)
+	ruff $(check_dirs) --fix
 	${MAKE} autogenerate_code
 	${MAKE} extra_style_checks

--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 <p align="center">
    <br>
-    <img src="https://github.com/huggingface/diffusers/raw/main/docs/source/imgs/diffusers_library.jpg" width="400"/>
+    <img src="./docs/source/en/imgs/diffusers_library.jpg" width="400"/>
    <br>
 <p>
 <p align="center">
@@ -235,6 +235,102 @@ images = pipeline(prompt_ids, params, prng_seed, num_inference_steps, jit=True).
 images = pipeline.numpy_to_pil(np.asarray(images.reshape((num_samples,) + images.shape[-3:])))
 ```

+Diffusers also has a Image-to-Image generation pipeline with Flax/Jax
+```python
+import jax
+import numpy as np
+import jax.numpy as jnp
+from flax.jax_utils import replicate
+from flax.training.common_utils import shard
+import requests
+from io import BytesIO
+from PIL import Image
+from diffusers import FlaxStableDiffusionImg2ImgPipeline
+
+def create_key(seed=0):
+    return jax.random.PRNGKey(seed)
+rng = create_key(0)
+
+url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
+response = requests.get(url)
+init_img = Image.open(BytesIO(response.content)).convert("RGB")
+init_img = init_img.resize((768, 512))
+
+prompts = "A fantasy landscape, trending on artstation"
+
+pipeline, params = FlaxStableDiffusionImg2ImgPipeline.from_pretrained(
+    "CompVis/stable-diffusion-v1-4", revision="flax",
+    dtype=jnp.bfloat16,
+)
+
+num_samples = jax.device_count()
+rng = jax.random.split(rng, jax.device_count())
+prompt_ids, processed_image = pipeline.prepare_inputs(prompt=[prompts]*num_samples, image = [init_img]*num_samples)
+p_params = replicate(params)
+prompt_ids = shard(prompt_ids)
+processed_image = shard(processed_image)
+
+output = pipeline(
+    prompt_ids=prompt_ids, 
+    image=processed_image, 
+    params=p_params, 
+    prng_seed=rng, 
+    strength=0.75, 
+    num_inference_steps=50, 
+    jit=True, 
+    height=512,
+    width=768).images
+
+output_images = pipeline.numpy_to_pil(np.asarray(output.reshape((num_samples,) + output.shape[-3:])))
+```
+
+Diffusers also has a Text-guided inpainting pipeline with Flax/Jax
+
+```python
+import jax
+import numpy as np
+from flax.jax_utils import replicate
+from flax.training.common_utils import shard
+import PIL
+import requests
+from io import BytesIO
+
+
+from diffusers import FlaxStableDiffusionInpaintPipeline
+
+def download_image(url):
+    response = requests.get(url)
+    return PIL.Image.open(BytesIO(response.content)).convert("RGB")
+img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png"
+mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png"
+
+init_image = download_image(img_url).resize((512, 512))
+mask_image = download_image(mask_url).resize((512, 512))
+
+pipeline, params = FlaxStableDiffusionInpaintPipeline.from_pretrained("xvjiarui/stable-diffusion-2-inpainting")
+
+prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
+prng_seed = jax.random.PRNGKey(0)
+num_inference_steps = 50
+
+num_samples = jax.device_count()
+prompt = num_samples * [prompt]
+init_image = num_samples * [init_image]
+mask_image = num_samples * [mask_image]
+prompt_ids, processed_masked_images, processed_masks = pipeline.prepare_inputs(prompt, init_image, mask_image)
+
+
+# shard inputs and rng
+params = replicate(params)
+prng_seed = jax.random.split(prng_seed, jax.device_count())
+prompt_ids = shard(prompt_ids)
+processed_masked_images = shard(processed_masked_images)
+processed_masks = shard(processed_masks)
+
+images = pipeline(prompt_ids, processed_masks, processed_masked_images, params, prng_seed, num_inference_steps, jit=True).images
+images = pipeline.numpy_to_pil(np.asarray(images.reshape((num_samples,) + images.shape[-3:])))
+```
+
 ### Image-to-Image text-guided generation with Stable Diffusion

 The `StableDiffusionImg2ImgPipeline` lets you pass a text prompt and an initial image to condition the generation of new images.
--- a/docker/diffusers-flax-cpu/Dockerfile
+++ b/docker/diffusers-flax-cpu/Dockerfile
@@ -34,8 +34,8 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip && \
        datasets \
        hf-doc-builder \
        huggingface-hub \
+        Jinja2 \
        librosa \
-        modelcards \
        numpy \
        scipy \
        tensorboard \
--- a/docker/diffusers-flax-tpu/Dockerfile
+++ b/docker/diffusers-flax-tpu/Dockerfile
@@ -36,8 +36,8 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip && \
        datasets \
        hf-doc-builder \
        huggingface-hub \
+        Jinja2 \
        librosa \        
-        modelcards \
        numpy \
        scipy \
        tensorboard \
--- a/docker/diffusers-onnxruntime-cpu/Dockerfile
+++ b/docker/diffusers-onnxruntime-cpu/Dockerfile
@@ -34,8 +34,8 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip && \
        datasets \
        hf-doc-builder \
        huggingface-hub \
+        Jinja2 \
        librosa \
-        modelcards \
        numpy \
        scipy \
        tensorboard \
--- a/docker/diffusers-onnxruntime-cuda/Dockerfile
+++ b/docker/diffusers-onnxruntime-cuda/Dockerfile
@@ -34,8 +34,8 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip && \
        datasets \
        hf-doc-builder \
        huggingface-hub \
+        Jinja2 \
        librosa \
-        modelcards \
        numpy \
        scipy \
        tensorboard \
--- a/docker/diffusers-pytorch-cpu/Dockerfile
+++ b/docker/diffusers-pytorch-cpu/Dockerfile
@@ -33,8 +33,8 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip && \
        datasets \
        hf-doc-builder \
        huggingface-hub \
+        Jinja2 \
        librosa \
-        modelcards \
        numpy \
        scipy \
        tensorboard \
--- a/docker/diffusers-pytorch-cuda/Dockerfile
+++ b/docker/diffusers-pytorch-cuda/Dockerfile
@@ -33,8 +33,8 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip && \
        datasets \
        hf-doc-builder \
        huggingface-hub \
+        Jinja2 \
        librosa \
-        modelcards \
        numpy \
        scipy \
        tensorboard \
--- a/docs/README.md
+++ b/docs/README.md
@@ -54,7 +54,7 @@ doc-builder preview {package_name} {path_to_docs}
 For example:

 ```bash
-doc-builder preview diffusers docs/source/
+doc-builder preview diffusers docs/source/en
 ```

 The docs will be viewable at [http://localhost:3000](http://localhost:3000). You can also preview the docs once you have opened a PR. You will see a bot add a comment to a link where the documentation with your changes lives.
@@ -126,23 +126,28 @@ When adding a new pipeline:
    - Paper abstract
    - Tips and tricks and how to use it best
    - Possible an end-to-end example of how to use it
- Add all the pipeline classes that should be linked in the diffusion model. These classes should be added using our Markdown syntax. Usually as follows:
-
-```
-## XXXPipeline
-
-[[autodoc]] XXXPipeline
-```
-
-This will include every public method of the pipeline that is documented. You can specify which methods should be in the docs:
+- Add all the pipeline classes that should be linked in the diffusion model. These classes should be added using our Markdown syntax. By default as follows:

 ```
 ## XXXPipeline

 [[autodoc]] XXXPipeline
+    - all
 	- __call__
 ```

+This will include every public method of the pipeline that is documented, as well as the  `__call__` method that is not documented by default. If you just want to add additional methods that are not documented, you can put the list of all methods to add in a list that contains `all`.
+
+```
+[[autodoc]] XXXPipeline
+    - all
+	- __call__
+	- enable_attention_slicing
+	- disable_attention_slicing
+    - enable_xformers_memory_efficient_attention 
+    - disable_xformers_memory_efficient_attention
+```
+
 You can follow the same process to create a new scheduler under the `docs/source/api/schedulers` folder

 ### Writing source documentation
@@ -155,9 +160,9 @@ adds a link to its documentation with this syntax: \[\`XXXClass\`\] or \[\`funct
 function to be in the main package.

 If you want to create a link to some internal class or function, you need to
-provide its path. For instance: \[\`pipeline_utils.ImagePipelineOutput\`\]. This will be converted into a link with
-`pipeline_utils.ImagePipelineOutput` in the description. To get rid of the path and only keep the name of the object you are
-linking to in the description, add a ~: \[\`~pipeline_utils.ImagePipelineOutput\`\] will generate a link with `ImagePipelineOutput` in the description.
+provide its path. For instance: \[\`pipelines.ImagePipelineOutput\`\]. This will be converted into a link with
+`pipelines.ImagePipelineOutput` in the description. To get rid of the path and only keep the name of the object you are
+linking to in the description, add a ~: \[\`~pipelines.ImagePipelineOutput\`\] will generate a link with `ImagePipelineOutput` in the description.

 The same works for methods so you can either use \[\`XXXClass.method\`\] or \[~\`XXXClass.method\`\].

--- a/docs/TRANSLATING.md
+++ b/docs/TRANSLATING.md
@@ -0,0 +1,57 @@
+### Translating the Diffusers documentation into your language
+
+As part of our mission to democratize machine learning, we'd love to make the Diffusers library available in many more languages! Follow the steps below if you want to help translate the documentation into your language 🙏.
+
+**🗞️ Open an issue**
+
+To get started, navigate to the [Issues](https://github.com/huggingface/diffusers/issues) page of this repo and check if anyone else has opened an issue for your language. If not, open a new issue by selecting the "Translation template" from the "New issue" button.
+
+Once an issue exists, post a comment to indicate which chapters you'd like to work on, and we'll add your name to the list.
+
+
+**🍴 Fork the repository**
+
+First, you'll need to [fork the Diffusers repo](https://docs.github.com/en/get-started/quickstart/fork-a-repo). You can do this by clicking on the **Fork** button on the top-right corner of this repo's page.
+
+Once you've forked the repo, you'll want to get the files on your local machine for editing. You can do that by cloning the fork with Git as follows:
+
+```bash
+git clone https://github.com/YOUR-USERNAME/diffusers.git
+```
+
+**📋 Copy-paste the English version with a new language code**
+
+The documentation files are in one leading directory:
+
+- [`docs/source`](https://github.com/huggingface/diffusers/tree/main/docs/source): All the documentation materials are organized here by language.
+
+You'll only need to copy the files in the [`docs/source/en`](https://github.com/huggingface/diffusers/tree/main/docs/source/en) directory, so first navigate to your fork of the repo and run the following:
+
+```bash
+cd ~/path/to/diffusers/docs
+cp -r source/en source/LANG-ID
+```
+
+Here, `LANG-ID` should be one of the ISO 639-1 or ISO 639-2 language codes -- see [here](https://www.loc.gov/standards/iso639-2/php/code_list.php) for a handy table.
+
+**✍️ Start translating**
+
+The fun part comes - translating the text!
+
+The first thing we recommend is translating the part of the `_toctree.yml` file that corresponds to your doc chapter. This file is used to render the table of contents on the website. 
+
+> 🙋 If the `_toctree.yml` file doesn't yet exist for your language, you can create one by copy-pasting from the English version and deleting the sections unrelated to your chapter. Just make sure it exists in the `docs/source/LANG-ID/` directory!
+
+The fields you should add are `local` (with the name of the file containing the translation; e.g. `autoclass_tutorial`), and `title` (with the title of the doc in your language; e.g. `Load pretrained instances with an AutoClass`) -- as a reference, here is the `_toctree.yml` for [English](https://github.com/huggingface/diffusers/blob/main/docs/source/en/_toctree.yml):
+
+```yaml
+- sections:
+  - local: pipeline_tutorial # Do not change this! Use the same name for your .md file
+    title: Pipelines for inference # Translate this!
+    ...
+  title: Tutorials # Translate this!
+```
+
+Once you have translated the `_toctree.yml` file, you can start translating the [MDX](https://mdxjs.com/) files associated with your docs chapter.
+
+> 🙋 If you'd like others to help you with the translation, you should [open an issue](https://github.com/huggingface/diffusers/issues) and tag @patrickvonplaten.
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -1,178 +0,0 @@
- sections:
-  - local: index
-    title: "🧨 Diffusers"
-  - local: quicktour
-    title: "Quicktour"
-  - local: installation
-    title: "Installation"
-  title: "Get started"
- sections:
-  - sections:
-    - local: using-diffusers/loading
-      title: "Loading Pipelines, Models, and Schedulers"
-    - local: using-diffusers/schedulers
-      title: "Using different Schedulers"
-    - local: using-diffusers/configuration
-      title: "Configuring Pipelines, Models, and Schedulers"
-    - local: using-diffusers/custom_pipeline_overview
-      title: "Loading and Adding Custom Pipelines"
-    title: "Loading & Hub"
-  - sections:
-    - local: using-diffusers/unconditional_image_generation
-      title: "Unconditional Image Generation"
-    - local: using-diffusers/conditional_image_generation
-      title: "Text-to-Image Generation"
-    - local: using-diffusers/img2img
-      title: "Text-Guided Image-to-Image"
-    - local: using-diffusers/inpaint
-      title: "Text-Guided Image-Inpainting"
-    - local: using-diffusers/depth2img
-      title: "Text-Guided Depth-to-Image"
-    - local: using-diffusers/reusing_seeds
-      title: "Reusing seeds for deterministic generation"
-    - local: using-diffusers/custom_pipeline_examples
-      title: "Community Pipelines"
-    - local: using-diffusers/contribute_pipeline
-      title: "How to contribute a Pipeline"
-    title: "Pipelines for Inference"
-  - sections:
-    - local: using-diffusers/rl
-      title: "Reinforcement Learning"
-    - local: using-diffusers/audio
-      title: "Audio"
-    - local: using-diffusers/other-modalities
-      title: "Other Modalities"
-    title: "Taking Diffusers Beyond Images"
-  title: "Using Diffusers"
- sections:
-  - local: optimization/fp16
-    title: "Memory and Speed"
-  - local: optimization/xformers
-    title: "xFormers"
-  - local: optimization/onnx
-    title: "ONNX"
-  - local: optimization/open_vino
-    title: "OpenVINO"
-  - local: optimization/mps
-    title: "MPS"
-  - local: optimization/habana
-    title: "Habana Gaudi"
-  title: "Optimization/Special Hardware"
- sections:
-  - local: training/overview
-    title: "Overview"
-  - local: training/unconditional_training
-    title: "Unconditional Image Generation"
-  - local: training/text_inversion
-    title: "Textual Inversion"
-  - local: training/dreambooth
-    title: "Dreambooth"
-  - local: training/text2image
-    title: "Text-to-image fine-tuning"
-  title: "Training"
- sections:
-  - local: conceptual/stable_diffusion
-    title: "Stable Diffusion"
-  - local: conceptual/philosophy
-    title: "Philosophy"
-  - local: conceptual/contribution
-    title: "How to contribute?"
-  title: "Conceptual Guides"
- sections:
-  - sections:
-    - local: api/models
-      title: "Models"
-    - local: api/diffusion_pipeline
-      title: "Diffusion Pipeline"
-    - local: api/logging
-      title: "Logging"
-    - local: api/configuration
-      title: "Configuration"
-    - local: api/outputs
-      title: "Outputs"
-    title: "Main Classes"
-  - sections:
-    - local: api/pipelines/overview
-      title: "Overview"
-    - local: api/pipelines/alt_diffusion
-      title: "AltDiffusion"
-    - local: api/pipelines/cycle_diffusion
-      title: "Cycle Diffusion"
-    - local: api/pipelines/ddim
-      title: "DDIM"
-    - local: api/pipelines/ddpm
-      title: "DDPM"
-    - local: api/pipelines/latent_diffusion
-      title: "Latent Diffusion"
-    - local: api/pipelines/latent_diffusion_uncond
-      title: "Unconditional Latent Diffusion"
-    - local: api/pipelines/paint_by_example
-      title: "PaintByExample"
-    - local: api/pipelines/pndm
-      title: "PNDM"
-    - local: api/pipelines/score_sde_ve
-      title: "Score SDE VE"
-    - local: api/pipelines/stable_diffusion
-      title: "Stable Diffusion"
-    - local: api/pipelines/stable_diffusion_2
-      title: "Stable Diffusion 2"
-    - local: api/pipelines/stable_diffusion_safe
-      title: "Safe Stable Diffusion"
-    - local: api/pipelines/stochastic_karras_ve
-      title: "Stochastic Karras VE"
-    - local: api/pipelines/dance_diffusion
-      title: "Dance Diffusion"
-    - local: api/pipelines/unclip
-      title: "UnCLIP"
-    - local: api/pipelines/versatile_diffusion
-      title: "Versatile Diffusion"
-    - local: api/pipelines/vq_diffusion
-      title: "VQ Diffusion"
-    - local: api/pipelines/repaint
-      title: "RePaint"
-    - local: api/pipelines/audio_diffusion
-      title: "Audio Diffusion"
-    title: "Pipelines"
-  - sections:
-    - local: api/schedulers/overview
-      title: "Overview"
-    - local: api/schedulers/ddim
-      title: "DDIM"
-    - local: api/schedulers/ddpm
-      title: "DDPM"
-    - local: api/schedulers/singlestep_dpm_solver
-      title: "Singlestep DPM-Solver"
-    - local: api/schedulers/multistep_dpm_solver
-      title: "Multistep DPM-Solver"
-    - local: api/schedulers/heun
-      title: "Heun Scheduler"
-    - local: api/schedulers/dpm_discrete
-      title: "DPM Discrete Scheduler"
-    - local: api/schedulers/dpm_discrete_ancestral
-      title: "DPM Discrete Scheduler with ancestral sampling"
-    - local: api/schedulers/stochastic_karras_ve
-      title: "Stochastic Kerras VE"
-    - local: api/schedulers/lms_discrete
-      title: "Linear Multistep"
-    - local: api/schedulers/pndm
-      title: "PNDM"
-    - local: api/schedulers/score_sde_ve
-      title: "VE-SDE"
-    - local: api/schedulers/ipndm
-      title: "IPNDM"
-    - local: api/schedulers/score_sde_vp
-      title: "VP-SDE"
-    - local: api/schedulers/euler
-      title: "Euler scheduler"
-    - local: api/schedulers/euler_ancestral
-      title: "Euler Ancestral Scheduler"
-    - local: api/schedulers/vq_diffusion
-      title: "VQDiffusionScheduler"
-    - local: api/schedulers/repaint
-      title: "RePaint Scheduler"      
-    title: "Schedulers"
-  - sections:
-    - local: api/experimental/rl
-      title: "RL Planning"
-    title: "Experimental Features"
-  title: "API"
--- a/docs/source/conceptual/philosophy.mdx
+++ b/docs/source/conceptual/philosophy.mdx
@@ -1,17 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# Philosophy
-
- Readability and clarity are preferred over highly optimized code. A strong importance is put on providing readable, intuitive and elementary code design. *E.g.*, the provided [schedulers](https://github.com/huggingface/diffusers/tree/main/src/diffusers/schedulers) are separated from the provided [models](https://github.com/huggingface/diffusers/tree/main/src/diffusers/models) and use well-commented code that can be read alongside the original paper.
- Diffusers is **modality independent** and focuses on providing pretrained models and tools to build systems that generate **continuous outputs**, *e.g.* vision and audio. This is one of the guiding goals even if the initial pipelines are devoted to vision tasks.
- Diffusion models and schedulers are provided as concise, elementary building blocks. In contrast, diffusion pipelines are a collection of end-to-end diffusion systems that can be used out-of-the-box, should stay as close as possible to their original implementations and can include components of other libraries, such as text encoders. Examples of diffusion pipelines are [Glide](https://github.com/openai/glide-text2im), [Latent Diffusion](https://github.com/CompVis/latent-diffusion) and [Stable Diffusion](https://github.com/compvis/stable-diffusion).
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -0,0 +1,232 @@
+- sections:
+  - local: index
+    title: 🧨 Diffusers
+  - local: quicktour
+    title: Quicktour
+  - local: stable_diffusion
+    title: Stable Diffusion
+  - local: installation
+    title: Installation
+  title: Get started
+- sections:
+  - sections:
+    - local: using-diffusers/loading
+      title: Loading Pipelines, Models, and Schedulers
+    - local: using-diffusers/schedulers
+      title: Using different Schedulers
+    - local: using-diffusers/configuration
+      title: Configuring Pipelines, Models, and Schedulers
+    - local: using-diffusers/custom_pipeline_overview
+      title: Loading and Adding Custom Pipelines
+    - local: using-diffusers/kerascv
+      title: Using KerasCV Stable Diffusion Checkpoints in Diffusers
+    title: Loading & Hub
+  - sections:
+    - local: using-diffusers/unconditional_image_generation
+      title: Unconditional Image Generation
+    - local: using-diffusers/conditional_image_generation
+      title: Text-to-Image Generation
+    - local: using-diffusers/img2img
+      title: Text-Guided Image-to-Image
+    - local: using-diffusers/inpaint
+      title: Text-Guided Image-Inpainting
+    - local: using-diffusers/depth2img
+      title: Text-Guided Depth-to-Image
+    - local: using-diffusers/controlling_generation
+      title: Controlling generation
+    - local: using-diffusers/reusing_seeds
+      title: Reusing seeds for deterministic generation
+    - local: using-diffusers/reproducibility
+      title: Reproducibility
+    - local: using-diffusers/custom_pipeline_examples
+      title: Community Pipelines
+    - local: using-diffusers/contribute_pipeline
+      title: How to contribute a Pipeline
+    - local: using-diffusers/using_safetensors
+      title: Using safetensors
+    title: Pipelines for Inference
+  - sections:
+    - local: using-diffusers/rl
+      title: Reinforcement Learning
+    - local: using-diffusers/audio
+      title: Audio
+    - local: using-diffusers/other-modalities
+      title: Other Modalities
+    title: Taking Diffusers Beyond Images
+  title: Using Diffusers
+- sections:
+  - local: optimization/fp16
+    title: Memory and Speed
+  - local: optimization/torch2.0
+    title: Torch2.0 support
+  - local: optimization/xformers
+    title: xFormers
+  - local: optimization/onnx
+    title: ONNX
+  - local: optimization/open_vino
+    title: OpenVINO
+  - local: optimization/mps
+    title: MPS
+  - local: optimization/habana
+    title: Habana Gaudi
+  title: Optimization/Special Hardware
+- sections:
+  - local: training/overview
+    title: Overview
+  - local: training/unconditional_training
+    title: Unconditional Image Generation
+  - local: training/text_inversion
+    title: Textual Inversion
+  - local: training/dreambooth
+    title: Dreambooth
+  - local: training/text2image
+    title: Text-to-image fine-tuning
+  - local: training/lora
+    title: LoRA Support in Diffusers
+  title: Training
+- sections:
+  - local: conceptual/philosophy
+    title: Philosophy
+  - local: conceptual/contribution
+    title: How to contribute?
+  - local: conceptual/ethical_guidelines
+    title: Diffusers' Ethical Guidelines
+  title: Conceptual Guides
+- sections:
+  - sections:
+    - local: api/models
+      title: Models
+    - local: api/diffusion_pipeline
+      title: Diffusion Pipeline
+    - local: api/logging
+      title: Logging
+    - local: api/configuration
+      title: Configuration
+    - local: api/outputs
+      title: Outputs
+    - local: api/loaders
+      title: Loaders
+    title: Main Classes
+  - sections:
+    - local: api/pipelines/overview
+      title: Overview
+    - local: api/pipelines/alt_diffusion
+      title: AltDiffusion
+    - local: api/pipelines/audio_diffusion
+      title: Audio Diffusion
+    - local: api/pipelines/cycle_diffusion
+      title: Cycle Diffusion
+    - local: api/pipelines/dance_diffusion
+      title: Dance Diffusion
+    - local: api/pipelines/ddim
+      title: DDIM
+    - local: api/pipelines/ddpm
+      title: DDPM
+    - local: api/pipelines/dit
+      title: DiT
+    - local: api/pipelines/latent_diffusion
+      title: Latent Diffusion
+    - local: api/pipelines/paint_by_example
+      title: PaintByExample
+    - local: api/pipelines/pndm
+      title: PNDM
+    - local: api/pipelines/repaint
+      title: RePaint
+    - local: api/pipelines/stable_diffusion_safe
+      title: Safe Stable Diffusion
+    - local: api/pipelines/score_sde_ve
+      title: Score SDE VE
+    - local: api/pipelines/semantic_stable_diffusion
+      title: Semantic Guidance
+    - sections:
+      - local: api/pipelines/stable_diffusion/overview
+        title: Overview
+      - local: api/pipelines/stable_diffusion/text2img
+        title: Text-to-Image
+      - local: api/pipelines/stable_diffusion/img2img
+        title: Image-to-Image
+      - local: api/pipelines/stable_diffusion/inpaint
+        title: Inpaint
+      - local: api/pipelines/stable_diffusion/depth2img
+        title: Depth-to-Image
+      - local: api/pipelines/stable_diffusion/image_variation
+        title: Image-Variation
+      - local: api/pipelines/stable_diffusion/upscale
+        title: Super-Resolution
+      - local: api/pipelines/stable_diffusion/latent_upscale
+        title: Stable-Diffusion-Latent-Upscaler
+      - local: api/pipelines/stable_diffusion/pix2pix
+        title: InstructPix2Pix
+      - local: api/pipelines/stable_diffusion/attend_and_excite
+        title: Attend and Excite
+      - local: api/pipelines/stable_diffusion/pix2pix_zero
+        title: Pix2Pix Zero
+      - local: api/pipelines/stable_diffusion/self_attention_guidance
+        title: Self-Attention Guidance
+      - local: api/pipelines/stable_diffusion/panorama
+        title: MultiDiffusion Panorama
+      title: Stable Diffusion
+    - local: api/pipelines/stable_diffusion_2
+      title: Stable Diffusion 2
+    - local: api/pipelines/stable_unclip
+      title: Stable unCLIP
+    - local: api/pipelines/stochastic_karras_ve
+      title: Stochastic Karras VE
+    - local: api/pipelines/unclip
+      title: UnCLIP
+    - local: api/pipelines/latent_diffusion_uncond
+      title: Unconditional Latent Diffusion
+    - local: api/pipelines/versatile_diffusion
+      title: Versatile Diffusion
+    - local: api/pipelines/vq_diffusion
+      title: VQ Diffusion
+    title: Pipelines
+  - sections:
+    - local: api/schedulers/overview
+      title: Overview
+    - local: api/schedulers/ddim
+      title: DDIM
+    - local: api/schedulers/ddim_inverse
+      title: DDIMInverse
+    - local: api/schedulers/ddpm
+      title: DDPM
+    - local: api/schedulers/deis
+      title: DEIS
+    - local: api/schedulers/dpm_discrete
+      title: DPM Discrete Scheduler
+    - local: api/schedulers/dpm_discrete_ancestral
+      title: DPM Discrete Scheduler with ancestral sampling
+    - local: api/schedulers/euler_ancestral
+      title: Euler Ancestral Scheduler
+    - local: api/schedulers/euler
+      title: Euler scheduler
+    - local: api/schedulers/heun
+      title: Heun Scheduler
+    - local: api/schedulers/ipndm
+      title: IPNDM
+    - local: api/schedulers/lms_discrete
+      title: Linear Multistep
+    - local: api/schedulers/multistep_dpm_solver
+      title: Multistep DPM-Solver
+    - local: api/schedulers/pndm
+      title: PNDM
+    - local: api/schedulers/repaint
+      title: RePaint Scheduler
+    - local: api/schedulers/singlestep_dpm_solver
+      title: Singlestep DPM-Solver
+    - local: api/schedulers/stochastic_karras_ve
+      title: Stochastic Kerras VE
+    - local: api/schedulers/unipc
+      title: UniPCMultistepScheduler
+    - local: api/schedulers/score_sde_ve
+      title: VE-SDE
+    - local: api/schedulers/score_sde_vp
+      title: VP-SDE
+    - local: api/schedulers/vq_diffusion
+      title: VQDiffusionScheduler
+    title: Schedulers
+  - sections:
+    - local: api/experimental/rl
+      title: RL Planning
+    title: Experimental Features
+  title: API
--- a/docs/source/en/api/configuration.mdx
+++ b/docs/source/en/api/configuration.mdx
--- a/docs/source/en/api/diffusion_pipeline.mdx
+++ b/docs/source/en/api/diffusion_pipeline.mdx
@@ -30,13 +30,18 @@ Any pipeline object can be saved locally with [`~DiffusionPipeline.save_pretrain

 ## DiffusionPipeline
 [[autodoc]] DiffusionPipeline
-	- from_pretrained
-	- save_pretrained
-	- to
+	- all
+	- __call__
 	- device
+	- to
 	- components

 ## ImagePipelineOutput
 By default diffusion pipelines return an object of class

-[[autodoc]] pipeline_utils.ImagePipelineOutput
+[[autodoc]] pipelines.ImagePipelineOutput
+
+## AudioPipelineOutput
+By default diffusion pipelines return an object of class
+
+[[autodoc]] pipelines.AudioPipelineOutput
--- a/docs/source/en/api/experimental/rl.mdx
+++ b/docs/source/en/api/experimental/rl.mdx
--- a/docs/source/en/api/loaders.mdx
+++ b/docs/source/en/api/loaders.mdx
@@ -0,0 +1,30 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Loaders
+
+There are many ways to train adapter neural networks for diffusion models, such as 
+- [Textual Inversion](./training/text_inversion.mdx)
+- [LoRA](https://github.com/cloneofsimo/lora)
+- [Hypernetworks](https://arxiv.org/abs/1609.09106)
+
+Such adapter neural networks often only consist of a fraction of the number of weights compared 
+to the pretrained model and as such are very portable. The Diffusers library offers an easy-to-use
+API to load such adapter neural networks via the [`loaders.py` module](https://github.com/huggingface/diffusers/blob/main/src/diffusers/loaders.py). 
+
+**Note**: This module is still highly experimental and prone to future changes.
+
+## LoaderMixins
+
+### UNet2DConditionLoadersMixin
+
+[[autodoc]] loaders.UNet2DConditionLoadersMixin
--- a/docs/source/en/api/logging.mdx
+++ b/docs/source/en/api/logging.mdx
@@ -1,4 +1,4 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.

 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
--- a/docs/source/en/api/models.mdx
+++ b/docs/source/en/api/models.mdx
@@ -41,13 +41,13 @@ The models are built on the base class ['ModelMixin'] that is a `torch.nn.module
 [[autodoc]] models.vae.DecoderOutput

 ## VQEncoderOutput
-[[autodoc]] models.vae.VQEncoderOutput
+[[autodoc]] models.vq_model.VQEncoderOutput

 ## VQModel
 [[autodoc]] VQModel

 ## AutoencoderKLOutput
-[[autodoc]] models.vae.AutoencoderKLOutput
+[[autodoc]] models.autoencoder_kl.AutoencoderKLOutput

 ## AutoencoderKL
 [[autodoc]] AutoencoderKL
@@ -56,7 +56,7 @@ The models are built on the base class ['ModelMixin'] that is a `torch.nn.module
 [[autodoc]] Transformer2DModel

 ## Transformer2DModelOutput
-[[autodoc]] models.attention.Transformer2DModelOutput
+[[autodoc]] models.transformer_2d.Transformer2DModelOutput

 ## PriorTransformer
 [[autodoc]] models.prior_transformer.PriorTransformer
--- a/docs/source/en/api/outputs.mdx
+++ b/docs/source/en/api/outputs.mdx
@@ -25,7 +25,7 @@ pipeline = DDIMPipeline.from_pretrained("google/ddpm-cifar10-32")
 outputs = pipeline()
 ```

-The `outputs` object is a [`~pipeline_utils.ImagePipelineOutput`], as we can see in the
+The `outputs` object is a [`~pipelines.ImagePipelineOutput`], as we can see in the
 documentation of that class below, it means it has an image attribute.

 You can access each attribute as you would usually do, and if that attribute has not been returned by the model, you will get `None`:
--- a/docs/source/en/api/pipelines/alt_diffusion.mdx
+++ b/docs/source/en/api/pipelines/alt_diffusion.mdx
@@ -28,7 +28,7 @@ The abstract of the paper is the following:

 ## Tips

- AltDiffusion is conceptually exaclty the same as [Stable Diffusion](./api/pipelines/stable_diffusion).
+- AltDiffusion is conceptually exaclty the same as [Stable Diffusion](./api/pipelines/stable_diffusion/overview).

 - *Run AltDiffusion*

@@ -69,15 +69,15 @@ If you want to use all possible use cases in a single `DiffusionPipeline` we rec

 ## AltDiffusionPipelineOutput
 [[autodoc]] pipelines.alt_diffusion.AltDiffusionPipelineOutput
+	- all
+	- __call__

 ## AltDiffusionPipeline
 [[autodoc]] AltDiffusionPipeline
+	- all
 	- __call__
-	- enable_attention_slicing
-	- disable_attention_slicing

 ## AltDiffusionImg2ImgPipeline
 [[autodoc]] AltDiffusionImg2ImgPipeline
+	- all
 	- __call__
-	- enable_attention_slicing
-	- disable_attention_slicing
--- a/docs/source/en/api/pipelines/audio_diffusion.mdx
+++ b/docs/source/en/api/pipelines/audio_diffusion.mdx
@@ -91,12 +91,8 @@ display(Audio(output.audios[0], rate=pipe.mel.get_sample_rate()))

 ## AudioDiffusionPipeline
 [[autodoc]] AudioDiffusionPipeline
-    - __call__
-    - encode
-    - slerp
-
+	- all
+	- __call__

 ## Mel
 [[autodoc]] Mel
-    - audio_slice_to_image
-    - image_to_audio
--- a/docs/source/en/api/pipelines/cycle_diffusion.mdx
+++ b/docs/source/en/api/pipelines/cycle_diffusion.mdx
@@ -96,4 +96,5 @@ image.save("black_to_blue.png")

 ## CycleDiffusionPipeline
 [[autodoc]] CycleDiffusionPipeline
+	- all
 	- __call__
--- a/docs/source/en/api/pipelines/dance_diffusion.mdx
+++ b/docs/source/en/api/pipelines/dance_diffusion.mdx
@@ -30,4 +30,5 @@ The original codebase of this implementation can be found [here](https://github.

 ## DanceDiffusionPipeline
 [[autodoc]] DanceDiffusionPipeline
-    - __call__
+	- all
+	- __call__
--- a/docs/source/en/api/pipelines/ddim.mdx
+++ b/docs/source/en/api/pipelines/ddim.mdx
@@ -32,4 +32,5 @@ For questions, feel free to contact the author on [tsong.me](https://tsong.me/).

 ## DDIMPipeline
 [[autodoc]] DDIMPipeline
-    - __call__
+	- all
+	- __call__
--- a/docs/source/en/api/pipelines/ddpm.mdx
+++ b/docs/source/en/api/pipelines/ddpm.mdx
@@ -33,4 +33,5 @@ The original codebase of this paper can be found [here](https://github.com/hojon

 # DDPMPipeline
 [[autodoc]] DDPMPipeline
-    - __call__
+	- all
+	- __call__
--- a/docs/source/en/api/pipelines/dit.mdx
+++ b/docs/source/en/api/pipelines/dit.mdx
@@ -0,0 +1,59 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Scalable Diffusion Models with Transformers (DiT)
+
+## Overview
+
+[Scalable Diffusion Models with Transformers](https://arxiv.org/abs/2212.09748) (DiT) by William Peebles and Saining Xie.
+
+The abstract of the paper is the following:
+
+*We explore a new class of diffusion models based on the transformer architecture. We train latent diffusion models of images, replacing the commonly-used U-Net backbone with a transformer that operates on latent patches. We analyze the scalability of our Diffusion Transformers (DiTs) through the lens of forward pass complexity as measured by Gflops. We find that DiTs with higher Gflops -- through increased transformer depth/width or increased number of input tokens -- consistently have lower FID. In addition to possessing good scalability properties, our largest DiT-XL/2 models outperform all prior diffusion models on the class-conditional ImageNet 512x512 and 256x256 benchmarks, achieving a state-of-the-art FID of 2.27 on the latter.*
+
+The original codebase of this paper can be found here: [facebookresearch/dit](https://github.com/facebookresearch/dit).
+
+## Available Pipelines:
+
+| Pipeline | Tasks | Colab
+|---|---|:---:|
+| [pipeline_dit.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/dit/pipeline_dit.py) | *Conditional Image Generation* | - |
+
+
+## Usage example
+
+```python
+from diffusers import DiTPipeline, DPMSolverMultistepScheduler
+import torch
+
+pipe = DiTPipeline.from_pretrained("facebook/DiT-XL-2-256", torch_dtype=torch.float16)
+pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
+pipe = pipe.to("cuda")
+
+# pick words from Imagenet class labels
+pipe.labels  # to print all available words
+
+# pick words that exist in ImageNet
+words = ["white shark", "umbrella"]
+
+class_ids = pipe.get_label_ids(words)
+
+generator = torch.manual_seed(33)
+output = pipe(class_labels=class_ids, num_inference_steps=25, generator=generator)
+
+image = output.images[0]  # label 'white shark'
+```
+
+## DiTPipeline
+[[autodoc]] DiTPipeline
+	- all
+	- __call__
--- a/docs/source/en/api/pipelines/latent_diffusion.mdx
+++ b/docs/source/en/api/pipelines/latent_diffusion.mdx
@@ -40,8 +40,10 @@ The original codebase can be found [here](https://github.com/CompVis/latent-diff

 ## LDMTextToImagePipeline
 [[autodoc]] LDMTextToImagePipeline
-    - __call__
+	- all
+	- __call__

 ## LDMSuperResolutionPipeline
 [[autodoc]] LDMSuperResolutionPipeline
-    - __call__
+	- all
+	- __call__
--- a/docs/source/en/api/pipelines/latent_diffusion_uncond.mdx
+++ b/docs/source/en/api/pipelines/latent_diffusion_uncond.mdx
@@ -38,4 +38,5 @@ The original codebase can be found [here](https://github.com/CompVis/latent-diff

 ## LDMPipeline
 [[autodoc]] LDMPipeline
-    - __call__
+	- all
+	- __call__
--- a/docs/source/en/api/pipelines/overview.mdx
+++ b/docs/source/en/api/pipelines/overview.mdx
@@ -57,13 +57,24 @@ available a colab notebook to directly try them out.
 | [pndm](./pndm) | [**Pseudo Numerical Methods for Diffusion Models on Manifolds**](https://arxiv.org/abs/2202.09778) | Unconditional Image Generation | 
 | [score_sde_ve](./score_sde_ve) | [**Score-Based Generative Modeling through Stochastic Differential Equations**](https://openreview.net/forum?id=PxTIG12RRHS) | Unconditional Image Generation | 
 | [score_sde_vp](./score_sde_vp) | [**Score-Based Generative Modeling through Stochastic Differential Equations**](https://openreview.net/forum?id=PxTIG12RRHS) | Unconditional Image Generation | 
-| [stable_diffusion](./stable_diffusion) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Text-to-Image Generation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/training_example.ipynb)
-| [stable_diffusion](./stable_diffusion) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Image-to-Image Text-Guided Generation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/image_2_image_using_diffusers.ipynb)
-| [stable_diffusion](./stable_diffusion) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Text-Guided Image Inpainting | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/in_painting_with_stable_diffusion_using_diffusers.ipynb)
-| [stable_diffusion_2](./stable_diffusion_2) | [**Stable Diffusion 2**](https://stability.ai/blog/stable-diffusion-v2-release) | Text-to-Image Generation | 
+| [semantic_stable_diffusion](./semantic_stable_diffusion) | [**SEGA: Instructing Diffusion using Semantic Dimensions**](https://arxiv.org/abs/2301.12247) | Text-to-Image Generation |
+| [stable_diffusion_text2img](./stable_diffusion/text2img) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Text-to-Image Generation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/training_example.ipynb)
+| [stable_diffusion_img2img](./stable_diffusion/img2img) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Image-to-Image Text-Guided Generation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/image_2_image_using_diffusers.ipynb)
+| [stable_diffusion_inpaint](./stable_diffusion/inpaint) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Text-Guided Image Inpainting | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/in_painting_with_stable_diffusion_using_diffusers.ipynb)
+| [stable_diffusion_panorama](./stable_diffusion/panorama) | [**MultiDiffusion: Fusing Diffusion Paths for Controlled Image Generation**](https://arxiv.org/abs/2302.08113) | Text-Guided Panorama View Generation |
+| [stable_diffusion_pix2pix](./stable_diffusion/pix2pix) | [**InstructPix2Pix: Learning to Follow Image Editing Instructions**](https://arxiv.org/abs/2211.09800) | Text-Based Image Editing |
+| [stable_diffusion_pix2pix_zero](./stable_diffusion/pix2pix_zero) | [**Zero-shot Image-to-Image Translation**](https://arxiv.org/abs/2302.03027) | Text-Based Image Editing |
+| [stable_diffusion_attend_and_excite](./stable_diffusion/attend_and_excite) | [**Attend and Excite: Attention-Based Semantic Guidance for Text-to-Image Diffusion Models**](https://arxiv.org/abs/2301.13826) | Text-to-Image Generation |
+| [stable_diffusion_self_attention_guidance](./stable_diffusion/self_attention_guidance) | [**Self-Attention Guidance**](https://arxiv.org/abs/2210.00939) | Text-to-Image Generation |
+| [stable_diffusion_image_variation](./stable_diffusion/image_variation) | [**Stable Diffusion Image Variations**](https://github.com/LambdaLabsML/lambda-diffusers#stable-diffusion-image-variations) | Image-to-Image Generation |
+| [stable_diffusion_latent_upscale](./stable_diffusion/latent_upscale) | [**Stable Diffusion Latent Upscaler**](https://twitter.com/StabilityAI/status/1590531958815064065) | Text-Guided Super Resolution Image-to-Image |
+| [stable_diffusion_2](./stable_diffusion_2/) | [**Stable Diffusion 2**](https://stability.ai/blog/stable-diffusion-v2-release) | Text-to-Image Generation | 
 | [stable_diffusion_2](./stable_diffusion_2) | [**Stable Diffusion 2**](https://stability.ai/blog/stable-diffusion-v2-release) | Text-Guided Image Inpainting | 
+| [stable_diffusion_2](./stable_diffusion_2) | [**Stable Diffusion 2**](https://stability.ai/blog/stable-diffusion-v2-release) | Depth-to-Image Text-Guided Generation |
 | [stable_diffusion_2](./stable_diffusion_2) | [**Stable Diffusion 2**](https://stability.ai/blog/stable-diffusion-v2-release) | Text-Guided Super Resolution Image-to-Image |
 | [stable_diffusion_safe](./stable_diffusion_safe) | [**Safe Stable Diffusion**](https://arxiv.org/abs/2211.05105) | Text-Guided Generation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ml-research/safe-latent-diffusion/blob/main/examples/Safe%20Latent%20Diffusion.ipynb)
+| [stable_unclip](./stable_unclip) | **Stable unCLIP** | Text-to-Image Generation |
+| [stable_unclip](./stable_unclip) | **Stable unCLIP** | Image-to-Image Text-Guided Generation |
 | [stochastic_karras_ve](./stochastic_karras_ve) | [**Elucidating the Design Space of Diffusion-Based Generative Models**](https://arxiv.org/abs/2206.00364) | Unconditional Image Generation |
 | [unclip](./unclip) | [Hierarchical Text-Conditional Image Generation with CLIP Latents](https://arxiv.org/abs/2204.06125) | Text-to-Image Generation |
 | [versatile_diffusion](./versatile_diffusion) | [Versatile Diffusion: Text, Images and Variations All in One Diffusion Model](https://arxiv.org/abs/2211.08332) | Text-to-Image Generation | 
--- a/docs/source/en/api/pipelines/paint_by_example.mdx
+++ b/docs/source/en/api/pipelines/paint_by_example.mdx
@@ -69,5 +69,6 @@ image
 ```

 ## PaintByExamplePipeline
-[[autodoc]] pipelines.paint_by_example.pipeline_paint_by_example.PaintByExamplePipeline
-    - __call__
+[[autodoc]] PaintByExamplePipeline
+	- all
+	- __call__
--- a/docs/source/en/api/pipelines/pndm.mdx
+++ b/docs/source/en/api/pipelines/pndm.mdx
@@ -30,6 +30,6 @@ The original codebase can be found [here](https://github.com/luping-liu/PNDM).


 ## PNDMPipeline
-[[autodoc]] pipelines.pndm.pipeline_pndm.PNDMPipeline
-    - __call__
-
+[[autodoc]] PNDMPipeline
+	- all
+	- __call__
--- a/docs/source/en/api/pipelines/repaint.mdx
+++ b/docs/source/en/api/pipelines/repaint.mdx
@@ -72,6 +72,6 @@ inpainted_image = output.images[0]
 ```

 ## RePaintPipeline
-[[autodoc]] pipelines.repaint.pipeline_repaint.RePaintPipeline
-    - __call__
-
+[[autodoc]] RePaintPipeline
+	- all
+	- __call__
--- a/docs/source/en/api/pipelines/score_sde_ve.mdx
+++ b/docs/source/en/api/pipelines/score_sde_ve.mdx
@@ -32,5 +32,5 @@ This pipeline implements the Variance Expanding (VE) variant of the method.

 ## ScoreSdeVePipeline
 [[autodoc]] ScoreSdeVePipeline
-    - __call__
-
+	- all
+	- __call__
--- a/docs/source/en/api/pipelines/semantic_stable_diffusion.mdx
+++ b/docs/source/en/api/pipelines/semantic_stable_diffusion.mdx
@@ -0,0 +1,79 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Semantic Guidance
+
+Semantic Guidance for Diffusion Models was proposed in [SEGA: Instructing Diffusion using Semantic Dimensions](https://arxiv.org/abs/2301.12247) and provides strong semantic control over the image generation.
+Small changes to the text prompt usually result in entirely different output images. However, with SEGA a variety of changes to the image are enabled that can be controlled easily and intuitively, and stay true to the original image composition.
+
+The abstract of the paper is the following:
+
+*Text-to-image diffusion models have recently received a lot of interest for their astonishing ability to produce high-fidelity images from text only. However, achieving one-shot generation that aligns with the user's intent is nearly impossible, yet small changes to the input prompt often result in very different images. This leaves the user with little semantic control. To put the user in control, we show how to interact with the diffusion process to flexibly steer it along semantic directions. This semantic guidance (SEGA) allows for subtle and extensive edits, changes in composition and style, as well as optimizing the overall artistic conception. We demonstrate SEGA's effectiveness on a variety of tasks and provide evidence for its versatility and flexibility.*
+
+
+*Overview*:
+
+| Pipeline | Tasks | Colab | Demo
+|---|---|:---:|:---:|
+| [pipeline_semantic_stable_diffusion.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion) | *Text-to-Image Generation* |  [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ml-research/semantic-image-editing/blob/main/examples/SemanticGuidance.ipynb) | [Coming Soon](https://huggingface.co/AIML-TUDA)
+
+## Tips
+
+- The Semantic Guidance pipeline can be used with any [Stable Diffusion](./api/pipelines/stable_diffusion/text2img) checkpoint.
+
+### Run Semantic Guidance
+
+The interface of [`SemanticStableDiffusionPipeline`] provides several additional parameters to influence the image generation.
+Exemplary usage may look like this:
+
+```python
+import torch
+from diffusers import SemanticStableDiffusionPipeline
+
+pipe = SemanticStableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16)
+pipe = pipe.to("cuda")
+
+out = pipe(
+    prompt="a photo of the face of a woman",
+    num_images_per_prompt=1,
+    guidance_scale=7,
+    editing_prompt=[
+        "smiling, smile",  # Concepts to apply
+        "glasses, wearing glasses",
+        "curls, wavy hair, curly hair",
+        "beard, full beard, mustache",
+    ],
+    reverse_editing_direction=[False, False, False, False],  # Direction of guidance i.e. increase all concepts
+    edit_warmup_steps=[10, 10, 10, 10],  # Warmup period for each concept
+    edit_guidance_scale=[4, 5, 5, 5.4],  # Guidance scale for each concept
+    edit_threshold=[
+        0.99,
+        0.975,
+        0.925,
+        0.96,
+    ],  # Threshold for each concept. Threshold equals the percentile of the latent space that will be discarded. I.e. threshold=0.99 uses 1% of the latent dimensions
+    edit_momentum_scale=0.3,  # Momentum scale that will be added to the latent guidance
+    edit_mom_beta=0.6,  # Momentum beta
+    edit_weights=[1, 1, 1, 1, 1],  # Weights of the individual concepts against each other
+)
+```
+
+For more examples check the colab notebook.
+
+## StableDiffusionSafePipelineOutput
+[[autodoc]] pipelines.semantic_stable_diffusion.SemanticStableDiffusionPipelineOutput
+	- all
+
+## SemanticStableDiffusionPipeline
+[[autodoc]] SemanticStableDiffusionPipeline
+	- all
+	- __call__
--- a/docs/source/en/api/pipelines/stable_diffusion/attend_and_excite.mdx
+++ b/docs/source/en/api/pipelines/stable_diffusion/attend_and_excite.mdx
@@ -0,0 +1,75 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Attend and Excite: Attention-Based Semantic Guidance for Text-to-Image Diffusion Models
+
+## Overview
+
+Attend and Excite for Stable Diffusion was proposed in [Attend-and-Excite: Attention-Based Semantic Guidance for Text-to-Image Diffusion Models](https://attendandexcite.github.io/Attend-and-Excite/) and provides textual attention control over the image generation.
+
+The abstract of the paper is the following:
+
+*Text-to-image diffusion models have recently received a lot of interest for their astonishing ability to produce high-fidelity images from text only. However, achieving one-shot generation that aligns with the user's intent is nearly impossible, yet small changes to the input prompt often result in very different images. This leaves the user with little semantic control. To put the user in control, we show how to interact with the diffusion process to flexibly steer it along semantic directions. This semantic guidance (SEGA) allows for subtle and extensive edits, changes in composition and style, as well as optimizing the overall artistic conception. We demonstrate SEGA's effectiveness on a variety of tasks and provide evidence for its versatility and flexibility.*
+
+Resources
+
+* [Project Page](https://attendandexcite.github.io/Attend-and-Excite/)
+* [Paper](https://arxiv.org/abs/2301.13826)
+* [Original Code](https://github.com/AttendAndExcite/Attend-and-Excite)
+* [Demo](https://huggingface.co/spaces/AttendAndExcite/Attend-and-Excite)
+
+
+## Available Pipelines:
+
+| Pipeline | Tasks | Colab | Demo
+|---|---|:---:|:---:|
+| [pipeline_semantic_stable_diffusion_attend_and_excite.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_semantic_stable_diffusion_attend_and_excite) | *Text-to-Image Generation* | - | -
+
+
+### Usage example
+
+
+```python
+import torch
+from diffusers import StableDiffusionAttendAndExcitePipeline
+
+model_id = "CompVis/stable-diffusion-v1-4"
+pipe = StableDiffusionAttendAndExcitePipeline.from_pretrained(model_id, torch_dtype=torch.float16).to("cuda")
+pipe = pipe.to("cuda")
+
+prompt = "a cat and a frog"
+
+# use get_indices function to find out indices of the tokens you want to alter
+pipe.get_indices(prompt)
+
+token_indices = [2, 5]
+seed = 6141
+generator = torch.Generator("cuda").manual_seed(seed)
+
+images = pipe(
+    prompt=prompt,
+    token_indices=token_indices,
+    guidance_scale=7.5,
+    generator=generator,
+    num_inference_steps=50,
+    max_iter_to_alter=25,
+).images
+
+image = images[0]
+image.save(f"../images/{prompt}_{seed}.png")
+```
+
+
+## StableDiffusionAttendAndExcitePipeline
+[[autodoc]] StableDiffusionAttendAndExcitePipeline
+	- all
+	- __call__
--- a/docs/source/en/api/pipelines/stable_diffusion/depth2img.mdx
+++ b/docs/source/en/api/pipelines/stable_diffusion/depth2img.mdx
@@ -0,0 +1,33 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Depth-to-Image Generation
+
+## StableDiffusionDepth2ImgPipeline
+
+The depth-guided stable diffusion model was created by the researchers and engineers from [CompVis](https://github.com/CompVis), [Stability AI](https://stability.ai/), and [LAION](https://laion.ai/), as part of Stable Diffusion 2.0. It uses [MiDas](https://github.com/isl-org/MiDaS) to infer depth based on an image.
+
+[`StableDiffusionDepth2ImgPipeline`] lets you pass a text prompt and an initial image to condition the generation of new images as well as a `depth_map` to preserve the images’ structure. 
+
+The original codebase can be found here: 
+- *Stable Diffusion v2*: [Stability-AI/stablediffusion](https://github.com/Stability-AI/stablediffusion#depth-conditional-stable-diffusion)
+
+Available Checkpoints are:
+- *stable-diffusion-2-depth*: [stabilityai/stable-diffusion-2-depth](https://huggingface.co/stabilityai/stable-diffusion-2-depth)
+
+[[autodoc]] StableDiffusionDepth2ImgPipeline
+	- all
+	- __call__
+	- enable_attention_slicing
+	- disable_attention_slicing
+	- enable_xformers_memory_efficient_attention
+	- disable_xformers_memory_efficient_attention
--- a/docs/source/en/api/pipelines/stable_diffusion/image_variation.mdx
+++ b/docs/source/en/api/pipelines/stable_diffusion/image_variation.mdx
@@ -0,0 +1,31 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Image Variation
+
+## StableDiffusionImageVariationPipeline
+
+[`StableDiffusionImageVariationPipeline`] lets you generate variations from an input image using Stable Diffusion. It uses a fine-tuned version of Stable Diffusion model, trained by  [Justin Pinkney](https://www.justinpinkney.com/) (@Buntworthy) at [Lambda](https://lambdalabs.com/)
+
+The original codebase can be found here:
+[Stable Diffusion Image Variations](https://github.com/LambdaLabsML/lambda-diffusers#stable-diffusion-image-variations)
+
+Available Checkpoints are:
+- *sd-image-variations-diffusers*: [lambdalabs/sd-image-variations-diffusers](https://huggingface.co/lambdalabs/sd-image-variations-diffusers)
+
+[[autodoc]] StableDiffusionImageVariationPipeline
+	- all
+	- __call__
+	- enable_attention_slicing
+	- disable_attention_slicing
+	- enable_xformers_memory_efficient_attention
+	- disable_xformers_memory_efficient_attention
--- a/docs/source/en/api/pipelines/stable_diffusion/img2img.mdx
+++ b/docs/source/en/api/pipelines/stable_diffusion/img2img.mdx
@@ -0,0 +1,29 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Image-to-Image Generation
+
+## StableDiffusionImg2ImgPipeline
+
+The Stable Diffusion model was created by the researchers and engineers from [CompVis](https://github.com/CompVis), [Stability AI](https://stability.ai/), [runway](https://github.com/runwayml), and [LAION](https://laion.ai/). The [`StableDiffusionImg2ImgPipeline`] lets you pass a text prompt and an initial image to condition the generation of new images using Stable Diffusion.
+
+The original codebase can be found here: [CampVis/stable-diffusion](https://github.com/CompVis/stable-diffusion/blob/main/scripts/img2img.py) 
+
+[`StableDiffusionImg2ImgPipeline`] is compatible with all Stable Diffusion checkpoints for [Text-to-Image](./text2img) 
+
+[[autodoc]] StableDiffusionImg2ImgPipeline
+	- all
+	- __call__
+	- enable_attention_slicing
+	- disable_attention_slicing
+	- enable_xformers_memory_efficient_attention
+	- disable_xformers_memory_efficient_attention
--- a/docs/source/en/api/pipelines/stable_diffusion/inpaint.mdx
+++ b/docs/source/en/api/pipelines/stable_diffusion/inpaint.mdx
@@ -0,0 +1,33 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Text-Guided Image Inpainting
+
+## StableDiffusionInpaintPipeline
+
+The Stable Diffusion model was created by the researchers and engineers from [CompVis](https://github.com/CompVis), [Stability AI](https://stability.ai/), [runway](https://github.com/runwayml), and [LAION](https://laion.ai/). The [`StableDiffusionInpaintPipeline`] lets you edit specific parts of an image by providing a mask and a text prompt using Stable Diffusion.
+
+The original codebase can be found here: 
+- *Stable Diffusion V1*: [CampVis/stable-diffusion](https://github.com/runwayml/stable-diffusion#inpainting-with-stable-diffusion)
+- *Stable Diffusion V2*: [Stability-AI/stablediffusion](https://github.com/Stability-AI/stablediffusion#image-inpainting-with-stable-diffusion)
+
+Available checkpoints are:
+- *stable-diffusion-inpainting (512x512 resolution)*: [runwayml/stable-diffusion-inpainting](https://huggingface.co/runwayml/stable-diffusion-inpainting)
+- *stable-diffusion-2-inpainting (512x512 resolution)*: [stabilityai/stable-diffusion-2-inpainting](https://huggingface.co/stabilityai/stable-diffusion-2-inpainting)
+
+[[autodoc]] StableDiffusionInpaintPipeline
+	- all
+	- __call__
+	- enable_attention_slicing
+	- disable_attention_slicing
+	- enable_xformers_memory_efficient_attention
+	- disable_xformers_memory_efficient_attention
--- a/docs/source/en/api/pipelines/stable_diffusion/latent_upscale.mdx
+++ b/docs/source/en/api/pipelines/stable_diffusion/latent_upscale.mdx
@@ -0,0 +1,33 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Stable Diffusion Latent Upscaler
+
+## StableDiffusionLatentUpscalePipeline
+
+The Stable Diffusion Latent Upscaler model was created by [Katherine Crowson](https://github.com/crowsonkb/k-diffusion) in collaboration with [Stability AI](https://stability.ai/). It can be used on top of any [`StableDiffusionUpscalePipeline`] checkpoint to enhance its output image resolution by a factor of 2.
+
+A notebook that demonstrates the original implementation can be found here: 
+- [Stable Diffusion Upscaler Demo](https://colab.research.google.com/drive/1o1qYJcFeywzCIdkfKJy7cTpgZTCM2EI4)
+
+Available Checkpoints are:
+- *stabilityai/latent-upscaler*: [stabilityai/sd-x2-latent-upscaler](https://huggingface.co/stabilityai/sd-x2-latent-upscaler)
+
+
+[[autodoc]] StableDiffusionLatentUpscalePipeline
+	- all
+	- __call__
+	- enable_sequential_cpu_offload
+	- enable_attention_slicing
+	- disable_attention_slicing
+	- enable_xformers_memory_efficient_attention
+	- disable_xformers_memory_efficient_attention
--- a/docs/source/en/api/pipelines/stable_diffusion/overview.mdx
+++ b/docs/source/en/api/pipelines/stable_diffusion/overview.mdx
@@ -25,9 +25,18 @@ For more details about how Stable Diffusion works and how it differs from the ba

 | Pipeline | Tasks | Colab | Demo
 |---|---|:---:|:---:|
-| [pipeline_stable_diffusion.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py) | *Text-to-Image Generation* | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/stable_diffusion.ipynb) | [🤗 Stable Diffusion](https://huggingface.co/spaces/stabilityai/stable-diffusion)
-| [pipeline_stable_diffusion_img2img.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py) | *Image-to-Image Text-Guided Generation* | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/image_2_image_using_diffusers.ipynb) | [🤗 Diffuse the Rest](https://huggingface.co/spaces/huggingface/diffuse-the-rest)
-| [pipeline_stable_diffusion_inpaint.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py) | **Experimental** – *Text-Guided Image Inpainting* | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/in_painting_with_stable_diffusion_using_diffusers.ipynb) | Coming soon
+| [StableDiffusionPipeline](./text2img) | *Text-to-Image Generation* | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/stable_diffusion.ipynb) | [🤗 Stable Diffusion](https://huggingface.co/spaces/stabilityai/stable-diffusion)
+| [StableDiffusionImg2ImgPipeline](./img2img) | *Image-to-Image Text-Guided Generation* | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/image_2_image_using_diffusers.ipynb) | [🤗 Diffuse the Rest](https://huggingface.co/spaces/huggingface/diffuse-the-rest)
+| [StableDiffusionInpaintPipeline](./inpaint) | **Experimental** – *Text-Guided Image Inpainting* | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/in_painting_with_stable_diffusion_using_diffusers.ipynb) | Coming soon
+| [StableDiffusionDepth2ImgPipeline](./depth2img) | **Experimental** – *Depth-to-Image Text-Guided Generation * | | Coming soon
+| [StableDiffusionImageVariationPipeline](./image_variation) | **Experimental** – *Image Variation Generation * | | [🤗 Stable Diffusion Image Variations](https://huggingface.co/spaces/lambdalabs/stable-diffusion-image-variations)
+| [StableDiffusionUpscalePipeline](./upscale) | **Experimental** – *Text-Guided Image Super-Resolution * | | Coming soon
+| [StableDiffusionLatentUpscalePipeline](./latent_upscale) | **Experimental** – *Text-Guided Image Super-Resolution * | | Coming soon
+| [StableDiffusionInstructPix2PixPipeline](./pix2pix) | **Experimental** – *Text-Based Image Editing * | | [InstructPix2Pix: Learning to Follow Image Editing Instructions](https://huggingface.co/spaces/timbrooks/instruct-pix2pix)
+| [StableDiffusionAttendAndExcitePipeline](./attend_and_excite) | **Experimental** – *Text-to-Image Generation * | | [Attend-and-Excite: Attention-Based Semantic Guidance for Text-to-Image Diffusion Models](https://huggingface.co/spaces/AttendAndExcite/Attend-and-Excite)
+| [StableDiffusionPix2PixZeroPipeline](./pix2pix_zero) | **Experimental** – *Text-Based Image Editing * | | [Zero-shot Image-to-Image Translation](https://arxiv.org/abs/2302.03027)
+
+

 ## Tips

@@ -70,54 +79,3 @@ If you want to use all possible use cases in a single `DiffusionPipeline` you ca

 ## StableDiffusionPipelineOutput
 [[autodoc]] pipelines.stable_diffusion.StableDiffusionPipelineOutput
-
-## StableDiffusionPipeline
-[[autodoc]] StableDiffusionPipeline
-	- __call__
-	- enable_attention_slicing
-	- disable_attention_slicing
-	- enable_vae_slicing
-	- disable_vae_slicing
-	- enable_xformers_memory_efficient_attention
-	- disable_xformers_memory_efficient_attention
-
-## StableDiffusionImg2ImgPipeline
-[[autodoc]] StableDiffusionImg2ImgPipeline
-	- __call__
-	- enable_attention_slicing
-	- disable_attention_slicing
-	- enable_xformers_memory_efficient_attention
-	- disable_xformers_memory_efficient_attention
-
-## StableDiffusionInpaintPipeline
-[[autodoc]] StableDiffusionInpaintPipeline
-	- __call__
-	- enable_attention_slicing
-	- disable_attention_slicing
-	- enable_xformers_memory_efficient_attention
-	- disable_xformers_memory_efficient_attention
-
-## StableDiffusionDepth2ImgPipeline
-[[autodoc]] StableDiffusionDepth2ImgPipeline
-	- __call__
-	- enable_attention_slicing
-	- disable_attention_slicing
-	- enable_xformers_memory_efficient_attention
-	- disable_xformers_memory_efficient_attention
-
-## StableDiffusionImageVariationPipeline
-[[autodoc]] StableDiffusionImageVariationPipeline
-	- __call__
-	- enable_attention_slicing
-	- disable_attention_slicing
-	- enable_xformers_memory_efficient_attention
-	- disable_xformers_memory_efficient_attention
-
-
-## StableDiffusionUpscalePipeline
-[[autodoc]] StableDiffusionUpscalePipeline
-	- __call__
-	- enable_attention_slicing
-	- disable_attention_slicing
-	- enable_xformers_memory_efficient_attention
-	- disable_xformers_memory_efficient_attention
--- a/docs/source/en/api/pipelines/stable_diffusion/panorama.mdx
+++ b/docs/source/en/api/pipelines/stable_diffusion/panorama.mdx
@@ -0,0 +1,57 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# MultiDiffusion: Fusing Diffusion Paths for Controlled Image Generation
+
+## Overview
+
+[MultiDiffusion: Fusing Diffusion Paths for Controlled Image Generation](https://arxiv.org/abs/2302.08113) by Omer Bar-Tal, Lior Yariv, Yaron Lipman, and Tali Dekel.
+
+The abstract of the paper is the following:
+
+*Recent advances in text-to-image generation with diffusion models present transformative capabilities in image quality. However, user controllability of the generated image, and fast adaptation to new tasks still remains an open challenge, currently mostly addressed by costly and long re-training and fine-tuning or ad-hoc adaptations to specific image generation tasks. In this work, we present MultiDiffusion, a unified framework that enables versatile and controllable image generation, using a pre-trained text-to-image diffusion model, without any further training or finetuning. At the center of our approach is a new generation process, based on an optimization task that binds together multiple diffusion generation processes with a shared set of parameters or constraints. We show that MultiDiffusion can be readily applied to generate high quality and diverse images that adhere to user-provided controls, such as desired aspect ratio (e.g., panorama), and spatial guiding signals, ranging from tight segmentation masks to bounding boxes.
+
+Resources:
+
+* [Project Page](https://multidiffusion.github.io/).
+* [Paper](https://arxiv.org/abs/2302.08113).
+* [Original Code](https://github.com/omerbt/MultiDiffusion).
+
+## Available Pipelines:
+
+| Pipeline | Tasks
+|---|---|
+| [StableDiffusionPanoramaPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py) | *Text-Guided Panorama View Generation* |
+
+<!-- TODO: add Colab -->
+
+## Usage example
+
+```python
+import torch
+from diffusers import StableDiffusionPanoramaPipeline, DDIMScheduler
+
+model_ckpt = "stabilityai/stable-diffusion-2-base"
+scheduler = DDIMScheduler.from_pretrained(model_ckpt, subfolder="scheduler")
+pipe = StableDiffusionPanoramaPipeline.from_pretrained(model_ckpt, scheduler=scheduler, torch_dtype=torch.float16)
+
+pipe = pipe.to("cuda")
+
+prompt = "a photo of the dolomites"
+image = pipe(prompt).images[0]
+image.save("dolomites.png")
+```
+
+## StableDiffusionPanoramaPipeline
+[[autodoc]] StableDiffusionPanoramaPipeline
+	- __call__
+	- all
--- a/docs/source/en/api/pipelines/stable_diffusion/pix2pix.mdx
+++ b/docs/source/en/api/pipelines/stable_diffusion/pix2pix.mdx
@@ -0,0 +1,70 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# InstructPix2Pix: Learning to Follow Image Editing Instructions
+
+## Overview
+
+[InstructPix2Pix: Learning to Follow Image Editing Instructions](https://arxiv.org/abs/2211.09800) by Tim Brooks, Aleksander Holynski and Alexei A. Efros.
+
+The abstract of the paper is the following:
+
+*We propose a method for editing images from human instructions: given an input image and a written instruction that tells the model what to do, our model follows these instructions to edit the image. To obtain training data for this problem, we combine the knowledge of two large pretrained models -- a language model (GPT-3) and a text-to-image model (Stable Diffusion) -- to generate a large dataset of image editing examples. Our conditional diffusion model, InstructPix2Pix, is trained on our generated data, and generalizes to real images and user-written instructions at inference time. Since it performs edits in the forward pass and does not require per example fine-tuning or inversion, our model edits images quickly, in a matter of seconds. We show compelling editing results for a diverse collection of input images and written instructions.*
+
+Resources:
+
+* [Project Page](https://www.timothybrooks.com/instruct-pix2pix).
+* [Paper](https://arxiv.org/abs/2211.09800).
+* [Original Code](https://github.com/timothybrooks/instruct-pix2pix).
+* [Demo](https://huggingface.co/spaces/timbrooks/instruct-pix2pix).
+
+
+## Available Pipelines:
+
+| Pipeline | Tasks | Demo
+|---|---|:---:|
+| [StableDiffusionInstructPix2PixPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py) | *Text-Based Image Editing* | [🤗 Space](https://huggingface.co/spaces/timbrooks/instruct-pix2pix) |
+
+<!-- TODO: add Colab -->
+
+## Usage example
+
+```python
+import PIL
+import requests
+import torch
+from diffusers import StableDiffusionInstructPix2PixPipeline
+
+model_id = "timbrooks/instruct-pix2pix"
+pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to("cuda")
+
+url = "https://huggingface.co/datasets/diffusers/diffusers-images-docs/resolve/main/mountain.png"
+
+
+def download_image(url):
+    image = PIL.Image.open(requests.get(url, stream=True).raw)
+    image = PIL.ImageOps.exif_transpose(image)
+    image = image.convert("RGB")
+    return image
+
+
+image = download_image(url)
+
+prompt = "make the mountains snowy"
+edit = pipe(prompt, image=image, num_inference_steps=20, image_guidance_scale=1.5, guidance_scale=7).images[0]
+images[0].save("snowy_mountains.png")
+```
+
+## StableDiffusionInstructPix2PixPipeline
+[[autodoc]] StableDiffusionInstructPix2PixPipeline
+	- __call__
+	- all
--- a/docs/source/en/api/pipelines/stable_diffusion/pix2pix_zero.mdx
+++ b/docs/source/en/api/pipelines/stable_diffusion/pix2pix_zero.mdx
@@ -0,0 +1,289 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Zero-shot Image-to-Image Translation
+
+## Overview
+
+[Zero-shot Image-to-Image Translation](https://arxiv.org/abs/2302.03027) by Gaurav Parmar, Krishna Kumar Singh, Richard Zhang, Yijun Li, Jingwan Lu, and Jun-Yan Zhu.
+
+The abstract of the paper is the following:
+
+*Large-scale text-to-image generative models have shown their remarkable ability to synthesize diverse and high-quality images. However, it is still challenging to directly apply these models for editing real images for two reasons. First, it is hard for users to come up with a perfect text prompt that accurately describes every visual detail in the input image. Second, while existing models can introduce desirable changes in certain regions, they often dramatically alter the input content and introduce unexpected changes in unwanted regions. In this work, we propose pix2pix-zero, an image-to-image translation method that can preserve the content of the original image without manual prompting. We first automatically discover editing directions that reflect desired edits in the text embedding space. To preserve the general content structure after editing, we further propose cross-attention guidance, which aims to retain the cross-attention maps of the input image throughout the diffusion process. In addition, our method does not need additional training for these edits and can directly use the existing pre-trained text-to-image diffusion model. We conduct extensive experiments and show that our method outperforms existing and concurrent works for both real and synthetic image editing.*
+
+Resources:
+
+* [Project Page](https://pix2pixzero.github.io/).
+* [Paper](https://arxiv.org/abs/2302.03027).
+* [Original Code](https://github.com/pix2pixzero/pix2pix-zero).
+
+## Tips 
+
+* The pipeline can be conditioned on real input images. Check out the code examples below to know more.
+* The pipeline exposes two arguments namely `source_embeds` and `target_embeds`
+that let you control the direction of the semantic edits in the final image to be generated. Let's say,
+you wanted to translate from "cat" to "dog". In this case, the edit direction will be "cat -> dog". To reflect
+this in the pipeline, you simply have to set the embeddings related to the phrases including "cat" to
+`source_embeds` and "dog" to `target_embeds`. Refer to the code example below for more details.
+* When you're using this pipeline from a prompt, specify the _source_ concept in the prompt. Taking
+the above example, a valid input prompt would be: "a high resolution painting of a **cat** in the style of van gough".
+* If you wanted to reverse the direction in the example above, i.e., "dog -> cat", then it's recommended to:
+    * Swap the `source_embeds` and `target_embeds`.
+    * Change the input prompt to include "dog".  
+* To learn more about how the source and target embeddings are generated, refer to the [original 
+paper](https://arxiv.org/abs/2302.03027). Below, we also provide some directions on how to generate the embeddings.
+
+## Available Pipelines:
+
+| Pipeline | Tasks | Demo
+|---|---|:---:|
+| [StableDiffusionPix2PixZeroPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py) | *Text-Based Image Editing* | [🤗 Space] (soon) |
+
+<!-- TODO: add Colab -->
+
+## Usage example
+
+### Based on an image generated with the input prompt
+
+```python
+import requests
+import torch
+
+from diffusers import DDIMScheduler, StableDiffusionPix2PixZeroPipeline
+
+
+def download(embedding_url, local_filepath):
+    r = requests.get(embedding_url)
+    with open(local_filepath, "wb") as f:
+        f.write(r.content)
+
+
+model_ckpt = "CompVis/stable-diffusion-v1-4"
+pipeline = StableDiffusionPix2PixZeroPipeline.from_pretrained(
+    model_ckpt, conditions_input_image=False, torch_dtype=torch.float16
+)
+pipeline.scheduler = DDIMScheduler.from_config(pipeline.scheduler.config)
+pipeline.to("cuda")
+
+prompt = "a high resolution painting of a cat in the style of van gough"
+src_embs_url = "https://github.com/pix2pixzero/pix2pix-zero/raw/main/assets/embeddings_sd_1.4/cat.pt"
+target_embs_url = "https://github.com/pix2pixzero/pix2pix-zero/raw/main/assets/embeddings_sd_1.4/dog.pt"
+
+for url in [src_embs_url, target_embs_url]:
+    download(url, url.split("/")[-1])
+
+src_embeds = torch.load(src_embs_url.split("/")[-1])
+target_embeds = torch.load(target_embs_url.split("/")[-1])
+
+images = pipeline(
+    prompt,
+    source_embeds=src_embeds,
+    target_embeds=target_embeds,
+    num_inference_steps=50,
+    cross_attention_guidance_amount=0.15,
+).images
+images[0].save("edited_image_dog.png")
+```
+
+### Based on an input image
+
+When the pipeline is conditioned on an input image, we first obtain an inverted
+noise from it using a `DDIMInverseScheduler` with the help of a generated caption. Then 
+the inverted noise is used to start the generation process. 
+
+First, let's load our pipeline: 
+
+```py
+import torch
+from transformers import BlipForConditionalGeneration, BlipProcessor
+from diffusers import DDIMScheduler, DDIMInverseScheduler, StableDiffusionPix2PixZeroPipeline
+
+captioner_id = "Salesforce/blip-image-captioning-base"
+processor = BlipProcessor.from_pretrained(captioner_id)
+model = BlipForConditionalGeneration.from_pretrained(captioner_id, torch_dtype=torch.float16, low_cpu_mem_usage=True)
+
+sd_model_ckpt = "CompVis/stable-diffusion-v1-4"
+pipeline = StableDiffusionPix2PixZeroPipeline.from_pretrained(
+    sd_model_ckpt,
+    caption_generator=model,
+    caption_processor=processor,
+    torch_dtype=torch.float16,
+    safety_checker=None,
+)
+pipeline.scheduler = DDIMScheduler.from_config(pipeline.scheduler.config)
+pipeline.inverse_scheduler = DDIMInverseScheduler.from_config(pipeline.scheduler.config)
+pipeline.enable_model_cpu_offload()
+```
+
+Then, we load an input image for conditioning and obtain a suitable caption for it: 
+
+```py
+import requests
+from PIL import Image
+
+img_url = "https://github.com/pix2pixzero/pix2pix-zero/raw/main/assets/test_images/cats/cat_6.png"
+raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB").resize((512, 512))
+caption = pipeline.generate_caption(raw_image)
+```
+
+Then we employ the generated caption and the input image to get the inverted noise: 
+
+```py 
+generator = torch.manual_seed(0)
+inv_latents = pipeline.invert(caption, image=raw_image, generator=generator).latents
+```
+
+Now, generate the image with edit directions: 
+
+```py
+# See the "Generating source and target embeddings" section below to
+# automate the generation of these captions with a pre-trained model like Flan-T5 as explained below.
+source_prompts = ["a cat sitting on the street", "a cat playing in the field", "a face of a cat"]
+target_prompts = ["a dog sitting on the street", "a dog playing in the field", "a face of a dog"]
+
+source_embeds = pipeline.get_embeds(source_prompts, batch_size=2)
+target_embeds = pipeline.get_embeds(target_prompts, batch_size=2)
+
+
+image = pipeline(
+    caption,
+    source_embeds=source_embeds,
+    target_embeds=target_embeds,
+    num_inference_steps=50,
+    cross_attention_guidance_amount=0.15,
+    generator=generator,
+    latents=inv_latents,
+    negative_prompt=caption,
+).images[0]
+image.save("edited_image.png")
+```
+
+## Generating source and target embeddings 
+
+The authors originally used the [GPT-3 API](https://openai.com/api/) to generate the source and target captions for discovering
+edit directions. However, we can also leverage open source and public models for the same purpose.
+Below, we provide an end-to-end example with the [Flan-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5) model
+for generating captions and [CLIP](https://huggingface.co/docs/transformers/model_doc/clip) for
+computing embeddings on the generated captions.  
+
+**1. Load the generation model**:
+
+```py
+import torch
+from transformers import AutoTokenizer, T5ForConditionalGeneration
+
+tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-xl")
+model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-xl", device_map="auto", torch_dtype=torch.float16)
+```
+
+**2. Construct a starting prompt**: 
+
+```py
+source_concept = "cat"
+target_concept = "dog"
+
+source_text = f"Provide a caption for images containing a {source_concept}. "
+"The captions should be in English and should be no longer than 150 characters."
+
+target_text = f"Provide a caption for images containing a {target_concept}. "
+"The captions should be in English and should be no longer than 150 characters."
+```
+
+Here, we're interested in the "cat -> dog" direction. 
+
+**3. Generate captions**:
+
+We can use a utility like so for this purpose. 
+
+```py
+def generate_captions(input_prompt):
+    input_ids = tokenizer(input_prompt, return_tensors="pt").input_ids.to("cuda")
+
+    outputs = model.generate(
+        input_ids, temperature=0.8, num_return_sequences=16, do_sample=True, max_new_tokens=128, top_k=10
+    )
+    return tokenizer.batch_decode(outputs, skip_special_tokens=True)
+```
+
+And then we just call it to generate our captions:
+
+```py
+source_captions = generate_captions(source_text)
+target_captions = generate_captions(target_concept)
+```
+
+We encourage you to play around with the different parameters supported by the
+`generate()` method ([documentation](https://huggingface.co/docs/transformers/main/en/main_classes/text_generation#transformers.generation_tf_utils.TFGenerationMixin.generate)) for the generation quality you are looking for.
+
+**4. Load the embedding model**: 
+
+Here, we need to use the same text encoder model used by the subsequent Stable Diffusion model.
+
+```py 
+from diffusers import StableDiffusionPix2PixZeroPipeline 
+
+pipeline = StableDiffusionPix2PixZeroPipeline.from_pretrained(
+    "CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16
+)
+pipeline = pipeline.to("cuda")
+tokenizer = pipeline.tokenizer
+text_encoder = pipeline.text_encoder
+```
+
+**5. Compute embeddings**:
+
+```py 
+import torch 
+
+def embed_captions(sentences, tokenizer, text_encoder, device="cuda"):
+    with torch.no_grad():
+        embeddings = []
+        for sent in sentences:
+            text_inputs = tokenizer(
+                sent,
+                padding="max_length",
+                max_length=tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            prompt_embeds = text_encoder(text_input_ids.to(device), attention_mask=None)[0]
+            embeddings.append(prompt_embeds)
+    return torch.concatenate(embeddings, dim=0).mean(dim=0).unsqueeze(0)
+
+source_embeddings = embed_captions(source_captions, tokenizer, text_encoder)
+target_embeddings = embed_captions(target_captions, tokenizer, text_encoder)
+```
+
+And you're done! [Here](https://colab.research.google.com/drive/1tz2C1EdfZYAPlzXXbTnf-5PRBiR8_R1F?usp=sharing) is a Colab Notebook that you can use to interact with the entire process.
+
+Now, you can use these embeddings directly while calling the pipeline: 
+
+```py
+from diffusers import DDIMScheduler
+
+pipeline.scheduler = DDIMScheduler.from_config(pipeline.scheduler.config)
+
+images = pipeline(
+    prompt,
+    source_embeds=source_embeddings,
+    target_embeds=target_embeddings,
+    num_inference_steps=50,
+    cross_attention_guidance_amount=0.15,
+).images
+images[0].save("edited_image_dog.png")
+```
+
+## StableDiffusionPix2PixZeroPipeline
+[[autodoc]] StableDiffusionPix2PixZeroPipeline
+	- __call__
+	- all
--- a/docs/source/en/api/pipelines/stable_diffusion/self_attention_guidance.mdx
+++ b/docs/source/en/api/pipelines/stable_diffusion/self_attention_guidance.mdx
@@ -0,0 +1,64 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Self-Attention Guidance (SAG)
+
+## Overview
+
+[Self-Attention Guidance](https://arxiv.org/abs/2210.00939) by Susung Hong et al.
+
+The abstract of the paper is the following:
+
+*Denoising diffusion models (DDMs) have been drawing much attention for their appreciable sample quality and diversity. Despite their remarkable performance, DDMs remain black boxes on which further study is necessary to take a profound step. Motivated by this, we delve into the design of conventional U-shaped diffusion models. More specifically, we investigate the self-attention modules within these models through carefully designed experiments and explore their characteristics. In addition, inspired by the studies that substantiate the effectiveness of the guidance schemes, we present plug-and-play diffusion guidance, namely Self-Attention Guidance (SAG), that can drastically boost the performance of existing diffusion models. Our method, SAG, extracts the intermediate attention map from a diffusion model at every iteration and selects tokens above a certain attention score for masking and blurring to obtain a partially blurred input. Subsequently, we measure the dissimilarity between the predicted noises obtained from feeding the blurred and original input to the diffusion model and leverage it as guidance. With this guidance, we observe apparent improvements in a wide range of diffusion models, e.g., ADM, IDDPM, and Stable Diffusion, and show that the results further improve by combining our method with the conventional guidance scheme. We provide extensive ablation studies to verify our choices.*
+
+Resources:
+
+* [Project Page](https://ku-cvlab.github.io/Self-Attention-Guidance).
+* [Paper](https://arxiv.org/abs/2210.00939).
+* [Original Code](https://github.com/KU-CVLAB/Self-Attention-Guidance).
+* [Demo](https://colab.research.google.com/github/SusungHong/Self-Attention-Guidance/blob/main/SAG_Stable.ipynb).
+
+
+## Available Pipelines:
+
+| Pipeline | Tasks | Demo
+|---|---|:---:|
+| [StableDiffusionSAGPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py) | *Text-to-Image Generation* | [Colab](https://colab.research.google.com/github/SusungHong/Self-Attention-Guidance/blob/main/SAG_Stable.ipynb) |
+
+## Usage example
+
+```python
+import torch
+from diffusers import StableDiffusionSAGPipeline
+from accelerate.utils import set_seed
+
+pipe = StableDiffusionSAGPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
+pipe = pipe.to("cuda")
+
+seed = 8978
+prompt = "."
+guidance_scale = 7.5
+num_images_per_prompt = 1
+
+sag_scale = 1.0
+
+set_seed(seed)
+images = pipe(
+    prompt, num_images_per_prompt=num_images_per_prompt, guidance_scale=guidance_scale, sag_scale=sag_scale
+).images
+images[0].save("example.png")
+```
+
+## StableDiffusionSAGPipeline
+[[autodoc]] StableDiffusionSAGPipeline
+	- __call__
+	- all
--- a/docs/source/en/api/pipelines/stable_diffusion/text2img.mdx
+++ b/docs/source/en/api/pipelines/stable_diffusion/text2img.mdx
@@ -0,0 +1,39 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Text-to-Image Generation
+
+## StableDiffusionPipeline
+
+The Stable Diffusion model was created by the researchers and engineers from [CompVis](https://github.com/CompVis), [Stability AI](https://stability.ai/), [runway](https://github.com/runwayml), and [LAION](https://laion.ai/). The [`StableDiffusionPipeline`] is capable of generating photo-realistic images given any text input using Stable Diffusion.
+
+The original codebase can be found here: 
+- *Stable Diffusion V1*: [CompVis/stable-diffusion](https://github.com/CompVis/stable-diffusion)
+- *Stable Diffusion v2*: [Stability-AI/stablediffusion](https://github.com/Stability-AI/stablediffusion)
+
+Available Checkpoints are:
+- *stable-diffusion-v1-4 (512x512 resolution)* [CompVis/stable-diffusion-v1-4](https://huggingface.co/CompVis/stable-diffusion-v1-4)
+- *stable-diffusion-v1-5 (512x512 resolution)* [runwayml/stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5)
+- *stable-diffusion-2-base (512x512 resolution)*: [stabilityai/stable-diffusion-2-base](https://huggingface.co/stabilityai/stable-diffusion-2-base)
+- *stable-diffusion-2 (768x768 resolution)*: [stabilityai/stable-diffusion-2](https://huggingface.co/stabilityai/stable-diffusion-2) 
+- *stable-diffusion-2-1-base (512x512 resolution)* [stabilityai/stable-diffusion-2-1-base](https://huggingface.co/stabilityai/stable-diffusion-2-1-base)
+- *stable-diffusion-2-1 (768x768 resolution)*: [stabilityai/stable-diffusion-2-1](https://huggingface.co/stabilityai/stable-diffusion-2-1)
+
+[[autodoc]] StableDiffusionPipeline
+	- all
+	- __call__
+	- enable_attention_slicing
+	- disable_attention_slicing
+	- enable_vae_slicing
+	- disable_vae_slicing
+	- enable_xformers_memory_efficient_attention
+	- disable_xformers_memory_efficient_attention
--- a/docs/source/en/api/pipelines/stable_diffusion/upscale.mdx
+++ b/docs/source/en/api/pipelines/stable_diffusion/upscale.mdx
@@ -0,0 +1,32 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Super-Resolution
+
+## StableDiffusionUpscalePipeline
+
+The upscaler diffusion model was created by the researchers and engineers from [CompVis](https://github.com/CompVis), [Stability AI](https://stability.ai/), and [LAION](https://laion.ai/), as part of Stable Diffusion 2.0. [`StableDiffusionUpscalePipeline`] can be used to enhance the resolution of input images by a factor of 4.
+
+The original codebase can be found here: 
+- *Stable Diffusion v2*: [Stability-AI/stablediffusion](https://github.com/Stability-AI/stablediffusion#image-upscaling-with-stable-diffusion)
+
+Available Checkpoints are:
+- *stabilityai/stable-diffusion-x4-upscaler (x4 resolution resolution)*: [stable-diffusion-x4-upscaler](https://huggingface.co/stabilityai/stable-diffusion-x4-upscaler)
+
+
+[[autodoc]] StableDiffusionUpscalePipeline
+	- all
+	- __call__
+	- enable_attention_slicing
+	- disable_attention_slicing
+	- enable_xformers_memory_efficient_attention
+	- disable_xformers_memory_efficient_attention
--- a/docs/source/en/api/pipelines/stable_diffusion_2.mdx
+++ b/docs/source/en/api/pipelines/stable_diffusion_2.mdx
@@ -24,17 +24,20 @@ For more details about how Stable Diffusion 2 works and how it differs from Stab

 ### Available checkpoints:

-Note that the architecture is more or less identical to [Stable Diffusion 1](./api/pipelines/stable_diffusion) so please refer to [this page](./api/pipelines/stable_diffusion) for API documentation.
+Note that the architecture is more or less identical to [Stable Diffusion 1](./stable_diffusion/overview) so please refer to [this page](./stable_diffusion/overview) for API documentation.

 - *Text-to-Image (512x512 resolution)*: [stabilityai/stable-diffusion-2-base](https://huggingface.co/stabilityai/stable-diffusion-2-base) with [`StableDiffusionPipeline`]
 - *Text-to-Image (768x768 resolution)*: [stabilityai/stable-diffusion-2](https://huggingface.co/stabilityai/stable-diffusion-2) with [`StableDiffusionPipeline`]
 - *Image Inpainting (512x512 resolution)*: [stabilityai/stable-diffusion-2-inpainting](https://huggingface.co/stabilityai/stable-diffusion-2-inpainting) with [`StableDiffusionInpaintPipeline`]
- *Image Upscaling (x4 resolution resolution)*: [stable-diffusion-x4-upscaler](https://huggingface.co/stabilityai/stable-diffusion-x4-upscaler) [`StableDiffusionUpscalePipeline`]
+- *Super-Resolution (x4 resolution resolution)*: [stable-diffusion-x4-upscaler](https://huggingface.co/stabilityai/stable-diffusion-x4-upscaler) [`StableDiffusionUpscalePipeline`]
 - *Depth-to-Image (512x512 resolution)*: [stabilityai/stable-diffusion-2-depth](https://huggingface.co/stabilityai/stable-diffusion-2-depth) with [`StableDiffusionDepth2ImagePipeline`]

 We recommend using the [`DPMSolverMultistepScheduler`] as it's currently the fastest scheduler there is.

- *Text-to-Image (512x512 resolution)*:
+
+### Text-to-Image
+
+- *Text-to-Image (512x512 resolution)*: [stabilityai/stable-diffusion-2-base](https://huggingface.co/stabilityai/stable-diffusion-2-base) with [`StableDiffusionPipeline`]

 ```python
 from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler
@@ -51,7 +54,7 @@ image = pipe(prompt, num_inference_steps=25).images[0]
 image.save("astronaut.png")
 ```

- *Text-to-Image (768x768 resolution)*:
+- *Text-to-Image (768x768 resolution)*: [stabilityai/stable-diffusion-2](https://huggingface.co/stabilityai/stable-diffusion-2) with [`StableDiffusionPipeline`]

 ```python
 from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler
@@ -68,7 +71,9 @@ image = pipe(prompt, guidance_scale=9, num_inference_steps=25).images[0]
 image.save("astronaut.png")
 ```

- *Image Inpainting (512x512 resolution)*:
+### Image Inpainting
+
+- *Image Inpainting (512x512 resolution)*: [stabilityai/stable-diffusion-2-inpainting](https://huggingface.co/stabilityai/stable-diffusion-2-inpainting) with [`StableDiffusionInpaintPipeline`]

 ```python
 import PIL
@@ -102,7 +107,10 @@ image = pipe(prompt=prompt, image=init_image, mask_image=mask_image, num_inferen
 image.save("yellow_cat.png")
 ```

- *Image Upscaling (x4 resolution resolution)*: [stable-diffusion-x4-upscaler](https://huggingface.co/stabilityai/stable-diffusion-x4-upscaler) [`StableDiffusionUpscalePipeline`]
+### Super-Resolution
+
+- *Image Upscaling (x4 resolution resolution)*: [stable-diffusion-x4-upscaler](https://huggingface.co/stabilityai/stable-diffusion-x4-upscaler) with [`StableDiffusionUpscalePipeline`]
+

 ```python
 import requests
@@ -126,16 +134,10 @@ upscaled_image = pipeline(prompt=prompt, image=low_res_img).images[0]
 upscaled_image.save("upsampled_cat.png")
 ```

+### Depth-to-Image
+
 - *Depth-Guided Text-to-Image*: [stabilityai/stable-diffusion-2-depth](https://huggingface.co/stabilityai/stable-diffusion-2-depth) [`StableDiffusionDepth2ImagePipeline`]

-**Installation**
-
-```bash
-!pip install -U git+https://github.com/huggingface/transformers.git
-!pip install diffusers[torch]
-```
-
-**Example**

 ```python
 import torch
--- a/docs/source/en/api/pipelines/stable_diffusion_safe.mdx
+++ b/docs/source/en/api/pipelines/stable_diffusion_safe.mdx
@@ -24,11 +24,11 @@ The abstract of the paper is the following:

 | Pipeline | Tasks | Colab | Demo
 |---|---|:---:|:---:|
-| [pipeline_stable_diffusion_safe.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py) | *Text-to-Image Generation* |  [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ml-research/safe-latent-diffusion/blob/main/examples/Safe%20Latent%20Diffusion.ipynb) | -
+| [pipeline_stable_diffusion_safe.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py) | *Text-to-Image Generation* |  [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ml-research/safe-latent-diffusion/blob/main/examples/Safe%20Latent%20Diffusion.ipynb) | [![Huggingface Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/AIML-TUDA/unsafe-vs-safe-stable-diffusion)

 ## Tips

- Safe Stable Diffusion may also be used with weights of [Stable Diffusion](./api/pipelines/stable_diffusion).
+- Safe Stable Diffusion may also be used with weights of [Stable Diffusion](./api/pipelines/stable_diffusion/text2img).

 ### Run Safe Stable Diffusion

@@ -58,7 +58,7 @@ You may use the 4 configurations defined in the [Safe Latent Diffusion paper](ht
 >>> out = pipeline(prompt=prompt, **SafetyConfig.MAX)
 ```

-The following configurations are available: `SafetyConfig.WEAK`, `SafetyConfig.MEDIUM`, `SafetyConfig.STRONg`, and `SafetyConfig.MAX`.
+The following configurations are available: `SafetyConfig.WEAK`, `SafetyConfig.MEDIUM`, `SafetyConfig.STRONG`, and `SafetyConfig.MAX`.

 ### How to load and use different schedulers.

@@ -81,10 +81,10 @@ To use a different scheduler, you can either change it via the [`ConfigMixin.fro

 ## StableDiffusionSafePipelineOutput
 [[autodoc]] pipelines.stable_diffusion_safe.StableDiffusionSafePipelineOutput
+	- all
+	- __call__

 ## StableDiffusionPipelineSafe
 [[autodoc]] StableDiffusionPipelineSafe
+	- all
 	- __call__
-	- enable_attention_slicing
-	- disable_attention_slicing
-
--- a/docs/source/en/api/pipelines/stable_unclip.mdx
+++ b/docs/source/en/api/pipelines/stable_unclip.mdx
@@ -0,0 +1,97 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Stable unCLIP
+
+Stable unCLIP checkpoints are finetuned from [stable diffusion 2.1](./stable_diffusion_2) checkpoints to condition on CLIP image embeddings.
+Stable unCLIP also still conditions on text embeddings. Given the two separate conditionings, stable unCLIP can be used
+for text guided image variation. When combined with an unCLIP prior, it can also be used for full text to image generation.
+
+## Tips
+
+Stable unCLIP takes a `noise_level` as input during inference. `noise_level` determines how much noise is added 
+to the image embeddings. A higher `noise_level` increases variation in the final un-noised images. By default, 
+we do not add any additional noise to the image embeddings i.e. `noise_level = 0`.
+
+### Available checkpoints:
+
+TODO
+
+### Text-to-Image Generation
+
+```python
+import torch
+from diffusers import StableUnCLIPPipeline
+
+pipe = StableUnCLIPPipeline.from_pretrained(
+    "fusing/stable-unclip-2-1-l", torch_dtype=torch.float16
+)  # TODO update model path
+pipe = pipe.to("cuda")
+
+prompt = "a photo of an astronaut riding a horse on mars"
+images = pipe(prompt).images
+images[0].save("astronaut_horse.png")
+```
+
+
+### Text guided Image-to-Image Variation
+
+```python
+import requests
+import torch
+from PIL import Image
+from io import BytesIO
+
+from diffusers import StableUnCLIPImg2ImgPipeline
+
+pipe = StableUnCLIPImg2ImgPipeline.from_pretrained(
+    "fusing/stable-unclip-2-1-l-img2img", torch_dtype=torch.float16
+)  # TODO update model path
+pipe = pipe.to("cuda")
+
+url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
+
+response = requests.get(url)
+init_image = Image.open(BytesIO(response.content)).convert("RGB")
+init_image = init_image.resize((768, 512))
+
+prompt = "A fantasy landscape, trending on artstation"
+
+images = pipe(prompt, init_image).images
+images[0].save("fantasy_landscape.png")
+```
+
+### StableUnCLIPPipeline
+
+[[autodoc]] StableUnCLIPPipeline
+	- all
+	- __call__
+	- enable_attention_slicing
+	- disable_attention_slicing
+	- enable_vae_slicing
+	- disable_vae_slicing
+	- enable_xformers_memory_efficient_attention
+	- disable_xformers_memory_efficient_attention
+
+
+### StableUnCLIPImg2ImgPipeline
+
+[[autodoc]] StableUnCLIPImg2ImgPipeline
+	- all
+	- __call__
+	- enable_attention_slicing
+	- disable_attention_slicing
+	- enable_vae_slicing
+	- disable_vae_slicing
+	- enable_xformers_memory_efficient_attention
+	- disable_xformers_memory_efficient_attention
+    
--- a/docs/source/en/api/pipelines/stochastic_karras_ve.mdx
+++ b/docs/source/en/api/pipelines/stochastic_karras_ve.mdx
@@ -32,4 +32,5 @@ This pipeline implements the Stochastic sampling tailored to the Variance-Expand

 ## KarrasVePipeline
 [[autodoc]] KarrasVePipeline
-    - __call__
+	- all
+	- __call__
--- a/docs/source/en/api/pipelines/unclip.mdx
+++ b/docs/source/en/api/pipelines/unclip.mdx
@@ -24,8 +24,14 @@ The unCLIP model in diffusers comes from kakaobrain's karlo and the original cod
 | Pipeline | Tasks | Colab
 |---|---|:---:|
 | [pipeline_unclip.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/unclip/pipeline_unclip.py) | *Text-to-Image Generation* | - |
+| [pipeline_unclip_image_variation.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/unclip/pipeline_unclip_image_variation.py) | *Image-Guided Image Generation* | - |


 ## UnCLIPPipeline
-[[autodoc]] pipelines.unclip.pipeline_unclip.UnCLIPPipeline
-    - __call__
+[[autodoc]] UnCLIPPipeline
+	- all
+	- __call__
+
+[[autodoc]] UnCLIPImageVariationPipeline
+	- all
+	- __call__
--- a/docs/source/en/api/pipelines/versatile_diffusion.mdx
+++ b/docs/source/en/api/pipelines/versatile_diffusion.mdx
@@ -20,7 +20,7 @@ The abstract of the paper is the following:

 ## Tips

- VersatileDiffusion is conceptually very similar as [Stable Diffusion](./api/pipelines/stable_diffusion),  but instead of providing just a image data stream conditioned on text, VersatileDiffusion provides both a image and text data stream and can be conditioned on both text and image.
+- VersatileDiffusion is conceptually very similar as [Stable Diffusion](./api/pipelines/stable_diffusion/overview),  but instead of providing just a image data stream conditioned on text, VersatileDiffusion provides both a image and text data stream and can be conditioned on both text and image.

 ### *Run VersatileDiffusion*

@@ -56,18 +56,15 @@ To use a different scheduler, you can either change it via the [`ConfigMixin.fro

 ## VersatileDiffusionTextToImagePipeline
 [[autodoc]] VersatileDiffusionTextToImagePipeline
+	- all
 	- __call__
-	- enable_attention_slicing
-	- disable_attention_slicing

 ## VersatileDiffusionImageVariationPipeline
 [[autodoc]] VersatileDiffusionImageVariationPipeline
+	- all
 	- __call__
-	- enable_attention_slicing
-	- disable_attention_slicing

 ## VersatileDiffusionDualGuidedPipeline
 [[autodoc]] VersatileDiffusionDualGuidedPipeline
+	- all
 	- __call__
-	- enable_attention_slicing
-	- disable_attention_slicing
--- a/docs/source/en/api/pipelines/vq_diffusion.mdx
+++ b/docs/source/en/api/pipelines/vq_diffusion.mdx
@@ -30,5 +30,6 @@ The original codebase can be found [here](https://github.com/microsoft/VQ-Diffus


 ## VQDiffusionPipeline
-[[autodoc]] pipelines.vq_diffusion.pipeline_vq_diffusion.VQDiffusionPipeline
-    - __call__
+[[autodoc]] VQDiffusionPipeline
+	- all
+	- __call__
--- a/docs/source/en/api/schedulers/ddim.mdx
+++ b/docs/source/en/api/schedulers/ddim.mdx
--- a/docs/source/en/api/schedulers/ddim_inverse.mdx
+++ b/docs/source/en/api/schedulers/ddim_inverse.mdx
@@ -0,0 +1,21 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Inverse Denoising Diffusion Implicit Models (DDIMInverse)
+
+## Overview
+
+This scheduler is the inverted scheduler of [Denoising Diffusion Implicit Models](https://arxiv.org/abs/2010.02502) (DDIM) by Jiaming Song, Chenlin Meng and Stefano Ermon.
+The implementation is mostly based on the DDIM inversion definition of [Null-text Inversion for Editing Real Images using Guided Diffusion Models](https://arxiv.org/pdf/2211.09794.pdf)
+
+## DDIMInverseScheduler
+[[autodoc]] DDIMInverseScheduler
--- a/docs/source/en/api/schedulers/ddpm.mdx
+++ b/docs/source/en/api/schedulers/ddpm.mdx
--- a/docs/source/en/api/schedulers/deis.mdx
+++ b/docs/source/en/api/schedulers/deis.mdx
@@ -0,0 +1,22 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# DEIS
+
+Fast Sampling of Diffusion Models with Exponential Integrator.
+
+## Overview
+
+Original paper can be found [here](https://arxiv.org/abs/2204.13902). The original implementation can be found [here](https://github.com/qsh-zh/deis).
+
+## DEISMultistepScheduler
+[[autodoc]] DEISMultistepScheduler
--- a/docs/source/en/api/schedulers/dpm_discrete.mdx
+++ b/docs/source/en/api/schedulers/dpm_discrete.mdx
--- a/docs/source/en/api/schedulers/dpm_discrete_ancestral.mdx
+++ b/docs/source/en/api/schedulers/dpm_discrete_ancestral.mdx
--- a/docs/source/en/api/schedulers/euler.mdx
+++ b/docs/source/en/api/schedulers/euler.mdx
--- a/docs/source/en/api/schedulers/euler_ancestral.mdx
+++ b/docs/source/en/api/schedulers/euler_ancestral.mdx
--- a/docs/source/en/api/schedulers/heun.mdx
+++ b/docs/source/en/api/schedulers/heun.mdx
--- a/docs/source/en/api/schedulers/ipndm.mdx
+++ b/docs/source/en/api/schedulers/ipndm.mdx
--- a/docs/source/en/api/schedulers/lms_discrete.mdx
+++ b/docs/source/en/api/schedulers/lms_discrete.mdx
--- a/docs/source/en/api/schedulers/multistep_dpm_solver.mdx
+++ b/docs/source/en/api/schedulers/multistep_dpm_solver.mdx
--- a/docs/source/en/api/schedulers/overview.mdx
+++ b/docs/source/en/api/schedulers/overview.mdx
@@ -37,16 +37,18 @@ To this end, the design of schedulers is such that:

 - Schedulers can be used interchangeably between diffusion models in inference to find the preferred trade-off between speed and generation quality.
 - Schedulers are currently by default in PyTorch, but are designed to be framework independent (partial Jax support currently exists).
+- Many diffusion pipelines, such as [`StableDiffusionPipeline`] and [`DiTPipeline`] can use any of [`KarrasDiffusionSchedulers`]

 ## Schedulers Summary

 The following table summarizes all officially supported schedulers, their corresponding paper

-
 | Scheduler | Paper |
 |---|---|
 | [ddim](./ddim) | [**Denoising Diffusion Implicit Models**](https://arxiv.org/abs/2010.02502) |
+| [ddim_inverse](./ddim_inverse) | [**Denoising Diffusion Implicit Models**](https://arxiv.org/abs/2010.02502) |
 | [ddpm](./ddpm) | [**Denoising Diffusion Probabilistic Models**](https://arxiv.org/abs/2006.11239) |
+| [deis](./deis) | [**DEISMultistepScheduler**](https://arxiv.org/abs/2204.13902) |
 | [singlestep_dpm_solver](./singlestep_dpm_solver) | [**Singlestep DPM-Solver**](https://arxiv.org/abs/2206.00927) |
 | [multistep_dpm_solver](./multistep_dpm_solver) | [**Multistep DPM-Solver**](https://arxiv.org/abs/2206.00927) |
 | [heun](./heun) | [**Heun scheduler inspired by Karras et. al paper**](https://arxiv.org/abs/2206.00364) |
@@ -61,6 +63,7 @@ The following table summarizes all officially supported schedulers, their corres
 | [euler](./euler) | [**Euler scheduler**](https://arxiv.org/abs/2206.00364) |
 | [euler_ancestral](./euler_ancestral) | [**Euler Ancestral scheduler**](https://github.com/crowsonkb/k-diffusion/blob/481677d114f6ea445aa009cf5bd7a9cdee909e47/k_diffusion/sampling.py#L72) |
 | [vq_diffusion](./vq_diffusion) | [**VQDiffusionScheduler**](https://arxiv.org/abs/2111.14822) |
+| [unipc](./unipc) | [**UniPCMultistepScheduler**](https://arxiv.org/abs/2302.04867) |
 | [repaint](./repaint) | [**RePaint scheduler**](https://arxiv.org/abs/2201.09865) |

 ## API
@@ -80,4 +83,10 @@ The class [`SchedulerOutput`] contains the outputs from any schedulers `step(...

 [[autodoc]] schedulers.scheduling_utils.SchedulerOutput

+### KarrasDiffusionSchedulers

+`KarrasDiffusionSchedulers` encompasses the main generalization of schedulers in Diffusers. The schedulers in this class are distinguished, at a high level, by their noise sampling strategy; the type of network and scaling; and finally the training strategy or how the loss is weighed.
+
+The different schedulers, depending on the type of ODE solver, fall into the above taxonomy and provide a good abstraction for the design of the main schedulers implemented in Diffusers. The schedulers in this class are given below:
+
+[[autodoc]] schedulers.scheduling_utils.KarrasDiffusionSchedulers
--- a/docs/source/en/api/schedulers/pndm.mdx
+++ b/docs/source/en/api/schedulers/pndm.mdx
--- a/docs/source/en/api/schedulers/repaint.mdx
+++ b/docs/source/en/api/schedulers/repaint.mdx
--- a/docs/source/en/api/schedulers/score_sde_ve.mdx
+++ b/docs/source/en/api/schedulers/score_sde_ve.mdx
--- a/docs/source/en/api/schedulers/score_sde_vp.mdx
+++ b/docs/source/en/api/schedulers/score_sde_vp.mdx
--- a/docs/source/en/api/schedulers/singlestep_dpm_solver.mdx
+++ b/docs/source/en/api/schedulers/singlestep_dpm_solver.mdx
--- a/docs/source/en/api/schedulers/stochastic_karras_ve.mdx
+++ b/docs/source/en/api/schedulers/stochastic_karras_ve.mdx
--- a/docs/source/en/api/schedulers/unipc.mdx
+++ b/docs/source/en/api/schedulers/unipc.mdx
@@ -0,0 +1,24 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# UniPC
+
+## Overview
+
+UniPC is a training-free framework designed for the fast sampling of diffusion models, which consists of a corrector (UniC) and a predictor (UniP) that share a unified analytical form and support arbitrary orders.
+
+For more details about the method, please refer to the [[paper]](https://arxiv.org/abs/2302.04867) and the [[code]](https://github.com/wl-zhao/UniPC).
+
+Fast Sampling of Diffusion Models with Exponential Integrator.
+
+## UniPCMultistepScheduler
+[[autodoc]] UniPCMultistepScheduler
--- a/docs/source/en/api/schedulers/vq_diffusion.mdx
+++ b/docs/source/en/api/schedulers/vq_diffusion.mdx
--- a/docs/source/en/conceptual/contribution.mdx
+++ b/docs/source/en/conceptual/contribution.mdx
@@ -177,7 +177,7 @@ Follow these steps to start contributing ([supported Python versions](https://gi
   $ make style
   ```

-   🧨 Diffusers also uses `flake8` and a few custom scripts to check for coding mistakes. Quality
+   🧨 Diffusers also uses `ruff` and a few custom scripts to check for coding mistakes. Quality
   control runs in CI, however you can also run the same checks with:

   ```bash
--- a/docs/source/en/conceptual/ethical_guidelines.mdx
+++ b/docs/source/en/conceptual/ethical_guidelines.mdx
@@ -0,0 +1,49 @@
+# 🧨 Diffusers’ Ethical Guidelines
+
+## Preamble
+
+[Diffusers](https://huggingface.co/docs/diffusers/index) provides pre-trained diffusion models and serves as a modular toolbox for inference and training. 
+
+Given its real case applications in the world and potential negative impacts on society, we think it is important to provide the project with ethical guidelines to guide the development, users’ contributions, and usage of the Diffusers library.
+
+The risks associated with using this technology are still being examined, but to name a few: copyrights issues for artists; deep-fake exploitation; sexual content generation in inappropriate contexts; non-consensual impersonation; harmful social biases perpetuating the oppression of marginalized groups.
+We will keep tracking risks and adapt the following guidelines based on the community's responsiveness and valuable feedback.
+
+
+## Scope
+
+The Diffusers community will apply the following ethical guidelines to the project’s development and help coordinate how the community will integrate the contributions, especially concerning sensitive topics related to ethical concerns.
+
+
+## Ethical guidelines
+
+The following ethical guidelines apply generally, but we will primarily implement them when dealing with ethically sensitive issues while making a technical choice. Furthermore, we commit to adapting those ethical principles over time following emerging harms related to the state of the art of the technology in question.
+
+- **Transparency**: we are committed to being transparent in managing PRs, explaining our choices to users, and making technical decisions.
+
+- **Consistency**: we are committed to guaranteeing our users the same level of attention in project management, keeping it technically stable and consistent.
+
+- **Simplicity**: with a desire to make it easy to use and exploit the Diffusers library, we are committed to keeping the project’s goals lean and coherent.
+
+- **Accessibility**: the Diffusers project helps lower the entry bar for contributors who can help run it even without technical expertise. Doing so makes research artifacts more accessible to the community.
+
+- **Reproducibility**: we aim to be transparent about the reproducibility of upstream code, models, and datasets when made available through the Diffusers library.
+
+- **Responsibility**: as a community and through teamwork, we hold a collective responsibility to our users by anticipating and mitigating this technology's potential risks and dangers.
+
+
+## Examples of implementations: Safety features and Mechanisms
+
+The team works daily to make the technical and non-technical tools available to deal with the potential ethical and social risks associated with diffusion technology. Moreover, the community's input is invaluable in ensuring these features' implementation and raising awareness with us. 
+
+- [**Community tab**](https://huggingface.co/docs/hub/repositories-pull-requests-discussions): it enables the community to discuss and better collaborate on a project.
+
+- **Bias exploration and evaluation**: the Hugging Face team provides a [space](https://huggingface.co/spaces/society-ethics/DiffusionBiasExplorer) to demonstrate the biases in Stable Diffusion interactively. In this sense, we support and encourage bias explorers and evaluations.
+
+- **Encouraging safety in deployment**
+
+  - [**Safe Stable Diffusion**](https://huggingface.co/docs/diffusers/main/en/api/pipelines/stable_diffusion_safe): It mitigates the well-known issue that models, like Stable Diffusion, that are trained on unfiltered, web-crawled datasets tend to suffer from inappropriate degeneration. Related paper: [Safe Latent Diffusion: Mitigating Inappropriate Degeneration in Diffusion Models](https://arxiv.org/abs/2211.05105).
+
+- **Staged released on the Hub**: in particularly sensitive situations, access to some repositories should be restricted. This staged release is an intermediary step that allows the repository’s authors to have more control over its use.
+
+- **Licensing**: [OpenRAILs](https://huggingface.co/blog/open_rail), a new type of licensing, allow us to ensure free access while having a set of restrictions that ensure more responsible use. 
--- a/docs/source/en/conceptual/philosophy.mdx
+++ b/docs/source/en/conceptual/philosophy.mdx
@@ -0,0 +1,110 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Philosophy
+
+🧨 Diffusers provides **state-of-the-art** pretrained diffusion models across multiple modalities.
+Its purpose is to serve as a **modular toolbox** for both inference and training.
+
+We aim at building a library that stands the test of time and therefore take API design very seriously.
+
+In a nutshell, Diffusers is built to be a natural extension of PyTorch. Therefore, most of our design choices are based on [PyTorch's Design Principles](https://pytorch.org/docs/stable/community/design.html#pytorch-design-philosophy). Let's go over the most important ones:
+
+## Usability over Performance
+
+- While Diffusers has many built-in performance-enhancing features (see [Memory and Speed](https://huggingface.co/docs/diffusers/optimization/fp16)), models are always loaded with the highest precision and lowest optimization. Therefore, by default diffusion pipelines are always instantiated on CPU with float32 precision if not otherwise defined by the user. This ensures usability across different platforms and accelerators and means that no complex installations are required to run the library.
+- Diffusers aim at being a **light-weight** package and therefore has very few required dependencies, but many soft dependencies that can improve performance (such as `accelerate`, `safetensors`, `onnx`, etc...). We strive to keep the library as lightweight as possible so that it can be added without much concern as a dependency on other packages.
+- Diffusers prefers simple, self-explainable code over condensed, magic code. This means that short-hand code syntaxes such as lambda functions, and advanced PyTorch operators are often not desired.
+
+## Simple over easy
+
+As PyTorch states, **explicit is better than implicit** and **simple is better than complex**. This design philosophy is reflected in multiple parts of the library: 
+- We follow PyTorch's API with methods like [`DiffusionPipeline.to`](https://huggingface.co/docs/diffusers/main/en/api/diffusion_pipeline#diffusers.DiffusionPipeline.to) to let the user handle device management.
+- Raising concise error messages is preferred to silently correct erroneous input. Diffusers aims at teaching the user, rather than making the library as easy to use as possible.
+- Complex model vs. scheduler logic is exposed instead of magically handled inside. Schedulers/Samplers are separated from diffusion models with minimal dependencies on each other. This forces the user to write the unrolled denoising loop. However, the separation allows for easier debugging and gives the user more control over adapting the denoising process or switching out diffusion models or schedulers.
+- Separately trained components of the diffusion pipeline, *e.g.* the text encoder, the unet, and the variational autoencoder, each have their own model class. This forces the user to handle the interaction between the different model components, and the serialization format separates the model components into different files. However, this allows for easier debugging and customization. Dreambooth or textual inversion training 
+is very simple thanks to diffusers' ability to separate single components of the diffusion pipeline.
+
+## Tweakable, contributor-friendly over abstraction
+
+For large parts of the library, Diffusers adopts an important design principle of the [Transformers library](https://github.com/huggingface/transformers), which is to prefer copy-pasted code over hasty abstractions. This design principle is very opinionated and stands in stark contrast to popular design principles such as [Don't repeat yourself (DRY)](https://en.wikipedia.org/wiki/Don%27t_repeat_yourself). 
+In short, just like Transformers does for modeling files, diffusers prefers to keep an extremely low level of abstraction and very self-contained code for pipelines and schedulers.
+Functions, long code blocks, and even classes can be copied across multiple files which at first can look like a bad, sloppy design choice that makes the library unmaintainable. 
+**However**, this design has proven to be extremely successful for Transformers and makes a lot of sense for community-driven, open-source machine learning libraries because:
+- Machine Learning is an extremely fast-moving field in which paradigms, model architectures, and algorithms are changing rapidly, which therefore makes it very difficult to define long-lasting code abstractions.
+- Machine Learning practitioners like to be able to quickly tweak existing code for ideation and research and therefore prefer self-contained code over one that contains many abstractions.
+- Open-source libraries rely on community contributions and therefore must build a library that is easy to contribute to. The more abstract the code, the more dependencies, the harder to read, and the harder to contribute to. Contributors simply stop contributing to very abstract libraries out of fear of breaking vital functionality. If contributing to a library cannot break other fundamental code, not only is it more inviting for potential new contributors, but it is also easier to review and contribute to multiple parts in parallel.
+
+At Hugging Face, we call this design the **single-file policy** which means that almost all of the code of a certain class should be written in a single, self-contained file. To read more about the philosophy, you can have a look
+at [this blog post](https://huggingface.co/blog/transformers-design-philosophy).
+
+In diffusers, we follow this philosophy for both pipelines and schedulers, but only partly for diffusion models. The reason we don't follow this design fully for diffusion models is because almost all diffusion pipelines, such 
+as [DDPM](https://huggingface.co/docs/diffusers/v0.12.0/en/api/pipelines/ddpm), [Stable Diffusion](https://huggingface.co/docs/diffusers/v0.12.0/en/api/pipelines/stable_diffusion/overview#stable-diffusion-pipelines), [UnCLIP (Dalle-2)](https://huggingface.co/docs/diffusers/v0.12.0/en/api/pipelines/unclip#overview) and [Imagen](https://imagen.research.google/) all rely on the same diffusion model, the [UNet](https://huggingface.co/docs/diffusers/api/models#diffusers.UNet2DConditionModel).
+
+Great, now you should have generally understood why 🧨 Diffusers is designed the way it is 🤗. 
+We try to apply these design principles consistently across the library. Nevertheless, there are some minor exceptions to the philosophy or some unlucky design choices. If you have feedback regarding the design, we would ❤️  to hear it [directly on GitHub](https://github.com/huggingface/diffusers/issues/new?assignees=&labels=&template=feedback.md&title=).
+
+## Design Philosophy in Details
+
+Now, let's look a bit into the nitty-gritty details of the design philosophy. Diffusers essentially consist of three major classes, [pipelines](https://github.com/huggingface/diffusers/tree/main/src/diffusers/pipelines), [models](https://github.com/huggingface/diffusers/tree/main/src/diffusers/models), and [schedulers](https://github.com/huggingface/diffusers/tree/main/src/diffusers/schedulers).
+Let's walk through more in-detail design decisions for each class.
+
+### Pipelines
+
+Pipelines are designed to be easy to use (therefore do not follow [*Simple over easy*](#simple-over-easy) 100%)), are not feature complete, and should loosely be seen as examples of how to use [models](#models) and [schedulers](#schedulers) for inference.
+
+The following design principles are followed:
+- Pipelines follow the single-file policy. All pipelines can be found in individual directories under src/diffusers/pipelines. One pipeline folder corresponds to one diffusion paper/project/release. Multiple pipeline files can be gathered in one pipeline folder, as it’s done for [`src/diffusers/pipelines/stable-diffusion`](https://github.com/huggingface/diffusers/tree/main/src/diffusers/pipelines/stable_diffusion). If pipelines share similar functionality, one can make use of the [#Copied from mechanism](https://github.com/huggingface/diffusers/blob/125d783076e5bd9785beb05367a2d2566843a271/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py#L251).
+- Pipelines all inherit from [`DiffusionPipeline`]
+- Every pipeline consists of different model and scheduler components, that are documented in the [`model_index.json` file](https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/model_index.json), are accessible under the same name as attributes of the pipeline and can be shared between pipelines with [`DiffusionPipeline.components`](https://huggingface.co/docs/diffusers/main/en/api/diffusion_pipeline#diffusers.DiffusionPipeline.components) function.
+- Every pipeline should be loadable via the [`DiffusionPipeline.from_pretrained`](https://huggingface.co/docs/diffusers/main/en/api/diffusion_pipeline#diffusers.DiffusionPipeline.from_pretrained) function.
+- Pipelines should be used **only** for inference.
+- Pipelines should be very readable, self-explanatory, and easy to tweak.
+- Pipelines should be designed to build on top of each other and be easy to integrate into higher-level APIs.
+- Pipelines are **not** intended to be feature-complete user interfaces. For future complete user interfaces one should rather have a look at [InvokeAI](https://github.com/invoke-ai/InvokeAI), [Diffuzers](https://github.com/abhishekkrthakur/diffuzers), and [lama-cleaner](https://github.com/Sanster/lama-cleaner)
+- Every pipeline should have one and only one way to run it via a `__call__` method. The naming of the `__call__` arguments should be shared across all pipelines.
+- Pipelines should be named after the task they are intended to solve.
+- In almost all cases, novel diffusion pipelines shall be implemented in a new pipeline folder/file.
+
+### Models
+
+Models are designed as configurable toolboxes that are natural extensions of [PyTorch's Module class](https://pytorch.org/docs/stable/generated/torch.nn.Module.html). They only partly follow the **single-file policy**.
+
+The following design principles are followed:
+- Models correspond to **a type of model architecture**. *E.g.* the [`UNet2DConditionModel`] class is used for all UNet variations that expect 2D image inputs and are conditioned on some context.
+- All models can be found in [`src/diffusers/models`](https://github.com/huggingface/diffusers/tree/main/src/diffusers/models) and every model architecture shall be defined in its file, e.g. [`unet_2d_condition.py`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/unet_2d_condition.py), [`transformer_2d.py`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/transformer_2d.py), etc...
+- Models **do not** follow the single-file policy and should make use of smaller model building blocks, such as [`attention.py`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention.py), [`resnet.py`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/resnet.py), [`embeddings.py`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/embeddings.py), etc... **Note**: This is in stark contrast to Transformers' modeling files and shows that models do not really follow the single-file policy.
+- Models intend to expose complexity, just like PyTorch's module does, and give clear error messages.
+- Models all inherit from `ModelMixin` and `ConfigMixin`.
+- Models can be optimized for performance when it doesn’t demand major code changes, keeps backward compatibility, and gives significant memory or compute gain.
+- Models should by default have the highest precision and lowest performance setting.
+- To integrate new model checkpoints whose general architecture can be classified as an architecture that already exists in Diffusers, the existing model architecture shall be adapted to make it work with the new checkpoint. One should only create a new file if the model architecture is fundamentally different.
+- Models should be designed to be easily extendable to future changes. This can be achieved by limiting public function arguments, configuration arguments, and "foreseeing" future changes, *e.g.* it is usually better to add `string` "...type" arguments that can easily be extended to new future types instead of boolean `is_..._type` arguments. Only the minimum amount of changes shall be made to existing architectures to make a new model checkpoint work.
+- The model design is a difficult trade-off between keeping code readable and concise and supporting many model checkpoints. For most parts of the modeling code, classes shall be adapted for new model checkpoints, while there are some exceptions where it is preferred to add new classes to make sure the code is kept concise and 
+readable longterm, such as [UNet blocks](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/unet_2d_blocks.py) and [Attention processors](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
+
+### Schedulers
+
+Schedulers are responsible to guide the denoising process for inference as well as to define a noise schedule for training. They are designed as individual classes with loadable configuration files and strongly follow the **single-file policy**.
+
+The following design principles are followed:
+- All schedulers are found in [`src/diffusers/schedulers`](https://github.com/huggingface/diffusers/tree/main/src/diffusers/schedulers). 
+- Schedulers are **not** allowed to import from large utils files and shall be kept very self-contained. 
+- One scheduler python file corresponds to one scheduler algorithm (as might be defined in a paper). 
+- If schedulers share similar functionalities, we can make use of the `#Copied from` mechanism.
+- Schedulers all inherit from `SchedulerMixin` and `ConfigMixin`.
+- Schedulers can be easily swapped out with the [`ConfigMixin.from_config`](https://huggingface.co/docs/diffusers/main/en/api/configuration#diffusers.ConfigMixin.from_config) method as explained in detail [here](./using-diffusers/schedulers.mdx).
+- Every scheduler has to have a `set_num_inference_steps`, and a `step` function. `set_num_inference_steps(...)` has to be called before every denoising process, *i.e.* before `step(...)` is called.
+- Every scheduler exposes the timesteps to be "looped over" via a `timesteps` attribute, which is an array of timesteps the model will be called upon
+- The `step(...)` function takes a predicted model output and the "current" sample (x_t) and returns the "previous", slightly more denoised sample (x_t-1).
+- Given the complexity of diffusion schedulers, the `step` function does not expose all the complexity and can be a bit of a "black box".
+- In almost all cases, novel schedulers shall be implemented in a new scheduling file.
--- a/docs/source/en/imgs/access_request.png
+++ b/docs/source/en/imgs/access_request.png
--- a/docs/source/en/imgs/diffusers_library.jpg
+++ b/docs/source/en/imgs/diffusers_library.jpg
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -0,0 +1,75 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+<p align="center">
+    <br>
+    <img src="https://raw.githubusercontent.com/huggingface/diffusers/77aadfee6a891ab9fcfb780f87c693f7a5beeb8e/docs/source/imgs/diffusers_library.jpg" width="400"/>
+    <br>
+</p>
+
+# 🧨 Diffusers
+
+🤗 Diffusers provides pretrained vision and audio diffusion models, and serves as a modular toolbox for inference and training.
+
+More precisely, 🤗 Diffusers offers:
+
+- State-of-the-art diffusion pipelines that can be run in inference with just a couple of lines of code (see [**Using Diffusers**](./using-diffusers/conditional_image_generation)) or have a look at [**Pipelines**](#pipelines) to get an overview of all supported pipelines and their corresponding papers.
+- Various noise schedulers that can be used interchangeably for the preferred speed vs. quality trade-off in inference. For more information see [**Schedulers**](./api/schedulers/overview).
+- Multiple types of models, such as UNet, can be used as building blocks in an end-to-end diffusion system. See [**Models**](./api/models) for more details 
+- Training examples to show how to train the most popular diffusion model tasks. For more information see [**Training**](./training/overview).
+
+## 🧨 Diffusers Pipelines
+
+The following table summarizes all officially supported pipelines, their corresponding paper, and if 
+available a colab notebook to directly try them out.
+
+| Pipeline | Paper | Tasks | Colab
+|---|---|:---:|:---:|
+| [alt_diffusion](./api/pipelines/alt_diffusion) | [**AltDiffusion**](https://arxiv.org/abs/2211.06679) | Image-to-Image Text-Guided Generation |
+| [audio_diffusion](./api/pipelines/audio_diffusion) | [**Audio Diffusion**](https://github.com/teticio/audio-diffusion.git) | Unconditional Audio Generation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/teticio/audio-diffusion/blob/master/notebooks/audio_diffusion_pipeline.ipynb)
+| [cycle_diffusion](./api/pipelines/cycle_diffusion) | [**Cycle Diffusion**](https://arxiv.org/abs/2210.05559) | Image-to-Image Text-Guided Generation |
+| [dance_diffusion](./api/pipelines/dance_diffusion) | [**Dance Diffusion**](https://github.com/williamberman/diffusers.git) | Unconditional Audio Generation |
+| [ddpm](./api/pipelines/ddpm) | [**Denoising Diffusion Probabilistic Models**](https://arxiv.org/abs/2006.11239) | Unconditional Image Generation |
+| [ddim](./api/pipelines/ddim) | [**Denoising Diffusion Implicit Models**](https://arxiv.org/abs/2010.02502) | Unconditional Image Generation |
+| [latent_diffusion](./api/pipelines/latent_diffusion) | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752)| Text-to-Image Generation | 
+| [latent_diffusion](./api/pipelines/latent_diffusion) | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752)| Super Resolution Image-to-Image | 
+| [latent_diffusion_uncond](./api/pipelines/latent_diffusion_uncond) | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752) | Unconditional Image Generation | 
+| [paint_by_example](./api/pipelines/paint_by_example) | [**Paint by Example: Exemplar-based Image Editing with Diffusion Models**](https://arxiv.org/abs/2211.13227) | Image-Guided Image Inpainting | 
+| [pndm](./api/pipelines/pndm) | [**Pseudo Numerical Methods for Diffusion Models on Manifolds**](https://arxiv.org/abs/2202.09778) | Unconditional Image Generation | 
+| [score_sde_ve](./api/pipelines/score_sde_ve) | [**Score-Based Generative Modeling through Stochastic Differential Equations**](https://openreview.net/forum?id=PxTIG12RRHS) | Unconditional Image Generation | 
+| [score_sde_vp](./api/pipelines/score_sde_vp) | [**Score-Based Generative Modeling through Stochastic Differential Equations**](https://openreview.net/forum?id=PxTIG12RRHS) | Unconditional Image Generation | 
+| [semantic_stable_diffusion](./api/pipelines/semantic_stable_diffusion) | [**Semantic Guidance**](https://arxiv.org/abs/2301.12247) | Text-Guided Generation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ml-research/semantic-image-editing/blob/main/examples/SemanticGuidance.ipynb)
+| [stable_diffusion_text2img](./api/pipelines/stable_diffusion/text2img) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Text-to-Image Generation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/training_example.ipynb)
+| [stable_diffusion_img2img](./api/pipelines/stable_diffusion/img2img) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Image-to-Image Text-Guided Generation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/image_2_image_using_diffusers.ipynb)
+| [stable_diffusion_inpaint](./api/pipelines/stable_diffusion/inpaint) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Text-Guided Image Inpainting | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/in_painting_with_stable_diffusion_using_diffusers.ipynb)
+| [stable_diffusion_panorama](./api/pipelines/stable_diffusion/panorama) | [**MultiDiffusion**](https://multidiffusion.github.io/) | Text-to-Panorama Generation |
+| [stable_diffusion_pix2pix](./api/pipelines/stable_diffusion/pix2pix) | [**InstructPix2Pix**](https://github.com/timothybrooks/instruct-pix2pix) | Text-Guided Image Editing| 
+| [stable_diffusion_pix2pix_zero](./api/pipelines/stable_diffusion/pix2pix_zero) | [**Zero-shot Image-to-Image Translation**](https://pix2pixzero.github.io/) | Text-Guided Image Editing | 
+| [stable_diffusion_attend_and_excite](./api/pipelines/stable_diffusion/attend_and_excite) | [**Attend and Excite for Stable Diffusion**](https://attendandexcite.github.io/Attend-and-Excite/) | Text-to-Image Generation | 
+| [stable_diffusion_self_attention_guidance](./api/pipelines/stable_diffusion/self_attention_guidance) | [**Self-Attention Guidance**](https://ku-cvlab.github.io/Self-Attention-Guidance) | Text-to-Image Generation | 
+| [stable_diffusion_image_variation](./stable_diffusion/image_variation) | [**Stable Diffusion Image Variations**](https://github.com/LambdaLabsML/lambda-diffusers#stable-diffusion-image-variations) | Image-to-Image Generation |
+| [stable_diffusion_latent_upscale](./stable_diffusion/latent_upscale) | [**Stable Diffusion Latent Upscaler**](https://twitter.com/StabilityAI/status/1590531958815064065) | Text-Guided Super Resolution Image-to-Image |
+| [stable_diffusion_2](./api/pipelines/stable_diffusion_2) | [**Stable Diffusion 2**](https://stability.ai/blog/stable-diffusion-v2-release) | Text-to-Image Generation |
+| [stable_diffusion_2](./api/pipelines/stable_diffusion_2) | [**Stable Diffusion 2**](https://stability.ai/blog/stable-diffusion-v2-release) | Text-Guided Image Inpainting | 
+| [stable_diffusion_2](./api/pipelines/stable_diffusion_2) | [**Depth-Conditional Stable Diffusion**](https://github.com/Stability-AI/stablediffusion#depth-conditional-stable-diffusion) | Depth-to-Image Generation | 
+| [stable_diffusion_2](./api/pipelines/stable_diffusion_2) | [**Stable Diffusion 2**](https://stability.ai/blog/stable-diffusion-v2-release) | Text-Guided Super Resolution Image-to-Image |
+| [stable_diffusion_safe](./api/pipelines/stable_diffusion_safe) | [**Safe Stable Diffusion**](https://arxiv.org/abs/2211.05105) | Text-Guided Generation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ml-research/safe-latent-diffusion/blob/main/examples/Safe%20Latent%20Diffusion.ipynb)
+| [stable_unclip](./stable_unclip) | **Stable unCLIP** | Text-to-Image Generation |
+| [stable_unclip](./stable_unclip) | **Stable unCLIP** | Image-to-Image Text-Guided Generation |
+| [stochastic_karras_ve](./api/pipelines/stochastic_karras_ve) | [**Elucidating the Design Space of Diffusion-Based Generative Models**](https://arxiv.org/abs/2206.00364) | Unconditional Image Generation | 
+| [unclip](./api/pipelines/unclip) | [Hierarchical Text-Conditional Image Generation with CLIP Latents](https://arxiv.org/abs/2204.06125) | Text-to-Image Generation |
+| [versatile_diffusion](./api/pipelines/versatile_diffusion) | [Versatile Diffusion: Text, Images and Variations All in One Diffusion Model](https://arxiv.org/abs/2211.08332) | Text-to-Image Generation | 
+| [versatile_diffusion](./api/pipelines/versatile_diffusion) | [Versatile Diffusion: Text, Images and Variations All in One Diffusion Model](https://arxiv.org/abs/2211.08332) | Image Variations Generation | 
+| [versatile_diffusion](./api/pipelines/versatile_diffusion) | [Versatile Diffusion: Text, Images and Variations All in One Diffusion Model](https://arxiv.org/abs/2211.08332) | Dual Image and Text Guided Generation | 
+| [vq_diffusion](./api/pipelines/vq_diffusion) | [Vector Quantized Diffusion Model for Text-to-Image Synthesis](https://arxiv.org/abs/2111.14822) | Text-to-Image Generation | 
+
+**Note**: Pipelines are simple examples of how to play around with the diffusion systems as described in the corresponding papers. 
--- a/docs/source/en/installation.mdx
+++ b/docs/source/en/installation.mdx
--- a/docs/source/en/optimization/fp16.mdx
+++ b/docs/source/en/optimization/fp16.mdx
@@ -20,7 +20,6 @@ We'll discuss how the following settings impact performance and memory.
 | ---------------- | ------- | ------- |
 | original         | 9.50s   | x1      |
 | cuDNN auto-tuner | 9.37s   | x1.01   |
-| autocast (fp16)  | 5.47s   | x1.74   |
 | fp16             | 3.61s   | x2.63   |
 | channels last    | 3.30s   | x2.88   |
 | traced UNet      | 3.21s   | x2.96   |
@@ -54,27 +53,9 @@ import torch
 torch.backends.cuda.matmul.allow_tf32 = True
 ```

-## Automatic mixed precision (AMP)
-
-If you use a CUDA GPU, you can take advantage of `torch.autocast` to perform inference roughly twice as fast at the cost of slightly lower precision. All you need to do is put your inference call inside an `autocast` context manager. The following example shows how to do it using Stable Diffusion text-to-image generation as an example:
-
-```Python
-from torch import autocast
-from diffusers import StableDiffusionPipeline
-
-pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
-pipe = pipe.to("cuda")
-
-prompt = "a photo of an astronaut riding a horse on mars"
-with autocast("cuda"):
-    image = pipe(prompt).images[0]
-```
-
-Despite the precision loss, in our experience the final image results look the same as the `float32` versions. Feel free to experiment and report back!
-
 ## Half precision weights

-To save more GPU memory and get even more speed, you can load and run the model weights directly in half precision. This involves loading the float16 version of the weights, which was saved to a branch named `fp16`, and telling PyTorch to use the `float16` type when loading them:
+To save more GPU memory and get more speed, you can load and run the model weights directly in half precision. This involves loading the float16 version of the weights, which was saved to a branch named `fp16`, and telling PyTorch to use the `float16` type when loading them:

 ```Python
 pipe = StableDiffusionPipeline.from_pretrained(
@@ -88,6 +69,11 @@ prompt = "a photo of an astronaut riding a horse on mars"
 image = pipe(prompt).images[0]
 ```

+<Tip warning={true}>
+  It is strongly discouraged to make use of [`torch.autocast`](https://pytorch.org/docs/stable/amp.html#torch.autocast) in any of the pipelines as it can lead to black images and is always slower than using pure 
+  float16 precision.
+</Tip>
+
 ## Sliced attention for additional memory savings

 For even additional memory savings, you can use a sliced version of attention that performs the computation in steps instead of all at once.
@@ -147,9 +133,10 @@ images = pipe([prompt] * 32).images
 You may see a small performance boost in VAE decode on multi-image batches. There should be no performance impact on single-image batches.


+<a name="sequential_offloading"></a>
 ## Offloading to CPU with accelerate for memory savings

-For additional memory savings, you can offload the weights to CPU and load them to GPU when performing the forward pass.
+For additional memory savings, you can offload the weights to CPU and only load them to GPU when performing the forward pass.

 To perform CPU offloading, all you have to do is invoke [`~StableDiffusionPipeline.enable_sequential_cpu_offload`]:

@@ -162,16 +149,21 @@ pipe = StableDiffusionPipeline.from_pretrained(
    
    torch_dtype=torch.float16,
 )
-pipe = pipe.to("cuda")

 prompt = "a photo of an astronaut riding a horse on mars"
 pipe.enable_sequential_cpu_offload()
 image = pipe(prompt).images[0]
 ```

-And you can get the memory consumption to < 2GB.
+And you can get the memory consumption to < 3GB.

-If is also possible to chain it with attention slicing for minimal memory consumption, running it in as little as < 800mb of GPU vRAM:
+Note that this method works at the submodule level, not on whole models. This is the best way to minimize memory consumption, but inference is much slower due to the iterative nature of the process. The UNet component of the pipeline runs several times (as many as `num_inference_steps`); each time, the different submodules of the UNet are sequentially onloaded and then offloaded as they are needed, so the number of memory transfers is large.
+
+<Tip>
+Consider using <a href="#model_offloading">model offloading</a> as another point in the optimization space: it will be much faster, but memory savings won't be as large.
+</Tip>
+
+It is also possible to chain offloading with attention slicing for minimal memory consumption (< 2GB).

 ```Python
 import torch
@@ -182,7 +174,6 @@ pipe = StableDiffusionPipeline.from_pretrained(
    
    torch_dtype=torch.float16,
 )
-pipe = pipe.to("cuda")

 prompt = "a photo of an astronaut riding a horse on mars"
 pipe.enable_sequential_cpu_offload()
@@ -191,6 +182,57 @@ pipe.enable_attention_slicing(1)
 image = pipe(prompt).images[0]
 ```

+**Note**: When using `enable_sequential_cpu_offload()`, it is important to **not** move the pipeline to CUDA beforehand or else the gain in memory consumption will only be minimal. See [this issue](https://github.com/huggingface/diffusers/issues/1934) for more information.
+
+
+<a name="model_offloading"></a>
+## Model offloading for fast inference and memory savings
+
+[Sequential CPU offloading](#sequential_offloading), as discussed in the previous section, preserves a lot of memory but makes inference slower, because submodules are moved to GPU as needed, and immediately returned to CPU when a new module runs.
+
+Full-model offloading is an alternative that moves whole models to the GPU, instead of handling each model's constituent _modules_. This results in a negligible impact on inference time (compared with moving the pipeline to `cuda`), while still providing some memory savings.
+
+In this scenario, only one of the main components of the pipeline (typically: text encoder, unet and vae)
+will be in the GPU while the others wait in the CPU. Compoments like the UNet that run for multiple iterations will stay on GPU until they are no longer needed.
+
+This feature can be enabled by invoking `enable_model_cpu_offload()` on the pipeline, as shown below.
+
+```Python
+import torch
+from diffusers import StableDiffusionPipeline
+
+pipe = StableDiffusionPipeline.from_pretrained(
+    "runwayml/stable-diffusion-v1-5",  
+    torch_dtype=torch.float16,
+)
+
+prompt = "a photo of an astronaut riding a horse on mars"
+pipe.enable_model_cpu_offload()
+image = pipe(prompt).images[0]
+```
+
+This is also compatible with attention slicing for additional memory savings.
+
+```Python
+import torch
+from diffusers import StableDiffusionPipeline
+
+pipe = StableDiffusionPipeline.from_pretrained(
+    "runwayml/stable-diffusion-v1-5",
+    torch_dtype=torch.float16,
+)
+
+prompt = "a photo of an astronaut riding a horse on mars"
+pipe.enable_model_cpu_offload()
+pipe.enable_attention_slicing(1)
+
+image = pipe(prompt).images[0]
+```
+
+<Tip>
+This feature requires `accelerate` version 0.17.0 or larger.
+</Tip>
+
 ## Using Channels Last memory format

 Channels last memory format is an alternative way of ordering NCHW tensors in memory preserving dimensions ordering. Channels last tensors ordered in such a way that channels become the densest dimension (aka storing images pixel-per-pixel). Since not all operators currently support channels last format it may result in a worst performance, so it's better to try it and see if it works for your model.
@@ -224,6 +266,7 @@ torch.set_grad_enabled(False)
 n_experiments = 2
 unet_runs_per_experiment = 50

+
 # load inputs
 def generate_inputs():
    sample = torch.randn(2, 4, 64, 64).half().cuda()
@@ -302,6 +345,8 @@ pipe = StableDiffusionPipeline.from_pretrained(

 # use jitted unet
 unet_traced = torch.jit.load("unet_traced.pt")
+
+
 # del pipe.unet
 class TracedUNet(torch.nn.Module):
    def __init__(self):
@@ -357,4 +402,4 @@ with torch.inference_mode():

 # optional: You can disable it via
 # pipe.disable_xformers_memory_efficient_attention()
-```
+```
--- a/docs/source/en/optimization/habana.mdx
+++ b/docs/source/en/optimization/habana.mdx
--- a/docs/source/en/optimization/mps.mdx
+++ b/docs/source/en/optimization/mps.mdx
--- a/docs/source/en/optimization/onnx.mdx
+++ b/docs/source/en/optimization/onnx.mdx
--- a/docs/source/en/optimization/open_vino.mdx
+++ b/docs/source/en/optimization/open_vino.mdx
--- a/docs/source/en/optimization/torch2.0.mdx
+++ b/docs/source/en/optimization/torch2.0.mdx
@@ -0,0 +1,200 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Torch2.0 support in Diffusers
+
+Starting from version `0.13.0`, Diffusers supports the latest optimization from the upcoming [PyTorch 2.0](https://pytorch.org/get-started/pytorch-2.0/) release. These include:
+1. Support for native flash and memory-efficient attention without any extra dependencies.
+2. [torch.compile](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html) support for compiling individual models for extra performance boost.
+
+
+## Installation
+To benefit from the native efficient attention and `torch.compile`, we will need to install the nightly version of PyTorch as the stable version is yet to be released. The first step is to install CUDA11.7 or CUDA11.8, 
+as torch2.0 does not support the previous versions. Once CUDA is installed, torch nightly can be installed using:
+
+```bash
+pip install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cu117
+```
+
+## Using efficient attention and torch.compile.
+
+
+1. **Efficient Attention**
+
+   Efficient attention is implemented via the [`torch.nn.functional.scaled_dot_product_attention`](https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention) function, which automatically enables flash/memory efficient attention, depending on the input and the GPU type. This is the same as the `memory_efficient_attention` from [xFormers](https://github.com/facebookresearch/xformers) but built natively into PyTorch. 
+
+   Efficient attention will be enabled by default in Diffusers if torch2.0 is installed and if `torch.nn.functional.scaled_dot_product_attention` is available. To use it, you can install torch2.0 as suggested above and use the pipeline. For example:
+
+    ```Python
+    import torch
+    from diffusers import StableDiffusionPipeline
+
+    pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16)
+    pipe = pipe.to("cuda")
+
+    prompt = "a photo of an astronaut riding a horse on mars"
+    image = pipe(prompt).images[0]
+    ```
+
+    If you want to enable it explicitly (which is not required), you can do so as shown below.
+
+    ```Python
+    import torch
+    from diffusers import StableDiffusionPipeline
+    from diffusers.models.cross_attention import AttnProcessor2_0
+
+    pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16).to("cuda")
+    pipe.unet.set_attn_processor(AttnProcessor2_0())
+
+    prompt = "a photo of an astronaut riding a horse on mars"
+    image = pipe(prompt).images[0]
+    ```
+
+    This should be as fast and memory efficient as `xFormers`.
+
+
+2. **torch.compile**
+
+    To get an additional speedup, we can use the new `torch.compile` feature. To do so, we wrap our `unet` with `torch.compile`. For more information and different options, refer to the 
+    [torch compile docs](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html).
+
+    ```python
+    import torch
+    from diffusers import StableDiffusionPipeline
+
+    pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16).to(
+        "cuda"
+    )
+    pipe.unet = torch.compile(pipe.unet)
+
+    batch_size = 10
+    prompt = "A photo of an astronaut riding a horse on marse."
+    images = pipe(prompt, num_inference_steps=steps, num_images_per_prompt=batch_size).images
+    ```
+
+    Depending on the type of GPU it can give between 2-9% speed-up over efficient attention. But note that as of now the speed-up is mostly noticeable on the more recent GPU architectures, such as in the A100.
+    
+    Note that compilation will also take some time to complete, so it is best suited for situations where you need to prepare your pipeline once and then perform the same type of inference operations multiple times.
+
+
+## Benchmark
+
+We conducted a simple benchmark on different GPUs to compare vanilla attention, xFormers, `torch.nn.functional.scaled_dot_product_attention` and `torch.compile+torch.nn.functional.scaled_dot_product_attention`.
+For the benchmark we used the the [stable-diffusion-v1-4](https://huggingface.co/CompVis/stable-diffusion-v1-4) model with 50 steps. `xFormers` benchmark is done using the `torch==1.13.1` version. The table below summarizes the result that we got.
+The `Speed over xformers` columns denotes the speed-up gained over `xFormers` using the `torch.compile+torch.nn.functional.scaled_dot_product_attention`.
+
+
+### FP16 benchmark
+
+The table below shows the benchmark results for inference using `fp16`. As we can see, `torch.nn.functional.scaled_dot_product_attention` is as fast as `xFormers` (sometimes slightly faster/slower) on all the GPUs we tested.
+And using `torch.compile` gives further speed-up up to 10% over `xFormers`, but it's mostly noticeable on the A100 GPU.
+
+___The time reported is in seconds.___
+
+| GPU | Batch Size | Vanilla Attention | xFormers | PyTorch2.0 SDPA | SDPA + torch.compile | Speed over xformers (%) |
+| --- | --- | --- | --- | --- | --- | --- |
+| A100 | 10 | 12.02 | 8.7 | 8.79 | 7.89 | 9.31 |
+| A100 | 16 | 18.95 | 13.57 | 13.67 | 12.25 | 9.73 |
+| A100 | 32 (1) | OOM | 26.56 | 26.68 | 24.08 | 9.34 |
+| A100 | 64(2) | | 52.51 | 53.03 | 47.81 | 8.95 |
+| | | | | | | |
+| A10 | 4 | 13.94 | 9.81 | 10.01 | 9.35 | 4.69 |
+| A10 | 8 | 27.09 | 19 | 19.53 | 18.33 | 3.53 |
+| A10 | 10 | 33.69 | 23.53 | 24.19 | 22.52 | 4.29 |
+| A10 | 16 | OOM | 37.55 | 38.31 | 36.81 | 1.97 |
+| A10 | 32 (1) | | 77.19 | 78.43 | 76.64 | 0.71 |
+| A10 | 64 (1) | | 173.59 | 158.99 | 155.14 | 10.63 |
+| | | | | | | |
+| T4 | 4 | 38.81 | 30.09 | 29.74 | 27.55 | 8.44 |
+| T4 | 8 | OOM | 55.71 | 55.99 | 53.85 | 3.34 |
+| T4 | 10 | OOM | 68.96 | 69.86 | 65.35 | 5.23 |
+| T4 | 16 | OOM | 111.47 | 113.26  | 106.93  | 4.07 |
+| | | | | | | |
+| V100 | 4 | 9.84 | 8.16 | 8.09 | 7.65 | 6.25 |
+| V100 | 8 | OOM | 15.62 | 15.44 | 14.59 | 6.59 |
+| V100 | 10 | OOM | 19.52 | 19.28 | 18.18 | 6.86 |
+| V100 | 16 | OOM | 30.29 | 29.84 | 28.22 | 6.83 |
+| | | | | | | |
+| 3090 | 4 | 10.04 | 7.82 | 7.89 | 7.47 | 4.48 |
+| 3090 | 8 | 19.27 | 14.97 | 15.04 | 14.22 | 5.01 |
+| 3090 | 10| 24.08 | 18.7 | 18.7 | 17.69 | 5.40 |
+| 3090 | 16 | OOM | 29.06 | 29.06 | 28.2 | 2.96 |
+| 3090 | 32 (1) | | 58.05 | 58 | 54.88 | 5.46 |
+| 3090 | 64 (1) | | 126.54 | 126.03 | 117.33 | 7.28 |
+| | | | | | | |
+| 3090 Ti | 4 | 9.07 | 7.14 | 7.15 | 6.81 | 4.62 |
+| 3090 Ti | 8 | 17.51 | 13.65 | 13.72 | 12.99 | 4.84 |
+| 3090 Ti | 10 (2) | 21.79 | 16.85 | 16.93 | 16.02 | 4.93 |
+| 3090 Ti | 16 | OOM | 26.1 | 26.28 | 25.46 | 2.45 |
+| 3090 Ti | 32 (1) | | 51.78 | 52.04 | 49.15 | 5.08 |
+| 3090 Ti | 64 (1) | | 112.02 | 112.33 | 103.91 | 7.24 |
+
+
+				
+### FP32 benchmark
+
+The table below shows the benchmark results for inference using `fp32`. As we can see, `torch.nn.functional.scaled_dot_product_attention` is as fast as `xFormers` (sometimes slightly faster/slower) on all the GPUs we tested.
+Using `torch.compile` with efficient attention gives up to 18% performance improvement over `xFormers` in Ampere cards, and up to 20% over vanilla attention.
+
+| GPU | Batch Size | Vanilla Attention | xFormers | PyTorch2.0 SDPA | SDPA + torch.compile | Speed over xformers (%) | Speed over vanilla (%) |
+| --- | --- | --- | --- | --- | --- | --- | --- |
+| A100 | 4 | 16.56 | 12.42 | 12.2 | 11.84 | 4.67 | 28.50 |
+| A100 | 10 | OOM | 29.93 | 29.44 | 28.5 | 4.78 | |
+| A100 | 16 | | 47.08 | 46.27 | 44.8 | 4.84 | |
+| A100 | 32 | | 92.89 | 91.34 | 88.35 | 4.89 | |
+| A100 | 64 | | 185.3 | 182.71 | 176.48 | 4.76 | |
+| | | | | | | |
+| A10 | 1 | 10.59 | 8.81 | 7.51 | 7.35 | 16.57 | 30.59 |
+| A10 | 4 | 34.77 | 27.63 | 22.77 | 22.07 | 20.12 | 36.53 |
+| A10 | 8 | | 56.19 | 43.53 | 43.86 | 21.94 | |
+| A10 | 16 | | 116.49 | 88.56 | 86.64 | 25.62 | |
+| A10 | 32 | | 221.95 | 175.74 | 168.18 | 24.23 | |
+| A10 | 48 | | 333.23 | 264.84 | | 20.52 | |
+| | | | | | | |
+| T4 | 1 | 28.2 | 24.49 | 23.93 | 23.56 | 3.80 | 16.45 |
+| T4 | 2 | 52.77 | 45.7 | 45.88 | 45.06 | 1.40 | 14.61 |
+| T4 | 4 | OOM | 85.72 | 85.78 | 84.48 | 1.45 | |
+| T4 | 8 | | 149.64 | 150.75 | 148.4 | 0.83 | |
+| | | | | | | |
+| V100 | 1 | 7.4 | 6.84 | 6.8 | 6.66 | 2.63 | 10.00 |
+| V100 | 2 | 13.85 | 12.81 | 12.66 | 12.35 | 3.59 | 10.83 |
+| V100 | 4 | OOM | 25.73 | 25.31 | 24.78 | 3.69 | |
+| V100 | 8 | | 43.95 | 43.37 | 42.25 | 3.87 | |
+| V100 | 16 | | 84.99 | 84.73 | 82.55 | 2.87 | |
+| | | | | | | |
+| 3090 | 1 | 7.09 | 6.78 | 6.11 | 6.03 | 11.06 | 14.95 |
+| 3090 | 4 | 22.69 | 21.45 | 18.67 | 18.09 | 15.66 | 20.27 |
+| 3090 | 8 (2) | | 42.59 | 36.75 | 35.59 | 16.44 | |
+| 3090 | 16 | | 85.35 | 72.37 | 70.25 | 17.69 | |
+| 3090 | 32 (1) | | 162.05 | 138.99 | 134.53 | 16.98 | |
+| 3090 | 48 | | 241.91 | 207.75 | | 14.12 | |
+| | | | | | | |
+| 3090 Ti | 1 | 6.45 | 6.19 | 5.64 | 5.49 | 11.31 | 14.88 |
+| 3090 Ti | 4 | 20.32 | 19.31 | 16.9 | 16.37 | 15.23 | 19.44 |
+| 3090 Ti | 8 (2) | | 37.93 | 33.05 | 31.99 | 15.66 | |
+| 3090 Ti | 16 | | 75.37 | 65.25 | 64.32 | 14.66 | |
+| 3090 Ti | 32 (1) | | 142.55 | 124.44 | 120.74 | 15.30 | |
+| 3090 Ti | 48 | | 213.19 | 186.55 | | 12.50 | |
+| | | | | | | |
+| 4090 | 1 | 5.54 | 4.99 | 4.51 | | | |
+| 4090 | 4 | 13.67 | 11.4 | 10.3 | | | |
+| 4090 | 8 (2) | | 19.79 | 17.13 | | | |
+| 4090 | 16 | | 38.62 | 33.14 | | | |
+| 4090 | 32 (1) | | 76.57 | 65.96 | | | |
+| 4090 | 48 | | 114.44 | 98.78 | | | |
+
+
+
+(1) Batch Size >= 32 requires enable_vae_slicing() because of https://github.com/pytorch/pytorch/issues/81665																										
+This is required for PyTorch 1.13.1, and also for PyTorch 2.0 and batch size of 64
+
+For more details about how this benchmark was run, please refer to [this PR](https://github.com/huggingface/diffusers/pull/2303).	
--- a/Show More
+++ b/Show More