Merge branch 'main' into fix-lora-device-test

empty
2026-02-16 07:50:05 +08:00 · 2024-04-25 17:13:28 +05:30 · 2024-04-24 10:58:50 +05:30 · 2024-04-23 20:40:11 +05:30 · 2024-04-23 15:30:26 +05:30 · 2024-04-22 17:23:42 +05:30
523 changed files with 8809 additions and 24793 deletions
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -39,7 +39,7 @@ jobs:
          python utils/print_env.py
      - name: Diffusers Benchmarking
        env:
-            HF_TOKEN: ${{ secrets.DIFFUSERS_BOT_TOKEN }}
+            HUGGING_FACE_HUB_TOKEN: ${{ secrets.DIFFUSERS_BOT_TOKEN }}
            BASE_PATH: benchmark_outputs
        run: |
          export TOTAL_GPU_MEMORY=$(python -c "import torch; print(torch.cuda.get_device_properties(0).total_memory / (1024**3))")
--- a/.github/workflows/build_docker_images.yml
+++ b/.github/workflows/build_docker_images.yml
@@ -25,17 +25,17 @@ jobs:
    steps:
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v1
-
+      
      - name: Check out code
        uses: actions/checkout@v3
-
+      
      - name: Find Changed Dockerfiles
        id: file_changes
        uses: jitterbit/get-changed-files@v1
        with:
          format: 'space-delimited'
          token: ${{ secrets.GITHUB_TOKEN }}
-
+      
      - name: Build Changed Docker Images
        run: |
          CHANGED_FILES="${{ steps.file_changes.outputs.all }}"
@@ -52,7 +52,7 @@ jobs:
  build-and-push-docker-images:
    runs-on: [ self-hosted, intel-cpu, 8-cpu, ci ]
    if: github.event_name != 'pull_request'
-
+    
    permissions:
      contents: read
      packages: write
@@ -69,7 +69,6 @@ jobs:
          - diffusers-flax-tpu
          - diffusers-onnxruntime-cpu
          - diffusers-onnxruntime-cuda
-          - diffusers-doc-builder

    steps:
      - name: Checkout repository
@@ -91,11 +90,24 @@ jobs:

      - name: Post to a Slack channel
        id: slack
-        uses: huggingface/hf-workflows/.github/actions/post-slack@main
+        uses: slackapi/slack-github-action@6c661ce58804a1a20f6dc5fbee7f0381b469e001
        with:
          # Slack channel id, channel name, or user id to post message.
          # See also: https://api.slack.com/methods/chat.postMessage#channels
-          slack_channel: ${{ env.CI_SLACK_CHANNEL }}
-          title: "🤗 Results of the ${{ matrix.image-name }} Docker Image build"
-          status: ${{ job.status }}
-          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
+          channel-id: ${{ env.CI_SLACK_CHANNEL }}
+          # For posting a rich message using Block Kit
+          payload: |
+            {
+              "text": "${{ matrix.image-name }} Docker Image build result: ${{ job.status }}\n${{ github.event.head_commit.url }}",
+              "blocks": [
+                {
+                  "type": "section",
+                  "text": {
+                    "type": "mrkdwn",
+                    "text": "${{ matrix.image-name }} Docker Image build result: ${{ job.status }}\n${{ github.event.head_commit.url }}"
+                  }
+                }
+              ]
+            }
+        env:
+          SLACK_BOT_TOKEN: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
--- a/.github/workflows/build_documentation.yml
+++ b/.github/workflows/build_documentation.yml
@@ -21,7 +21,7 @@ jobs:
      package: diffusers
      notebook_folder: diffusers_doc
      languages: en ko zh ja pt
-      custom_container: diffusers/diffusers-doc-builder
+
    secrets:
      token: ${{ secrets.HUGGINGFACE_PUSH }}
      hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
--- a/.github/workflows/build_pr_documentation.yml
+++ b/.github/workflows/build_pr_documentation.yml
@@ -20,4 +20,3 @@ jobs:
      install_libgl1: true
      package: diffusers
      languages: en ko zh ja pt
-      custom_container: diffusers/diffusers-doc-builder
--- a/.github/workflows/nightly_tests.yml
+++ b/.github/workflows/nightly_tests.yml
@@ -19,7 +19,7 @@ env:
 jobs:
  setup_torch_cuda_pipeline_matrix:
    name: Setup Torch Pipelines Matrix
-    runs-on: diffusers/diffusers-pytorch-cpu
+    runs-on: ubuntu-latest
    outputs:
      pipeline_test_matrix: ${{ steps.fetch_pipeline_matrix.outputs.pipeline_test_matrix }}
    steps:
@@ -67,30 +67,30 @@ jobs:
          fetch-depth: 2
      - name: NVIDIA-SMI
        run: nvidia-smi
-
+      
      - name: Install dependencies
        run: |
          python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
          python -m uv pip install -e [quality,test]
          python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
          python -m uv pip install pytest-reportlog
-
+      
      - name: Environment
        run: |
          python utils/print_env.py
-
-      - name: Nightly PyTorch CUDA checkpoint (pipelines) tests
+      
+      - name: Nightly PyTorch CUDA checkpoint (pipelines) tests 
        env:
-          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+          HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
          # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
          CUBLAS_WORKSPACE_CONFIG: :16:8
        run: |
          python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
            -s -v -k "not Flax and not Onnx" \
            --make-reports=tests_pipeline_${{ matrix.module }}_cuda \
-            --report-log=tests_pipeline_${{ matrix.module }}_cuda.log \
+            --report-log=tests_pipeline_${{ matrix.module }}_cuda.log \ 
            tests/pipelines/${{ matrix.module }}
-
+      
      - name: Failure short reports
        if: ${{ failure() }}
        run: |
@@ -103,7 +103,7 @@ jobs:
        with:
          name: pipeline_${{ matrix.module }}_test_reports
          path: reports
-
+      
      - name: Generate Report and Notify Channel
        if: always()
        run: |
@@ -112,7 +112,7 @@ jobs:

  run_nightly_tests_for_other_torch_modules:
    name: Torch Non-Pipelines CUDA Nightly Tests
-    runs-on: [single-gpu, nvidia-gpu, t4, ci]
+    runs-on: docker-gpu
    container:
      image: diffusers/diffusers-pytorch-cuda
      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0
@@ -139,35 +139,35 @@ jobs:
      run: python utils/print_env.py

    - name: Run nightly PyTorch CUDA tests for non-pipeline modules
-      if: ${{ matrix.module != 'examples'}}
+      if: ${{ matrix.module != 'examples'}} 
      env:
-        HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
        # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
        CUBLAS_WORKSPACE_CONFIG: :16:8
      run: |
        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
          -s -v -k "not Flax and not Onnx" \
          --make-reports=tests_torch_${{ matrix.module }}_cuda \
-          --report-log=tests_torch_${{ matrix.module }}_cuda.log \
+          --report-log=tests_torch_${{ matrix.module }}_cuda.log \ 
          tests/${{ matrix.module }}

    - name: Run nightly example tests with Torch
      if: ${{ matrix.module == 'examples' }}
      env:
-        HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
        # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
        CUBLAS_WORKSPACE_CONFIG: :16:8
      run: |
        python -m uv pip install peft@git+https://github.com/huggingface/peft.git
        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
          -s -v --make-reports=examples_torch_cuda \
-          --report-log=examples_torch_cuda.log \
+          --report-log=examples_torch_cuda.log \ 
          examples/

    - name: Failure short reports
      if: ${{ failure() }}
      run: |
-        cat reports/tests_torch_${{ matrix.module }}_cuda_stats.txt
+        cat reports/tests_torch_${{ matrix.module }}_cuda_stats.txt 
        cat reports/tests_torch_${{ matrix.module }}_cuda_failures_short.txt

    - name: Test suite reports artifacts
@@ -185,7 +185,7 @@ jobs:

  run_lora_nightly_tests:
    name: Nightly LoRA Tests with PEFT and TORCH
-    runs-on: [single-gpu, nvidia-gpu, t4, ci]
+    runs-on: docker-gpu
    container:
      image: diffusers/diffusers-pytorch-cuda
      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0
@@ -211,20 +211,20 @@ jobs:

    - name: Run nightly LoRA tests with PEFT and Torch
      env:
-        HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
        # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
        CUBLAS_WORKSPACE_CONFIG: :16:8
      run: |
        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
          -s -v -k "not Flax and not Onnx" \
          --make-reports=tests_torch_lora_cuda \
-          --report-log=tests_torch_lora_cuda.log \
+          --report-log=tests_torch_lora_cuda.log \ 
          tests/lora
-
+    
    - name: Failure short reports
      if: ${{ failure() }}
      run: |
-        cat reports/tests_torch_lora_cuda_stats.txt
+        cat reports/tests_torch_lora_cuda_stats.txt 
        cat reports/tests_torch_lora_cuda_failures_short.txt

    - name: Test suite reports artifacts
@@ -239,12 +239,12 @@ jobs:
      run: |
        pip install slack_sdk tabulate
        python scripts/log_reports.py >> $GITHUB_STEP_SUMMARY
-
+  
  run_flax_tpu_tests:
    name: Nightly Flax TPU Tests
    runs-on: docker-tpu
    if: github.event_name == 'schedule'
-
+    
    container:
      image: diffusers/diffusers-flax-tpu
      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --privileged
@@ -269,12 +269,12 @@ jobs:

    - name: Run nightly Flax TPU tests
      env:
-        HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
      run: |
        python -m pytest -n 0 \
          -s -v -k "Flax" \
          --make-reports=tests_flax_tpu \
-          --report-log=tests_flax_tpu.log \
+          --report-log=tests_flax_tpu.log \ 
          tests/

    - name: Failure short reports
@@ -298,11 +298,11 @@ jobs:

  run_nightly_onnx_tests:
    name: Nightly ONNXRuntime CUDA tests on Ubuntu
-    runs-on: [single-gpu, nvidia-gpu, t4, ci]
+    runs-on: docker-gpu
    container:
      image: diffusers/diffusers-onnxruntime-cuda
      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
-
+    
    steps:
    - name: Checkout diffusers
      uses: actions/checkout@v3
@@ -321,15 +321,15 @@ jobs:

    - name: Environment
      run: python utils/print_env.py
-
+    
    - name: Run nightly ONNXRuntime CUDA tests
      env:
-        HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
      run: |
        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
          -s -v -k "Onnx" \
          --make-reports=tests_onnx_cuda \
-          --report-log=tests_onnx_cuda.log \
+          --report-log=tests_onnx_cuda.log \ 
          tests/

    - name: Failure short reports
@@ -344,7 +344,7 @@ jobs:
      with:
        name: ${{ matrix.config.report }}_test_reports
        path: reports
-
+    
    - name: Generate Report and Notify Channel
      if: always()
      run: |
@@ -390,7 +390,7 @@ jobs:
        shell: arch -arch arm64 bash {0}
        env:
          HF_HOME: /System/Volumes/Data/mnt/cache
-          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+          HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
        run: |
          ${CONDA_RUN} python -m pytest -n 1 -s -v --make-reports=tests_torch_mps \
            --report-log=tests_torch_mps.log \
--- a/.github/workflows/pr_test_fetcher.yml
+++ b/.github/workflows/pr_test_fetcher.yml
@@ -15,7 +15,7 @@ concurrency:
 jobs:
  setup_pr_tests:
    name: Setup PR Tests
-    runs-on: [ self-hosted, intel-cpu, 8-cpu, ci ]
+    runs-on: docker-cpu
    container:
      image: diffusers/diffusers-pytorch-cpu
      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
@@ -73,7 +73,7 @@ jobs:
      max-parallel: 2
      matrix:
        modules: ${{ fromJson(needs.setup_pr_tests.outputs.matrix) }}
-    runs-on: [ self-hosted, intel-cpu, 8-cpu, ci ]
+    runs-on: docker-cpu
    container:
      image: diffusers/diffusers-pytorch-cpu
      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
@@ -123,7 +123,7 @@ jobs:
        config:
          - name: Hub tests for models, schedulers, and pipelines
            framework: hub_tests_pytorch
-            runner: [ self-hosted, intel-cpu, 8-cpu, ci ]
+            runner: docker-cpu
            image: diffusers/diffusers-pytorch-cpu
            report: torch_hub

--- a/.github/workflows/pr_tests.yml
+++ b/.github/workflows/pr_tests.yml
@@ -156,7 +156,7 @@ jobs:
      if: ${{ matrix.config.framework == 'pytorch_examples' }}
      run: |
        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-        python -m uv pip install peft timm
+        python -m uv pip install peft
        python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \
          --make-reports=tests_${{ matrix.config.report }} \
          examples
--- a/.github/workflows/push_tests.yml
+++ b/.github/workflows/push_tests.yml
@@ -21,9 +21,7 @@ env:
 jobs:
  setup_torch_cuda_pipeline_matrix:
    name: Setup Torch Pipelines CUDA Slow Tests Matrix
-    runs-on: [ self-hosted, intel-cpu, 8-cpu, ci ]
-    container:
-      image: diffusers/diffusers-pytorch-cpu
+    runs-on: ubuntu-latest
    outputs:
      pipeline_test_matrix: ${{ steps.fetch_pipeline_matrix.outputs.pipeline_test_matrix }}
    steps:
@@ -31,13 +29,14 @@ jobs:
        uses: actions/checkout@v3
        with:
          fetch-depth: 2
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.8"
      - name: Install dependencies
        run: |
-          python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-          python -m uv pip install -e [quality,test]
-      - name: Environment
-        run: |
-          python utils/print_env.py
+          pip install -e .
+          pip install huggingface_hub
      - name: Fetch Pipeline Matrix
        id: fetch_pipeline_matrix
        run: |
@@ -56,13 +55,12 @@ jobs:
    needs: setup_torch_cuda_pipeline_matrix
    strategy:
      fail-fast: false
-      max-parallel: 8
      matrix:
        module: ${{ fromJson(needs.setup_torch_cuda_pipeline_matrix.outputs.pipeline_test_matrix) }}
    runs-on: [single-gpu, nvidia-gpu, t4, ci]
    container:
      image: diffusers/diffusers-pytorch-cuda
-      options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface/diffusers:/mnt/cache/ --gpus 0 --privileged
+      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0 --privileged
    steps:
      - name: Checkout diffusers
        uses: actions/checkout@v3
@@ -87,7 +85,7 @@ jobs:
          python utils/print_env.py
      - name: Slow PyTorch CUDA checkpoint tests on Ubuntu
        env:
-          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+          HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
          # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
          CUBLAS_WORKSPACE_CONFIG: :16:8
        run: |
@@ -116,16 +114,16 @@ jobs:

  torch_cuda_tests:
    name: Torch CUDA Tests
-    runs-on: [single-gpu, nvidia-gpu, t4, ci]
+    runs-on: docker-gpu
    container:
      image: diffusers/diffusers-pytorch-cuda
-      options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface/diffusers:/mnt/cache/ --gpus 0
+      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0
    defaults:
      run:
        shell: bash
    strategy:
      matrix:
-        module: [models, schedulers, lora, others, single_file]
+        module: [models, schedulers, lora, others]
    steps:
    - name: Checkout diffusers
      uses: actions/checkout@v3
@@ -144,7 +142,7 @@ jobs:

    - name: Run slow PyTorch CUDA tests
      env:
-        HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
        # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
        CUBLAS_WORKSPACE_CONFIG: :16:8
      run: |
@@ -168,10 +166,10 @@ jobs:

  peft_cuda_tests:
    name: PEFT CUDA Tests
-    runs-on: [single-gpu, nvidia-gpu, t4, ci]
+    runs-on: docker-gpu
    container:
      image: diffusers/diffusers-pytorch-cuda
-      options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface/diffusers:/mnt/cache/ --gpus 0
+      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0
    defaults:
      run:
        shell: bash
@@ -194,7 +192,7 @@ jobs:

    - name: Run slow PEFT CUDA tests
      env:
-        HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
        # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
        CUBLAS_WORKSPACE_CONFIG: :16:8
      run: |
@@ -221,7 +219,7 @@ jobs:
    runs-on: docker-tpu
    container:
      image: diffusers/diffusers-flax-tpu
-      options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ --privileged
+      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --privileged
    defaults:
      run:
        shell: bash
@@ -243,7 +241,7 @@ jobs:

    - name: Run slow Flax TPU tests
      env:
-        HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
      run: |
        python -m pytest -n 0 \
          -s -v -k "Flax" \
@@ -265,10 +263,10 @@ jobs:

  onnx_cuda_tests:
    name: ONNX CUDA Tests
-    runs-on: [single-gpu, nvidia-gpu, t4, ci]
+    runs-on: docker-gpu
    container:
      image: diffusers/diffusers-onnxruntime-cuda
-      options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ --gpus 0
+      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0
    defaults:
      run:
        shell: bash
@@ -290,7 +288,7 @@ jobs:

    - name: Run slow ONNXRuntime CUDA tests
      env:
-        HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
      run: |
        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
          -s -v -k "Onnx" \
@@ -313,11 +311,11 @@ jobs:
  run_torch_compile_tests:
    name: PyTorch Compile CUDA tests

-    runs-on: [single-gpu, nvidia-gpu, t4, ci]
+    runs-on: docker-gpu

    container:
      image: diffusers/diffusers-pytorch-compile-cuda
-      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/

    steps:
    - name: Checkout diffusers
@@ -337,7 +335,7 @@ jobs:
        python utils/print_env.py
    - name: Run example tests on GPU
      env:
-        HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
      run: |
        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v -k "compile" --make-reports=tests_torch_compile_cuda tests/
    - name: Failure short reports
@@ -354,11 +352,11 @@ jobs:
  run_xformers_tests:
    name: PyTorch xformers CUDA tests

-    runs-on: [single-gpu, nvidia-gpu, t4, ci]
+    runs-on: docker-gpu

    container:
      image: diffusers/diffusers-pytorch-xformers-cuda
-      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/

    steps:
    - name: Checkout diffusers
@@ -378,7 +376,7 @@ jobs:
        python utils/print_env.py
    - name: Run example tests on GPU
      env:
-        HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
      run: |
        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v -k "xformers" --make-reports=tests_torch_xformers_cuda tests/
    - name: Failure short reports
@@ -395,11 +393,11 @@ jobs:
  run_examples_tests:
    name: Examples PyTorch CUDA tests on Ubuntu

-    runs-on: [single-gpu, nvidia-gpu, t4, ci]
+    runs-on: docker-gpu

    container:
      image: diffusers/diffusers-pytorch-cuda
-      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/

    steps:
    - name: Checkout diffusers
@@ -423,10 +421,9 @@ jobs:

    - name: Run example tests on GPU
      env:
-        HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
      run: |
        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-        python -m uv pip install timm
        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v --make-reports=examples_torch_cuda examples/

    - name: Failure short reports
--- a/.github/workflows/push_tests_fast.yml
+++ b/.github/workflows/push_tests_fast.yml
@@ -107,7 +107,7 @@ jobs:
      if: ${{ matrix.config.framework == 'pytorch_examples' }}
      run: |
        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-        python -m uv pip install peft timm
+        python -m uv pip install peft
        python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \
          --make-reports=tests_${{ matrix.config.report }} \
          examples
--- a/.github/workflows/push_tests_mps.yml
+++ b/.github/workflows/push_tests_mps.yml
@@ -23,7 +23,7 @@ concurrency:
 jobs:
  run_fast_tests_apple_m1:
    name: Fast PyTorch MPS tests on MacOS
-    runs-on: macos-13-xlarge
+    runs-on: [ self-hosted, apple-m1 ]

    steps:
    - name: Checkout diffusers
@@ -59,7 +59,7 @@ jobs:
      shell: arch -arch arm64 bash {0}
      env:
        HF_HOME: /System/Volumes/Data/mnt/cache
-        HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
      run: |
        ${CONDA_RUN} python -m pytest -n 0 -s -v --make-reports=tests_torch_mps tests/

--- a/.github/workflows/run_tests_from_a_pr.yml
+++ b/.github/workflows/run_tests_from_a_pr.yml
@@ -1,73 +0,0 @@
-name: Check running SLOW tests from a PR (only GPU)
-
-on:
-  workflow_dispatch:
-    inputs:
-      docker_image:
-        default: 'diffusers/diffusers-pytorch-cuda'
-        description: 'Name of the Docker image'
-        required: true
-      branch: 
-        description: 'PR Branch to test on'
-        required: true
-      test:
-        description: 'Tests to run (e.g.: `tests/models`).'
-        required: true
-
-env:
-  DIFFUSERS_IS_CI: yes
-  IS_GITHUB_CI: "1"
-  HF_HOME: /mnt/cache
-  OMP_NUM_THREADS: 8
-  MKL_NUM_THREADS: 8
-  PYTEST_TIMEOUT: 600
-  RUN_SLOW: yes
-
-jobs:
-  run_tests:
-    name: "Run a test on our runner from a PR"
-    runs-on: [single-gpu, nvidia-gpu, t4, ci]
-    container:
-      image: ${{ github.event.inputs.docker_image }}
-      options: --gpus 0 --privileged --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-
-    steps:
-      - name: Validate test files input
-        id: validate_test_files
-        env: 
-          PY_TEST: ${{ github.event.inputs.test }}
-        run: |
-          if [[ ! "$PY_TEST" =~ ^tests/ ]]; then
-            echo "Error: The input string must start with 'tests/'."
-            exit 1
-          fi
-          
-          if [[ ! "$PY_TEST" =~ ^tests/(models|pipelines) ]]; then
-            echo "Error: The input string must contain either 'models' or 'pipelines' after 'tests/'."
-            exit 1
-          fi
-          
-          if [[ "$PY_TEST" == *";"* ]]; then
-            echo "Error: The input string must not contain ';'."
-            exit 1
-          fi
-          echo "$PY_TEST"
-
-      - name: Checkout PR branch
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ github.event.inputs.branch }}
-          repository: ${{ github.event.pull_request.head.repo.full_name }}
-
-
-      - name: Install pytest 
-        run: | 
-          python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-          python -m uv pip install -e [quality,test]
-          python -m uv pip install peft
-      
-      - name: Run tests
-        env: 
-            PY_TEST: ${{ github.event.inputs.test }}
-        run: |
-          pytest "$PY_TEST"
--- a/.github/workflows/ssh-runner.yml
+++ b/.github/workflows/ssh-runner.yml
@@ -1,46 +0,0 @@
-name: SSH into runners
-
-on:
-  workflow_dispatch:
-    inputs:
-      runner_type:
-        description: 'Type of runner to test (a10 or t4)'
-        required: true
-      docker_image:
-        description: 'Name of the Docker image'
-        required: true
-
-env:
-  IS_GITHUB_CI: "1"
-  HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
-  HF_HOME: /mnt/cache
-  DIFFUSERS_IS_CI: yes
-  OMP_NUM_THREADS: 8
-  MKL_NUM_THREADS: 8
-  RUN_SLOW: yes
-
-jobs:
-  ssh_runner:
-    name: "SSH"
-    runs-on: [single-gpu, nvidia-gpu, "${{ github.event.inputs.runner_type }}", ci]
-    container:
-      image: ${{ github.event.inputs.docker_image }}
-      options: --gpus all --privileged --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-
-    steps:
-      - name: Checkout diffusers
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: 2
-
-      - name: NVIDIA-SMI
-        run: |
-          nvidia-smi
-
-      - name: Tailscale # In order to be able to SSH when a test fails
-        uses: huggingface/tailscale-action@v1
-        with:
-          authkey: ${{ secrets.TAILSCALE_SSH_AUTHKEY }}
-          slackChannel: ${{ secrets.SLACK_CIFEEDBACK_CHANNEL }}
-          slackToken: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
-          waitForSSH: true
--- a/.github/workflows/update_metadata.yml
+++ b/.github/workflows/update_metadata.yml
@@ -25,6 +25,6 @@ jobs:

      - name: Update metadata
        env:
-          HF_TOKEN: ${{ secrets.SAYAK_HF_TOKEN }}
+          HUGGING_FACE_HUB_TOKEN: ${{ secrets.SAYAK_HF_TOKEN }}
        run: |
          python utils/update_metadata.py --commit_sha ${{ github.sha }}
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -355,7 +355,7 @@ You will need basic `git` proficiency to be able to contribute to
 manual. Type `git --help` in a shell and enjoy. If you prefer books, [Pro
 Git](https://git-scm.com/book/en/v2) is a very good reference.

-Follow these steps to start contributing ([supported Python versions](https://github.com/huggingface/diffusers/blob/42f25d601a910dceadaee6c44345896b4cfa9928/setup.py#L270)):
+Follow these steps to start contributing ([supported Python versions](https://github.com/huggingface/diffusers/blob/main/setup.py#L265)):

 1. Fork the [repository](https://github.com/huggingface/diffusers) by
 clicking on the 'Fork' button on the repository's page. This creates a copy of the code
--- a/README.md
+++ b/README.md
@@ -77,7 +77,7 @@ Please refer to the [How to use Stable Diffusion in Apple Silicon](https://huggi

 ## Quickstart

-Generating outputs is super easy with 🤗 Diffusers. To generate an image from text, use the `from_pretrained` method to load any pretrained diffusion model (browse the [Hub](https://huggingface.co/models?library=diffusers&sort=downloads) for 25.000+ checkpoints):
+Generating outputs is super easy with 🤗 Diffusers. To generate an image from text, use the `from_pretrained` method to load any pretrained diffusion model (browse the [Hub](https://huggingface.co/models?library=diffusers&sort=downloads) for 22000+ checkpoints):

 ```python
 from diffusers import DiffusionPipeline
@@ -219,7 +219,7 @@ Also, say 👋 in our public Discord channel <a href="https://discord.gg/G7tWnz9
 - https://github.com/deep-floyd/IF
 - https://github.com/bentoml/BentoML
 - https://github.com/bmaltais/kohya_ss
- +11.000 other amazing GitHub repositories 💪
+- +9000 other amazing GitHub repositories 💪

 Thank you for using us ❤️.

--- a/docker/diffusers-doc-builder/Dockerfile
+++ b/docker/diffusers-doc-builder/Dockerfile
@@ -1,51 +0,0 @@
-FROM ubuntu:20.04
-LABEL maintainer="Hugging Face"
-LABEL repository="diffusers"
-
-ENV DEBIAN_FRONTEND=noninteractive
-
-RUN apt-get -y update \
-    && apt-get install -y software-properties-common \
-    && add-apt-repository ppa:deadsnakes/ppa
-
-RUN apt install -y bash \
-                   build-essential \
-                   git \
-                   git-lfs \
-                   curl \
-                   ca-certificates \
-                   libsndfile1-dev \
-                   python3.10 \
-                   python3-pip \
-                   libgl1 \
-                   zip \
-                   python3.10-venv && \
-    rm -rf /var/lib/apt/lists
-
-# make sure to use venv
-RUN python3.10 -m venv /opt/venv
-ENV PATH="/opt/venv/bin:$PATH"
-
-# pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
-RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
-    python3.10 -m uv pip install --no-cache-dir \
-        torch \
-        torchvision \
-        torchaudio \
-        invisible_watermark \
-        --extra-index-url https://download.pytorch.org/whl/cpu && \
-    python3.10 -m uv pip install --no-cache-dir \
-        accelerate \
-        datasets \
-        hf-doc-builder \
-        huggingface-hub \
-        Jinja2 \
-        librosa \
-        numpy \
-        scipy \
-        tensorboard \
-        transformers \
-        matplotlib \
-        setuptools==69.5.1
-
-CMD ["/bin/bash"]
--- a/docker/diffusers-flax-cpu/Dockerfile
+++ b/docker/diffusers-flax-cpu/Dockerfile
@@ -4,25 +4,22 @@ LABEL repository="diffusers"

 ENV DEBIAN_FRONTEND=noninteractive

-RUN apt-get -y update \
-    && apt-get install -y software-properties-common \
-    && add-apt-repository ppa:deadsnakes/ppa
-
-RUN apt install -y bash \
-        build-essential \
-        git \
-        git-lfs \
-        curl \
-        ca-certificates \
-        libsndfile1-dev \
-        libgl1 \
-        python3.10 \
-        python3-pip \
-        python3.10-venv && \
+RUN apt update && \
+    apt install -y bash \
+                   build-essential \
+                   git \
+                   git-lfs \
+                   curl \
+                   ca-certificates \
+                   libsndfile1-dev \
+                   libgl1 \
+                   python3.8 \
+                   python3-pip \
+                   python3.8-venv && \
    rm -rf /var/lib/apt/lists

 # make sure to use venv
-RUN python3.10 -m venv /opt/venv
+RUN python3 -m venv /opt/venv
 ENV PATH="/opt/venv/bin:$PATH"

 # pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
--- a/docker/diffusers-flax-tpu/Dockerfile
+++ b/docker/diffusers-flax-tpu/Dockerfile
@@ -4,11 +4,8 @@ LABEL repository="diffusers"

 ENV DEBIAN_FRONTEND=noninteractive

-RUN apt-get -y update \
-    && apt-get install -y software-properties-common \
-    && add-apt-repository ppa:deadsnakes/ppa
-
-RUN apt install -y bash \
+RUN apt update && \
+    apt install -y bash \
                   build-essential \
                   git \
                   git-lfs \
@@ -16,13 +13,13 @@ RUN apt install -y bash \
                   ca-certificates \
                   libsndfile1-dev \
                   libgl1 \
-                   python3.10 \
+                   python3.8 \
                   python3-pip \
-                   python3.10-venv && \
+                   python3.8-venv && \
    rm -rf /var/lib/apt/lists

 # make sure to use venv
-RUN python3.10 -m venv /opt/venv
+RUN python3 -m venv /opt/venv
 ENV PATH="/opt/venv/bin:$PATH"

 # pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
--- a/docker/diffusers-onnxruntime-cpu/Dockerfile
+++ b/docker/diffusers-onnxruntime-cpu/Dockerfile
@@ -4,11 +4,8 @@ LABEL repository="diffusers"

 ENV DEBIAN_FRONTEND=noninteractive

-RUN apt-get -y update \
-    && apt-get install -y software-properties-common \
-    && add-apt-repository ppa:deadsnakes/ppa
-
-RUN apt install -y bash \
+RUN apt update && \
+    apt install -y bash \
                   build-essential \
                   git \
                   git-lfs \
@@ -16,13 +13,13 @@ RUN apt install -y bash \
                   ca-certificates \
                   libsndfile1-dev \
                   libgl1 \
-                   python3.10 \
+                   python3.8 \
                   python3-pip \
-                   python3.10-venv && \
+                   python3.8-venv && \
    rm -rf /var/lib/apt/lists

 # make sure to use venv
-RUN python3.10 -m venv /opt/venv
+RUN python3 -m venv /opt/venv
 ENV PATH="/opt/venv/bin:$PATH"

 # pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
--- a/docker/diffusers-onnxruntime-cuda/Dockerfile
+++ b/docker/diffusers-onnxruntime-cuda/Dockerfile
@@ -4,11 +4,8 @@ LABEL repository="diffusers"

 ENV DEBIAN_FRONTEND=noninteractive

-RUN apt-get -y update \
-    && apt-get install -y software-properties-common \
-    && add-apt-repository ppa:deadsnakes/ppa
-
-RUN apt install -y bash \
+RUN apt update && \
+    apt install -y bash \
                   build-essential \
                   git \
                   git-lfs \
@@ -16,24 +13,24 @@ RUN apt install -y bash \
                   ca-certificates \
                   libsndfile1-dev \
                   libgl1 \
-                   python3.10 \
+                   python3.8 \
                   python3-pip \
-                   python3.10-venv && \
+                   python3.8-venv && \
    rm -rf /var/lib/apt/lists

 # make sure to use venv
-RUN python3.10 -m venv /opt/venv
+RUN python3 -m venv /opt/venv
 ENV PATH="/opt/venv/bin:$PATH"

 # pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
-RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
-    python3.10 -m uv pip install --no-cache-dir \
+RUN python3 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
+    python3 -m uv pip install --no-cache-dir \
        torch \
        torchvision \
        torchaudio \
        "onnxruntime-gpu>=1.13.1" \
        --extra-index-url https://download.pytorch.org/whl/cu117 && \
-    python3.10 -m uv pip install --no-cache-dir \
+    python3 -m uv pip install --no-cache-dir \
        accelerate \
        datasets \
        hf-doc-builder \
--- a/docker/diffusers-pytorch-compile-cuda/Dockerfile
+++ b/docker/diffusers-pytorch-compile-cuda/Dockerfile
@@ -4,11 +4,8 @@ LABEL repository="diffusers"

 ENV DEBIAN_FRONTEND=noninteractive

-RUN apt-get -y update \
-    && apt-get install -y software-properties-common \
-    && add-apt-repository ppa:deadsnakes/ppa
-
-RUN apt install -y bash \
+RUN apt update && \
+    apt install -y bash \
    build-essential \
    git \
    git-lfs \
@@ -16,23 +13,24 @@ RUN apt install -y bash \
    ca-certificates \
    libsndfile1-dev \
    libgl1 \
-    python3.10 \
+    python3.9 \
+    python3.9-dev \
    python3-pip \
-    python3.10-venv && \
+    python3.9-venv && \
    rm -rf /var/lib/apt/lists

 # make sure to use venv
-RUN python3.10 -m venv /opt/venv
+RUN python3.9 -m venv /opt/venv
 ENV PATH="/opt/venv/bin:$PATH"

 # pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
-RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
-    python3.10 -m uv pip install --no-cache-dir \
+RUN python3.9 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
+    python3.9 -m uv pip install --no-cache-dir \
    torch \
    torchvision \
    torchaudio \
    invisible_watermark && \
-    python3.10 -m pip install --no-cache-dir \
+    python3.9 -m pip install --no-cache-dir \
    accelerate \
    datasets \
    hf-doc-builder \
--- a/docker/diffusers-pytorch-cpu/Dockerfile
+++ b/docker/diffusers-pytorch-cpu/Dockerfile
@@ -4,36 +4,33 @@ LABEL repository="diffusers"

 ENV DEBIAN_FRONTEND=noninteractive

-RUN apt-get -y update \
-    && apt-get install -y software-properties-common \
-    && add-apt-repository ppa:deadsnakes/ppa
-
-RUN apt install -y bash \
+RUN apt update && \
+    apt install -y bash \
                   build-essential \
                   git \
                   git-lfs \
                   curl \
                   ca-certificates \
                   libsndfile1-dev \
-                   python3.10 \
+                   python3.8 \
                   python3-pip \
                   libgl1 \
-                   python3.10-venv && \
+                   python3.8-venv && \
    rm -rf /var/lib/apt/lists

 # make sure to use venv
-RUN python3.10 -m venv /opt/venv
+RUN python3 -m venv /opt/venv
 ENV PATH="/opt/venv/bin:$PATH"

 # pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
-RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
-    python3.10 -m uv pip install --no-cache-dir \
+RUN python3 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
+    python3 -m uv pip install --no-cache-dir \
        torch \
        torchvision \
        torchaudio \
        invisible_watermark \
        --extra-index-url https://download.pytorch.org/whl/cpu && \
-    python3.10 -m uv pip install --no-cache-dir \
+    python3 -m uv pip install --no-cache-dir \
        accelerate \
        datasets \
        hf-doc-builder \
--- a/docker/diffusers-pytorch-cuda/Dockerfile
+++ b/docker/diffusers-pytorch-cuda/Dockerfile
@@ -4,11 +4,8 @@ LABEL repository="diffusers"

 ENV DEBIAN_FRONTEND=noninteractive

-RUN apt-get -y update \
-    && apt-get install -y software-properties-common \
-    && add-apt-repository ppa:deadsnakes/ppa
-
-RUN apt install -y bash \
+RUN apt update && \
+    apt install -y bash \
    build-essential \
    git \
    git-lfs \
@@ -16,23 +13,23 @@ RUN apt install -y bash \
    ca-certificates \
    libsndfile1-dev \
    libgl1 \
-    python3.10 \
+    python3.8 \
    python3-pip \
-    python3.10-venv && \
+    python3.8-venv && \
    rm -rf /var/lib/apt/lists

 # make sure to use venv
-RUN python3.10 -m venv /opt/venv
+RUN python3 -m venv /opt/venv
 ENV PATH="/opt/venv/bin:$PATH"

 # pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
-RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
-    python3.10 -m uv pip install --no-cache-dir \
+RUN python3 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
+    python3 -m uv pip install --no-cache-dir \
    torch \
    torchvision \
    torchaudio \
    invisible_watermark && \
-    python3.10 -m pip install --no-cache-dir \
+    python3 -m pip install --no-cache-dir \
    accelerate \
    datasets \
    hf-doc-builder \
--- a/docker/diffusers-pytorch-xformers-cuda/Dockerfile
+++ b/docker/diffusers-pytorch-xformers-cuda/Dockerfile
@@ -4,11 +4,8 @@ LABEL repository="diffusers"

 ENV DEBIAN_FRONTEND=noninteractive

-RUN apt-get -y update \
-    && apt-get install -y software-properties-common \
-    && add-apt-repository ppa:deadsnakes/ppa
-
-RUN apt install -y bash \
+RUN apt update && \
+    apt install -y bash \
                   build-essential \
                   git \
                   git-lfs \
@@ -16,23 +13,23 @@ RUN apt install -y bash \
                   ca-certificates \
                   libsndfile1-dev \
                   libgl1 \
-                   python3.10 \
+                   python3.8 \
                   python3-pip \
-                   python3.10-venv && \
+                   python3.8-venv && \
    rm -rf /var/lib/apt/lists

 # make sure to use venv
-RUN python3.10 -m venv /opt/venv
+RUN python3 -m venv /opt/venv
 ENV PATH="/opt/venv/bin:$PATH"

 # pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
-RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
-    python3.10 -m pip install --no-cache-dir \
+RUN python3 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
+    python3 -m pip install --no-cache-dir \
        torch \
        torchvision \
        torchaudio \
        invisible_watermark && \
-    python3.10 -m uv pip install --no-cache-dir \
+    python3 -m uv pip install --no-cache-dir \
        accelerate \
        datasets \
        hf-doc-builder \
--- a/docs/README.md
+++ b/docs/README.md
@@ -242,10 +242,10 @@ Here's an example of a tuple return, comprising several objects:

 ```
    Returns:
-        `tuple(torch.Tensor)` comprising various elements depending on the configuration ([`BertConfig`]) and inputs:
-        - ** loss** (*optional*, returned when `masked_lm_labels` is provided) `torch.Tensor` of shape `(1,)` --
+        `tuple(torch.FloatTensor)` comprising various elements depending on the configuration ([`BertConfig`]) and inputs:
+        - ** loss** (*optional*, returned when `masked_lm_labels` is provided) `torch.FloatTensor` of shape `(1,)` --
          Total loss is the sum of the masked language modeling loss and the next sequence prediction (classification) loss.
-        - **prediction_scores** (`torch.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`) --
+        - **prediction_scores** (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`) --
          Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
 ```

--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -62,11 +62,13 @@
  - local: using-diffusers/callback
    title: Pipeline callbacks
  - local: using-diffusers/reusing_seeds
-    title: Reproducible pipelines
-  - local: using-diffusers/image_quality
-    title: Controlling image quality
+    title: Improve image quality with deterministic generation
+  - local: using-diffusers/control_brightness
+    title: Control image brightness
  - local: using-diffusers/weighted_prompts
    title: Prompt techniques
+  - local: using-diffusers/freeu
+    title: Improve generation quality with FreeU
  title: Inference techniques
 - sections:
  - local: using-diffusers/sdxl
@@ -81,20 +83,26 @@
    title: ControlNet
  - local: using-diffusers/t2i_adapter
    title: T2I-Adapter
-  - local: using-diffusers/inference_with_lcm
-    title: Latent Consistency Model
  - local: using-diffusers/textual_inversion_inference
    title: Textual inversion
  - local: using-diffusers/shap-e
    title: Shap-E
  - local: using-diffusers/diffedit
    title: DiffEdit
+  - local: using-diffusers/reproducibility
+    title: Create reproducible pipelines
+  - local: using-diffusers/custom_pipeline_examples
+    title: Community pipelines
+  - local: using-diffusers/contribute_pipeline
+    title: Contribute a community pipeline
+  - local: using-diffusers/inference_with_lcm_lora
+    title: Latent Consistency Model-LoRA
+  - local: using-diffusers/inference_with_lcm
+    title: Latent Consistency Model
  - local: using-diffusers/inference_with_tcd_lora
    title: Trajectory Consistency Distillation-LoRA
  - local: using-diffusers/svd
    title: Stable Video Diffusion
-  - local: using-diffusers/marigold_usage
-    title: Marigold Computer Vision
  title: Specific pipeline examples
 - sections:
  - local: training/overview
@@ -141,6 +149,8 @@
 - sections:
  - local: optimization/fp16
    title: Speed up inference
+  - local: using-diffusers/distilled_sd
+    title: Distilled Stable Diffusion inference
  - local: optimization/memory
    title: Reduce memory usage
  - local: optimization/torch2.0
@@ -297,8 +307,6 @@
      title: Latent Diffusion
    - local: api/pipelines/ledits_pp
      title: LEDITS++
-    - local: api/pipelines/marigold
-      title: Marigold
    - local: api/pipelines/panorama
      title: MultiDiffusion
    - local: api/pipelines/musicldm
@@ -309,8 +317,6 @@
      title: Personalized Image Animator (PIA)
    - local: api/pipelines/pixart
      title: PixArt-α
-    - local: api/pipelines/pixart_sigma
-      title: PixArt-Σ
    - local: api/pipelines/self_attention_guidance
      title: Self-Attention Guidance
    - local: api/pipelines/semantic_stable_diffusion
@@ -445,8 +451,6 @@
      title: Utilities
    - local: api/image_processor
      title: VAE Image Processor
-    - local: api/video_processor
-      title: Video Processor
    title: Internal classes
    isExpanded: false
-  title: API
+  title: API
--- a/docs/source/en/api/attnprocessor.md
+++ b/docs/source/en/api/attnprocessor.md
@@ -55,6 +55,3 @@ An attention processor is a class for applying different types of attention mech

 ## XFormersAttnProcessor
 [[autodoc]] models.attention_processor.XFormersAttnProcessor
-
-## AttnProcessorNPU
-[[autodoc]] models.attention_processor.AttnProcessorNPU
--- a/docs/source/en/api/image_processor.md
+++ b/docs/source/en/api/image_processor.md
@@ -25,11 +25,3 @@ All pipelines with [`VaeImageProcessor`] accept PIL Image, PyTorch tensor, or Nu
 The [`VaeImageProcessorLDM3D`] accepts RGB and depth inputs and returns RGB and depth outputs.

 [[autodoc]] image_processor.VaeImageProcessorLDM3D
-
-## PixArtImageProcessor
-
-[[autodoc]] image_processor.PixArtImageProcessor
-
-## IPAdapterMaskProcessor
-
-[[autodoc]] image_processor.IPAdapterMaskProcessor
--- a/docs/source/en/api/loaders/single_file.md
+++ b/docs/source/en/api/loaders/single_file.md
@@ -10,134 +10,13 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->

-# Loading Pipelines and Models via `from_single_file`
+# Single files

-The `from_single_file` method allows you to load supported pipelines using a single checkpoint file as opposed to Diffusers' multiple folders format. This is useful if you are working with Stable Diffusion Web UI's (such as A1111) that rely on a single file format to distribute all the components of a model.
+Diffusers supports loading pretrained pipeline (or model) weights stored in a single file, such as a `ckpt` or `safetensors` file. These single file types are typically produced from community trained models. There are three classes for loading single file weights:

-The `from_single_file` method also supports loading models in their originally distributed format. This means that supported models that have been finetuned with other services can be loaded directly into Diffusers model objects and pipelines.
-
-## Pipelines that currently support `from_single_file` loading
-
- [`StableDiffusionPipeline`]
- [`StableDiffusionImg2ImgPipeline`]
- [`StableDiffusionInpaintPipeline`]
- [`StableDiffusionControlNetPipeline`]
- [`StableDiffusionControlNetImg2ImgPipeline`]
- [`StableDiffusionControlNetInpaintPipeline`]
- [`StableDiffusionUpscalePipeline`]
- [`StableDiffusionXLPipeline`]
- [`StableDiffusionXLImg2ImgPipeline`]
- [`StableDiffusionXLInpaintPipeline`]
- [`StableDiffusionXLInstructPix2PixPipeline`]
- [`StableDiffusionXLControlNetPipeline`]
- [`StableDiffusionXLKDiffusionPipeline`]
- [`LatentConsistencyModelPipeline`]
- [`LatentConsistencyModelImg2ImgPipeline`]
- [`StableDiffusionControlNetXSPipeline`]
- [`StableDiffusionXLControlNetXSPipeline`]
- [`LEditsPPPipelineStableDiffusion`]
- [`LEditsPPPipelineStableDiffusionXL`]
- [`PIAPipeline`]
-
-## Models that currently support `from_single_file` loading
-
- [`UNet2DConditionModel`]
- [`StableCascadeUNet`]
- [`AutoencoderKL`]
- [`ControlNetModel`]
-
-## Usage Examples
-
-## Loading a Pipeline using `from_single_file`
-
-```python
-from diffusers import StableDiffusionXLPipeline
-
-ckpt_path = "https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/sd_xl_base_1.0_0.9vae.safetensors"
-pipe = StableDiffusionXLPipeline.from_single_file(ckpt_path)
-```
-
-## Setting components in a Pipeline using `from_single_file`
-
-Set components of a pipeline by passing them directly to the `from_single_file` method. For example, here we are swapping out the pipeline's default scheduler with the `DDIMScheduler`.
-
-```python
-from diffusers import StableDiffusionXLPipeline, DDIMScheduler
-
-ckpt_path = "https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/sd_xl_base_1.0_0.9vae.safetensors"
-
-scheduler = DDIMScheduler()
-pipe = StableDiffusionXLPipeline.from_single_file(ckpt_path, scheduler=scheduler)
-
-```
-
-Here we are passing in a ControlNet model to the `StableDiffusionControlNetPipeline`.
-
-```python
-from diffusers import StableDiffusionControlNetPipeline, ControlNetModel
-
-ckpt_path = "https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.safetensors"
-
-controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11p_sd15_canny")
-pipe = StableDiffusionControlNetPipeline.from_single_file(ckpt_path, controlnet=controlnet)
-
-```
-
-## Loading a Model using `from_single_file`
-
-```python
-from diffusers import StableCascadeUNet
-
-ckpt_path = "https://huggingface.co/stabilityai/stable-cascade/blob/main/stage_b_lite.safetensors"
-model = StableCascadeUNet.from_single_file(ckpt_path)
-
-```
-
-## Using a Diffusers model repository to configure single file loading
-
-Under the hood, `from_single_file` will try to automatically determine a model repository to use to configure the components of a pipeline. You can also explicitly set the model repository to configure the pipeline with the `config` argument.
-
-```python
-from diffusers import StableDiffusionXLPipeline
-
-ckpt_path = "https://huggingface.co/segmind/SSD-1B/blob/main/SSD-1B.safetensors"
-repo_id = "segmind/SSD-1B"
-
-pipe = StableDiffusionXLPipeline.from_single_file(ckpt_path, config=repo_id)
-
-```
-
-In the example above, since we explicitly passed `repo_id="segmind/SSD-1B"` to the `config` argument, it will use this [configuration file](https://huggingface.co/segmind/SSD-1B/blob/main/unet/config.json) from the `unet` subfolder in `"segmind/SSD-1B"` to configure the `unet` component of the pipeline; Similarly, it will use the `config.json` file from `vae` subfolder to configure the `vae` model, `config.json` file from `text_encoder` folder to configure `text_encoder` and so on.
-
-<Tip>
-
-Most of the time you do not need to explicitly set a `config` argument. `from_single_file` will automatically map the checkpoint to the appropriate model repository. However, this option can be useful in cases where model components in the checkpoint might have been changed from what was originally distributed, or in cases where a checkpoint file might not have the necessary metadata to correctly determine the configuration to use for the pipeline.
-
-</Tip>
-
-## Override configuration options when using single file loading
-
-Override the default model or pipeline configuration options by providing the relevant arguments directly to the `from_single_file` method. Any argument supported by the model or pipeline class can be configured in this way:
-
-### Setting a pipeline configuration option
-
-```python
-from diffusers import StableDiffusionXLInstructPix2PixPipeline
-
-ckpt_path = "https://huggingface.co/stabilityai/cosxl/blob/main/cosxl_edit.safetensors"
-pipe = StableDiffusionXLInstructPix2PixPipeline.from_single_file(ckpt_path, config="diffusers/sdxl-instructpix2pix-768", is_cosxl_edit=True)
-
-```
-
-### Setting a model configuration option
-
-```python
-from diffusers import UNet2DConditionModel
-
-ckpt_path = "https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/sd_xl_base_1.0_0.9vae.safetensors"
-model = UNet2DConditionModel.from_single_file(ckpt_path, upcast_attention=True)
-
-```
+- [`FromSingleFileMixin`] supports loading pretrained pipeline weights stored in a single file, which can either be a `ckpt` or `safetensors` file.
+- [`FromOriginalVAEMixin`] supports loading a pretrained [`AutoencoderKL`] from pretrained ControlNet weights stored in a single file, which can either be a `ckpt` or `safetensors` file.
+- [`FromOriginalControlnetMixin`] supports loading pretrained ControlNet weights stored in a single file, which can either be a `ckpt` or `safetensors` file.

 <Tip>

@@ -145,116 +24,14 @@ To learn more about how to load single file weights, see the [Load different Sta

 </Tip>

-## Working with local files
-
-As of `diffusers>=0.28.0` the `from_single_file` method will attempt to configure a pipeline or model by first inferring the model type from the keys in the checkpoint file. This inferred model type is then used to determine the appropriate model repository on the Hugging Face Hub to configure the model or pipeline.
-
-For example, any single file checkpoint based on the Stable Diffusion XL base model will use the [`stabilityai/stable-diffusion-xl-base-1.0`](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0) model repository to configure the pipeline.
-
-If you are working in an environment with restricted internet access, it is recommended that you download the config files and checkpoints for the model to your preferred directory and pass the local paths to the `pretrained_model_link_or_path` and `config` arguments of the `from_single_file` method.
-
-```python
-from huggingface_hub import hf_hub_download, snapshot_download
-
-my_local_checkpoint_path = hf_hub_download(
-    repo_id="segmind/SSD-1B",
-    filename="SSD-1B.safetensors"
-)
-
-my_local_config_path = snapshot_download(
-    repo_id="segmind/SSD-1B",
-    allowed_patterns=["*.json", "**/*.json", "*.txt", "**/*.txt"]
-)
-
-pipe = StableDiffusionXLPipeline.from_single_file(my_local_checkpoint_path, config=my_local_config_path, local_files_only=True)
-
-```
-
-By default this will download the checkpoints and config files to the [Hugging Face Hub cache directory](https://huggingface.co/docs/huggingface_hub/en/guides/manage-cache). You can also specify a local directory to download the files to by passing the `local_dir` argument to the `hf_hub_download` and `snapshot_download` functions.
-
-```python
-from huggingface_hub import hf_hub_download, snapshot_download
-
-my_local_checkpoint_path = hf_hub_download(
-    repo_id="segmind/SSD-1B",
-    filename="SSD-1B.safetensors"
-    local_dir="my_local_checkpoints"
-)
-
-my_local_config_path = snapshot_download(
-    repo_id="segmind/SSD-1B",
-    allowed_patterns=["*.json", "**/*.json", "*.txt", "**/*.txt"]
-    local_dir="my_local_config"
-)
-
-pipe = StableDiffusionXLPipeline.from_single_file(my_local_checkpoint_path, config=my_local_config_path, local_files_only=True)
-
-```
-
-## Working with local files on file systems that do not support symlinking
-
-By default the `from_single_file` method relies on the `huggingface_hub` caching mechanism to fetch and store checkpoints and config files for models and pipelines. If you are working with a file system that does not support symlinking, it is recommended that you first download the checkpoint file to a local directory and disable symlinking by passing the `local_dir_use_symlink=False` argument to the `hf_hub_download` and `snapshot_download` functions.
-
-```python
-from huggingface_hub import hf_hub_download, snapshot_download
-
-my_local_checkpoint_path = hf_hub_download(
-    repo_id="segmind/SSD-1B",
-    filename="SSD-1B.safetensors"
-    local_dir="my_local_checkpoints",
-    local_dir_use_symlinks=False
-)
-print("My local checkpoint: ", my_local_checkpoint_path)
-
-my_local_config_path = snapshot_download(
-    repo_id="segmind/SSD-1B",
-    allowed_patterns=["*.json", "**/*.json", "*.txt", "**/*.txt"]
-    local_dir_use_symlinks=False,
-)
-print("My local config: ", my_local_config_path)
-
-```
-
-Then pass the local paths to the `pretrained_model_link_or_path` and `config` arguments of the `from_single_file` method.
-
-```python
-pipe = StableDiffusionXLPipeline.from_single_file(my_local_checkpoint_path, config=my_local_config_path, local_files_only=True)
-
-```
-
-<Tip>
-
-As of `huggingface_hub>=0.23.0` the `local_dir_use_symlinks` argument isn't necessary for the `hf_hub_download` and `snapshot_download` functions.
-
-</Tip>
-
-## Using the original configuration file of a model
-
-If you would like to configure the model components in a pipeline using the orignal YAML configuration file, you can pass a local path or url to the original configuration file via the `original_config` argument.
-
-```python
-from diffusers import StableDiffusionXLPipeline
-
-ckpt_path = "https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/sd_xl_base_1.0_0.9vae.safetensors"
-repo_id = "stabilityai/stable-diffusion-xl-base-1.0"
-original_config = "https://raw.githubusercontent.com/Stability-AI/generative-models/main/configs/inference/sd_xl_base.yaml"
-
-pipe = StableDiffusionXLPipeline.from_single_file(ckpt_path, original_config=original_config)
-```
-
-<Tip>
-
-When using `original_config` with `local_files_only=True`, Diffusers will attempt to infer the components of the pipeline based on the type signatures of pipeline class, rather than attempting to fetch the configuration files from a model repository on the Hugging Face Hub. This is to prevent backward breaking changes in existing code that might not be able to connect to the internet to fetch the necessary configuration files.
-
-This is not as reliable as providing a path to a local model repository using the `config` argument and might lead to errors when configuring the pipeline. To avoid this, please run the pipeline with `local_files_only=False` once to download the appropriate pipeline configuration files to the local cache.
-
-</Tip>
-
-
 ## FromSingleFileMixin

 [[autodoc]] loaders.single_file.FromSingleFileMixin

-## FromOriginalModelMixin
+## FromOriginalVAEMixin

-[[autodoc]] loaders.single_file_model.FromOriginalModelMixin
+[[autodoc]] loaders.autoencoder.FromOriginalVAEMixin
+
+## FromOriginalControlnetMixin
+
+[[autodoc]] loaders.controlnet.FromOriginalControlNetMixin
--- a/docs/source/en/api/pipelines/animatediff.md
+++ b/docs/source/en/api/pipelines/animatediff.md
@@ -101,53 +101,6 @@ AnimateDiff tends to work better with finetuned Stable Diffusion models. If you

 </Tip>

-### AnimateDiffSDXLPipeline
-
-AnimateDiff can also be used with SDXL models. This is currently an experimental feature as only a beta release of the motion adapter checkpoint is available.
-
-```python
-import torch
-from diffusers.models import MotionAdapter
-from diffusers import AnimateDiffSDXLPipeline, DDIMScheduler
-from diffusers.utils import export_to_gif
-
-adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-sdxl-beta", torch_dtype=torch.float16)
-
-model_id = "stabilityai/stable-diffusion-xl-base-1.0"
-scheduler = DDIMScheduler.from_pretrained(
-    model_id,
-    subfolder="scheduler",
-    clip_sample=False,
-    timestep_spacing="linspace",
-    beta_schedule="linear",
-    steps_offset=1,
-)
-pipe = AnimateDiffSDXLPipeline.from_pretrained(
-    model_id,
-    motion_adapter=adapter,
-    scheduler=scheduler,
-    torch_dtype=torch.float16,
-    variant="fp16",
-).to("cuda")
-
-# enable memory savings
-pipe.enable_vae_slicing()
-pipe.enable_vae_tiling()
-
-output = pipe(
-    prompt="a panda surfing in the ocean, realistic, high quality",
-    negative_prompt="low quality, worst quality",
-    num_inference_steps=20,
-    guidance_scale=8,
-    width=1024,
-    height=1024,
-    num_frames=16,
-)
-
-frames = output.frames[0]
-export_to_gif(frames, "animation.gif")
-```
-
 ### AnimateDiffVideoToVideoPipeline

 AnimateDiff can also be used to generate visually similar videos or enable style/character/background or other edits starting from an initial video, allowing you to seamlessly explore creative possibilities.
@@ -569,12 +522,6 @@ export_to_gif(frames, "animatelcm-motion-lora.gif")
  - all
  - __call__

-## AnimateDiffSDXLPipeline
-
-[[autodoc]] AnimateDiffSDXLPipeline
-  - all
-  - __call__
-
 ## AnimateDiffVideoToVideoPipeline

 [[autodoc]] AnimateDiffVideoToVideoPipeline
--- a/docs/source/en/api/pipelines/i2vgenxl.md
+++ b/docs/source/en/api/pipelines/i2vgenxl.md
@@ -47,7 +47,6 @@ Sample output with I2VGenXL:
 * Unlike SVD, it additionally accepts text prompts as inputs.
 * It can generate higher resolution videos.
 * When using the [`DDIMScheduler`] (which is default for this pipeline), less than 50 steps for inference leads to bad results.
-* This implementation is 1-stage variant of I2VGenXL. The main figure in the [I2VGen-XL](https://arxiv.org/abs/2311.04145) paper shows a 2-stage variant, however, 1-stage variant works well. See [this discussion](https://github.com/huggingface/diffusers/discussions/7952) for more details.

 ## I2VGenXLPipeline
 [[autodoc]] I2VGenXLPipeline
--- a/docs/source/en/api/pipelines/marigold.md
+++ b/docs/source/en/api/pipelines/marigold.md
@@ -1,76 +0,0 @@
-<!--Copyright 2024 Marigold authors and The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# Marigold Pipelines for Computer Vision Tasks
-
-![marigold](https://marigoldmonodepth.github.io/images/teaser_collage_compressed.jpg)
-
-Marigold was proposed in [Repurposing Diffusion-Based Image Generators for Monocular Depth Estimation](https://huggingface.co/papers/2312.02145), a CVPR 2024 Oral paper by [Bingxin Ke](http://www.kebingxin.com/), [Anton Obukhov](https://www.obukhov.ai/), [Shengyu Huang](https://shengyuh.github.io/), [Nando Metzger](https://nandometzger.github.io/), [Rodrigo Caye Daudt](https://rcdaudt.github.io/), and [Konrad Schindler](https://scholar.google.com/citations?user=FZuNgqIAAAAJ&hl=en). 
-The idea is to repurpose the rich generative prior of Text-to-Image Latent Diffusion Models (LDMs) for traditional computer vision tasks. 
-Initially, this idea was explored to fine-tune Stable Diffusion for Monocular Depth Estimation, as shown in the teaser above. 
-Later, 
- [Tianfu Wang](https://tianfwang.github.io/) trained the first Latent Consistency Model (LCM) of Marigold, which unlocked fast single-step inference;
- [Kevin Qu](https://www.linkedin.com/in/kevin-qu-b3417621b/?locale=en_US) extended the approach to Surface Normals Estimation;
- [Anton Obukhov](https://www.obukhov.ai/) contributed the pipelines and documentation into diffusers (enabled and supported by [YiYi Xu](https://yiyixuxu.github.io/) and [Sayak Paul](https://sayak.dev/)).
-
-The abstract from the paper is:
-
-*Monocular depth estimation is a fundamental computer vision task. Recovering 3D depth from a single image is geometrically ill-posed and requires scene understanding, so it is not surprising that the rise of deep learning has led to a breakthrough. The impressive progress of monocular depth estimators has mirrored the growth in model capacity, from relatively modest CNNs to large Transformer architectures. Still, monocular depth estimators tend to struggle when presented with images with unfamiliar content and layout, since their knowledge of the visual world is restricted by the data seen during training, and challenged by zero-shot generalization to new domains. This motivates us to explore whether the extensive priors captured in recent generative diffusion models can enable better, more generalizable depth estimation. We introduce Marigold, a method for affine-invariant monocular depth estimation that is derived from Stable Diffusion and retains its rich prior knowledge. The estimator can be fine-tuned in a couple of days on a single GPU using only synthetic training data. It delivers state-of-the-art performance across a wide range of datasets, including over 20% performance gains in specific cases. Project page: https://marigoldmonodepth.github.io.*
-
-## Available Pipelines
-
-Each pipeline supports one Computer Vision task, which takes an input RGB image as input and produces a *prediction* of the modality of interest, such as a depth map of the input image. 
-Currently, the following tasks are implemented:
-
-| Pipeline                                                                                                                                    | Predicted Modalities                                                                                             |                                                                       Demos                                                                        |
-|---------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------|:--------------------------------------------------------------------------------------------------------------------------------------------------:|
-| [MarigoldDepthPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/marigold/pipeline_marigold_depth.py)     | [Depth](https://en.wikipedia.org/wiki/Depth_map), [Disparity](https://en.wikipedia.org/wiki/Binocular_disparity) | [Fast Demo (LCM)](https://huggingface.co/spaces/prs-eth/marigold-lcm), [Slow Original Demo (DDIM)](https://huggingface.co/spaces/prs-eth/marigold) |
-| [MarigoldNormalsPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/marigold/pipeline_marigold_normals.py) | [Surface normals](https://en.wikipedia.org/wiki/Normal_mapping)                                                  |                                   [Fast Demo (LCM)](https://huggingface.co/spaces/prs-eth/marigold-normals-lcm)                                    |
-
-
-## Available Checkpoints
-
-The original checkpoints can be found under the [PRS-ETH](https://huggingface.co/prs-eth/) Hugging Face organization. 
-
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-components-across-pipelines) section to learn how to efficiently load the same components into multiple pipelines. Also, to know more about reducing the memory usage of this pipeline, refer to the ["Reduce memory usage"] section [here](../../using-diffusers/svd#reduce-memory-usage).
-
-</Tip>
-
-<Tip warning={true}>
-
-Marigold pipelines were designed and tested only with `DDIMScheduler` and `LCMScheduler`. 
-Depending on the scheduler, the number of inference steps required to get reliable predictions varies, and there is no universal value that works best across schedulers.
-Because of that, the default value of `num_inference_steps` in the `__call__` method of the pipeline is set to `None` (see the API reference). 
-Unless set explicitly, its value will be taken from the checkpoint configuration `model_index.json`. 
-This is done to ensure high-quality predictions when calling the pipeline with just the `image` argument. 
-
-</Tip>
-
-See also Marigold [usage examples](marigold_usage).
-
-## MarigoldDepthPipeline
-[[autodoc]] MarigoldDepthPipeline
-	- all
-	- __call__
-
-## MarigoldNormalsPipeline
-[[autodoc]] MarigoldNormalsPipeline
-	- all
-	- __call__
-
-## MarigoldDepthOutput
-[[autodoc]] pipelines.marigold.pipeline_marigold_depth.MarigoldDepthOutput
-
-## MarigoldNormalsOutput
-[[autodoc]] pipelines.marigold.pipeline_marigold_normals.MarigoldNormalsOutput
--- a/docs/source/en/api/pipelines/overview.md
+++ b/docs/source/en/api/pipelines/overview.md
@@ -97,11 +97,6 @@ The table below lists all the pipelines currently available in 🤗 Diffusers an
 	- to
 	- components

-
-[[autodoc]] pipelines.StableDiffusionMixin.enable_freeu
-
-[[autodoc]] pipelines.StableDiffusionMixin.disable_freeu
-
 ## FlaxDiffusionPipeline

 [[autodoc]] pipelines.pipeline_flax_utils.FlaxDiffusionPipeline
--- a/docs/source/en/api/pipelines/pixart.md
+++ b/docs/source/en/api/pipelines/pixart.md
@@ -31,7 +31,7 @@ Some notes about this pipeline:

 <Tip>

-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers.md) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading.md#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
+Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-components-across-pipelines) section to learn how to efficiently load the same components into multiple pipelines.

 </Tip>

--- a/docs/source/en/api/pipelines/pixart_sigma.md
+++ b/docs/source/en/api/pipelines/pixart_sigma.md
@@ -1,151 +0,0 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# PixArt-Σ
-
-![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/pixart/header_collage_sigma.jpg)
-
-[PixArt-Σ: Weak-to-Strong Training of Diffusion Transformer for 4K Text-to-Image Generation](https://huggingface.co/papers/2403.04692) is Junsong Chen, Jincheng Yu, Chongjian Ge, Lewei Yao, Enze Xie, Yue Wu, Zhongdao Wang, James Kwok, Ping Luo, Huchuan Lu, and Zhenguo Li.
-
-The abstract from the paper is:
-
-*In this paper, we introduce PixArt-Σ, a Diffusion Transformer model (DiT) capable of directly generating images at 4K resolution. PixArt-Σ represents a significant advancement over its predecessor, PixArt-α, offering images of markedly higher fidelity and improved alignment with text prompts. A key feature of PixArt-Σ is its training efficiency. Leveraging the foundational pre-training of PixArt-α, it evolves from the ‘weaker’ baseline to a ‘stronger’ model via incorporating higher quality data, a process we term “weak-to-strong training”. The advancements in PixArt-Σ are twofold: (1) High-Quality Training Data: PixArt-Σ incorporates superior-quality image data, paired with more precise and detailed image captions. (2) Efficient Token Compression: we propose a novel attention module within the DiT framework that compresses both keys and values, significantly improving efficiency and facilitating ultra-high-resolution image generation. Thanks to these improvements, PixArt-Σ achieves superior image quality and user prompt adherence capabilities with significantly smaller model size (0.6B parameters) than existing text-to-image diffusion models, such as SDXL (2.6B parameters) and SD Cascade (5.1B parameters). Moreover, PixArt-Σ’s capability to generate 4K images supports the creation of high-resolution posters and wallpapers, efficiently bolstering the production of highquality visual content in industries such as film and gaming.*
-
-You can find the original codebase at [PixArt-alpha/PixArt-sigma](https://github.com/PixArt-alpha/PixArt-sigma) and all the available checkpoints at [PixArt-alpha](https://huggingface.co/PixArt-alpha).
-
-Some notes about this pipeline:
-
-* It uses a Transformer backbone (instead of a UNet) for denoising. As such it has a similar architecture as [DiT](https://hf.co/docs/transformers/model_doc/dit).
-* It was trained using text conditions computed from T5. This aspect makes the pipeline better at following complex text prompts with intricate details.
-* It is good at producing high-resolution images at different aspect ratios. To get the best results, the authors recommend some size brackets which can be found [here](https://github.com/PixArt-alpha/PixArt-sigma/blob/master/diffusion/data/datasets/utils.py).
-* It rivals the quality of state-of-the-art text-to-image generation systems (as of this writing) such as PixArt-α, Stable Diffusion XL, Playground V2.0 and DALL-E 3, while being more efficient than them.
-* It shows the ability of generating super high resolution images, such as 2048px or even 4K.
-* It shows that text-to-image models can grow from a weak model to a stronger one through several improvements (VAEs, datasets, and so on.)
-
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
-
-## Inference with under 8GB GPU VRAM
-
-Run the [`PixArtSigmaPipeline`] with under 8GB GPU VRAM by loading the text encoder in 8-bit precision. Let's walk through a full-fledged example. 
-
-First, install the [bitsandbytes](https://github.com/TimDettmers/bitsandbytes) library:
-
-```bash
-pip install -U bitsandbytes
-```
-
-Then load the text encoder in 8-bit:
-
-```python
-from transformers import T5EncoderModel
-from diffusers import PixArtSigmaPipeline
-import torch
-
-text_encoder = T5EncoderModel.from_pretrained(
-    "PixArt-alpha/PixArt-Sigma-XL-2-1024-MS",
-    subfolder="text_encoder",
-    load_in_8bit=True,
-    device_map="auto",
-
-)
-pipe = PixArtSigmaPipeline.from_pretrained(
-    "PixArt-alpha/PixArt-Sigma-XL-2-1024-MS",
-    text_encoder=text_encoder,
-    transformer=None,
-    device_map="balanced"
-)
-```
-
-Now, use the `pipe` to encode a prompt:
-
-```python
-with torch.no_grad():
-    prompt = "cute cat"
-    prompt_embeds, prompt_attention_mask, negative_embeds, negative_prompt_attention_mask = pipe.encode_prompt(prompt)
-```
-
-Since text embeddings have been computed, remove the `text_encoder` and `pipe` from the memory, and free up som GPU VRAM:
-
-```python
-import gc 
-
-def flush():
-    gc.collect()
-    torch.cuda.empty_cache()
-
-del text_encoder
-del pipe
-flush()
-```
-
-Then compute the latents with the prompt embeddings as inputs:
-
-```python
-pipe = PixArtSigmaPipeline.from_pretrained(
-    "PixArt-alpha/PixArt-Sigma-XL-2-1024-MS",
-    text_encoder=None,
-    torch_dtype=torch.float16,
-).to("cuda")
-
-latents = pipe(
-    negative_prompt=None, 
-    prompt_embeds=prompt_embeds,
-    negative_prompt_embeds=negative_embeds,
-    prompt_attention_mask=prompt_attention_mask,
-    negative_prompt_attention_mask=negative_prompt_attention_mask,
-    num_images_per_prompt=1,
-    output_type="latent",
-).images
-
-del pipe.transformer
-flush()
-```
-
-<Tip>
-
-Notice that while initializing `pipe`, you're setting `text_encoder` to `None` so that it's not loaded.
-
-</Tip>
-
-Once the latents are computed, pass it off to the VAE to decode into a real image:
-
-```python
-with torch.no_grad():
-    image = pipe.vae.decode(latents / pipe.vae.config.scaling_factor, return_dict=False)[0]
-image = pipe.image_processor.postprocess(image, output_type="pil")[0]
-image.save("cat.png")
-```
-
-By deleting components you aren't using and flushing the GPU VRAM, you should be able to run [`PixArtSigmaPipeline`] with under 8GB GPU VRAM.
-
-![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/pixart/8bits_cat.png)
-
-If you want a report of your memory-usage, run this [script](https://gist.github.com/sayakpaul/3ae0f847001d342af27018a96f467e4e).
-
-<Tip warning={true}>
-
-Text embeddings computed in 8-bit can impact the quality of the generated images because of the information loss in the representation space caused by the reduced precision. It's recommended to compare the outputs with and without 8-bit.
-
-</Tip>
-
-While loading the `text_encoder`, you set `load_in_8bit` to `True`. You could also specify `load_in_4bit` to bring your memory requirements down even further to under 7GB.
-
-## PixArtSigmaPipeline
-
-[[autodoc]] PixArtSigmaPipeline
-	- all
-	- __call__
-	
--- a/docs/source/en/api/utilities.md
+++ b/docs/source/en/api/utilities.md
@@ -37,7 +37,3 @@ Utility and helper functions for working with 🤗 Diffusers.
 ## make_image_grid

 [[autodoc]] utils.make_image_grid
-
-## randn_tensor
-
-[[autodoc]] utils.torch_utils.randn_tensor
--- a/docs/source/en/api/video_processor.md
+++ b/docs/source/en/api/video_processor.md
@@ -1,21 +0,0 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# Video Processor
-
-The [`VideoProcessor`] provides a unified API for video pipelines to prepare inputs for VAE encoding and post-processing outputs once they're decoded. The class inherits [`VaeImageProcessor`] so it includes transformations such as resizing, normalization, and conversion between PIL Image, PyTorch, and NumPy arrays.
-
-## VideoProcessor
-
-[[autodoc]] video_processor.VideoProcessor.preprocess_video
-
-[[autodoc]] video_processor.VideoProcessor.postprocess_video
--- a/docs/source/en/conceptual/contribution.md
+++ b/docs/source/en/conceptual/contribution.md
@@ -198,81 +198,38 @@ Anything displayed on [the official Diffusers doc page](https://huggingface.co/d

 Please have a look at [this page](https://github.com/huggingface/diffusers/tree/main/docs) on how to verify changes made to the documentation locally.

+
 ### 6. Contribute a community pipeline

-> [!TIP]
-> Read the [Community pipelines](../using-diffusers/custom_pipeline_overview#community-pipelines) guide to learn more about the difference between a GitHub and Hugging Face Hub community pipeline. If you're interested in why we have community pipelines, take a look at GitHub Issue [#841](https://github.com/huggingface/diffusers/issues/841) (basically, we can't maintain all the possible ways diffusion models can be used for inference but we also don't want to prevent the community from building them).
+[Pipelines](https://huggingface.co/docs/diffusers/api/pipelines/overview) are usually the first point of contact between the Diffusers library and the user.
+Pipelines are examples of how to use Diffusers [models](https://huggingface.co/docs/diffusers/api/models/overview) and [schedulers](https://huggingface.co/docs/diffusers/api/schedulers/overview).
+We support two types of pipelines:

-Contributing a community pipeline is a great way to share your creativity and work with the community. It lets you build on top of the [`DiffusionPipeline`] so that anyone can load and use it by setting the `custom_pipeline` parameter. This section will walk you through how to create a simple pipeline where the UNet only does a single forward pass and calls the scheduler once (a "one-step" pipeline).
+- Official Pipelines
+- Community Pipelines

-1. Create a one_step_unet.py file for your community pipeline. This file can contain whatever package you want to use as long as it's installed by the user. Make sure you only have one pipeline class that inherits from [`DiffusionPipeline`] to load model weights and the scheduler configuration from the Hub. Add a UNet and scheduler to the `__init__` function.
+Both official and community pipelines follow the same design and consist of the same type of components.

-    You should also add the `register_modules` function to ensure your pipeline and its components can be saved with [`~DiffusionPipeline.save_pretrained`].
+Official pipelines are tested and maintained by the core maintainers of Diffusers. Their code
+resides in [src/diffusers/pipelines](https://github.com/huggingface/diffusers/tree/main/src/diffusers/pipelines).
+In contrast, community pipelines are contributed and maintained purely by the **community** and are **not** tested.
+They reside in [examples/community](https://github.com/huggingface/diffusers/tree/main/examples/community) and while they can be accessed via the [PyPI diffusers package](https://pypi.org/project/diffusers/), their code is not part of the PyPI distribution.

-```py
-from diffusers import DiffusionPipeline
-import torch
+The reason for the distinction is that the core maintainers of the Diffusers library cannot maintain and test all
+possible ways diffusion models can be used for inference, but some of them may be of interest to the community.
+Officially released diffusion pipelines,
+such as Stable Diffusion are added to the core src/diffusers/pipelines package which ensures
+high quality of maintenance, no backward-breaking code changes, and testing.
+More bleeding edge pipelines should be added as community pipelines. If usage for a community pipeline is high, the pipeline can be moved to the official pipelines upon request from the community. This is one of the ways we strive to be a community-driven library.

-class UnetSchedulerOneForwardPipeline(DiffusionPipeline):
-    def __init__(self, unet, scheduler):
-        super().__init__()
+To add a community pipeline, one should add a <name-of-the-community>.py file to [examples/community](https://github.com/huggingface/diffusers/tree/main/examples/community) and adapt the [examples/community/README.md](https://github.com/huggingface/diffusers/tree/main/examples/community/README.md) to include an example of the new pipeline.

-        self.register_modules(unet=unet, scheduler=scheduler)
-```
+An example can be seen [here](https://github.com/huggingface/diffusers/pull/2400).

-1. In the forward pass (which we recommend defining as `__call__`), you can add any feature you'd like. For the "one-step" pipeline, create a random image and call the UNet and scheduler once by setting `timestep=1`.
+Community pipeline PRs are only checked at a superficial level and ideally they should be maintained by their original authors.

-```py
-  from diffusers import DiffusionPipeline
-  import torch
-
-  class UnetSchedulerOneForwardPipeline(DiffusionPipeline):
-      def __init__(self, unet, scheduler):
-          super().__init__()
-
-          self.register_modules(unet=unet, scheduler=scheduler)
-
-      def __call__(self):
-          image = torch.randn(
-              (1, self.unet.config.in_channels, self.unet.config.sample_size, self.unet.config.sample_size),
-          )
-          timestep = 1
-
-          model_output = self.unet(image, timestep).sample
-          scheduler_output = self.scheduler.step(model_output, timestep, image).prev_sample
-
-          return scheduler_output
-```
-
-Now you can run the pipeline by passing a UNet and scheduler to it or load pretrained weights if the pipeline structure is identical.
-
-```py
-from diffusers import DDPMScheduler, UNet2DModel
-
-scheduler = DDPMScheduler()
-unet = UNet2DModel()
-
-pipeline = UnetSchedulerOneForwardPipeline(unet=unet, scheduler=scheduler)
-output = pipeline()
-# load pretrained weights
-pipeline = UnetSchedulerOneForwardPipeline.from_pretrained("google/ddpm-cifar10-32", use_safetensors=True)
-output = pipeline()
-```
-
-You can either share your pipeline as a GitHub community pipeline or Hub community pipeline.
-
-<hfoptions id="pipeline type">
-<hfoption id="GitHub pipeline">
-
-Share your GitHub pipeline by opening a pull request on the Diffusers [repository](https://github.com/huggingface/diffusers) and add the one_step_unet.py file to the [examples/community](https://github.com/huggingface/diffusers/tree/main/examples/community) subfolder.
-
-</hfoption>
-<hfoption id="Hub pipeline">
-
-Share your Hub pipeline by creating a model repository on the Hub and uploading the one_step_unet.py file to it.
-
-</hfoption>
-</hfoptions>
+Contributing a community pipeline is a great way to understand how Diffusers models and schedulers work. Having contributed a community pipeline is usually the first stepping stone to contributing an official pipeline to the
+core package.

 ### 7. Contribute to training examples

--- a/docs/source/en/installation.md
+++ b/docs/source/en/installation.md
@@ -112,7 +112,7 @@ pip install -e ".[flax]"

 These commands will link the folder you cloned the repository to and your Python library paths.
 Python will now look inside the folder you cloned to in addition to the normal library paths.
-For example, if your Python packages are typically installed in `~/anaconda3/envs/main/lib/python3.10/site-packages/`, Python will also search the `~/diffusers/` folder you cloned to.
+For example, if your Python packages are typically installed in `~/anaconda3/envs/main/lib/python3.8/site-packages/`, Python will also search the `~/diffusers/` folder you cloned to.

 <Tip warning={true}>

--- a/docs/source/en/optimization/fp16.md
+++ b/docs/source/en/optimization/fp16.md
@@ -12,23 +12,27 @@ specific language governing permissions and limitations under the License.

 # Speed up inference

-There are several ways to optimize Diffusers for inference speed, such as reducing the computational burden by lowering the data precision or using a lightweight distilled model. There are also memory-efficient attention implementations, [xFormers](xformers) and [scaled dot product attention](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) in PyTorch 2.0, that reduce memory usage which also indirectly speeds up inference. Different speed optimizations can be stacked together to get the fastest inference times.
+There are several ways to optimize 🤗 Diffusers for inference speed. As a general rule of thumb, we recommend using either [xFormers](xformers) or `torch.nn.functional.scaled_dot_product_attention` in PyTorch 2.0 for their memory-efficient attention.

-> [!TIP]
-> Optimizing for inference speed or reduced memory usage can lead to improved performance in the other category, so you should try to optimize for both whenever you can. This guide focuses on inference speed, but you can learn more about lowering memory usage in the [Reduce memory usage](memory) guide.
+<Tip>

-The inference times below are obtained from generating a single 512x512 image from the prompt "a photo of an astronaut riding a horse on mars" with 50 DDIM steps on a NVIDIA A100.
+In many cases, optimizing for speed or memory leads to improved performance in the other, so you should try to optimize for both whenever you can. This guide focuses on inference speed, but you can learn more about preserving memory in the [Reduce memory usage](memory) guide.

-| setup    | latency | speed-up |
-|----------|---------|----------|
-| baseline | 5.27s   | x1       |
-| tf32     | 4.14s   | x1.27    |
-| fp16     | 3.51s   | x1.50    |
-| combined | 3.41s   | x1.54    |
+</Tip>

-## TensorFloat-32
+The results below are obtained from generating a single 512x512 image from the prompt `a photo of an astronaut riding a horse on mars` with 50 DDIM steps on a Nvidia Titan RTX, demonstrating the speed-up you can expect.

-On Ampere and later CUDA devices, matrix multiplications and convolutions can use the [TensorFloat-32 (tf32)](https://blogs.nvidia.com/blog/2020/05/14/tensorfloat-32-precision-format/) mode for faster, but slightly less accurate computations. By default, PyTorch enables tf32 mode for convolutions but not matrix multiplications. Unless your network requires full float32 precision, we recommend enabling tf32 for matrix multiplications. It can significantly speed up computations with typically negligible loss in numerical accuracy.
+|                  | latency | speed-up |
+| ---------------- | ------- | ------- |
+| original         | 9.50s   | x1      |
+| fp16             | 3.61s   | x2.63   |
+| channels last    | 3.30s   | x2.88   |
+| traced UNet      | 3.21s   | x2.96   |
+| memory efficient attention  | 2.63s  | x3.61   |
+
+## Use TensorFloat-32
+
+On Ampere and later CUDA devices, matrix multiplications and convolutions can use the [TensorFloat-32 (TF32)](https://blogs.nvidia.com/blog/2020/05/14/tensorfloat-32-precision-format/) mode for faster, but slightly less accurate computations. By default, PyTorch enables TF32 mode for convolutions but not matrix multiplications. Unless your network requires full float32 precision, we recommend enabling TF32 for matrix multiplications. It can significantly speeds up computations with typically negligible loss in numerical accuracy.

 ```python
 import torch
@@ -36,11 +40,11 @@ import torch
 torch.backends.cuda.matmul.allow_tf32 = True
 ```

-Learn more about tf32 in the [Mixed precision training](https://huggingface.co/docs/transformers/en/perf_train_gpu_one#tf32) guide.
+You can learn more about TF32 in the [Mixed precision training](https://huggingface.co/docs/transformers/en/perf_train_gpu_one#tf32) guide.

 ## Half-precision weights

-To save GPU memory and get more speed, set `torch_dtype=torch.float16` to load and run the model weights directly with half-precision weights.
+To save GPU memory and get more speed, try loading and running the model weights directly in half-precision or float16:

 ```Python
 import torch
@@ -52,76 +56,19 @@ pipe = DiffusionPipeline.from_pretrained(
    use_safetensors=True,
 )
 pipe = pipe.to("cuda")
+
+prompt = "a photo of an astronaut riding a horse on mars"
+image = pipe(prompt).images[0]
 ```

-> [!WARNING]
-> Don't use [torch.autocast](https://pytorch.org/docs/stable/amp.html#torch.autocast) in any of the pipelines as it can lead to black images and is always slower than pure float16 precision.
+<Tip warning={true}>
+
+Don't use [`torch.autocast`](https://pytorch.org/docs/stable/amp.html#torch.autocast) in any of the pipelines as it can lead to black images and is always slower than pure float16 precision.
+
+</Tip>

 ## Distilled model

-You could also use a distilled Stable Diffusion model and autoencoder to speed up inference. During distillation, many of the UNet's residual and attention blocks are shed to reduce the model size by 51% and improve latency on CPU/GPU by 43%. The distilled model is faster and uses less memory while generating images of comparable quality to the full Stable Diffusion model.
+You could also use a distilled Stable Diffusion model and autoencoder to speed up inference. During distillation, many of the UNet's residual and attention blocks are shed to reduce the model size. The distilled model is faster and uses less memory while generating images of comparable quality to the full Stable Diffusion model.

-> [!TIP]
-> Read the [Open-sourcing Knowledge Distillation Code and Weights of SD-Small and SD-Tiny](https://huggingface.co/blog/sd_distillation) blog post to learn more about how knowledge distillation training works to produce a faster, smaller, and cheaper generative model.
-
-The inference times below are obtained from generating 4 images from the prompt "a photo of an astronaut riding a horse on mars" with 25 PNDM steps on a NVIDIA A100. Each generation is repeated 3 times with the distilled Stable Diffusion v1.4 model by [Nota AI](https://hf.co/nota-ai).
-
-| setup                        | latency | speed-up |
-|------------------------------|---------|----------|
-| baseline                     | 6.37s   | x1       |
-| distilled                    | 4.18s   | x1.52    |
-| distilled + tiny autoencoder | 3.83s   | x1.66    |
-
-Let's load the distilled Stable Diffusion model and compare it against the original Stable Diffusion model.
-
-```py
-from diffusers import StableDiffusionPipeline
-import torch
-
-distilled = StableDiffusionPipeline.from_pretrained(
-    "nota-ai/bk-sdm-small", torch_dtype=torch.float16, use_safetensors=True,
-).to("cuda")
-prompt = "a golden vase with different flowers"
-generator = torch.manual_seed(2023)
-image = distilled("a golden vase with different flowers", num_inference_steps=25, generator=generator).images[0]
-image
-```
-
-<div class="flex gap-4">
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/original_sd.png"/>
-    <figcaption class="mt-2 text-center text-sm text-gray-500">original Stable Diffusion</figcaption>
-  </div>
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/distilled_sd.png"/>
-    <figcaption class="mt-2 text-center text-sm text-gray-500">distilled Stable Diffusion</figcaption>
-  </div>
-</div>
-
-### Tiny AutoEncoder
-
-To speed inference up even more, replace the autoencoder with a [distilled version](https://huggingface.co/sayakpaul/taesdxl-diffusers) of it.
-
-```py
-import torch
-from diffusers import AutoencoderTiny, StableDiffusionPipeline
-
-distilled = StableDiffusionPipeline.from_pretrained(
-    "nota-ai/bk-sdm-small", torch_dtype=torch.float16, use_safetensors=True,
-).to("cuda")
-distilled.vae = AutoencoderTiny.from_pretrained(
-    "sayakpaul/taesd-diffusers", torch_dtype=torch.float16, use_safetensors=True,
-).to("cuda")
-
-prompt = "a golden vase with different flowers"
-generator = torch.manual_seed(2023)
-image = distilled("a golden vase with different flowers", num_inference_steps=25, generator=generator).images[0]
-image
-```
-
-<div class="flex justify-center">
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/distilled_sd_vae.png" />
-    <figcaption class="mt-2 text-center text-sm text-gray-500">distilled Stable Diffusion + Tiny AutoEncoder</figcaption>
-  </div>
-</div>
+Learn more about in the [Distilled Stable Diffusion inference](../using-diffusers/distilled_sd) guide!
--- a/docs/source/en/optimization/memory.md
+++ b/docs/source/en/optimization/memory.md
@@ -261,7 +261,7 @@ from dataclasses import dataclass

@dataclass
 class UNet2DConditionOutput:
-    sample: torch.Tensor
+    sample: torch.FloatTensor


 pipe = StableDiffusionPipeline.from_pretrained(
--- a/docs/source/en/optimization/tgate.md
+++ b/docs/source/en/optimization/tgate.md
@@ -6,7 +6,7 @@ Before you begin, make sure you install T-GATE.

 ```bash
 pip install tgate
-pip install -U torch diffusers transformers accelerate DeepCache
+pip install -U pytorch diffusers transformers accelerate DeepCache
 ```


@@ -46,12 +46,12 @@ pipe = TgatePixArtLoader(

 image = pipe.tgate(
       "An alpaca made of colorful building blocks, cyberpunk.",
-       gate_step=gate_step,
+        gate_step=gate_step,
       num_inference_steps=inference_step,
 ).images[0]
 ```
 </hfoption>
-<hfoption id="Stable Diffusion XL">
+<hfoption id="Stable Diffusion XL"> 

 Accelerate `StableDiffusionXLPipeline` with T-GATE:

@@ -78,9 +78,9 @@ pipe = TgateSDXLLoader(
 ).to("cuda")

 image = pipe.tgate(
-       "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k.",
-       gate_step=gate_step,
-       num_inference_steps=inference_step
+        "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k.",
+        gate_step=gate_step,
+        num_inference_steps=inference_step
 ).images[0]
 ```
 </hfoption>
@@ -111,9 +111,9 @@ pipe = TgateSDXLDeepCacheLoader(
 ).to("cuda")

 image = pipe.tgate(
-       "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k.",
-       gate_step=gate_step,
-       num_inference_steps=inference_step
+        "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k.",
+        gate_step=gate_step,
+        num_inference_steps=inference_step
 ).images[0]
 ```
 </hfoption>
@@ -151,9 +151,9 @@ pipe = TgateSDXLLoader(
 ).to("cuda")

 image = pipe.tgate(
-       "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k.",
-       gate_step=gate_step,
-       num_inference_steps=inference_step
+        "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k.",
+        gate_step=gate_step,
+        num_inference_steps=inference_step
 ).images[0]
 ```
 </hfoption>
--- a/docs/source/en/stable_diffusion.md
+++ b/docs/source/en/stable_diffusion.md
@@ -49,7 +49,7 @@ One of the simplest ways to speed up inference is to place the pipeline on a GPU
 pipeline = pipeline.to("cuda")
 ```

-To make sure you can use the same image and improve on it, use a [`Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) and set a seed for [reproducibility](./using-diffusers/reusing_seeds):
+To make sure you can use the same image and improve on it, use a [`Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) and set a seed for [reproducibility](./using-diffusers/reproducibility):

 ```python
 import torch
--- a/docs/source/en/training/kandinsky.md
+++ b/docs/source/en/training/kandinsky.md
@@ -205,7 +205,7 @@ model_pred = unet(noisy_latents, timesteps, None, added_cond_kwargs=added_cond_k

 Once you’ve made all your changes or you’re okay with the default configuration, you’re ready to launch the training script! 🚀

-You'll train on the [Naruto BLIP captions](https://huggingface.co/datasets/lambdalabs/naruto-blip-captions) dataset to generate your own Naruto characters, but you can also create and train on your own dataset by following the [Create a dataset for training](create_dataset) guide. Set the environment variable `DATASET_NAME` to the name of the dataset on the Hub or if you're training on your own files, set the environment variable `TRAIN_DIR` to a path to your dataset.
+You'll train on the [Pokémon BLIP captions](https://huggingface.co/datasets/lambdalabs/pokemon-blip-captions) dataset to generate your own Pokémon, but you can also create and train on your own dataset by following the [Create a dataset for training](create_dataset) guide. Set the environment variable `DATASET_NAME` to the name of the dataset on the Hub or if you're training on your own files, set the environment variable `TRAIN_DIR` to a path to your dataset.

 If you’re training on more than one GPU, add the `--multi_gpu` parameter to the `accelerate launch` command.

@@ -219,7 +219,7 @@ To monitor training progress with Weights & Biases, add the `--report_to=wandb`
 <hfoption id="prior model">

 ```bash
-export DATASET_NAME="lambdalabs/naruto-blip-captions"
+export DATASET_NAME="lambdalabs/pokemon-blip-captions"

 accelerate launch --mixed_precision="fp16"  train_text_to_image_prior.py \
  --dataset_name=$DATASET_NAME \
@@ -232,17 +232,17 @@ accelerate launch --mixed_precision="fp16"  train_text_to_image_prior.py \
  --checkpoints_total_limit=3 \
  --lr_scheduler="constant" \
  --lr_warmup_steps=0 \
-  --validation_prompts="A robot naruto, 4k photo" \
+  --validation_prompts="A robot pokemon, 4k photo" \
  --report_to="wandb" \
  --push_to_hub \
-  --output_dir="kandi2-prior-naruto-model"
+  --output_dir="kandi2-prior-pokemon-model"
 ```

 </hfoption>
 <hfoption id="decoder model">

 ```bash
-export DATASET_NAME="lambdalabs/naruto-blip-captions"
+export DATASET_NAME="lambdalabs/pokemon-blip-captions"

 accelerate launch --mixed_precision="fp16"  train_text_to_image_decoder.py \
  --dataset_name=$DATASET_NAME \
@@ -256,10 +256,10 @@ accelerate launch --mixed_precision="fp16"  train_text_to_image_decoder.py \
  --checkpoints_total_limit=3 \
  --lr_scheduler="constant" \
  --lr_warmup_steps=0 \
-  --validation_prompts="A robot naruto, 4k photo" \
+  --validation_prompts="A robot pokemon, 4k photo" \
  --report_to="wandb" \
  --push_to_hub \
-  --output_dir="kandi2-decoder-naruto-model"
+  --output_dir="kandi2-decoder-pokemon-model"
 ```

 </hfoption>
@@ -279,7 +279,7 @@ prior_components = {"prior_" + k: v for k,v in prior_pipeline.components.items()
 pipeline = AutoPipelineForText2Image.from_pretrained("kandinsky-community/kandinsky-2-2-decoder", **prior_components, torch_dtype=torch.float16)

 pipe.enable_model_cpu_offload()
-prompt="A robot naruto, 4k photo"
+prompt="A robot pokemon, 4k photo"
 image = pipeline(prompt=prompt, negative_prompt=negative_prompt).images[0]
 ```

@@ -299,7 +299,7 @@ import torch
 pipeline = AutoPipelineForText2Image.from_pretrained("path/to/saved/model", torch_dtype=torch.float16)
 pipeline.enable_model_cpu_offload()

-prompt="A robot naruto, 4k photo"
+prompt="A robot pokemon, 4k photo"
 image = pipeline(prompt=prompt).images[0]
 ```

@@ -313,7 +313,7 @@ unet = UNet2DConditionModel.from_pretrained("path/to/saved/model" + "/checkpoint
 pipeline = AutoPipelineForText2Image.from_pretrained("kandinsky-community/kandinsky-2-2-decoder", unet=unet, torch_dtype=torch.float16)
 pipeline.enable_model_cpu_offload()

-image = pipeline(prompt="A robot naruto, 4k photo").images[0]
+image = pipeline(prompt="A robot pokemon, 4k photo").images[0]
 ```

 </hfoption>
--- a/docs/source/en/training/lora.md
+++ b/docs/source/en/training/lora.md
@@ -170,7 +170,7 @@ Aside from setting up the LoRA layers, the training script is more or less the s

 Once you've made all your changes or you're okay with the default configuration, you're ready to launch the training script! 🚀

-Let's train on the [Naruto BLIP captions](https://huggingface.co/datasets/lambdalabs/naruto-blip-captions) dataset to generate your own Naruto characters. Set the environment variables `MODEL_NAME` and `DATASET_NAME` to the model and dataset respectively. You should also specify where to save the model in `OUTPUT_DIR`, and the name of the model to save to on the Hub with `HUB_MODEL_ID`. The script creates and saves the following files to your repository:
+Let's train on the [Pokémon BLIP captions](https://huggingface.co/datasets/lambdalabs/pokemon-blip-captions) dataset to generate our own Pokémon. Set the environment variables `MODEL_NAME` and `DATASET_NAME` to the model and dataset respectively. You should also specify where to save the model in `OUTPUT_DIR`, and the name of the model to save to on the Hub with `HUB_MODEL_ID`. The script creates and saves the following files to your repository:

 - saved model checkpoints
 - `pytorch_lora_weights.safetensors` (the trained LoRA weights)
@@ -185,9 +185,9 @@ A full training run takes ~5 hours on a 2080 Ti GPU with 11GB of VRAM.

 ```bash
 export MODEL_NAME="runwayml/stable-diffusion-v1-5"
-export OUTPUT_DIR="/sddata/finetune/lora/naruto"
-export HUB_MODEL_ID="naruto-lora"
-export DATASET_NAME="lambdalabs/naruto-blip-captions"
+export OUTPUT_DIR="/sddata/finetune/lora/pokemon"
+export HUB_MODEL_ID="pokemon-lora"
+export DATASET_NAME="lambdalabs/pokemon-blip-captions"

 accelerate launch --mixed_precision="fp16"  train_text_to_image_lora.py \
  --pretrained_model_name_or_path=$MODEL_NAME \
@@ -208,7 +208,7 @@ accelerate launch --mixed_precision="fp16"  train_text_to_image_lora.py \
  --hub_model_id=${HUB_MODEL_ID} \
  --report_to=wandb \
  --checkpointing_steps=500 \
-  --validation_prompt="A naruto with blue eyes." \
+  --validation_prompt="A pokemon with blue eyes." \
  --seed=1337
 ```

@@ -220,7 +220,7 @@ import torch

 pipeline = AutoPipelineForText2Image.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16).to("cuda")
 pipeline.load_lora_weights("path/to/lora/model", weight_name="pytorch_lora_weights.safetensors")
-image = pipeline("A naruto with blue eyes").images[0]
+image = pipeline("A pokemon with blue eyes").images[0]
 ```

 ## Next steps
--- a/docs/source/en/training/sdxl.md
+++ b/docs/source/en/training/sdxl.md
@@ -176,7 +176,7 @@ If you want to learn more about how the training loop works, check out the [Unde

 Once you’ve made all your changes or you’re okay with the default configuration, you’re ready to launch the training script! 🚀

-Let’s train on the [Naruto BLIP captions](https://huggingface.co/datasets/lambdalabs/naruto-blip-captions) dataset to generate your own Naruto characters. Set the environment variables `MODEL_NAME` and `DATASET_NAME` to the model and the dataset (either from the Hub or a local path). You should also specify a VAE other than the SDXL VAE (either from the Hub or a local path) with `VAE_NAME` to avoid numerical instabilities.
+Let’s train on the [Pokémon BLIP captions](https://huggingface.co/datasets/lambdalabs/pokemon-blip-captions) dataset to generate your own Pokémon. Set the environment variables `MODEL_NAME` and `DATASET_NAME` to the model and the dataset (either from the Hub or a local path). You should also specify a VAE other than the SDXL VAE (either from the Hub or a local path) with `VAE_NAME` to avoid numerical instabilities.

 <Tip>

@@ -187,7 +187,7 @@ To monitor training progress with Weights & Biases, add the `--report_to=wandb`
 ```bash
 export MODEL_NAME="stabilityai/stable-diffusion-xl-base-1.0"
 export VAE_NAME="madebyollin/sdxl-vae-fp16-fix"
-export DATASET_NAME="lambdalabs/naruto-blip-captions"
+export DATASET_NAME="lambdalabs/pokemon-blip-captions"

 accelerate launch train_text_to_image_sdxl.py \
  --pretrained_model_name_or_path=$MODEL_NAME \
@@ -211,7 +211,7 @@ accelerate launch train_text_to_image_sdxl.py \
  --validation_prompt="a cute Sundar Pichai creature" \
  --validation_epochs 5 \
  --checkpointing_steps=5000 \
-  --output_dir="sdxl-naruto-model" \
+  --output_dir="sdxl-pokemon-model" \
  --push_to_hub
 ```

@@ -226,9 +226,9 @@ import torch

 pipeline = DiffusionPipeline.from_pretrained("path/to/your/model", torch_dtype=torch.float16).to("cuda")

-prompt = "A naruto with green eyes and red legs."
+prompt = "A pokemon with green eyes and red legs."
 image = pipeline(prompt, num_inference_steps=30, guidance_scale=7.5).images[0]
-image.save("naruto.png")
+image.save("pokemon.png")
 ```

 </hfoption>
@@ -244,11 +244,11 @@ import torch_xla.core.xla_model as xm
 device = xm.xla_device()
 pipeline = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0").to(device)

-prompt = "A naruto with green eyes and red legs."
+prompt = "A pokemon with green eyes and red legs."
 start = time()
 image = pipeline(prompt, num_inference_steps=inference_steps).images[0]
 print(f'Compilation time is {time()-start} sec')
-image.save("naruto.png")
+image.save("pokemon.png")

 start = time()
 image = pipeline(prompt, num_inference_steps=inference_steps).images[0]
--- a/docs/source/en/training/text2image.md
+++ b/docs/source/en/training/text2image.md
@@ -158,7 +158,7 @@ Once you've made all your changes or you're okay with the default configuration,
 <hfoptions id="training-inference">
 <hfoption id="PyTorch">

-Let's train on the [Naruto BLIP captions](https://huggingface.co/datasets/lambdalabs/naruto-blip-captions) dataset to generate your own Naruto characters. Set the environment variables `MODEL_NAME` and `dataset_name` to the model and the dataset (either from the Hub or a local path). If you're training on more than one GPU, add the `--multi_gpu` parameter to the `accelerate launch` command.
+Let's train on the [Pokémon BLIP captions](https://huggingface.co/datasets/lambdalabs/pokemon-blip-captions) dataset to generate your own Pokémon. Set the environment variables `MODEL_NAME` and `dataset_name` to the model and the dataset (either from the Hub or a local path). If you're training on more than one GPU, add the `--multi_gpu` parameter to the `accelerate launch` command.

 <Tip>

@@ -168,7 +168,7 @@ To train on a local dataset, set the `TRAIN_DIR` and `OUTPUT_DIR` environment va

 ```bash
 export MODEL_NAME="runwayml/stable-diffusion-v1-5"
-export dataset_name="lambdalabs/naruto-blip-captions"
+export dataset_name="lambdalabs/pokemon-blip-captions"

 accelerate launch --mixed_precision="fp16"  train_text_to_image.py \
  --pretrained_model_name_or_path=$MODEL_NAME \
@@ -183,7 +183,7 @@ accelerate launch --mixed_precision="fp16"  train_text_to_image.py \
  --max_grad_norm=1 \
  --enable_xformers_memory_efficient_attention
  --lr_scheduler="constant" --lr_warmup_steps=0 \
-  --output_dir="sd-naruto-model" \
+  --output_dir="sd-pokemon-model" \
  --push_to_hub
 ```

@@ -202,7 +202,7 @@ To train on a local dataset, set the `TRAIN_DIR` and `OUTPUT_DIR` environment va

 ```bash
 export MODEL_NAME="runwayml/stable-diffusion-v1-5"
-export dataset_name="lambdalabs/naruto-blip-captions"
+export dataset_name="lambdalabs/pokemon-blip-captions"

 python train_text_to_image_flax.py \
  --pretrained_model_name_or_path=$MODEL_NAME \
@@ -212,7 +212,7 @@ python train_text_to_image_flax.py \
  --max_train_steps=15000 \
  --learning_rate=1e-05 \
  --max_grad_norm=1 \
-  --output_dir="sd-naruto-model" \
+  --output_dir="sd-pokemon-model" \
  --push_to_hub
 ```

@@ -231,7 +231,7 @@ import torch
 pipeline = StableDiffusionPipeline.from_pretrained("path/to/saved_model", torch_dtype=torch.float16, use_safetensors=True).to("cuda")

 image = pipeline(prompt="yoda").images[0]
-image.save("yoda-naruto.png")
+image.save("yoda-pokemon.png")
 ```

 </hfoption>
@@ -246,7 +246,7 @@ from diffusers import FlaxStableDiffusionPipeline

 pipeline, params = FlaxStableDiffusionPipeline.from_pretrained("path/to/saved_model", dtype=jax.numpy.bfloat16)

-prompt = "yoda naruto"
+prompt = "yoda pokemon"
 prng_seed = jax.random.PRNGKey(0)
 num_inference_steps = 50

@@ -261,7 +261,7 @@ prompt_ids = shard(prompt_ids)

 images = pipeline(prompt_ids, params, prng_seed, num_inference_steps, jit=True).images
 images = pipeline.numpy_to_pil(np.asarray(images.reshape((num_samples,) + images.shape[-3:])))
-image.save("yoda-naruto.png")
+image.save("yoda-pokemon.png")
 ```

 </hfoption>
--- a/docs/source/en/training/wuerstchen.md
+++ b/docs/source/en/training/wuerstchen.md
@@ -131,7 +131,7 @@ If you want to learn more about how the training loop works, check out the [Unde

 Once you’ve made all your changes or you’re okay with the default configuration, you’re ready to launch the training script! 🚀

-Set the `DATASET_NAME` environment variable to the dataset name from the Hub. This guide uses the [Naruto BLIP captions](https://huggingface.co/datasets/lambdalabs/naruto-blip-captions) dataset, but you can create and train on your own datasets as well (see the [Create a dataset for training](create_dataset) guide).
+Set the `DATASET_NAME` environment variable to the dataset name from the Hub. This guide uses the [Pokémon BLIP captions](https://huggingface.co/datasets/lambdalabs/pokemon-blip-captions) dataset, but you can create and train on your own datasets as well (see the [Create a dataset for training](create_dataset) guide).

 <Tip>

@@ -140,7 +140,7 @@ To monitor training progress with Weights & Biases, add the `--report_to=wandb`
 </Tip>

 ```bash
-export DATASET_NAME="lambdalabs/naruto-blip-captions"
+export DATASET_NAME="lambdalabs/pokemon-blip-captions"

 accelerate launch  train_text_to_image_prior.py \
  --mixed_precision="fp16" \
@@ -156,10 +156,10 @@ accelerate launch  train_text_to_image_prior.py \
  --checkpoints_total_limit=3 \
  --lr_scheduler="constant" \
  --lr_warmup_steps=0 \
-  --validation_prompts="A robot naruto, 4k photo" \
+  --validation_prompts="A robot pokemon, 4k photo" \
  --report_to="wandb" \
  --push_to_hub \
-  --output_dir="wuerstchen-prior-naruto-model"
+  --output_dir="wuerstchen-prior-pokemon-model"
 ```

 Once training is complete, you can use your newly trained model for inference!
@@ -171,7 +171,7 @@ from diffusers.pipelines.wuerstchen import DEFAULT_STAGE_C_TIMESTEPS

 pipeline = AutoPipelineForText2Image.from_pretrained("path/to/saved/model", torch_dtype=torch.float16).to("cuda")

-caption = "A cute bird naruto holding a shield"
+caption = "A cute bird pokemon holding a shield"
 images = pipeline(
    caption,
    width=1024,
--- a/docs/source/en/tutorials/basic_training.md
+++ b/docs/source/en/tutorials/basic_training.md
@@ -260,7 +260,7 @@ Then, you'll need a way to evaluate the model. For evaluation, you can use the [
 ...     # The default pipeline output type is `List[PIL.Image]`
 ...     images = pipeline(
 ...         batch_size=config.eval_batch_size,
-...         generator=torch.Generator(device='cpu').manual_seed(config.seed), # Use a separate torch generator to avoid rewinding the random state of the main training loop
+...         generator=torch.manual_seed(config.seed),
 ...     ).images

 ...     # Make a grid out of the images
--- a/docs/source/en/using-diffusers/callback.md
+++ b/docs/source/en/using-diffusers/callback.md
@@ -19,74 +19,13 @@ The denoising loop of a pipeline can be modified with custom defined functions u

 This guide will demonstrate how callbacks work by a few features you can implement with them.

-## Official callbacks
-
-We provide a list of callbacks you can plug into an existing pipeline and modify the denoising loop. This is the current list of official callbacks:
-
- `SDCFGCutoffCallback`: Disables the CFG after a certain number of steps for all SD 1.5 pipelines, including text-to-image, image-to-image, inpaint, and controlnet.
- `SDXLCFGCutoffCallback`: Disables the CFG after a certain number of steps for all SDXL pipelines, including text-to-image, image-to-image, inpaint, and controlnet.
- `IPAdapterScaleCutoffCallback`: Disables the IP Adapter after a certain number of steps for all pipelines supporting IP-Adapter.
-
-> [!TIP]
-> If you want to add a new official callback, feel free to open a [feature request](https://github.com/huggingface/diffusers/issues/new/choose) or [submit a PR](https://huggingface.co/docs/diffusers/main/en/conceptual/contribution#how-to-open-a-pr).
-
-To set up a callback, you need to specify the number of denoising steps after which the callback comes into effect. You can do so by using either one of these two arguments
-
- `cutoff_step_ratio`: Float number with the ratio of the steps.
- `cutoff_step_index`: Integer number with the exact number of the step.
-
-```python
-import torch
-
-from diffusers import DPMSolverMultistepScheduler, StableDiffusionXLPipeline
-from diffusers.callbacks import SDXLCFGCutoffCallback
-
-
-callback = SDXLCFGCutoffCallback(cutoff_step_ratio=0.4)
-# can also be used with cutoff_step_index
-# callback = SDXLCFGCutoffCallback(cutoff_step_ratio=None, cutoff_step_index=10)
-
-pipeline = StableDiffusionXLPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0",
-    torch_dtype=torch.float16,
-    variant="fp16",
-).to("cuda")
-pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config, use_karras_sigmas=True)
-
-prompt = "a sports car at the road, best quality, high quality, high detail, 8k resolution"
-
-generator = torch.Generator(device="cpu").manual_seed(2628670641)
-
-out = pipeline(
-    prompt=prompt,
-    negative_prompt="",
-    guidance_scale=6.5,
-    num_inference_steps=25,
-    generator=generator,
-    callback_on_step_end=callback,
-)
-
-out.images[0].save("official_callback.png")
-```
-
-<div class="flex gap-4">
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/without_cfg_callback.png" alt="generated image of a sports car at the road" />
-    <figcaption class="mt-2 text-center text-sm text-gray-500">without SDXLCFGCutoffCallback</figcaption>
-  </div>
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/with_cfg_callback.png" alt="generated image of a a sports car at the road with cfg callback" />
-    <figcaption class="mt-2 text-center text-sm text-gray-500">with SDXLCFGCutoffCallback</figcaption>
-  </div>
-</div>
-
 ## Dynamic classifier-free guidance

 Dynamic classifier-free guidance (CFG) is a feature that allows you to disable CFG after a certain number of inference steps which can help you save compute with minimal cost to performance. The callback function for this should have the following arguments:

- `pipeline` (or the pipeline instance) provides access to important properties such as `num_timesteps` and `guidance_scale`. You can modify these properties by updating the underlying attributes. For this example, you'll disable CFG by setting `pipeline._guidance_scale=0.0`.
- `step_index` and `timestep` tell you where you are in the denoising loop. Use `step_index` to turn off CFG after reaching 40% of `num_timesteps`.
- `callback_kwargs` is a dict that contains tensor variables you can modify during the denoising loop. It only includes variables specified in the `callback_on_step_end_tensor_inputs` argument, which is passed to the pipeline's `__call__` method. Different pipelines may use different sets of variables, so please check a pipeline's `_callback_tensor_inputs` attribute for the list of variables you can modify. Some common variables include `latents` and `prompt_embeds`. For this function, change the batch size of `prompt_embeds` after setting `guidance_scale=0.0` in order for it to work properly.
+* `pipeline` (or the pipeline instance) provides access to important properties such as `num_timesteps` and `guidance_scale`. You can modify these properties by updating the underlying attributes. For this example, you'll disable CFG by setting `pipeline._guidance_scale=0.0`.
+* `step_index` and `timestep` tell you where you are in the denoising loop. Use `step_index` to turn off CFG after reaching 40% of `num_timesteps`.
+* `callback_kwargs` is a dict that contains tensor variables you can modify during the denoising loop. It only includes variables specified in the `callback_on_step_end_tensor_inputs` argument, which is passed to the pipeline's `__call__` method. Different pipelines may use different sets of variables, so please check a pipeline's `_callback_tensor_inputs` attribute for the list of variables you can modify. Some common variables include `latents` and `prompt_embeds`. For this function, change the batch size of `prompt_embeds` after setting `guidance_scale=0.0` in order for it to work properly.

 Your callback function should look something like this:

--- a/docs/source/en/using-diffusers/contribute_pipeline.md
+++ b/docs/source/en/using-diffusers/contribute_pipeline.md
@@ -0,0 +1,184 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Contribute a community pipeline
+
+<Tip>
+
+💡 Take a look at GitHub Issue [#841](https://github.com/huggingface/diffusers/issues/841) for more context about why we're adding community pipelines to help everyone easily share their work without being slowed down.
+
+</Tip>
+
+Community pipelines allow you to add any additional features you'd like on top of the [`DiffusionPipeline`]. The main benefit of building on top of the `DiffusionPipeline` is anyone can load and use your pipeline by only adding one more argument, making it super easy for the community to access.
+
+This guide will show you how to create a community pipeline and explain how they work. To keep things simple, you'll create a "one-step" pipeline where the `UNet` does a single forward pass and calls the scheduler once.
+
+## Initialize the pipeline
+
+You should start by creating a `one_step_unet.py` file for your community pipeline. In this file, create a pipeline class that inherits from the [`DiffusionPipeline`] to be able to load model weights and the scheduler configuration from the Hub. The one-step pipeline needs a `UNet` and a scheduler, so you'll need to add these as arguments to the `__init__` function:
+
+```python
+from diffusers import DiffusionPipeline
+import torch
+
+class UnetSchedulerOneForwardPipeline(DiffusionPipeline):
+    def __init__(self, unet, scheduler):
+        super().__init__()
+```
+
+To ensure your pipeline and its components (`unet` and `scheduler`) can be saved with [`~DiffusionPipeline.save_pretrained`], add them to the `register_modules` function:
+
+```diff
+  from diffusers import DiffusionPipeline
+  import torch
+
+  class UnetSchedulerOneForwardPipeline(DiffusionPipeline):
+      def __init__(self, unet, scheduler):
+          super().__init__()
+
+         self.register_modules(unet=unet, scheduler=scheduler)
+```
+
+Cool, the `__init__` step is done and you can move to the forward pass now! 🔥
+
+## Define the forward pass
+
+In the forward pass, which we recommend defining as `__call__`, you have complete creative freedom to add whatever feature you'd like. For our amazing one-step pipeline, create a random image and only call the `unet` and `scheduler` once by setting `timestep=1`:
+
+```diff
+  from diffusers import DiffusionPipeline
+  import torch
+
+  class UnetSchedulerOneForwardPipeline(DiffusionPipeline):
+      def __init__(self, unet, scheduler):
+          super().__init__()
+
+          self.register_modules(unet=unet, scheduler=scheduler)
+
+     def __call__(self):
+         image = torch.randn(
+             (1, self.unet.config.in_channels, self.unet.config.sample_size, self.unet.config.sample_size),
+         )
+         timestep = 1
+
+         model_output = self.unet(image, timestep).sample
+         scheduler_output = self.scheduler.step(model_output, timestep, image).prev_sample
+
+         return scheduler_output
+```
+
+That's it! 🚀 You can now run this pipeline by passing a `unet` and `scheduler` to it:
+
+```python
+from diffusers import DDPMScheduler, UNet2DModel
+
+scheduler = DDPMScheduler()
+unet = UNet2DModel()
+
+pipeline = UnetSchedulerOneForwardPipeline(unet=unet, scheduler=scheduler)
+
+output = pipeline()
+```
+
+But what's even better is you can load pre-existing weights into the pipeline if the pipeline structure is identical. For example, you can load the [`google/ddpm-cifar10-32`](https://huggingface.co/google/ddpm-cifar10-32) weights into the one-step pipeline:
+
+```python
+pipeline = UnetSchedulerOneForwardPipeline.from_pretrained("google/ddpm-cifar10-32", use_safetensors=True)
+
+output = pipeline()
+```
+
+## Share your pipeline
+
+Open a Pull Request on the 🧨 Diffusers [repository](https://github.com/huggingface/diffusers) to add your awesome pipeline in `one_step_unet.py` to the [examples/community](https://github.com/huggingface/diffusers/tree/main/examples/community) subfolder.
+
+Once it is merged, anyone with `diffusers >= 0.4.0` installed can use this pipeline magically 🪄 by specifying it in the `custom_pipeline` argument:
+
+```python
+from diffusers import DiffusionPipeline
+
+pipe = DiffusionPipeline.from_pretrained(
+    "google/ddpm-cifar10-32", custom_pipeline="one_step_unet", use_safetensors=True
+)
+pipe()
+```
+
+Another way to share your community pipeline is to upload the `one_step_unet.py` file directly to your preferred [model repository](https://huggingface.co/docs/hub/models-uploading) on the Hub. Instead of specifying the `one_step_unet.py` file, pass the model repository id to the `custom_pipeline` argument:
+
+```python
+from diffusers import DiffusionPipeline
+
+pipeline = DiffusionPipeline.from_pretrained(
+    "google/ddpm-cifar10-32", custom_pipeline="stevhliu/one_step_unet", use_safetensors=True
+)
+```
+
+Take a look at the following table to compare the two sharing workflows to help you decide the best option for you:
+
+|                | GitHub community pipeline                                                                                        | HF Hub community pipeline                                                                 |
+|----------------|------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------|
+| usage          | same                                                                                                             | same                                                                                      |
+| review process | open a Pull Request on GitHub and undergo a review process from the Diffusers team before merging; may be slower | upload directly to a Hub repository without any review; this is the fastest workflow      |
+| visibility     | included in the official Diffusers repository and documentation                                                  | included on your HF Hub profile and relies on your own usage/promotion to gain visibility |
+
+<Tip>
+
+💡 You can use whatever package you want in your community pipeline file - as long as the user has it installed, everything will work fine. Make sure you have one and only one pipeline class that inherits from `DiffusionPipeline` because this is automatically detected.
+
+</Tip>
+
+## How do community pipelines work?
+
+A community pipeline is a class that inherits from [`DiffusionPipeline`] which means:
+
+- It can be loaded with the [`custom_pipeline`] argument.
+- The model weights and scheduler configuration are loaded from [`pretrained_model_name_or_path`].
+- The code that implements a feature in the community pipeline is defined in a `pipeline.py` file.
+
+Sometimes you can't load all the pipeline components weights from an official repository. In this case, the other components should be passed directly to the pipeline:
+
+```python
+from diffusers import DiffusionPipeline
+from transformers import CLIPImageProcessor, CLIPModel
+
+model_id = "CompVis/stable-diffusion-v1-4"
+clip_model_id = "laion/CLIP-ViT-B-32-laion2B-s34B-b79K"
+
+feature_extractor = CLIPImageProcessor.from_pretrained(clip_model_id)
+clip_model = CLIPModel.from_pretrained(clip_model_id, torch_dtype=torch.float16)
+
+pipeline = DiffusionPipeline.from_pretrained(
+    model_id,
+    custom_pipeline="clip_guided_stable_diffusion",
+    clip_model=clip_model,
+    feature_extractor=feature_extractor,
+    scheduler=scheduler,
+    torch_dtype=torch.float16,
+    use_safetensors=True,
+)
+```
+
+The magic behind community pipelines is contained in the following code. It allows the community pipeline to be loaded from GitHub or the Hub, and it'll be available to all 🧨 Diffusers packages.
+
+```python
+# 2. Load the pipeline class, if using custom module then load it from the Hub
+# if we load from explicit class, let's use it
+if custom_pipeline is not None:
+    pipeline_class = get_class_from_dynamic_module(
+        custom_pipeline, module_file=CUSTOM_PIPELINE_FILE_NAME, cache_dir=custom_pipeline
+    )
+elif cls != DiffusionPipeline:
+    pipeline_class = cls
+else:
+    diffusers_module = importlib.import_module(cls.__module__.split(".")[0])
+    pipeline_class = getattr(diffusers_module, config_dict["_class_name"])
+```
--- a/docs/source/en/using-diffusers/control_brightness.md
+++ b/docs/source/en/using-diffusers/control_brightness.md
@@ -0,0 +1,58 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Control image brightness
+
+The Stable Diffusion pipeline is mediocre at generating images that are either very bright or dark as explained in the [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) paper. The solutions proposed in the paper are currently implemented in the [`DDIMScheduler`] which you can use to improve the lighting in your images.
+
+<Tip>
+
+💡 Take a look at the paper linked above for more details about the proposed solutions!
+
+</Tip>
+
+One of the solutions is to train a model with *v prediction* and *v loss*. Add the following flag to the [`train_text_to_image.py`](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image.py) or [`train_text_to_image_lora.py`](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image_lora.py) scripts to enable `v_prediction`:
+
+```bash
+--prediction_type="v_prediction"
+```
+
+For example, let's use the [`ptx0/pseudo-journey-v2`](https://huggingface.co/ptx0/pseudo-journey-v2) checkpoint which has been finetuned with `v_prediction`.
+
+Next, configure the following parameters in the [`DDIMScheduler`]:
+
+1. `rescale_betas_zero_snr=True`, rescales the noise schedule to zero terminal signal-to-noise ratio (SNR)
+2. `timestep_spacing="trailing"`, starts sampling from the last timestep
+
+```py
+from diffusers import DiffusionPipeline, DDIMScheduler
+
+pipeline = DiffusionPipeline.from_pretrained("ptx0/pseudo-journey-v2", use_safetensors=True)
+
+# switch the scheduler in the pipeline to use the DDIMScheduler
+pipeline.scheduler = DDIMScheduler.from_config(
+    pipeline.scheduler.config, rescale_betas_zero_snr=True, timestep_spacing="trailing"
+)
+pipeline.to("cuda")
+```
+
+Finally, in your call to the pipeline, set `guidance_rescale` to prevent overexposure:
+
+```py
+prompt = "A lion in galaxies, spirals, nebulae, stars, smoke, iridescent, intricate detail, octane render, 8k"
+image = pipeline(prompt, guidance_rescale=0.7).images[0]
+image
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/zero_snr.png"/>
+</div>
--- a/docs/source/en/using-diffusers/custom_pipeline_examples.md
+++ b/docs/source/en/using-diffusers/custom_pipeline_examples.md
@@ -0,0 +1,119 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Community pipelines
+
+[[open-in-colab]]
+
+<Tip>
+
+For more context about the design choices behind community pipelines, please have a look at [this issue](https://github.com/huggingface/diffusers/issues/841).
+
+</Tip>
+
+Community pipelines allow you to get creative and build your own unique pipelines to share with the community. You can find all community pipelines in the [diffusers/examples/community](https://github.com/huggingface/diffusers/tree/main/examples/community) folder along with inference and training examples for how to use them. This guide showcases some of the community pipelines and hopefully it'll inspire you to create your own (feel free to open a PR with your own pipeline and we will merge it!).
+
+To load a community pipeline, use the `custom_pipeline` argument in [`DiffusionPipeline`] to specify one of the files in [diffusers/examples/community](https://github.com/huggingface/diffusers/tree/main/examples/community):
+
+```py
+from diffusers import DiffusionPipeline
+
+pipe = DiffusionPipeline.from_pretrained(
+    "CompVis/stable-diffusion-v1-4", custom_pipeline="filename_in_the_community_folder", use_safetensors=True
+)
+```
+
+If a community pipeline doesn't work as expected, please open a GitHub issue and mention the author.
+
+You can learn more about community pipelines in the how to [load community pipelines](custom_pipeline_overview) and how to [contribute a community pipeline](contribute_pipeline) guides.
+
+## Multilingual Stable Diffusion
+
+The multilingual Stable Diffusion pipeline uses a pretrained [XLM-RoBERTa](https://huggingface.co/papluca/xlm-roberta-base-language-detection) to identify a language and the [mBART-large-50](https://huggingface.co/facebook/mbart-large-50-many-to-one-mmt) model to handle the translation. This allows you to generate images from text in 20 languages.
+
+```py
+import torch
+from diffusers import DiffusionPipeline
+from diffusers.utils import make_image_grid
+from transformers import (
+    pipeline,
+    MBart50TokenizerFast,
+    MBartForConditionalGeneration,
+)
+
+device = "cuda" if torch.cuda.is_available() else "cpu"
+device_dict = {"cuda": 0, "cpu": -1}
+
+# add language detection pipeline
+language_detection_model_ckpt = "papluca/xlm-roberta-base-language-detection"
+language_detection_pipeline = pipeline("text-classification",
+                                       model=language_detection_model_ckpt,
+                                       device=device_dict[device])
+
+# add model for language translation
+translation_tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-one-mmt")
+translation_model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-one-mmt").to(device)
+
+diffuser_pipeline = DiffusionPipeline.from_pretrained(
+    "CompVis/stable-diffusion-v1-4",
+    custom_pipeline="multilingual_stable_diffusion",
+    detection_pipeline=language_detection_pipeline,
+    translation_model=translation_model,
+    translation_tokenizer=translation_tokenizer,
+    torch_dtype=torch.float16,
+)
+
+diffuser_pipeline.enable_attention_slicing()
+diffuser_pipeline = diffuser_pipeline.to(device)
+
+prompt = ["a photograph of an astronaut riding a horse",
+          "Una casa en la playa",
+          "Ein Hund, der Orange isst",
+          "Un restaurant parisien"]
+
+images = diffuser_pipeline(prompt).images
+make_image_grid(images, rows=2, cols=2)
+```
+
+<div class="flex justify-center">
+    <img src="https://user-images.githubusercontent.com/4313860/198328706-295824a4-9856-4ce5-8e66-278ceb42fd29.png"/>
+</div>
+
+## MagicMix
+
+[MagicMix](https://huggingface.co/papers/2210.16056) is a pipeline that can mix an image and text prompt to generate a new image that preserves the image structure. The `mix_factor` determines how much influence the prompt has on the layout generation, `kmin` controls the number of steps during the content generation process, and `kmax` determines how much information is kept in the layout of the original image.
+
+```py
+from diffusers import DiffusionPipeline, DDIMScheduler
+from diffusers.utils import load_image, make_image_grid
+
+pipeline = DiffusionPipeline.from_pretrained(
+    "CompVis/stable-diffusion-v1-4",
+    custom_pipeline="magic_mix",
+    scheduler=DDIMScheduler.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="scheduler"),
+).to('cuda')
+
+img = load_image("https://user-images.githubusercontent.com/59410571/209578593-141467c7-d831-4792-8b9a-b17dc5e47816.jpg")
+mix_img = pipeline(img, prompt="bed", kmin=0.3, kmax=0.5, mix_factor=0.5)
+make_image_grid([img, mix_img], rows=1, cols=2)
+```
+
+<div class="flex gap-4">
+  <div>
+    <img class="rounded-xl" src="https://user-images.githubusercontent.com/59410571/209578593-141467c7-d831-4792-8b9a-b17dc5e47816.jpg" />
+    <figcaption class="mt-2 text-center text-sm text-gray-500">original image</figcaption>
+  </div>
+  <div>
+    <img class="rounded-xl" src="https://user-images.githubusercontent.com/59410571/209578602-70f323fa-05b7-4dd6-b055-e40683e37914.jpg" />
+    <figcaption class="mt-2 text-center text-sm text-gray-500">image and text prompt mix</figcaption>
+  </div>
+</div>
--- a/docs/source/en/using-diffusers/custom_pipeline_overview.md
+++ b/docs/source/en/using-diffusers/custom_pipeline_overview.md
@@ -16,19 +16,11 @@ specific language governing permissions and limitations under the License.

 ## Community pipelines

-> [!TIP] Take a look at GitHub Issue [#841](https://github.com/huggingface/diffusers/issues/841) for more context about why we're adding community pipelines to help everyone easily share their work without being slowed down.
-
 Community pipelines are any [`DiffusionPipeline`] class that are different from the original paper implementation (for example, the [`StableDiffusionControlNetPipeline`] corresponds to the [Text-to-Image Generation with ControlNet Conditioning](https://arxiv.org/abs/2302.05543) paper). They provide additional functionality or extend the original implementation of a pipeline.

 There are many cool community pipelines like [Marigold Depth Estimation](https://github.com/huggingface/diffusers/tree/main/examples/community#marigold-depth-estimation) or [InstantID](https://github.com/huggingface/diffusers/tree/main/examples/community#instantid-pipeline), and you can find all the official community pipelines [here](https://github.com/huggingface/diffusers/tree/main/examples/community).

-There are two types of community pipelines, those stored on the Hugging Face Hub and those stored on Diffusers GitHub repository. Hub pipelines are completely customizable (scheduler, models, pipeline code, etc.) while Diffusers GitHub pipelines are only limited to custom pipeline code.
-
-|                | GitHub community pipeline                                                                                        | HF Hub community pipeline                                                                 |
-|----------------|------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------|
-| usage          | same                                                                                                             | same                                                                                      |
-| review process | open a Pull Request on GitHub and undergo a review process from the Diffusers team before merging; may be slower | upload directly to a Hub repository without any review; this is the fastest workflow      |
-| visibility     | included in the official Diffusers repository and documentation                                                  | included on your HF Hub profile and relies on your own usage/promotion to gain visibility |
+There are two types of community pipelines, those stored on the Hugging Face Hub and those stored on Diffusers GitHub repository. Hub pipelines are completely customizable (scheduler, models, pipeline code, etc.) while Diffusers GitHub pipelines are only limited to custom pipeline code. Refer to this [table](./contribute_pipeline#share-your-pipeline) for a more detailed comparison of Hub vs GitHub community pipelines.

 <hfoptions id="community">
 <hfoption id="Hub pipelines">
@@ -169,97 +161,6 @@ out_lpw
  </div>
 </div>

-## Example community pipelines
-
-Community pipelines are a really fun and creative way to extend the capabilities of the original pipeline with new and unique features. You can find all community pipelines in the [diffusers/examples/community](https://github.com/huggingface/diffusers/tree/main/examples/community) folder with inference and training examples for how to use them.
-
-This section showcases a couple of the community pipelines and hopefully it'll inspire you to create your own (feel free to open a PR for your community pipeline and ping us for a review)!
-
-> [!TIP]
-> The [`~DiffusionPipeline.from_pipe`] method is particularly useful for loading community pipelines because many of them don't have pretrained weights and add a feature on top of an existing pipeline like Stable Diffusion or Stable Diffusion XL. You can learn more about the [`~DiffusionPipeline.from_pipe`] method in the [Load with from_pipe](custom_pipeline_overview#load-with-from_pipe) section.
-
-<hfoptions id="community">
-<hfoption id="Marigold">
-
-[Marigold](https://marigoldmonodepth.github.io/) is a depth estimation diffusion pipeline that uses the rich existing and inherent visual knowledge in diffusion models. It takes an input image and denoises and decodes it into a depth map. Marigold performs well even on images it hasn't seen before.
-
-```py
-import torch
-from PIL import Image
-from diffusers import DiffusionPipeline
-from diffusers.utils import load_image
-
-pipeline = DiffusionPipeline.from_pretrained(
-    "prs-eth/marigold-lcm-v1-0",
-    custom_pipeline="marigold_depth_estimation",
-    torch_dtype=torch.float16,
-    variant="fp16",
-)
-
-pipeline.to("cuda")
-image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/community-marigold.png")
-output = pipeline(
-    image,
-    denoising_steps=4,
-    ensemble_size=5,
-    processing_res=768,
-    match_input_res=True,
-    batch_size=0,
-    seed=33,
-    color_map="Spectral",
-    show_progress_bar=True,
-)
-depth_colored: Image.Image = output.depth_colored
-depth_colored.save("./depth_colored.png")
-```
-
-<div class="flex flex-row gap-4">
-  <div class="flex-1">
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/community-marigold.png"/>
-    <figcaption class="mt-2 text-center text-sm text-gray-500">original image</figcaption>
-  </div>
-  <div class="flex-1">
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/marigold-depth.png"/>
-    <figcaption class="mt-2 text-center text-sm text-gray-500">colorized depth image</figcaption>
-  </div>
-</div>
-
-</hfoption>
-<hfoption id="HD-Painter">
-
-[HD-Painter](https://hf.co/papers/2312.14091) is a high-resolution inpainting pipeline. It introduces a *Prompt-Aware Introverted Attention (PAIntA)* layer to better align a prompt with the area to be inpainted, and *Reweighting Attention Score Guidance (RASG)* to keep the latents more prompt-aligned and within their trained domain to generate realistc images.
-
-```py
-import torch
-from diffusers import DiffusionPipeline, DDIMScheduler
-from diffusers.utils import load_image
-
-pipeline = DiffusionPipeline.from_pretrained(
-    "Lykon/dreamshaper-8-inpainting",
-    custom_pipeline="hd_painter"
-)
-pipeline.scheduler = DDIMScheduler.from_config(pipeline.scheduler.config)
-init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/hd-painter.jpg")
-mask_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/hd-painter-mask.png")
-prompt = "football"
-image = pipeline(prompt, init_image, mask_image, use_rasg=True, use_painta=True, generator=torch.manual_seed(0)).images[0]
-image
-```
-
-<div class="flex flex-row gap-4">
-  <div class="flex-1">
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/hd-painter.jpg"/>
-    <figcaption class="mt-2 text-center text-sm text-gray-500">original image</figcaption>
-  </div>
-  <div class="flex-1">
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/hd-painter-output.png"/>
-    <figcaption class="mt-2 text-center text-sm text-gray-500">generated image</figcaption>
-  </div>
-</div>
-
-</hfoption>
-</hfoptions>
-
 ## Community components

 Community components allow users to build pipelines that may have customized components that are not a part of Diffusers. If your pipeline has custom components that Diffusers doesn't already support, you need to provide their implementations as Python modules. These customized components could be a VAE, UNet, and scheduler. In most cases, the text encoder is imported from the Transformers library. The pipeline code itself can also be customized.
--- a/docs/source/en/using-diffusers/distilled_sd.md
+++ b/docs/source/en/using-diffusers/distilled_sd.md
@@ -0,0 +1,133 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Distilled Stable Diffusion inference
+
+[[open-in-colab]]
+
+Stable Diffusion inference can be a computationally intensive process because it must iteratively denoise the latents to generate an image. To reduce the computational burden, you can use a *distilled* version of the Stable Diffusion model from [Nota AI](https://huggingface.co/nota-ai). The distilled version of their Stable Diffusion model eliminates some of the residual and attention blocks from the UNet, reducing the model size by 51% and improving latency on CPU/GPU by 43%.
+
+<Tip>
+
+Read this [blog post](https://huggingface.co/blog/sd_distillation) to learn more about how knowledge distillation training works to produce a faster, smaller, and cheaper generative model.
+
+</Tip>
+
+Let's load the distilled Stable Diffusion model and compare it against the original Stable Diffusion model:
+
+```py
+from diffusers import StableDiffusionPipeline
+import torch
+
+distilled = StableDiffusionPipeline.from_pretrained(
+    "nota-ai/bk-sdm-small", torch_dtype=torch.float16, use_safetensors=True,
+).to("cuda")
+
+original = StableDiffusionPipeline.from_pretrained(
+    "CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16, use_safetensors=True,
+).to("cuda")
+```
+
+Given a prompt, get the inference time for the original model:
+
+```py
+import time
+
+seed = 2023
+generator = torch.manual_seed(seed)
+
+NUM_ITERS_TO_RUN = 3
+NUM_INFERENCE_STEPS = 25
+NUM_IMAGES_PER_PROMPT = 4
+
+prompt = "a golden vase with different flowers"
+
+start = time.time_ns()
+for _ in range(NUM_ITERS_TO_RUN):
+    images = original(
+        prompt,
+        num_inference_steps=NUM_INFERENCE_STEPS,
+        generator=generator,
+        num_images_per_prompt=NUM_IMAGES_PER_PROMPT
+    ).images
+end = time.time_ns()
+original_sd = f"{(end - start) / 1e6:.1f}"
+
+print(f"Execution time -- {original_sd} ms\n")
+"Execution time -- 45781.5 ms"
+```
+
+Time the distilled model inference:
+
+```py
+start = time.time_ns()
+for _ in range(NUM_ITERS_TO_RUN):
+    images = distilled(
+        prompt,
+        num_inference_steps=NUM_INFERENCE_STEPS,
+        generator=generator,
+        num_images_per_prompt=NUM_IMAGES_PER_PROMPT
+    ).images
+end = time.time_ns()
+
+distilled_sd = f"{(end - start) / 1e6:.1f}"
+print(f"Execution time -- {distilled_sd} ms\n")
+"Execution time -- 29884.2 ms"
+```
+
+<div class="flex gap-4">
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/original_sd.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">original Stable Diffusion (45781.5 ms)</figcaption>
+  </div>
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/distilled_sd.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">distilled Stable Diffusion (29884.2 ms)</figcaption>
+  </div>
+</div>
+
+## Tiny AutoEncoder
+
+To speed inference up even more, use a tiny distilled version of the [Stable Diffusion VAE](https://huggingface.co/sayakpaul/taesdxl-diffusers) to denoise the latents into images. Replace the VAE in the distilled Stable Diffusion model with the tiny VAE:
+
+```py
+from diffusers import AutoencoderTiny
+
+distilled.vae = AutoencoderTiny.from_pretrained(
+    "sayakpaul/taesd-diffusers", torch_dtype=torch.float16, use_safetensors=True,
+).to("cuda")
+```
+
+Time the distilled model and distilled VAE inference:
+
+```py
+start = time.time_ns()
+for _ in range(NUM_ITERS_TO_RUN):
+    images = distilled(
+        prompt,
+        num_inference_steps=NUM_INFERENCE_STEPS,
+        generator=generator,
+        num_images_per_prompt=NUM_IMAGES_PER_PROMPT
+    ).images
+end = time.time_ns()
+
+distilled_tiny_sd = f"{(end - start) / 1e6:.1f}"
+print(f"Execution time -- {distilled_tiny_sd} ms\n")
+"Execution time -- 27165.7 ms"
+```
+
+<div class="flex justify-center">
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/distilled_sd_vae.png" />
+    <figcaption class="mt-2 text-center text-sm text-gray-500">distilled Stable Diffusion + Tiny AutoEncoder (27165.7 ms)</figcaption>
+  </div>
+</div>
--- a/docs/source/en/using-diffusers/freeu.md
+++ b/docs/source/en/using-diffusers/freeu.md
@@ -0,0 +1,135 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Improve generation quality with FreeU
+
+[[open-in-colab]]
+
+The UNet is responsible for denoising during the reverse diffusion process, and there are two distinct features in its architecture:
+
+1. Backbone features primarily contribute to the denoising process
+2. Skip features mainly introduce high-frequency features into the decoder module and can make the network overlook the semantics in the backbone features
+
+However, the skip connection can sometimes introduce unnatural image details. [FreeU](https://hf.co/papers/2309.11497) is a technique for improving image quality by rebalancing the contributions from the UNet’s skip connections and backbone feature maps.
+
+FreeU is applied during inference and it does not require any additional training. The technique works for different tasks such as text-to-image, image-to-image, and text-to-video.
+
+In this guide, you will apply FreeU to the [`StableDiffusionPipeline`], [`StableDiffusionXLPipeline`], and [`TextToVideoSDPipeline`]. You need to install Diffusers from source to run the examples below.
+
+## StableDiffusionPipeline
+
+Load the pipeline:
+
+```py
+from diffusers import DiffusionPipeline
+import torch
+
+pipeline = DiffusionPipeline.from_pretrained(
+    "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, safety_checker=None
+).to("cuda")
+```
+
+Then enable the FreeU mechanism with the FreeU-specific hyperparameters. These values are scaling factors for the backbone and skip features.
+
+```py
+pipeline.enable_freeu(s1=0.9, s2=0.2, b1=1.2, b2=1.4)
+```
+
+The values above are from the official FreeU [code repository](https://github.com/ChenyangSi/FreeU) where you can also find [reference hyperparameters](https://github.com/ChenyangSi/FreeU#range-for-more-parameters) for different models.
+
+<Tip>
+
+Disable the FreeU mechanism by calling `disable_freeu()` on a pipeline.
+
+</Tip>
+
+And then run inference:
+
+```py
+prompt = "A squirrel eating a burger"
+seed = 2023
+image = pipeline(prompt, generator=torch.manual_seed(seed)).images[0]
+image
+```
+
+The figure below compares non-FreeU and FreeU results respectively for the same hyperparameters used above (`prompt` and `seed`):
+
+![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/freeu/sdv1_5_freeu.jpg)
+
+
+Let's see how Stable Diffusion 2 results are impacted:
+
+```py
+from diffusers import DiffusionPipeline
+import torch
+
+pipeline = DiffusionPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-2-1", torch_dtype=torch.float16, safety_checker=None
+).to("cuda")
+
+prompt = "A squirrel eating a burger"
+seed = 2023
+
+pipeline.enable_freeu(s1=0.9, s2=0.2, b1=1.1, b2=1.2)
+image = pipeline(prompt, generator=torch.manual_seed(seed)).images[0]
+image
+```
+
+![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/freeu/sdv2_1_freeu.jpg)
+
+## Stable Diffusion XL
+
+Finally, let's take a look at how FreeU affects Stable Diffusion XL results:
+
+```py
+from diffusers import DiffusionPipeline
+import torch
+
+pipeline = DiffusionPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16,
+).to("cuda")
+
+prompt = "A squirrel eating a burger"
+seed = 2023
+
+# Comes from
+# https://wandb.ai/nasirk24/UNET-FreeU-SDXL/reports/FreeU-SDXL-Optimal-Parameters--Vmlldzo1NDg4NTUw
+pipeline.enable_freeu(s1=0.6, s2=0.4, b1=1.1, b2=1.2)
+image = pipeline(prompt, generator=torch.manual_seed(seed)).images[0]
+image
+```
+
+![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/freeu/sdxl_freeu.jpg)
+
+## Text-to-video generation
+
+FreeU can also be used to improve video quality:
+
+```python
+from diffusers import DiffusionPipeline
+from diffusers.utils import export_to_video
+import torch
+
+model_id = "cerspense/zeroscope_v2_576w"
+pipe = DiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to("cuda")
+
+prompt = "an astronaut riding a horse on mars"
+seed = 2023
+
+# The values come from
+# https://github.com/lyn-rgb/FreeU_Diffusers#video-pipelines
+pipe.enable_freeu(b1=1.2, b2=1.4, s1=0.9, s2=0.2)
+video_frames = pipe(prompt, height=320, width=576, num_frames=30, generator=torch.manual_seed(seed)).frames[0]
+export_to_video(video_frames, "astronaut_rides_horse.mp4")
+```
+
+Thanks to [kadirnar](https://github.com/kadirnar/) for helping to integrate the feature, and to [justindujardin](https://github.com/justindujardin) for the helpful discussions.
--- a/docs/source/en/using-diffusers/image_quality.md
+++ b/docs/source/en/using-diffusers/image_quality.md
@@ -1,190 +0,0 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# Controlling image quality
-
-The components of a diffusion model, like the UNet and scheduler, can be optimized to improve the quality of generated images leading to better image lighting and details. These techniques are especially useful if you don't have the resources to simply use a larger model for inference. You can enable these techniques during inference without any additional training.
-
-This guide will show you how to turn these techniques on in your pipeline and how to configure them to improve the quality of your generated images.
-
-## Lighting
-
-The Stable Diffusion models aren't very good at generating images that are very bright or dark because the scheduler doesn't start sampling from the last timestep and it doesn't enforce a zero signal-to-noise ratio (SNR). The [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://hf.co/papers/2305.08891) paper fixes these issues which are now available in some Diffusers schedulers.
-
-> [!TIP]
-> For inference, you need a model that has been trained with *v_prediction*. To train your own model with *v_prediction*, add the following flag to the [train_text_to_image.py](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image.py) or [train_text_to_image_lora.py](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image_lora.py) scripts.
->
-> ```bash
-> --prediction_type="v_prediction"
-> ```
-
-For example, load the [ptx0/pseudo-journey-v2](https://hf.co/ptx0/pseudo-journey-v2) checkpoint which was trained with `v_prediction` and the [`DDIMScheduler`]. Now you should configure the following parameters in the [`DDIMScheduler`].
-
-* `rescale_betas_zero_snr=True` to rescale the noise schedule to zero SNR
-* `timestep_spacing="trailing"` to start sampling from the last timestep
-
-Set `guidance_rescale` in the pipeline to prevent over-exposure. A lower value increases brightness but some of the details may appear washed out.
-
-```py
-from diffusers import DiffusionPipeline, DDIMScheduler
-
-pipeline = DiffusionPipeline.from_pretrained("ptx0/pseudo-journey-v2", use_safetensors=True)
-
-pipeline.scheduler = DDIMScheduler.from_config(
-    pipeline.scheduler.config, rescale_betas_zero_snr=True, timestep_spacing="trailing"
-)
-pipeline.to("cuda")
-prompt = "cinematic photo of a snowy mountain at night with the northern lights aurora borealis overhead, 35mm photograph, film, professional, 4k, highly detailed"
-generator = torch.Generator(device="cpu").manual_seed(23)
-image = pipeline(prompt, guidance_rescale=0.7, generator=generator).images[0]
-image
-```
-
-<div class="flex gap-4">
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/no-zero-snr.png"/>
-    <figcaption class="mt-2 text-center text-sm text-gray-500">default Stable Diffusion v2-1 image</figcaption>
-  </div>
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/zero-snr.png"/>
-    <figcaption class="mt-2 text-center text-sm text-gray-500">image with zero SNR and trailing timestep spacing enabled</figcaption>
-  </div>
-</div>
-
-## Details
-
-[FreeU](https://hf.co/papers/2309.11497) improves image details by rebalancing the UNet's backbone and skip connection weights. The skip connections can cause the model to overlook some of the backbone semantics which may lead to unnatural image details in the generated image. This technique does not require any additional training and can be applied on the fly during inference for tasks like image-to-image and text-to-video.
-
-Use the [`~pipelines.StableDiffusionMixin.enable_freeu`] method on your pipeline and configure the scaling factors for the backbone (`b1` and `b2`) and skip connections (`s1` and `s2`). The number after each scaling factor corresponds to the stage in the UNet where the factor is applied. Take a look at the [FreeU](https://github.com/ChenyangSi/FreeU#parameters) repository for reference hyperparameters for different models.
-
-<hfoptions id="freeu">
-<hfoption id="Stable Diffusion v1-5">
-
-```py
-import torch
-from diffusers import DiffusionPipeline
-
-pipeline = DiffusionPipeline.from_pretrained(
-    "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, safety_checker=None
-).to("cuda")
-pipeline.enable_freeu(s1=0.9, s2=0.2, b1=1.5, b2=1.6)
-generator = torch.Generator(device="cpu").manual_seed(33)
-prompt = ""
-image = pipeline(prompt, generator=generator).images[0]
-image
-```
-
-<div class="flex gap-4">
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sdv15-no-freeu.png"/>
-    <figcaption class="mt-2 text-center text-sm text-gray-500">FreeU disabled</figcaption>
-  </div>
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sdv15-freeu.png"/>
-    <figcaption class="mt-2 text-center text-sm text-gray-500">FreeU enabled</figcaption>
-  </div>
-</div>
-
-</hfoption>
-<hfoption id="Stable Diffusion v2-1">
-
-```py
-import torch
-from diffusers import DiffusionPipeline
-
-pipeline = DiffusionPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-2-1", torch_dtype=torch.float16, safety_checker=None
-).to("cuda")
-pipeline.enable_freeu(s1=0.9, s2=0.2, b1=1.4, b2=1.6)
-generator = torch.Generator(device="cpu").manual_seed(80)
-prompt = "A squirrel eating a burger"
-image = pipeline(prompt, generator=generator).images[0]
-image
-```
-
-<div class="flex gap-4">
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sdv21-no-freeu.png"/>
-    <figcaption class="mt-2 text-center text-sm text-gray-500">FreeU disabled</figcaption>
-  </div>
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sdv21-freeu.png"/>
-    <figcaption class="mt-2 text-center text-sm text-gray-500">FreeU enabled</figcaption>
-  </div>
-</div>
-
-</hfoption>
-<hfoption id="Stable Diffusion XL">
-
-```py
-import torch
-from diffusers import DiffusionPipeline
-
-pipeline = DiffusionPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16,
-).to("cuda")
-pipeline.enable_freeu(s1=0.9, s2=0.2, b1=1.3, b2=1.4)
-generator = torch.Generator(device="cpu").manual_seed(13)
-prompt = "A squirrel eating a burger"
-image = pipeline(prompt, generator=generator).images[0]
-image
-```
-
-<div class="flex gap-4">
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sdxl-no-freeu.png"/>
-    <figcaption class="mt-2 text-center text-sm text-gray-500">FreeU disabled</figcaption>
-  </div>
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sdxl-freeu.png"/>
-    <figcaption class="mt-2 text-center text-sm text-gray-500">FreeU enabled</figcaption>
-  </div>
-</div>
-
-</hfoption>
-<hfoption id="Zeroscope">
-
-```py
-import torch
-from diffusers import DiffusionPipeline
-from diffusers.utils import export_to_video
-
-pipeline = DiffusionPipeline.from_pretrained(
-    "damo-vilab/text-to-video-ms-1.7b", torch_dtype=torch.float16
-).to("cuda")
-# values come from https://github.com/lyn-rgb/FreeU_Diffusers#video-pipelines
-pipeline.enable_freeu(b1=1.2, b2=1.4, s1=0.9, s2=0.2)
-prompt = "Confident teddy bear surfer rides the wave in the tropics"
-generator = torch.Generator(device="cpu").manual_seed(47)
-video_frames = pipeline(prompt, generator=generator).frames[0]
-export_to_video(video_frames, "teddy_bear.mp4", fps=10)
-```
-
-<div class="flex gap-4">
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/video-no-freeu.gif"/>
-    <figcaption class="mt-2 text-center text-sm text-gray-500">FreeU disabled</figcaption>
-  </div>
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/video-freeu.gif"/>
-    <figcaption class="mt-2 text-center text-sm text-gray-500">FreeU enabled</figcaption>
-  </div>
-</div>
-
-</hfoption>
-</hfoptions>
-
-Call the [`pipelines.StableDiffusionMixin.disable_freeu`] method to disable FreeU.
-
-```py
-pipeline.disable_freeu()
-```
--- a/docs/source/en/using-diffusers/inference_with_lcm.md
+++ b/docs/source/en/using-diffusers/inference_with_lcm.md
@@ -10,30 +10,29 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->

-# Latent Consistency Model
-
 [[open-in-colab]]

-[Latent Consistency Models (LCMs)](https://hf.co/papers/2310.04378) enable fast high-quality image generation by directly predicting the reverse diffusion process in the latent rather than pixel space. In other words, LCMs try to predict the noiseless image from the noisy image in contrast to typical diffusion models that iteratively remove noise from the noisy image. By avoiding the iterative sampling process, LCMs are able to generate high-quality images in 2-4 steps instead of 20-30 steps.
+# Latent Consistency Model

-LCMs are distilled from pretrained models which requires ~32 hours of A100 compute. To speed this up, [LCM-LoRAs](https://hf.co/papers/2311.05556) train a [LoRA adapter](https://huggingface.co/docs/peft/conceptual_guides/adapter#low-rank-adaptation-lora) which have much fewer parameters to train compared to the full model. The LCM-LoRA can be plugged into a diffusion model once it has been trained.
+Latent Consistency Models (LCM) enable quality image generation in typically 2-4 steps making it possible to use diffusion models in almost real-time settings. 

-This guide will show you how to use LCMs and LCM-LoRAs for fast inference on tasks and how to use them with other adapters like ControlNet or T2I-Adapter.
+From the [official website](https://latent-consistency-models.github.io/):

-> [!TIP]
-> LCMs and LCM-LoRAs are available for Stable Diffusion v1.5, Stable Diffusion XL, and the SSD-1B model. You can find their checkpoints on the [Latent Consistency](https://hf.co/collections/latent-consistency/latent-consistency-models-weights-654ce61a95edd6dffccef6a8) Collections.
+> LCMs can be distilled from any pre-trained Stable Diffusion (SD) in only 4,000 training steps (~32 A100 GPU Hours) for generating high quality 768 x 768 resolution images in 2~4 steps or even one step, significantly accelerating text-to-image generation. We employ LCM to distill the Dreamshaper-V7 version of SD in just 4,000 training iterations.
+
+For a more technical overview of LCMs, refer to [the paper](https://huggingface.co/papers/2310.04378).
+
+LCM distilled models are available for [stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5), [stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0), and the [SSD-1B](https://huggingface.co/segmind/SSD-1B) model. All the checkpoints can be found in this [collection](https://huggingface.co/collections/latent-consistency/latent-consistency-models-weights-654ce61a95edd6dffccef6a8).
+
+This guide shows how to perform inference with LCMs for 
+- text-to-image
+- image-to-image
+- combined with style LoRAs
+- ControlNet/T2I-Adapter

 ## Text-to-image

-<hfoptions id="lcm-text2img">
-<hfoption id="LCM">
-
-To use LCMs, you need to load the LCM checkpoint for your supported model into [`UNet2DConditionModel`] and replace the scheduler with the [`LCMScheduler`]. Then you can use the pipeline as usual, and pass a text prompt to generate an image in just 4 steps.
-
-A couple of notes to keep in mind when using LCMs are:
-
-* Typically, batch size is doubled inside the pipeline for classifier-free guidance. But LCM applies guidance with guidance embeddings and doesn't need to double the batch size, which leads to faster inference. The downside is that negative prompts don't work with LCM because they don't have any effect on the denoising process.
-* The ideal range for `guidance_scale` is [3., 13.] because that is what the UNet was trained with. However, disabling `guidance_scale` with a value of 1.0 is also effective in most cases.
+You'll use the [`StableDiffusionXLPipeline`] pipeline with the [`LCMScheduler`] and then load the LCM-LoRA. Together with the LCM-LoRA and the scheduler, the pipeline enables a fast inference workflow, overcoming the slow iterative nature of diffusion models.

 ```python
 from diffusers import StableDiffusionXLPipeline, UNet2DConditionModel, LCMScheduler
@@ -50,69 +49,31 @@ pipe = StableDiffusionXLPipeline.from_pretrained(
 pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)

 prompt = "Self-portrait oil painting, a beautiful cyborg with golden hair, 8k"
+
 generator = torch.manual_seed(0)
 image = pipe(
    prompt=prompt, num_inference_steps=4, generator=generator, guidance_scale=8.0
 ).images[0]
-image
 ```

-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_full_sdxl_t2i.png"/>
-</div>
+![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_full_sdxl_t2i.png)

-</hfoption>
-<hfoption id="LCM-LoRA">
+Notice that we use only 4 steps for generation which is way less than what's typically used for standard SDXL.

-To use LCM-LoRAs, you need to replace the scheduler with the [`LCMScheduler`] and load the LCM-LoRA weights with the [`~loaders.LoraLoaderMixin.load_lora_weights`] method. Then you can use the pipeline as usual, and pass a text prompt to generate an image in just 4 steps.
+Some details to keep in mind:

-A couple of notes to keep in mind when using LCM-LoRAs are:
+* To perform classifier-free guidance, batch size is usually doubled inside the pipeline. LCM, however, applies guidance using guidance embeddings, so the batch size does not have to be doubled in this case. This leads to a faster inference time, with the drawback that negative prompts don't have any effect on the denoising process.
+* The UNet was trained using the [3., 13.] guidance scale range. So, that is the ideal range for `guidance_scale`. However, disabling `guidance_scale` using a value of 1.0 is also effective in most cases.

-* Typically, batch size is doubled inside the pipeline for classifier-free guidance. But LCM applies guidance with guidance embeddings and doesn't need to double the batch size, which leads to faster inference. The downside is that negative prompts don't work with LCM because they don't have any effect on the denoising process.
-* You could use guidance with LCM-LoRAs, but it is very sensitive to high `guidance_scale` values and can lead to artifacts in the generated image. The best values we've found are between [1.0, 2.0].
-* Replace [stabilityai/stable-diffusion-xl-base-1.0](https://hf.co/stabilityai/stable-diffusion-xl-base-1.0) with any finetuned model. For example, try using the [animagine-xl](https://huggingface.co/Linaqruf/animagine-xl) checkpoint to generate anime images with SDXL.
-
-```py
-import torch
-from diffusers import DiffusionPipeline, LCMScheduler
-
-pipe = DiffusionPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0",
-    variant="fp16",
-    torch_dtype=torch.float16
-).to("cuda")
-pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
-pipe.load_lora_weights("latent-consistency/lcm-lora-sdxl")
-
-prompt = "Self-portrait oil painting, a beautiful cyborg with golden hair, 8k"
-generator = torch.manual_seed(42)
-image = pipe(
-    prompt=prompt, num_inference_steps=4, generator=generator, guidance_scale=1.0
-).images[0]
-image
-```
-
-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_sdxl_t2i.png"/>
-</div>
-
-</hfoption>
-</hfoptions>

 ## Image-to-image

-<hfoptions id="lcm-img2img">
-<hfoption id="LCM">
-
-To use LCMs for image-to-image, you need to load the LCM checkpoint for your supported model into [`UNet2DConditionModel`] and replace the scheduler with the [`LCMScheduler`]. Then you can use the pipeline as usual, and pass a text prompt and initial image to generate an image in just 4 steps.
-
-> [!TIP]
-> Experiment with different values for `num_inference_steps`, `strength`, and `guidance_scale` to get the best results.
+LCMs can be applied to image-to-image tasks too. For this example, we'll use the [LCM_Dreamshaper_v7](https://huggingface.co/SimianLuo/LCM_Dreamshaper_v7) model, but the same steps can be applied to other LCM models as well.

 ```python
 import torch
 from diffusers import AutoPipelineForImage2Image, UNet2DConditionModel, LCMScheduler
-from diffusers.utils import load_image
+from diffusers.utils import make_image_grid, load_image

 unet = UNet2DConditionModel.from_pretrained(
    "SimianLuo/LCM_Dreamshaper_v7",
@@ -128,8 +89,12 @@ pipe = AutoPipelineForImage2Image.from_pretrained(
 ).to("cuda")
 pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)

-init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/img2img-init.png")
+# prepare image
+url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/img2img-init.png"
+init_image = load_image(url)
 prompt = "Astronauts in a jungle, cold color palette, muted colors, detailed, 8k"
+
+# pass prompt and image to pipeline
 generator = torch.manual_seed(0)
 image = pipe(
    prompt,
@@ -139,130 +104,22 @@ image = pipe(
    strength=0.5,
    generator=generator
 ).images[0]
-image
+make_image_grid([init_image, image], rows=1, cols=2)
 ```

-<div class="flex gap-4">
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/img2img-init.png"/>
-    <figcaption class="mt-2 text-center text-sm text-gray-500">initial image</figcaption>
-  </div>
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm-img2img.png"/>
-    <figcaption class="mt-2 text-center text-sm text-gray-500">generated image</figcaption>
-  </div>
-</div>
+![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_full_sdv1-5_i2i.png)

-</hfoption>
-<hfoption id="LCM-LoRA">

-To use LCM-LoRAs for image-to-image, you need to replace the scheduler with the [`LCMScheduler`] and load the LCM-LoRA weights with the [`~loaders.LoraLoaderMixin.load_lora_weights`] method. Then you can use the pipeline as usual, and pass a text prompt and initial image to generate an image in just 4 steps.
+<Tip>

-> [!TIP]
-> Experiment with different values for `num_inference_steps`, `strength`, and `guidance_scale` to get the best results.
+You can get different results based on your prompt and the image you provide. To get the best results, we recommend trying different values for `num_inference_steps`, `strength`, and `guidance_scale` parameters and choose the best one.

-```py
-import torch
-from diffusers import AutoPipelineForImage2Image, LCMScheduler
-from diffusers.utils import make_image_grid, load_image
+</Tip>

-pipe = AutoPipelineForImage2Image.from_pretrained(
-    "Lykon/dreamshaper-7",
-    torch_dtype=torch.float16,
-    variant="fp16",
-).to("cuda")

-pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
+## Combine with style LoRAs

-pipe.load_lora_weights("latent-consistency/lcm-lora-sdv1-5")
-
-init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/img2img-init.png")
-prompt = "Astronauts in a jungle, cold color palette, muted colors, detailed, 8k"
-
-generator = torch.manual_seed(0)
-image = pipe(
-    prompt,
-    image=init_image,
-    num_inference_steps=4,
-    guidance_scale=1,
-    strength=0.6,
-    generator=generator
-).images[0]
-image
-```
-
-<div class="flex gap-4">
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/img2img-init.png"/>
-    <figcaption class="mt-2 text-center text-sm text-gray-500">initial image</figcaption>
-  </div>
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm-lora-img2img.png"/>
-    <figcaption class="mt-2 text-center text-sm text-gray-500">generated image</figcaption>
-  </div>
-</div>
-
-</hfoption>
-</hfoptions>
-
-## Inpainting
-
-To use LCM-LoRAs for inpainting, you need to replace the scheduler with the [`LCMScheduler`] and load the LCM-LoRA weights with the [`~loaders.LoraLoaderMixin.load_lora_weights`] method. Then you can use the pipeline as usual, and pass a text prompt, initial image, and mask image to generate an image in just 4 steps.
-
-```py
-import torch
-from diffusers import AutoPipelineForInpainting, LCMScheduler
-from diffusers.utils import load_image, make_image_grid
-
-pipe = AutoPipelineForInpainting.from_pretrained(
-    "runwayml/stable-diffusion-inpainting",
-    torch_dtype=torch.float16,
-    variant="fp16",
-).to("cuda")
-
-pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
-
-pipe.load_lora_weights("latent-consistency/lcm-lora-sdv1-5")
-
-init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png")
-mask_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint_mask.png")
-
-prompt = "concept art digital painting of an elven castle, inspired by lord of the rings, highly detailed, 8k"
-generator = torch.manual_seed(0)
-image = pipe(
-    prompt=prompt,
-    image=init_image,
-    mask_image=mask_image,
-    generator=generator,
-    num_inference_steps=4,
-    guidance_scale=4, 
-).images[0]
-image
-```
-
-<div class="flex gap-4">
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png"/>
-    <figcaption class="mt-2 text-center text-sm text-gray-500">initial image</figcaption>
-  </div>
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm-lora-inpaint.png"/>
-    <figcaption class="mt-2 text-center text-sm text-gray-500">generated image</figcaption>
-  </div>
-</div>
-
-## Adapters
-
-LCMs are compatible with adapters like LoRA, ControlNet, T2I-Adapter, and AnimateDiff. You can bring the speed of LCMs to these adapters to generate images in a certain style or condition the model on another input like a canny image.
-
-### LoRA
-
-[LoRA](../using-diffusers/loading_adapters#lora) adapters can be rapidly finetuned to learn a new style from just a few images and plugged into a pretrained model to generate images in that style.
-
-<hfoptions id="lcm-lora">
-<hfoption id="LCM">
-
-Load the LCM checkpoint for your supported model into [`UNet2DConditionModel`] and replace the scheduler with the [`LCMScheduler`]. Then you can use the [`~loaders.LoraLoaderMixin.load_lora_weights`] method to load the LoRA weights into the LCM and generate a styled image in a few steps.
+LCMs can be used with other styled LoRAs to generate styled-images in very few steps (4-8). In the following example, we'll use the [papercut LoRA](TheLastBen/Papercut_SDXL). 

 ```python
 from diffusers import StableDiffusionXLPipeline, UNet2DConditionModel, LCMScheduler
@@ -277,9 +134,11 @@ pipe = StableDiffusionXLPipeline.from_pretrained(
    "stabilityai/stable-diffusion-xl-base-1.0", unet=unet, torch_dtype=torch.float16, variant="fp16",
 ).to("cuda")
 pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
+
 pipe.load_lora_weights("TheLastBen/Papercut_SDXL", weight_name="papercut.safetensors", adapter_name="papercut")

 prompt = "papercut, a cute fox"
+
 generator = torch.manual_seed(0)
 image = pipe(
    prompt=prompt, num_inference_steps=4, generator=generator, guidance_scale=8.0
@@ -287,58 +146,15 @@ image = pipe(
 image
 ```

-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_full_sdx_lora_mix.png"/>
-</div>
+![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_full_sdx_lora_mix.png)

-</hfoption>
-<hfoption id="LCM-LoRA">

-Replace the scheduler with the [`LCMScheduler`]. Then you can use the [`~loaders.LoraLoaderMixin.load_lora_weights`] method to load the LCM-LoRA weights and the style LoRA you want to use. Combine both LoRA adapters with the [`~loaders.UNet2DConditionLoadersMixin.set_adapters`] method and generate a styled image in a few steps.
+## ControlNet/T2I-Adapter

-```py
-import torch
-from diffusers import DiffusionPipeline, LCMScheduler
-
-pipe = DiffusionPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0",
-    variant="fp16",
-    torch_dtype=torch.float16
-).to("cuda")
-
-pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
-
-pipe.load_lora_weights("latent-consistency/lcm-lora-sdxl", adapter_name="lcm")
-pipe.load_lora_weights("TheLastBen/Papercut_SDXL", weight_name="papercut.safetensors", adapter_name="papercut")
-
-pipe.set_adapters(["lcm", "papercut"], adapter_weights=[1.0, 0.8])
-
-prompt = "papercut, a cute fox"
-generator = torch.manual_seed(0)
-image = pipe(prompt, num_inference_steps=4, guidance_scale=1, generator=generator).images[0]
-image
-```
-
-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_sdx_lora_mix.png"/>
-</div>
-
-</hfoption>
-</hfoptions>
+Let's look at how we can perform inference with ControlNet/T2I-Adapter and a LCM. 

 ### ControlNet
-
-[ControlNet](./controlnet) are adapters that can be trained on a variety of inputs like canny edge, pose estimation, or depth. The ControlNet can be inserted into the pipeline to provide additional conditioning and control to the model for more accurate generation.
-
-You can find additional ControlNet models trained on other inputs in [lllyasviel's](https://hf.co/lllyasviel) repository.
-
-<hfoptions id="lcm-controlnet">
-<hfoption id="LCM">
-
-Load a ControlNet model trained on canny images and pass it to the [`ControlNetModel`]. Then you can load a LCM model into [`StableDiffusionControlNetPipeline`] and replace the scheduler with the [`LCMScheduler`]. Now pass the canny image to the pipeline and generate an image.
-
-> [!TIP]
-> Experiment with different values for `num_inference_steps`, `controlnet_conditioning_scale`, `cross_attention_kwargs`, and `guidance_scale` to get the best results.
+For this example, we'll use the [LCM_Dreamshaper_v7](https://huggingface.co/SimianLuo/LCM_Dreamshaper_v7) model with canny ControlNet, but the same steps can be applied to other LCM models as well.

 ```python
 import torch
@@ -370,6 +186,8 @@ pipe = StableDiffusionControlNetPipeline.from_pretrained(
    torch_dtype=torch.float16,
    safety_checker=None,
 ).to("cuda")
+
+# set scheduler
 pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)

 generator = torch.manual_seed(0)
@@ -382,84 +200,16 @@ image = pipe(
 make_image_grid([canny_image, image], rows=1, cols=2)
 ```

-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_full_sdv1-5_controlnet.png"/>
-</div>
+![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_full_sdv1-5_controlnet.png)

-</hfoption>
-<hfoption id="LCM-LoRA">

-Load a ControlNet model trained on canny images and pass it to the [`ControlNetModel`]. Then you can load a Stable Diffusion v1.5 model into [`StableDiffusionControlNetPipeline`] and replace the scheduler with the [`LCMScheduler`]. Use the [`~loaders.LoraLoaderMixin.load_lora_weights`] method to load the LCM-LoRA weights, and pass the canny image to the pipeline and generate an image.
-
-> [!TIP]
-> Experiment with different values for `num_inference_steps`, `controlnet_conditioning_scale`, `cross_attention_kwargs`, and `guidance_scale` to get the best results.
-
-```py
-import torch
-import cv2
-import numpy as np
-from PIL import Image
-
-from diffusers import StableDiffusionControlNetPipeline, ControlNetModel, LCMScheduler
-from diffusers.utils import load_image
-
-image = load_image(
-    "https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png"
-).resize((512, 512))
-
-image = np.array(image)
-
-low_threshold = 100
-high_threshold = 200
-
-image = cv2.Canny(image, low_threshold, high_threshold)
-image = image[:, :, None]
-image = np.concatenate([image, image, image], axis=2)
-canny_image = Image.fromarray(image)
-
-controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16)
-pipe = StableDiffusionControlNetPipeline.from_pretrained(
-    "runwayml/stable-diffusion-v1-5",
-    controlnet=controlnet,
-    torch_dtype=torch.float16,
-    safety_checker=None,
-    variant="fp16"
-).to("cuda")
-
-pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
-
-pipe.load_lora_weights("latent-consistency/lcm-lora-sdv1-5")
-
-generator = torch.manual_seed(0)
-image = pipe(
-    "the mona lisa",
-    image=canny_image,
-    num_inference_steps=4,
-    guidance_scale=1.5,
-    controlnet_conditioning_scale=0.8,
-    cross_attention_kwargs={"scale": 1},
-    generator=generator,
-).images[0]
-image
-```
-
-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_sdv1-5_controlnet.png"/>
-</div>
-
-</hfoption>
-</hfoptions>
+<Tip>
+The inference parameters in this example might not work for all examples, so we recommend trying different values for the `num_inference_steps`, `guidance_scale`, `controlnet_conditioning_scale`, and `cross_attention_kwargs` parameters and choosing the best one. 
+</Tip>

 ### T2I-Adapter

-[T2I-Adapter](./t2i_adapter) is an even more lightweight adapter than ControlNet, that provides an additional input to condition a pretrained model with. It is faster than ControlNet but the results may be slightly worse.
-
-You can find additional T2I-Adapter checkpoints trained on other inputs in [TencentArc's](https://hf.co/TencentARC) repository.
-
-<hfoptions id="lcm-t2i">
-<hfoption id="LCM">
-
-Load a T2IAdapter trained on canny images and pass it to the [`StableDiffusionXLAdapterPipeline`]. Then load a LCM checkpoint into [`UNet2DConditionModel`] and replace the scheduler with the [`LCMScheduler`]. Now pass the canny image to the pipeline and generate an image.
+This example shows how to use the `lcm-sdxl` with the [Canny T2I-Adapter](TencentARC/t2i-adapter-canny-sdxl-1.0).

 ```python
 import torch
@@ -470,9 +220,10 @@ from PIL import Image
 from diffusers import StableDiffusionXLAdapterPipeline, UNet2DConditionModel, T2IAdapter, LCMScheduler
 from diffusers.utils import load_image, make_image_grid

-# detect the canny map in low resolution to avoid high-frequency details
+# Prepare image
+# Detect the canny map in low resolution to avoid high-frequency details
 image = load_image(
-    "https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png"
+    "https://huggingface.co/Adapter/t2iadapter/resolve/main/figs_SDXLV1.0/org_canny.jpg"
 ).resize((384, 384))

 image = np.array(image)
@@ -485,6 +236,7 @@ image = image[:, :, None]
 image = np.concatenate([image, image, image], axis=2)
 canny_image = Image.fromarray(image).resize((1024, 1216))

+# load adapter
 adapter = T2IAdapter.from_pretrained("TencentARC/t2i-adapter-canny-sdxl-1.0", torch_dtype=torch.float16, varient="fp16").to("cuda")

 unet = UNet2DConditionModel.from_pretrained(
@@ -502,7 +254,7 @@ pipe = StableDiffusionXLAdapterPipeline.from_pretrained(

 pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)

-prompt = "the mona lisa, 4k picture, high quality"
+prompt = "Mystical fairy in real, magic, 4k picture, high quality"
 negative_prompt = "extra digit, fewer digits, cropped, worst quality, low quality, glitch, deformed, mutated, ugly, disfigured"

 generator = torch.manual_seed(0)
@@ -516,116 +268,7 @@ image = pipe(
    adapter_conditioning_factor=1,
    generator=generator,
 ).images[0]
+grid = make_image_grid([canny_image, image], rows=1, cols=2)
 ```

-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm-t2i.png"/>
-</div>
-
-</hfoption>
-<hfoption id="LCM-LoRA">
-
-Load a T2IAdapter trained on canny images and pass it to the [`StableDiffusionXLAdapterPipeline`]. Replace the scheduler with the [`LCMScheduler`], and use the [`~loaders.LoraLoaderMixin.load_lora_weights`] method to load the LCM-LoRA weights. Pass the canny image to the pipeline and generate an image.
-
-```py
-import torch
-import cv2
-import numpy as np
-from PIL import Image
-
-from diffusers import StableDiffusionXLAdapterPipeline, UNet2DConditionModel, T2IAdapter, LCMScheduler
-from diffusers.utils import load_image, make_image_grid
-
-# detect the canny map in low resolution to avoid high-frequency details
-image = load_image(
-    "https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png"
-).resize((384, 384))
-
-image = np.array(image)
-
-low_threshold = 100
-high_threshold = 200
-
-image = cv2.Canny(image, low_threshold, high_threshold)
-image = image[:, :, None]
-image = np.concatenate([image, image, image], axis=2)
-canny_image = Image.fromarray(image).resize((1024, 1024))
-
-adapter = T2IAdapter.from_pretrained("TencentARC/t2i-adapter-canny-sdxl-1.0", torch_dtype=torch.float16, varient="fp16").to("cuda")
-
-pipe = StableDiffusionXLAdapterPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0", 
-    adapter=adapter,
-    torch_dtype=torch.float16,
-    variant="fp16", 
-).to("cuda")
-
-pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
-
-pipe.load_lora_weights("latent-consistency/lcm-lora-sdxl")
-
-prompt = "the mona lisa, 4k picture, high quality"
-negative_prompt = "extra digit, fewer digits, cropped, worst quality, low quality, glitch, deformed, mutated, ugly, disfigured"
-
-generator = torch.manual_seed(0)
-image = pipe(
-    prompt=prompt,
-    negative_prompt=negative_prompt,
-    image=canny_image,
-    num_inference_steps=4,
-    guidance_scale=1.5, 
-    adapter_conditioning_scale=0.8, 
-    adapter_conditioning_factor=1,
-    generator=generator,
-).images[0]
-```
-
-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm-lora-t2i.png"/>
-</div>
-
-</hfoption>
-</hfoptions>
-
-### AnimateDiff
-
-[AnimateDiff](../api/pipelines/animatediff) is an adapter that adds motion to an image. It can be used with most Stable Diffusion models, effectively turning them into "video generation" models. Generating good results with a video model usually requires generating multiple frames (16-24), which can be very slow with a regular Stable Diffusion model. LCM-LoRA can speed up this process by only taking 4-8 steps for each frame.
-
-Load a [`AnimateDiffPipeline`] and pass a [`MotionAdapter`] to it. Then replace the scheduler with the [`LCMScheduler`], and combine both LoRA adapters with the [`~loaders.UNet2DConditionLoadersMixin.set_adapters`] method. Now you can pass a prompt to the pipeline and generate an animated image.
-
-```py
-import torch
-from diffusers import MotionAdapter, AnimateDiffPipeline, DDIMScheduler, LCMScheduler
-from diffusers.utils import export_to_gif
-
-adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5")
-pipe = AnimateDiffPipeline.from_pretrained(
-    "frankjoshua/toonyou_beta6",
-    motion_adapter=adapter,
-).to("cuda")
-
-# set scheduler
-pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
-
-# load LCM-LoRA
-pipe.load_lora_weights("latent-consistency/lcm-lora-sdv1-5", adapter_name="lcm")
-pipe.load_lora_weights("guoyww/animatediff-motion-lora-zoom-in", weight_name="diffusion_pytorch_model.safetensors", adapter_name="motion-lora")
-
-pipe.set_adapters(["lcm", "motion-lora"], adapter_weights=[0.55, 1.2])
-
-prompt = "best quality, masterpiece, 1girl, looking at viewer, blurry background, upper body, contemporary, dress"
-generator = torch.manual_seed(0)
-frames = pipe(
-    prompt=prompt,
-    num_inference_steps=5,
-    guidance_scale=1.25,
-    cross_attention_kwargs={"scale": 1},
-    num_frames=24,
-    generator=generator
-).frames[0]
-export_to_gif(frames, "animation.gif")
-```
-
-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm-lora-animatediff.gif"/>
-</div>
+![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_full_sdxl_t2iadapter.png)
--- a/docs/source/en/using-diffusers/inference_with_lcm_lora.md
+++ b/docs/source/en/using-diffusers/inference_with_lcm_lora.md
@@ -0,0 +1,422 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+[[open-in-colab]]
+
+# Performing inference with LCM-LoRA
+
+Latent Consistency Models (LCM) enable quality image generation in typically 2-4 steps making it possible to use diffusion models in almost real-time settings. 
+
+From the [official website](https://latent-consistency-models.github.io/):
+
+> LCMs can be distilled from any pre-trained Stable Diffusion (SD) in only 4,000 training steps (~32 A100 GPU Hours) for generating high quality 768 x 768 resolution images in 2~4 steps or even one step, significantly accelerating text-to-image generation. We employ LCM to distill the Dreamshaper-V7 version of SD in just 4,000 training iterations.
+
+For a more technical overview of LCMs, refer to [the paper](https://huggingface.co/papers/2310.04378).
+
+However, each model needs to be distilled separately for latent consistency distillation. The core idea with LCM-LoRA is to train just a few adapter layers, the adapter being LoRA in this case. 
+This way, we don't have to train the full model and keep the number of trainable parameters manageable. The resulting LoRAs can then be applied to any fine-tuned version of the model without distilling them separately.
+Additionally, the LoRAs can be applied to image-to-image, ControlNet/T2I-Adapter, inpainting, AnimateDiff etc. 
+The LCM-LoRA can also be combined with other LoRAs to generate styled images in very few steps (4-8).
+
+LCM-LoRAs are available for [stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5), [stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0), and the [SSD-1B](https://huggingface.co/segmind/SSD-1B) model. All the checkpoints can be found in this [collection](https://huggingface.co/collections/latent-consistency/latent-consistency-models-loras-654cdd24e111e16f0865fba6).
+
+For more details about LCM-LoRA, refer to [the technical report](https://huggingface.co/papers/2311.05556).
+
+This guide shows how to perform inference with LCM-LoRAs for 
+- text-to-image
+- image-to-image
+- combined with styled LoRAs
+- ControlNet/T2I-Adapter
+- inpainting
+- AnimateDiff
+
+Before going through this guide, we'll take a look at the general workflow for performing inference with LCM-LoRAs.
+LCM-LoRAs are similar to other Stable Diffusion LoRAs so they can be used with any [`DiffusionPipeline`] that supports LoRAs.
+
+- Load the task specific pipeline and model.
+- Set the scheduler to [`LCMScheduler`].
+- Load the LCM-LoRA weights for the model.
+- Reduce the `guidance_scale` between `[1.0, 2.0]` and set the `num_inference_steps` between [4, 8].
+- Perform inference with the pipeline with the usual parameters.
+
+Let's look at how we can perform inference with LCM-LoRAs for different tasks.
+
+First, make sure you have [peft](https://github.com/huggingface/peft) installed, for better LoRA support.
+
+```bash
+pip install -U peft
+```
+
+## Text-to-image
+
+You'll use the [`StableDiffusionXLPipeline`] with the scheduler: [`LCMScheduler`] and then load the LCM-LoRA. Together with the LCM-LoRA and the scheduler, the pipeline enables a fast inference workflow overcoming the slow iterative nature of diffusion models.
+
+```python
+import torch
+from diffusers import DiffusionPipeline, LCMScheduler
+
+pipe = DiffusionPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-xl-base-1.0",
+    variant="fp16",
+    torch_dtype=torch.float16
+).to("cuda")
+
+# set scheduler
+pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
+
+# load LCM-LoRA
+pipe.load_lora_weights("latent-consistency/lcm-lora-sdxl")
+
+prompt = "Self-portrait oil painting, a beautiful cyborg with golden hair, 8k"
+
+generator = torch.manual_seed(42)
+image = pipe(
+    prompt=prompt, num_inference_steps=4, generator=generator, guidance_scale=1.0
+).images[0]
+```
+
+![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_sdxl_t2i.png)
+
+Notice that we use only 4 steps for generation which is way less than what's typically used for standard SDXL.
+
+<Tip>
+
+You may have noticed that we set `guidance_scale=1.0`, which disables classifer-free-guidance. This is because the LCM-LoRA is trained with guidance, so the batch size does not have to be doubled in this case. This leads to a faster inference time, with the drawback that negative prompts don't have any effect on the denoising process.
+
+You can also use guidance with LCM-LoRA, but due to the nature of training the model is very sensitve to the `guidance_scale` values, high values can lead to artifacts in the generated images. In our experiments, we found that the best values are in the range of [1.0, 2.0].
+
+</Tip>
+
+### Inference with a fine-tuned model
+
+As mentioned above, the LCM-LoRA can be applied to any fine-tuned version of the model without having to distill them separately. Let's look at how we can perform inference with a fine-tuned model. In this example, we'll use the [animagine-xl](https://huggingface.co/Linaqruf/animagine-xl) model, which is a fine-tuned version of the SDXL model for generating anime.
+
+```python
+from diffusers import DiffusionPipeline, LCMScheduler
+
+pipe = DiffusionPipeline.from_pretrained(
+    "Linaqruf/animagine-xl",
+    variant="fp16",
+    torch_dtype=torch.float16
+).to("cuda")
+
+# set scheduler
+pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
+
+# load LCM-LoRA
+pipe.load_lora_weights("latent-consistency/lcm-lora-sdxl")
+
+prompt = "face focus, cute, masterpiece, best quality, 1girl, green hair, sweater, looking at viewer, upper body, beanie, outdoors, night, turtleneck"
+
+generator = torch.manual_seed(0)
+image = pipe(
+    prompt=prompt, num_inference_steps=4, generator=generator, guidance_scale=1.0
+).images[0]
+```
+
+![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_sdxl_t2i_finetuned.png)
+
+
+## Image-to-image
+
+LCM-LoRA can be applied to image-to-image tasks too. Let's look at how we can perform image-to-image generation with LCMs. For this example we'll use the [dreamshaper-7](https://huggingface.co/Lykon/dreamshaper-7) model and the LCM-LoRA for `stable-diffusion-v1-5 `.
+
+```python
+import torch
+from diffusers import AutoPipelineForImage2Image, LCMScheduler
+from diffusers.utils import make_image_grid, load_image
+
+pipe = AutoPipelineForImage2Image.from_pretrained(
+    "Lykon/dreamshaper-7",
+    torch_dtype=torch.float16,
+    variant="fp16",
+).to("cuda")
+
+# set scheduler
+pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
+
+# load LCM-LoRA
+pipe.load_lora_weights("latent-consistency/lcm-lora-sdv1-5")
+
+# prepare image
+url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/img2img-init.png"
+init_image = load_image(url)
+prompt = "Astronauts in a jungle, cold color palette, muted colors, detailed, 8k"
+
+# pass prompt and image to pipeline
+generator = torch.manual_seed(0)
+image = pipe(
+    prompt,
+    image=init_image,
+    num_inference_steps=4,
+    guidance_scale=1,
+    strength=0.6,
+    generator=generator
+).images[0]
+make_image_grid([init_image, image], rows=1, cols=2)
+```
+
+![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_sdv1-5_i2i.png)
+
+
+<Tip>
+
+You can get different results based on your prompt and the image you provide. To get the best results, we recommend trying different values for `num_inference_steps`, `strength`, and `guidance_scale` parameters and choose the best one.
+
+</Tip>
+
+
+## Combine with styled LoRAs
+
+LCM-LoRA can be combined with other LoRAs to generate styled-images in very few steps (4-8). In the following example, we'll use the LCM-LoRA with the [papercut LoRA](TheLastBen/Papercut_SDXL). 
+To learn more about how to combine LoRAs, refer to [this guide](https://huggingface.co/docs/diffusers/tutorials/using_peft_for_inference#combine-multiple-adapters).
+
+```python
+import torch
+from diffusers import DiffusionPipeline, LCMScheduler
+
+pipe = DiffusionPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-xl-base-1.0",
+    variant="fp16",
+    torch_dtype=torch.float16
+).to("cuda")
+
+# set scheduler
+pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
+
+# load LoRAs
+pipe.load_lora_weights("latent-consistency/lcm-lora-sdxl", adapter_name="lcm")
+pipe.load_lora_weights("TheLastBen/Papercut_SDXL", weight_name="papercut.safetensors", adapter_name="papercut")
+
+# Combine LoRAs
+pipe.set_adapters(["lcm", "papercut"], adapter_weights=[1.0, 0.8])
+
+prompt = "papercut, a cute fox"
+generator = torch.manual_seed(0)
+image = pipe(prompt, num_inference_steps=4, guidance_scale=1, generator=generator).images[0]
+image
+```
+
+![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_sdx_lora_mix.png)
+
+
+## ControlNet/T2I-Adapter
+
+Let's look at how we can perform inference with ControlNet/T2I-Adapter and LCM-LoRA. 
+
+### ControlNet
+For this example, we'll use the SD-v1-5 model and the LCM-LoRA for SD-v1-5 with canny ControlNet.
+
+```python
+import torch
+import cv2
+import numpy as np
+from PIL import Image
+
+from diffusers import StableDiffusionControlNetPipeline, ControlNetModel, LCMScheduler
+from diffusers.utils import load_image
+
+image = load_image(
+    "https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png"
+).resize((512, 512))
+
+image = np.array(image)
+
+low_threshold = 100
+high_threshold = 200
+
+image = cv2.Canny(image, low_threshold, high_threshold)
+image = image[:, :, None]
+image = np.concatenate([image, image, image], axis=2)
+canny_image = Image.fromarray(image)
+
+controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16)
+pipe = StableDiffusionControlNetPipeline.from_pretrained(
+    "runwayml/stable-diffusion-v1-5",
+    controlnet=controlnet,
+    torch_dtype=torch.float16,
+    safety_checker=None,
+    variant="fp16"
+).to("cuda")
+
+# set scheduler
+pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
+
+# load LCM-LoRA
+pipe.load_lora_weights("latent-consistency/lcm-lora-sdv1-5")
+
+generator = torch.manual_seed(0)
+image = pipe(
+    "the mona lisa",
+    image=canny_image,
+    num_inference_steps=4,
+    guidance_scale=1.5,
+    controlnet_conditioning_scale=0.8,
+    cross_attention_kwargs={"scale": 1},
+    generator=generator,
+).images[0]
+make_image_grid([canny_image, image], rows=1, cols=2)
+```
+
+![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_sdv1-5_controlnet.png)
+
+
+<Tip>
+The inference parameters in this example might not work for all examples, so we recommend you to try different values for `num_inference_steps`, `guidance_scale`, `controlnet_conditioning_scale` and `cross_attention_kwargs` parameters and choose the best one. 
+</Tip>
+
+### T2I-Adapter
+
+This example shows how to use the LCM-LoRA with the [Canny T2I-Adapter](TencentARC/t2i-adapter-canny-sdxl-1.0) and SDXL.
+
+```python
+import torch
+import cv2
+import numpy as np
+from PIL import Image
+
+from diffusers import StableDiffusionXLAdapterPipeline, T2IAdapter, LCMScheduler
+from diffusers.utils import load_image, make_image_grid
+
+# Prepare image
+# Detect the canny map in low resolution to avoid high-frequency details
+image = load_image(
+    "https://huggingface.co/Adapter/t2iadapter/resolve/main/figs_SDXLV1.0/org_canny.jpg"
+).resize((384, 384))
+
+image = np.array(image)
+
+low_threshold = 100
+high_threshold = 200
+
+image = cv2.Canny(image, low_threshold, high_threshold)
+image = image[:, :, None]
+image = np.concatenate([image, image, image], axis=2)
+canny_image = Image.fromarray(image).resize((1024, 1024))
+
+# load adapter
+adapter = T2IAdapter.from_pretrained("TencentARC/t2i-adapter-canny-sdxl-1.0", torch_dtype=torch.float16, varient="fp16").to("cuda")
+
+pipe = StableDiffusionXLAdapterPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-xl-base-1.0", 
+    adapter=adapter,
+    torch_dtype=torch.float16,
+    variant="fp16", 
+).to("cuda")
+
+# set scheduler
+pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
+
+# load LCM-LoRA
+pipe.load_lora_weights("latent-consistency/lcm-lora-sdxl")
+
+prompt = "Mystical fairy in real, magic, 4k picture, high quality"
+negative_prompt = "extra digit, fewer digits, cropped, worst quality, low quality, glitch, deformed, mutated, ugly, disfigured"
+
+generator = torch.manual_seed(0)
+image = pipe(
+    prompt=prompt,
+    negative_prompt=negative_prompt,
+    image=canny_image,
+    num_inference_steps=4,
+    guidance_scale=1.5, 
+    adapter_conditioning_scale=0.8, 
+    adapter_conditioning_factor=1,
+    generator=generator,
+).images[0]
+make_image_grid([canny_image, image], rows=1, cols=2)
+```
+
+![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_sdxl_t2iadapter.png)
+
+
+## Inpainting
+
+LCM-LoRA can be used for inpainting as well. 
+
+```python
+import torch
+from diffusers import AutoPipelineForInpainting, LCMScheduler
+from diffusers.utils import load_image, make_image_grid
+
+pipe = AutoPipelineForInpainting.from_pretrained(
+    "runwayml/stable-diffusion-inpainting",
+    torch_dtype=torch.float16,
+    variant="fp16",
+).to("cuda")
+
+# set scheduler
+pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
+
+# load LCM-LoRA
+pipe.load_lora_weights("latent-consistency/lcm-lora-sdv1-5")
+
+# load base and mask image
+init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png")
+mask_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint_mask.png")
+
+# generator = torch.Generator("cuda").manual_seed(92)
+prompt = "concept art digital painting of an elven castle, inspired by lord of the rings, highly detailed, 8k"
+generator = torch.manual_seed(0)
+image = pipe(
+    prompt=prompt,
+    image=init_image,
+    mask_image=mask_image,
+    generator=generator,
+    num_inference_steps=4,
+    guidance_scale=4, 
+).images[0]
+make_image_grid([init_image, mask_image, image], rows=1, cols=3)
+```
+
+![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_sdv1-5_inpainting.png)
+
+
+## AnimateDiff
+
+[`AnimateDiff`] allows you to animate images using Stable Diffusion models. To get good results, we need to generate multiple frames (16-24), and doing this with standard SD models can be very slow. 
+LCM-LoRA can be used to speed up the process significantly, as you just need to do 4-8 steps for each frame. Let's look at how we can perform animation with LCM-LoRA and AnimateDiff.
+
+```python
+import torch
+from diffusers import MotionAdapter, AnimateDiffPipeline, DDIMScheduler, LCMScheduler
+from diffusers.utils import export_to_gif
+
+adapter = MotionAdapter.from_pretrained("diffusers/animatediff-motion-adapter-v1-5")
+pipe = AnimateDiffPipeline.from_pretrained(
+    "frankjoshua/toonyou_beta6",
+    motion_adapter=adapter,
+).to("cuda")
+
+# set scheduler
+pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
+
+# load LCM-LoRA
+pipe.load_lora_weights("latent-consistency/lcm-lora-sdv1-5", adapter_name="lcm")
+pipe.load_lora_weights("guoyww/animatediff-motion-lora-zoom-in", weight_name="diffusion_pytorch_model.safetensors", adapter_name="motion-lora")
+
+pipe.set_adapters(["lcm", "motion-lora"], adapter_weights=[0.55, 1.2])
+
+prompt = "best quality, masterpiece, 1girl, looking at viewer, blurry background, upper body, contemporary, dress"
+generator = torch.manual_seed(0)
+frames = pipe(
+    prompt=prompt,
+    num_inference_steps=5,
+    guidance_scale=1.25,
+    cross_attention_kwargs={"scale": 1},
+    num_frames=24,
+    generator=generator
+).frames[0]
+export_to_gif(frames, "animation.gif")
+```
+
+![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_sdv1-5_animatediff.gif)
--- a/docs/source/en/using-diffusers/inference_with_tcd_lora.md
+++ b/docs/source/en/using-diffusers/inference_with_tcd_lora.md
@@ -78,7 +78,7 @@ image = pipe(
    prompt=prompt,
    num_inference_steps=4,
    guidance_scale=0,
-    eta=0.3,
+    eta=0.3, 
    generator=torch.Generator(device=device).manual_seed(0),
 ).images[0]
 ```
@@ -156,14 +156,14 @@ image = pipe(
    prompt=prompt,
    num_inference_steps=8,
    guidance_scale=0,
-    eta=0.3,
+    eta=0.3, 
    generator=torch.Generator(device=device).manual_seed(0),
 ).images[0]
 ```

 ![](https://github.com/jabir-zheng/TCD/raw/main/assets/animagine_xl.png)

-TCD-LoRA also supports other LoRAs trained on different styles. For example, let's load the [TheLastBen/Papercut_SDXL](https://huggingface.co/TheLastBen/Papercut_SDXL) LoRA and fuse it with the TCD-LoRA with the [`~loaders.UNet2DConditionLoadersMixin.set_adapters`] method.
+TCD-LoRA also supports other LoRAs trained on different styles. For example, let's load the [TheLastBen/Papercut_SDXL](https://huggingface.co/TheLastBen/Papercut_SDXL) LoRA and fuse it with the TCD-LoRA with the [`~loaders.UNet2DConditionLoadersMixin.set_adapters`] method. 

 > [!TIP]
 > Check out the [Merge LoRAs](merge_loras) guide to learn more about efficient merging methods.
@@ -171,7 +171,7 @@ TCD-LoRA also supports other LoRAs trained on different styles. For example, let
 ```python
 import torch
 from diffusers import StableDiffusionXLPipeline
-from scheduling_tcd import TCDScheduler
+from scheduling_tcd import TCDScheduler 

 device = "cuda"
 base_model_id = "stabilityai/stable-diffusion-xl-base-1.0"
@@ -191,7 +191,7 @@ image = pipe(
    prompt=prompt,
    num_inference_steps=4,
    guidance_scale=0,
-    eta=0.3,
+    eta=0.3, 
    generator=torch.Generator(device=device).manual_seed(0),
 ).images[0]
 ```
@@ -215,7 +215,7 @@ from PIL import Image
 from transformers import DPTFeatureExtractor, DPTForDepthEstimation
 from diffusers import ControlNetModel, StableDiffusionXLControlNetPipeline
 from diffusers.utils import load_image, make_image_grid
-from scheduling_tcd import TCDScheduler
+from scheduling_tcd import TCDScheduler 

 device = "cuda"
 depth_estimator = DPTForDepthEstimation.from_pretrained("Intel/dpt-hybrid-midas").to(device)
@@ -249,13 +249,13 @@ controlnet = ControlNetModel.from_pretrained(
    controlnet_id,
    torch_dtype=torch.float16,
    variant="fp16",
-)
+).to(device)
 pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
    base_model_id,
    controlnet=controlnet,
    torch_dtype=torch.float16,
    variant="fp16",
-)
+).to(device)
 pipe.enable_model_cpu_offload()

 pipe.scheduler = TCDScheduler.from_config(pipe.scheduler.config)
@@ -271,9 +271,9 @@ depth_image = get_depth_map(image)
 controlnet_conditioning_scale = 0.5  # recommended for good generalization

 image = pipe(
-    prompt,
-    image=depth_image,
-    num_inference_steps=4,
+    prompt, 
+    image=depth_image, 
+    num_inference_steps=4, 
    guidance_scale=0,
    eta=0.3,
    controlnet_conditioning_scale=controlnet_conditioning_scale,
@@ -290,7 +290,7 @@ grid_image = make_image_grid([depth_image, image], rows=1, cols=2)
 import torch
 from diffusers import ControlNetModel, StableDiffusionXLControlNetPipeline
 from diffusers.utils import load_image, make_image_grid
-from scheduling_tcd import TCDScheduler
+from scheduling_tcd import TCDScheduler 

 device = "cuda"
 base_model_id = "stabilityai/stable-diffusion-xl-base-1.0"
@@ -301,13 +301,13 @@ controlnet = ControlNetModel.from_pretrained(
    controlnet_id,
    torch_dtype=torch.float16,
    variant="fp16",
-)
+).to(device)
 pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
    base_model_id,
    controlnet=controlnet,
    torch_dtype=torch.float16,
    variant="fp16",
-)
+).to(device)
 pipe.enable_model_cpu_offload()

 pipe.scheduler = TCDScheduler.from_config(pipe.scheduler.config)
@@ -322,9 +322,9 @@ canny_image = load_image("https://huggingface.co/datasets/hf-internal-testing/di
 controlnet_conditioning_scale = 0.5  # recommended for good generalization

 image = pipe(
-    prompt,
-    image=canny_image,
-    num_inference_steps=4,
+    prompt, 
+    image=canny_image, 
+    num_inference_steps=4, 
    guidance_scale=0,
    eta=0.3,
    controlnet_conditioning_scale=controlnet_conditioning_scale,
@@ -336,7 +336,7 @@ grid_image = make_image_grid([canny_image, image], rows=1, cols=2)
 ![](https://github.com/jabir-zheng/TCD/raw/main/assets/controlnet_canny_tcd.png)

 <Tip>
-The inference parameters in this example might not work for all examples, so we recommend you to try different values for `num_inference_steps`, `guidance_scale`, `controlnet_conditioning_scale` and `cross_attention_kwargs` parameters and choose the best one.
+The inference parameters in this example might not work for all examples, so we recommend you to try different values for `num_inference_steps`, `guidance_scale`, `controlnet_conditioning_scale` and `cross_attention_kwargs` parameters and choose the best one. 
 </Tip>

 </hfoption>
@@ -350,7 +350,7 @@ from diffusers import StableDiffusionXLPipeline
 from diffusers.utils import load_image, make_image_grid

 from ip_adapter import IPAdapterXL
-from scheduling_tcd import TCDScheduler
+from scheduling_tcd import TCDScheduler 

 device = "cuda"
 base_model_path = "stabilityai/stable-diffusion-xl-base-1.0"
@@ -359,8 +359,8 @@ ip_ckpt = "sdxl_models/ip-adapter_sdxl.bin"
 tcd_lora_id = "h1t/TCD-SDXL-LoRA"

 pipe = StableDiffusionXLPipeline.from_pretrained(
-    base_model_path,
-    torch_dtype=torch.float16,
+    base_model_path, 
+    torch_dtype=torch.float16, 
    variant="fp16"
 )
 pipe.scheduler = TCDScheduler.from_config(pipe.scheduler.config)
@@ -375,13 +375,13 @@ ref_image = load_image("https://raw.githubusercontent.com/tencent-ailab/IP-Adapt
 prompt = "best quality, high quality, wearing sunglasses"

 image = ip_model.generate(
-    pil_image=ref_image,
+    pil_image=ref_image, 
    prompt=prompt,
    scale=0.5,
-    num_samples=1,
-    num_inference_steps=4,
+    num_samples=1, 
+    num_inference_steps=4, 
    guidance_scale=0,
-    eta=0.3,
+    eta=0.3, 
    seed=0,
 )[0]

--- a/docs/source/en/using-diffusers/inpaint.md
+++ b/docs/source/en/using-diffusers/inpaint.md
@@ -230,7 +230,7 @@ from diffusers.utils import load_image, make_image_grid

 pipeline = AutoPipelineForInpainting.from_pretrained(
    "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, variant="fp16"
-)
+).to("cuda")
 pipeline.enable_model_cpu_offload()
 # remove following line if xFormers is not installed or you have PyTorch 2.0 or higher installed
 pipeline.enable_xformers_memory_efficient_attention()
@@ -255,7 +255,7 @@ from diffusers.utils import load_image, make_image_grid

 pipeline = AutoPipelineForInpainting.from_pretrained(
    "runwayml/stable-diffusion-inpainting", torch_dtype=torch.float16, variant="fp16"
-)
+).to("cuda")
 pipeline.enable_model_cpu_offload()
 # remove following line if xFormers is not installed or you have PyTorch 2.0 or higher installed
 pipeline.enable_xformers_memory_efficient_attention()
@@ -296,7 +296,7 @@ from diffusers.utils import load_image, make_image_grid

 pipeline = AutoPipelineForInpainting.from_pretrained(
    "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, variant="fp16"
-)
+).to("cuda")
 pipeline.enable_model_cpu_offload()
 # remove following line if xFormers is not installed or you have PyTorch 2.0 or higher installed
 pipeline.enable_xformers_memory_efficient_attention()
@@ -319,7 +319,7 @@ from diffusers.utils import load_image, make_image_grid

 pipeline = AutoPipelineForInpainting.from_pretrained(
    "runwayml/stable-diffusion-inpainting", torch_dtype=torch.float16, variant="fp16"
-)
+).to("cuda")
 pipeline.enable_model_cpu_offload()
 # remove following line if xFormers is not installed or you have PyTorch 2.0 or higher installed
 pipeline.enable_xformers_memory_efficient_attention()
--- a/docs/source/en/using-diffusers/ip_adapter.md
+++ b/docs/source/en/using-diffusers/ip_adapter.md
@@ -277,7 +277,7 @@ images = pipeline(

 ### IP-Adapter masking

-Binary masks specify which portion of the output image should be assigned to an IP-Adapter. This is useful for composing more than one IP-Adapter image. For each input IP-Adapter image, you must provide a binary mask.
+Binary masks specify which portion of the output image should be assigned to an IP-Adapter. This is useful for composing more than one IP-Adapter image. For each input IP-Adapter image, you must provide a binary mask an an IP-Adapter.

 To start, preprocess the input IP-Adapter images with the [`~image_processor.IPAdapterMaskProcessor.preprocess()`] to generate their masks. For optimal results, provide the output height and width to [`~image_processor.IPAdapterMaskProcessor.preprocess()`]. This ensures masks with different aspect ratios are appropriately stretched. If the input masks already match the aspect ratio of the generated image, you don't have to set the `height` and `width`.

@@ -305,18 +305,13 @@ masks = processor.preprocess([mask1, mask2], height=output_height, width=output_
  </div>
 </div>

-When there is more than one input IP-Adapter image, load them as a list and provide the IP-Adapter scale list. Each of the input IP-Adapter images here corresponds to one of the masks generated above.
+When there is more than one input IP-Adapter image, load them as a list to ensure each image is assigned to a different IP-Adapter. Each of the input IP-Adapter images here correspond to the masks generated above.

 ```py
-pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="sdxl_models", weight_name=["ip-adapter-plus-face_sdxl_vit-h.safetensors"])
-pipeline.set_ip_adapter_scale([[0.7, 0.7]])  # one scale for each image-mask pair
-
 face_image1 = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_mask_girl1.png")
 face_image2 = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_mask_girl2.png")

-ip_images = [[face_image1, face_image2]]
-
-masks = [masks.reshape(1, masks.shape[0], masks.shape[2], masks.shape[3])]
+ip_images = [[face_image1], [face_image2]]
 ```

 <div class="flex flex-row gap-4">
@@ -333,6 +328,8 @@ masks = [masks.reshape(1, masks.shape[0], masks.shape[2], masks.shape[3])]
 Now pass the preprocessed masks to `cross_attention_kwargs` in the pipeline call.

 ```py
+pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="sdxl_models", weight_name=["ip-adapter-plus-face_sdxl_vit-h.safetensors"] * 2)
+pipeline.set_ip_adapter_scale([0.7] * 2)
 generator = torch.Generator(device="cpu").manual_seed(0)
 num_images = 1

@@ -439,7 +436,7 @@ image = torch.from_numpy(faces[0].normed_embedding)
 ref_images_embeds.append(image.unsqueeze(0))
 ref_images_embeds = torch.stack(ref_images_embeds, dim=0).unsqueeze(0)
 neg_ref_images_embeds = torch.zeros_like(ref_images_embeds)
-id_embeds = torch.cat([neg_ref_images_embeds, ref_images_embeds]).to(dtype=torch.float16, device="cuda")
+id_embeds = torch.cat([neg_ref_images_embeds, ref_images_embeds]).to(dtype=torch.float16, device="cuda"))

 generator = torch.Generator(device="cpu").manual_seed(42)

@@ -455,28 +452,13 @@ images = pipeline(
 Both IP-Adapter FaceID Plus and Plus v2 models require CLIP image embeddings. You can prepare face embeddings as shown previously, then you can extract and pass CLIP embeddings to the hidden image projection layers.

 ```py
-from insightface.utils import face_align
-
-ref_images_embeds = []
-ip_adapter_images = []
-app = FaceAnalysis(name="buffalo_l", providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
-app.prepare(ctx_id=0, det_size=(640, 640))
-image = cv2.cvtColor(np.asarray(image), cv2.COLOR_BGR2RGB)
-faces = app.get(image)
-ip_adapter_images.append(face_align.norm_crop(image, landmark=faces[0].kps, image_size=224))
-image = torch.from_numpy(faces[0].normed_embedding)
-ref_images_embeds.append(image.unsqueeze(0))
-ref_images_embeds = torch.stack(ref_images_embeds, dim=0).unsqueeze(0)
-neg_ref_images_embeds = torch.zeros_like(ref_images_embeds)
-id_embeds = torch.cat([neg_ref_images_embeds, ref_images_embeds]).to(dtype=torch.float16, device="cuda")
-
-clip_embeds = pipeline.prepare_ip_adapter_image_embeds(
-  [ip_adapter_images], None, torch.device("cuda"), num_images, True)[0]
+clip_embeds = pipeline.prepare_ip_adapter_image_embeds([ip_adapter_images], None, torch.device("cuda"), num_images, True)[0]

 pipeline.unet.encoder_hid_proj.image_projection_layers[0].clip_embeds = clip_embeds.to(dtype=torch.float16)
 pipeline.unet.encoder_hid_proj.image_projection_layers[0].shortcut = False # True if Plus v2
 ```

+
 ### Multi IP-Adapter

 More than one IP-Adapter can be used at the same time to generate specific images in more diverse styles. For example, you can use IP-Adapter-Face to generate consistent faces and characters, and IP-Adapter Plus to generate those faces in a specific style.
@@ -661,16 +643,16 @@ image

 ### Style & layout control

-[InstantStyle](https://arxiv.org/abs/2404.02733) is a plug-and-play method on top of IP-Adapter, which disentangles style and layout from image prompt to control image generation. This way, you can generate images following only the style or layout from image prompt, with significantly improved diversity. This is achieved by only activating IP-Adapters to specific parts of the model. 
+[InstantStyle](https://arxiv.org/abs/2404.02733) is a plug-and-play method on top of IP-Adapter, which disentangles style and layout from image prompt to control image generation. This is achieved by only inserting IP-Adapters to some specific part of the model. 

 By default IP-Adapters are inserted to all layers of the model. Use the [`~loaders.IPAdapterMixin.set_ip_adapter_scale`] method with a dictionary to assign scales to IP-Adapter at different layers.

 ```py
-from diffusers import AutoPipelineForText2Image
+from diffusers import AutoPipelineForImage2Image
 from diffusers.utils import load_image
 import torch

-pipeline = AutoPipelineForText2Image.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16).to("cuda")
+pipeline = AutoPipelineForImage2Image.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16).to("cuda")
 pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="sdxl_models", weight_name="ip-adapter_sdxl.bin")

 scale = {
@@ -680,15 +662,15 @@ scale = {
 pipeline.set_ip_adapter_scale(scale)
 ```

-This will activate IP-Adapter at the second layer in the model's down-part block 2 and up-part block 0. The former is the layer where IP-Adapter injects layout information and the latter injects style. Inserting IP-Adapter to these two layers you can generate images following both the style and layout from image prompt, but with contents more aligned to text prompt.
+This will activate IP-Adapter at the second layer in the model's down-part block 2 and up-part block 0. The former is the layer where IP-Adapter injects layout information and the latter injects style. Inserting IP-Adapter to these two layers you can generate images following the style and layout of image prompt, but with contents more aligned to text prompt.

 ```py
 style_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg")

-generator = torch.Generator(device="cpu").manual_seed(26)
+generator = torch.Generator(device="cpu").manual_seed(42)
 image = pipeline(
    prompt="a cat, masterpiece, best quality, high quality",
-    ip_adapter_image=style_image,
+    image=style_image,
    negative_prompt="text, watermark, lowres, low quality, worst quality, deformed, glitch, low contrast, noisy, saturation, blurry",
    guidance_scale=5,
    num_inference_steps=30,
@@ -703,7 +685,7 @@ image
    <figcaption class="mt-2 text-center text-sm text-gray-500">IP-Adapter image</figcaption>
  </div>
  <div class="flex-1">
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png"/>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit_style_layout_cat.png"/>
    <figcaption class="mt-2 text-center text-sm text-gray-500">generated image</figcaption>
  </div>
 </div>
@@ -718,10 +700,10 @@ scale = {
 }
 pipeline.set_ip_adapter_scale(scale)

-generator = torch.Generator(device="cpu").manual_seed(26)
+generator = torch.Generator(device="cpu").manual_seed(42)
 image = pipeline(
    prompt="a cat, masterpiece, best quality, high quality",
-    ip_adapter_image=style_image,
+    image=style_image,
    negative_prompt="text, watermark, lowres, low quality, worst quality, deformed, glitch, low contrast, noisy, saturation, blurry",
    guidance_scale=5,
    num_inference_steps=30,
@@ -732,11 +714,11 @@ image

 <div class="flex flex-row gap-4">
  <div class="flex-1">
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_only.png"/>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit_style_cat.png"/>
    <figcaption class="mt-2 text-center text-sm text-gray-500">IP-Adapter only in style layer</figcaption>
  </div>
  <div class="flex-1">
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_ip_adapter.png"/>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/30518dfe089e6bf50008875077b44cb98fb2065c/diffusers/default_out.png"/>
    <figcaption class="mt-2 text-center text-sm text-gray-500">IP-Adapter in all layers</figcaption>
  </div>
 </div>
--- a/docs/source/en/using-diffusers/marigold_usage.md
+++ b/docs/source/en/using-diffusers/marigold_usage.md
@@ -1,399 +0,0 @@
-<!--Copyright 2024 Marigold authors and The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# Marigold Pipelines for Computer Vision Tasks
-
-[Marigold](marigold) is a novel diffusion-based dense prediction approach, and a set of pipelines for various computer vision tasks, such as monocular depth estimation.
-
-This guide will show you how to use Marigold to obtain fast and high-quality predictions for images and videos. 
-
-Each pipeline supports one Computer Vision task, which takes an input RGB image as input and produces a *prediction* of the modality of interest, such as a depth map of the input image. 
-Currently, the following tasks are implemented:
-
-| Pipeline                                                                                                                                    | Predicted Modalities                                                                                             |                                                                       Demos                                                                        |
-|---------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------|:--------------------------------------------------------------------------------------------------------------------------------------------------:|
-| [MarigoldDepthPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/marigold/pipeline_marigold_depth.py)     | [Depth](https://en.wikipedia.org/wiki/Depth_map), [Disparity](https://en.wikipedia.org/wiki/Binocular_disparity) | [Fast Demo (LCM)](https://huggingface.co/spaces/prs-eth/marigold-lcm), [Slow Original Demo (DDIM)](https://huggingface.co/spaces/prs-eth/marigold) |
-| [MarigoldNormalsPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/marigold/pipeline_marigold_normals.py) | [Surface normals](https://en.wikipedia.org/wiki/Normal_mapping)                                                  |                                   [Fast Demo (LCM)](https://huggingface.co/spaces/prs-eth/marigold-normals-lcm)                                    |
-
-The original checkpoints can be found under the [PRS-ETH](https://huggingface.co/prs-eth/) Hugging Face organization. 
-These checkpoints are meant to work with diffusers pipelines and the [original codebase](https://github.com/prs-eth/marigold). 
-The original code can also be used to train new checkpoints.
-
-| Checkpoint                                                                                    | Modality | Comment                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
-|-----------------------------------------------------------------------------------------------|----------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| [prs-eth/marigold-v1-0](https://huggingface.co/prs-eth/marigold-v1-0)                         | Depth    | The first Marigold Depth checkpoint, which predicts *affine-invariant depth* maps. The performance of this checkpoint in benchmarks was studied in the original [paper](https://huggingface.co/papers/2312.02145). Designed to be used with the `DDIMScheduler` at inference, it requires at least 10 steps to get reliable predictions. Affine-invariant depth prediction has a range of values in each pixel between 0 (near plane) and 1 (far plane); both planes are chosen by the model as part of the inference process. See the `MarigoldImageProcessor` reference for visualization utilities. |
-| [prs-eth/marigold-lcm-v1-0](https://huggingface.co/prs-eth/marigold-lcm-v1-0)                 | Depth    | The fast Marigold Depth checkpoint, fine-tuned from `prs-eth/marigold-v1-0`. Designed to be used with the `LCMScheduler` at inference, it requires as little as 1 step to get reliable predictions. The prediction reliability saturates at 4 steps and declines after that.                                                                                                                                                                                                                                                                                                                           |
-| [prs-eth/marigold-normals-v0-1](https://huggingface.co/prs-eth/marigold-normals-v0-1)         | Normals  | A preview checkpoint for the Marigold Normals pipeline. Designed to be used with the `DDIMScheduler` at inference, it requires at least 10 steps to get reliable predictions. The surface normals predictions are unit-length 3D vectors with values in the range from -1 to 1. *This checkpoint will be phased out after the release of `v1-0` version.*                                                                                                                                                                                                                                              |
-| [prs-eth/marigold-normals-lcm-v0-1](https://huggingface.co/prs-eth/marigold-normals-lcm-v0-1) | Normals  | The fast Marigold Normals checkpoint, fine-tuned from `prs-eth/marigold-normals-v0-1`. Designed to be used with the `LCMScheduler` at inference, it requires as little as 1 step to get reliable predictions. The prediction reliability saturates at 4 steps and declines after that. *This checkpoint will be phased out after the release of `v1-0` version.*                                                                                                                                                                                                                                       |
-The examples below are mostly given for depth prediction, but they can be universally applied with other supported modalities. 
-We showcase the predictions using the same input image of Albert Einstein generated by Midjourney. 
-This makes it easier to compare visualizations of the predictions across various modalities and checkpoints.
-
-<div class="flex gap-4" style="justify-content: center; width: 100%;">
-  <div style="flex: 1 1 50%; max-width: 50%;">
-    <img class="rounded-xl" src="https://marigoldmonodepth.github.io/images/einstein.jpg"/>
-    <figcaption class="mt-1 text-center text-sm text-gray-500">
-		Example input image for all Marigold pipelines
-	</figcaption>
-  </div>
-</div>
-
-### Depth Prediction Quick Start
-
-To get the first depth prediction, load `prs-eth/marigold-depth-lcm-v1-0` checkpoint into `MarigoldDepthPipeline` pipeline, put the image through the pipeline, and save the predictions: 
-
-```python
-import diffusers
-import torch
-
-pipe = diffusers.MarigoldDepthPipeline.from_pretrained(
-    "prs-eth/marigold-depth-lcm-v1-0", variant="fp16", torch_dtype=torch.float16
-).to("cuda")
-
-image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")
-depth = pipe(image)
-
-vis = pipe.image_processor.visualize_depth(depth.prediction)
-vis[0].save("einstein_depth.png")
-
-depth_16bit = pipe.image_processor.export_depth_to_16bit_png(depth.prediction)
-depth_16bit[0].save("einstein_depth_16bit.png")
-```
-
-The visualization function for depth [`~pipelines.marigold.marigold_image_processing.MarigoldImageProcessor.visualize_depth`] applies one of [matplotlib's colormaps](https://matplotlib.org/stable/users/explain/colors/colormaps.html) (`Spectral` by default) to map the predicted pixel values from a single-channel `[0, 1]` depth range into an RGB image. 
-With the `Spectral` colormap, pixels with near depth are painted red, and far pixels are assigned blue color.
-The 16-bit PNG file stores the single channel values mapped linearly from the `[0, 1]` range into `[0, 65535]`.
-Below are the raw and the visualized predictions; as can be seen, dark areas (mustache) are easier to distinguish in the visualization:
-
-<div class="flex gap-4">
-  <div style="flex: 1 1 50%; max-width: 50%;">
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/6838ae9b9148cfe22ce9bb4c0ab0907c757c4010/marigold/marigold_einstein_lcm_depth_16bit.png"/>
-    <figcaption class="mt-1 text-center text-sm text-gray-500">
-		Predicted depth (16-bit PNG)
-	</figcaption>
-  </div>
-  <div style="flex: 1 1 50%; max-width: 50%;">
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/6838ae9b9148cfe22ce9bb4c0ab0907c757c4010/marigold/marigold_einstein_lcm_depth.png"/>
-    <figcaption class="mt-1 text-center text-sm text-gray-500">
-		Predicted depth visualization (Spectral)
-	</figcaption>
-  </div>
-</div>
-
-### Surface Normals Prediction Quick Start
-
-Load `prs-eth/marigold-normals-lcm-v0-1` checkpoint into `MarigoldNormalsPipeline` pipeline, put the image through the pipeline, and save the predictions: 
-
-```python
-import diffusers
-import torch
-
-pipe = diffusers.MarigoldNormalsPipeline.from_pretrained(
-    "prs-eth/marigold-normals-lcm-v0-1", variant="fp16", torch_dtype=torch.float16
-).to("cuda")
-
-image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")
-normals = pipe(image)
-
-vis = pipe.image_processor.visualize_normals(normals.prediction)
-vis[0].save("einstein_normals.png")
-```
-
-The visualization function for normals [`~pipelines.marigold.marigold_image_processing.MarigoldImageProcessor.visualize_normals`] maps the three-dimensional prediction with pixel values in the range `[-1, 1]` into an RGB image. 
-The visualization function supports flipping surface normals axes to make the visualization compatible with other choices of the frame of reference. 
-Conceptually, each pixel is painted according to the surface normal vector in the frame of reference, where `X` axis points right, `Y` axis points up, and `Z` axis points at the viewer.
-Below is the visualized prediction:
-
-<div class="flex gap-4" style="justify-content: center; width: 100%;">
-  <div style="flex: 1 1 50%; max-width: 50%;">
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/6838ae9b9148cfe22ce9bb4c0ab0907c757c4010/marigold/marigold_einstein_lcm_normals.png"/>
-    <figcaption class="mt-1 text-center text-sm text-gray-500">
-		Predicted surface normals visualization
-	</figcaption>
-  </div>
-</div>
-
-In this example, the nose tip almost certainly has a point on the surface, in which the surface normal vector points straight at the viewer, meaning that its coordinates are `[0, 0, 1]`.
-This vector maps to the RGB `[128, 128, 255]`, which corresponds to the violet-blue color.
-Similarly, a surface normal on the cheek in the right part of the image has a large `X` component, which increases the red hue. 
-Points on the shoulders pointing up with a large `Y` promote green color. 
-
-### Speeding up inference
-
-The above quick start snippets are already optimized for speed: they load the LCM checkpoint, use the `fp16` variant of weights and computation, and perform just one denoising diffusion step. 
-The `pipe(image)` call completes in 280ms on RTX 3090 GPU.
-Internally, the input image is encoded with the Stable Diffusion VAE encoder, then the U-Net performs one denoising step, and finally, the prediction latent is decoded with the VAE decoder into pixel space.
-In this case, two out of three module calls are dedicated to converting between pixel and latent space of LDM.
-Because Marigold's latent space is compatible with the base Stable Diffusion, it is possible to speed up the pipeline call by more than 3x (85ms on RTX 3090) by using a [lightweight replacement of the SD VAE](autoencoder_tiny):
-
-```diff
-  import diffusers
-  import torch
-  
-  pipe = diffusers.MarigoldDepthPipeline.from_pretrained(
-      "prs-eth/marigold-depth-lcm-v1-0", variant="fp16", torch_dtype=torch.float16
-  ).to("cuda")
-  
-+ pipe.vae = diffusers.AutoencoderTiny.from_pretrained(
-+ 	"madebyollin/taesd", torch_dtype=torch.float16
-+ ).cuda()
-  
-  image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")
-  depth = pipe(image)
-```
-
-As suggested in [Optimizations](torch2.0), adding `torch.compile` may squeeze extra performance depending on the target hardware:
-
-```diff
-  import diffusers
-  import torch
-  
-  pipe = diffusers.MarigoldDepthPipeline.from_pretrained(
-      "prs-eth/marigold-depth-lcm-v1-0", variant="fp16", torch_dtype=torch.float16
-  ).to("cuda")
-  
-+ pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
-  
-  image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")
-  depth = pipe(image)
-```
-
-## Qualitative Comparison with Depth Anything
-
-With the above speed optimizations, Marigold delivers predictions with more details and faster than [Depth Anything](https://huggingface.co/docs/transformers/main/en/model_doc/depth_anything) with the largest checkpoint [LiheYoung/depth-anything-large-hf](https://huggingface.co/LiheYoung/depth-anything-large-hf):
-
-<div class="flex gap-4">
-  <div style="flex: 1 1 50%; max-width: 50%;">
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/6838ae9b9148cfe22ce9bb4c0ab0907c757c4010/marigold/marigold_einstein_lcm_depth.png"/>
-    <figcaption class="mt-1 text-center text-sm text-gray-500">
-		Marigold LCM fp16 with Tiny AutoEncoder
-	</figcaption>
-  </div>
-  <div style="flex: 1 1 50%; max-width: 50%;">
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/bfe7cb56ca1cc0811b328212472350879dfa7f8b/marigold/einstein_depthanything_large.png"/>
-    <figcaption class="mt-1 text-center text-sm text-gray-500">
-		Depth Anything Large
-	</figcaption>
-  </div>
-</div>
-
-## Maximizing Precision and Ensembling
-
-Marigold pipelines have a built-in ensembling mechanism combining multiple predictions from different random latents. 
-This is a brute-force way of improving the precision of predictions, capitalizing on the generative nature of diffusion.
-The ensembling path is activated automatically when the `ensemble_size` argument is set greater than `1`. 
-When aiming for maximum precision, it makes sense to adjust `num_inference_steps` simultaneously with `ensemble_size`. 
-The recommended values vary across checkpoints but primarily depend on the scheduler type.
-The effect of ensembling is particularly well-seen with surface normals:
-
-```python
-import diffusers
-
-model_path = "prs-eth/marigold-normals-v1-0"
-
-model_paper_kwargs = {
-	diffusers.schedulers.DDIMScheduler: {
-		"num_inference_steps": 10,
-		"ensemble_size": 10,
-	},
-	diffusers.schedulers.LCMScheduler: {
-		"num_inference_steps": 4,
-		"ensemble_size": 5,
-	},	
-}
-
-image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")
-
-pipe = diffusers.MarigoldNormalsPipeline.from_pretrained(model_path).to("cuda")
-pipe_kwargs = model_paper_kwargs[type(pipe.scheduler)]
-
-depth = pipe(image, **pipe_kwargs)
-
-vis = pipe.image_processor.visualize_normals(depth.prediction)
-vis[0].save("einstein_normals.png")
-```
-
-<div class="flex gap-4">
-  <div style="flex: 1 1 50%; max-width: 50%;">
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/6838ae9b9148cfe22ce9bb4c0ab0907c757c4010/marigold/marigold_einstein_lcm_normals.png"/>
-    <figcaption class="mt-1 text-center text-sm text-gray-500">
-		Surface normals, no ensembling
-	</figcaption>
-  </div>
-  <div style="flex: 1 1 50%; max-width: 50%;">
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/6838ae9b9148cfe22ce9bb4c0ab0907c757c4010/marigold/marigold_einstein_normals.png"/>
-    <figcaption class="mt-1 text-center text-sm text-gray-500">
-		Surface normals, with ensembling
-	</figcaption>
-  </div>
-</div>
-
-As can be seen, all areas with fine-grained structurers, such as hair, got more conservative and on average more correct predictions.
-Such a result is more suitable for precision-sensitive downstream tasks, such as 3D reconstruction.
-
-## Quantitative Evaluation
-
-To evaluate Marigold quantitatively in standard leaderboards and benchmarks (such as NYU, KITTI, and other datasets), follow the evaluation protocol outlined in the paper: load the full precision fp32 model and use appropriate values for `num_inference_steps` and `ensemble_size`. 
-Optionally seed randomness to ensure reproducibility. Maximizing `batch_size` will deliver maximum device utilization.
-
-```python
-import diffusers
-import torch
-
-device = "cuda"
-seed = 2024
-model_path = "prs-eth/marigold-v1-0"
-
-model_paper_kwargs = {
-	diffusers.schedulers.DDIMScheduler: {
-		"num_inference_steps": 50,
-		"ensemble_size": 10,
-	},
-	diffusers.schedulers.LCMScheduler: {
-		"num_inference_steps": 4,
-		"ensemble_size": 10,
-	},	
-}
-
-image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")
-
-generator = torch.Generator(device=device).manual_seed(seed)
-pipe = diffusers.MarigoldDepthPipeline.from_pretrained(model_path).to(device)
-pipe_kwargs = model_paper_kwargs[type(pipe.scheduler)]
-
-depth = pipe(image, generator=generator, **pipe_kwargs)
-
-# evaluate metrics
-```
-
-## Using Predictive Uncertainty
-
-The ensembling mechanism built into Marigold pipelines combines multiple predictions obtained from different random latents. 
-As a side effect, it can be used to quantify epistemic (model) uncertainty; simply specify `ensemble_size` greater than 1 and set `output_uncertainty=True`.
-The resulting uncertainty will be available in the `uncertainty` field of the output.
-It can be visualized as follows:
-
-```python
-import diffusers
-import torch
-
-pipe = diffusers.MarigoldDepthPipeline.from_pretrained(
-    "prs-eth/marigold-depth-lcm-v1-0", variant="fp16", torch_dtype=torch.float16
-).to("cuda")
-
-image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")
-depth = pipe(
-	image,
-	ensemble_size=10,  # any number greater than 1; higher values yield higher precision
-	output_uncertainty=True,
-)
-
-uncertainty = pipe.image_processor.visualize_uncertainty(depth.uncertainty)
-uncertainty[0].save("einstein_depth_uncertainty.png")
-```
-
-<div class="flex gap-4">
-  <div style="flex: 1 1 50%; max-width: 50%;">
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/6838ae9b9148cfe22ce9bb4c0ab0907c757c4010/marigold/marigold_einstein_depth_uncertainty.png"/>
-    <figcaption class="mt-1 text-center text-sm text-gray-500">
-		Depth uncertainty
-	</figcaption>
-  </div>
-  <div style="flex: 1 1 50%; max-width: 50%;">
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/6838ae9b9148cfe22ce9bb4c0ab0907c757c4010/marigold/marigold_einstein_normals_uncertainty.png"/>
-    <figcaption class="mt-1 text-center text-sm text-gray-500">
-		Surface normals uncertainty
-	</figcaption>
-  </div>
-</div>
-
-The interpretation of uncertainty is easy: higher values (white) correspond to pixels, where the model struggles to make consistent predictions.
-Evidently, the depth model is the least confident around edges with discontinuity, where the object depth changes drastically.
-The surface normals model is the least confident in fine-grained structures, such as hair, and dark areas, such as the collar.
-
-## Frame-by-frame Video Processing with Temporal Consistency
-
-Due to Marigold's generative nature, each prediction is unique and defined by the random noise sampled for the latent initialization. 
-This becomes an obvious drawback compared to traditional end-to-end dense regression networks, as exemplified in the following videos:
-
-<div class="flex gap-4">
-  <div style="flex: 1 1 50%; max-width: 50%;">
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/25024b5443a6c1357492751fd09355bd3f967845/marigold/marigold_obama.gif"/>
-    <figcaption class="mt-1 text-center text-sm text-gray-500">Input video</figcaption>
-  </div>
-  <div style="flex: 1 1 50%; max-width: 50%;">
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/25024b5443a6c1357492751fd09355bd3f967845/marigold/marigold_obama_depth_independent.gif"/>
-    <figcaption class="mt-1 text-center text-sm text-gray-500">Marigold Depth applied to input video frames independently</figcaption>
-  </div>
-</div>
-
-To address this issue, it is possible to pass `latents` argument to the pipelines, which defines the starting point of diffusion.
-Empirically, we found that a convex combination of the very same starting point noise latent and the latent corresponding to the previous frame prediction give sufficiently smooth results, as implemented in the snippet below:
-
-```python
-import imageio
-from PIL import Image
-from tqdm import tqdm
-import diffusers
-import torch
-
-device = "cuda"
-path_in = "obama.mp4"
-path_out = "obama_depth.gif"
-
-pipe = diffusers.MarigoldDepthPipeline.from_pretrained(
-    "prs-eth/marigold-lcm-v1-0", variant="fp16", torch_dtype=torch.float16
-).to(device)
-pipe.vae = diffusers.AutoencoderTiny.from_pretrained(
-    "madebyollin/taesd", torch_dtype=torch.float16
-).to(device)
-pipe.set_progress_bar_config(disable=True)
-
-with imageio.get_reader(path_in) as reader:
-    size = reader.get_meta_data()['size']
-    last_frame_latent = None
-    latent_common = torch.randn(
-        (1, 4, 768 * size[1] // (8 * max(size)), 768 * size[0] // (8 * max(size)))
-    ).to(device=device, dtype=torch.float16)
-
-    out = []
-    for frame_id, frame in tqdm(enumerate(reader), desc="Processing Video"):
-        frame = Image.fromarray(frame)
-        latents = latent_common
-        if last_frame_latent is not None:
-            latents = 0.9 * latents + 0.1 * last_frame_latent
-
-        depth = pipe(
-			frame, match_input_resolution=False, latents=latents, output_latent=True,
-        )
-        last_frame_latent = depth.latent
-        out.append(pipe.image_processor.visualize_depth(depth.prediction)[0])
-
-    diffusers.utils.export_to_gif(out, path_out, fps=reader.get_meta_data()['fps'])
-```
-
-Here, the diffusion process starts from the given computed latent. 
-The pipeline sets `output_latent=True` to access `out.latent` and computes its contribution to the next frame's latent initialization.
-The result is much more stable now:
-
-<div class="flex gap-4">
-  <div style="flex: 1 1 50%; max-width: 50%;">
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/25024b5443a6c1357492751fd09355bd3f967845/marigold/marigold_obama_depth_independent.gif"/>
-    <figcaption class="mt-1 text-center text-sm text-gray-500">Marigold Depth applied to input video frames independently</figcaption>
-  </div>
-  <div style="flex: 1 1 50%; max-width: 50%;">
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/25024b5443a6c1357492751fd09355bd3f967845/marigold/marigold_obama_depth_consistent.gif"/>
-    <figcaption class="mt-1 text-center text-sm text-gray-500">Marigold Depth with forced latents initialization</figcaption>
-  </div>
-</div>
-
-Hopefully, you will find Marigold useful for solving your downstream tasks, be it a part of a more broad generative workflow, or a broader perception task, such as 3D reconstruction. 
--- a/docs/source/en/using-diffusers/reproducibility.md
+++ b/docs/source/en/using-diffusers/reproducibility.md
@@ -0,0 +1,191 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Create reproducible pipelines
+
+[[open-in-colab]]
+
+Reproducibility is important for testing, replicating results, and can even be used to [improve image quality](reusing_seeds). However, the randomness in diffusion models is a desired property because it allows the pipeline to generate different images every time it is run. While you can't expect to get the exact same results across platforms, you can expect results to be reproducible across releases and platforms within a certain tolerance range. Even then, tolerance varies depending on the diffusion pipeline and checkpoint.
+
+This is why it's important to understand how to control sources of randomness in diffusion models or use deterministic algorithms.
+
+<Tip>
+
+💡 We strongly recommend reading PyTorch's [statement about reproducibility](https://pytorch.org/docs/stable/notes/randomness.html):
+
+> Completely reproducible results are not guaranteed across PyTorch releases, individual commits, or different platforms. Furthermore, results may not be reproducible between CPU and GPU executions, even when using identical seeds.
+
+</Tip>
+
+## Control randomness
+
+During inference, pipelines rely heavily on random sampling operations which include creating the
+Gaussian noise tensors to denoise and adding noise to the scheduling step.
+
+Take a look at the tensor values in the [`DDIMPipeline`] after two inference steps:
+
+```python
+from diffusers import DDIMPipeline
+import numpy as np
+
+model_id = "google/ddpm-cifar10-32"
+
+# load model and scheduler
+ddim = DDIMPipeline.from_pretrained(model_id, use_safetensors=True)
+
+# run pipeline for just two steps and return numpy tensor
+image = ddim(num_inference_steps=2, output_type="np").images
+print(np.abs(image).sum())
+```
+
+Running the code above prints one value, but if you run it again you get a different value. What is going on here?
+
+Every time the pipeline is run, [`torch.randn`](https://pytorch.org/docs/stable/generated/torch.randn.html) uses a different random seed to create Gaussian noise which is denoised stepwise. This leads to a different result each time it is run, which is great for diffusion pipelines since it generates a different random image each time.
+
+But if you need to reliably generate the same image, that'll depend on whether you're running the pipeline on a CPU or GPU.
+
+### CPU
+
+To generate reproducible results on a CPU, you'll need to use a PyTorch [`Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) and set a seed:
+
+```python
+import torch
+from diffusers import DDIMPipeline
+import numpy as np
+
+model_id = "google/ddpm-cifar10-32"
+
+# load model and scheduler
+ddim = DDIMPipeline.from_pretrained(model_id, use_safetensors=True)
+
+# create a generator for reproducibility
+generator = torch.Generator(device="cpu").manual_seed(0)
+
+# run pipeline for just two steps and return numpy tensor
+image = ddim(num_inference_steps=2, output_type="np", generator=generator).images
+print(np.abs(image).sum())
+```
+
+Now when you run the code above, it always prints a value of `1491.1711` no matter what because the `Generator` object with the seed is passed to all the random functions of the pipeline.
+
+If you run this code example on your specific hardware and PyTorch version, you should get a similar, if not the same, result.
+
+<Tip>
+
+💡 It might be a bit unintuitive at first to pass `Generator` objects to the pipeline instead of
+just integer values representing the seed, but this is the recommended design when dealing with
+probabilistic models in PyTorch, as `Generator`s are *random states* that can be
+passed to multiple pipelines in a sequence.
+
+</Tip>
+
+### GPU
+
+Writing a reproducible pipeline on a GPU is a bit trickier, and full reproducibility across different hardware is not guaranteed because matrix multiplication - which diffusion pipelines require a lot of - is less deterministic on a GPU than a CPU. For example, if you run the same code example above on a GPU:
+
+```python
+import torch
+from diffusers import DDIMPipeline
+import numpy as np
+
+model_id = "google/ddpm-cifar10-32"
+
+# load model and scheduler
+ddim = DDIMPipeline.from_pretrained(model_id, use_safetensors=True)
+ddim.to("cuda")
+
+# create a generator for reproducibility
+generator = torch.Generator(device="cuda").manual_seed(0)
+
+# run pipeline for just two steps and return numpy tensor
+image = ddim(num_inference_steps=2, output_type="np", generator=generator).images
+print(np.abs(image).sum())
+```
+
+The result is not the same even though you're using an identical seed because the GPU uses a different random number generator than the CPU.
+
+To circumvent this problem, 🧨 Diffusers has a [`~diffusers.utils.torch_utils.randn_tensor`] function for creating random noise on the CPU, and then moving the tensor to a GPU if necessary. The `randn_tensor` function is used everywhere inside the pipeline, allowing the user to **always** pass a CPU `Generator` even if the pipeline is run on a GPU.
+
+You'll see the results are much closer now!
+
+```python
+import torch
+from diffusers import DDIMPipeline
+import numpy as np
+
+model_id = "google/ddpm-cifar10-32"
+
+# load model and scheduler
+ddim = DDIMPipeline.from_pretrained(model_id, use_safetensors=True)
+ddim.to("cuda")
+
+# create a generator for reproducibility; notice you don't place it on the GPU!
+generator = torch.manual_seed(0)
+
+# run pipeline for just two steps and return numpy tensor
+image = ddim(num_inference_steps=2, output_type="np", generator=generator).images
+print(np.abs(image).sum())
+```
+
+<Tip>
+
+💡 If reproducibility is important, we recommend always passing a CPU generator.
+The performance loss is often neglectable, and you'll generate much more similar
+values than if the pipeline had been run on a GPU.
+
+</Tip>
+
+Finally, for more complex pipelines such as [`UnCLIPPipeline`], these are often extremely
+susceptible to precision error propagation. Don't expect similar results across
+different GPU hardware or PyTorch versions. In this case, you'll need to run
+exactly the same hardware and PyTorch version for full reproducibility.
+
+## Deterministic algorithms
+
+You can also configure PyTorch to use deterministic algorithms to create a reproducible pipeline. However, you should be aware that deterministic algorithms may be slower than nondeterministic ones and you may observe a decrease in performance. But if reproducibility is important to you, then this is the way to go!
+
+Nondeterministic behavior occurs when operations are launched in more than one CUDA stream. To avoid this, set the environment variable [`CUBLAS_WORKSPACE_CONFIG`](https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility) to `:16:8` to only use one buffer size during runtime.
+
+PyTorch typically benchmarks multiple algorithms to select the fastest one, but if you want reproducibility, you should disable this feature because the benchmark may select different algorithms each time. Lastly, pass `True` to [`torch.use_deterministic_algorithms`](https://pytorch.org/docs/stable/generated/torch.use_deterministic_algorithms.html) to enable deterministic algorithms.
+
+```py
+import os
+import torch
+
+os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":16:8"
+
+torch.backends.cudnn.benchmark = False
+torch.use_deterministic_algorithms(True)
+```
+
+Now when you run the same pipeline twice, you'll get identical results.
+
+```py
+import torch
+from diffusers import DDIMScheduler, StableDiffusionPipeline
+
+model_id = "runwayml/stable-diffusion-v1-5"
+pipe = StableDiffusionPipeline.from_pretrained(model_id, use_safetensors=True).to("cuda")
+pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
+g = torch.Generator(device="cuda")
+
+prompt = "A bear is playing a guitar on Times Square"
+
+g.manual_seed(0)
+result1 = pipe(prompt=prompt, num_inference_steps=50, generator=g, output_type="latent").images
+
+g.manual_seed(0)
+result2 = pipe(prompt=prompt, num_inference_steps=50, generator=g, output_type="latent").images
+
+print("L_inf dist =", abs(result1 - result2).max())
+"L_inf dist = tensor(0., device='cuda:0')"
+```
--- a/docs/source/en/using-diffusers/reusing_seeds.md
+++ b/docs/source/en/using-diffusers/reusing_seeds.md
@@ -10,179 +10,72 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->

-# Reproducible pipelines
+# Improve image quality with deterministic generation

-Diffusion models are inherently random which is what allows it to generate different outputs every time it is run. But there are certain times when you want to generate the same output every time, like when you're testing, replicating results, and even [improving image quality](#deterministic-batch-generation). While you can't expect to get identical results across platforms, you can expect reproducible results across releases and platforms within a certain tolerance range (though even this may vary).
+[[open-in-colab]]

-This guide will show you how to control randomness for deterministic generation on a CPU and GPU.
+A common way to improve the quality of generated images is with *deterministic batch generation*, generate a batch of images and select one image to improve with a more detailed prompt in a second round of inference. The key is to pass a list of [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html#generator)'s to the pipeline for batched image generation, and tie each `Generator` to a seed so you can reuse it for an image.

-> [!TIP]
-> We strongly recommend reading PyTorch's [statement about reproducibility](https://pytorch.org/docs/stable/notes/randomness.html):
->
-> "Completely reproducible results are not guaranteed across PyTorch releases, individual commits, or different platforms. Furthermore, results may not be reproducible between CPU and GPU executions, even when using identical seeds."
-
-## Control randomness
-
-During inference, pipelines rely heavily on random sampling operations which include creating the
-Gaussian noise tensors to denoise and adding noise to the scheduling step.
-
-Take a look at the tensor values in the [`DDIMPipeline`] after two inference steps.
-
-```python
-from diffusers import DDIMPipeline
-import numpy as np
-
-ddim = DDIMPipeline.from_pretrained( "google/ddpm-cifar10-32", use_safetensors=True)
-image = ddim(num_inference_steps=2, output_type="np").images
-print(np.abs(image).sum())
-```
-
-Running the code above prints one value, but if you run it again you get a different value.
-
-Each time the pipeline is run, [torch.randn](https://pytorch.org/docs/stable/generated/torch.randn.html) uses a different random seed to create the Gaussian noise tensors. This leads to a different result each time it is run and enables the diffusion pipeline to generate a different random image each time.
-
-But if you need to reliably generate the same image, that depends on whether you're running the pipeline on a CPU or GPU.
-
-> [!TIP]
-> It might seem unintuitive to pass `Generator` objects to a pipeline instead of the integer value representing the seed. However, this is the recommended design when working with probabilistic models in PyTorch because a `Generator` is a *random state* that can be passed to multiple pipelines in a sequence. As soon as the `Generator` is consumed, the *state* is changed in place which means even if you passed the same `Generator` to a different pipeline, it won't produce the same result because the state is already changed.
-
-<hfoptions id="hardware">
-<hfoption id="CPU">
-
-To generate reproducible results on a CPU, you'll need to use a PyTorch [Generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) and set a seed. Now when you run the code, it always prints a value of `1491.1711` because the `Generator` object with the seed is passed to all the random functions in the pipeline. You should get a similar, if not the same, result on whatever hardware and PyTorch version you're using.
-
-```python
-import torch
-import numpy as np
-from diffusers import DDIMPipeline
-
-ddim = DDIMPipeline.from_pretrained("google/ddpm-cifar10-32", use_safetensors=True)
-generator = torch.Generator(device="cpu").manual_seed(0)
-image = ddim(num_inference_steps=2, output_type="np", generator=generator).images
-print(np.abs(image).sum())
-```
-
-</hfoption>
-<hfoption id="GPU">
-
-Writing a reproducible pipeline on a GPU is a bit trickier, and full reproducibility across different hardware is not guaranteed because matrix multiplication - which diffusion pipelines require a lot of - is less deterministic on a GPU than a CPU. For example, if you run the same code example from the CPU example, you'll get a different result even though the seed is identical. This is because the GPU uses a different random number generator than the CPU.
-
-```python
-import torch
-import numpy as np
-from diffusers import DDIMPipeline
-
-ddim = DDIMPipeline.from_pretrained("google/ddpm-cifar10-32", use_safetensors=True)
-ddim.to("cuda")
-generator = torch.Generator(device="cuda").manual_seed(0)
-image = ddim(num_inference_steps=2, output_type="np", generator=generator).images
-print(np.abs(image).sum())
-```
-
-To avoid this issue, Diffusers has a [`~utils.torch_utils.randn_tensor`] function for creating random noise on the CPU, and then moving the tensor to a GPU if necessary. The [`~utils.torch_utils.randn_tensor`] function is used everywhere inside the pipeline. Now you can call [torch.manual_seed](https://pytorch.org/docs/stable/generated/torch.manual_seed.html) which automatically creates a CPU `Generator` that can be passed to the pipeline even if it is being run on a GPU.
-
-```python
-import torch
-import numpy as np
-from diffusers import DDIMPipeline
-
-ddim = DDIMPipeline.from_pretrained("google/ddpm-cifar10-32", use_safetensors=True)
-ddim.to("cuda")
-generator = torch.manual_seed(0)
-image = ddim(num_inference_steps=2, output_type="np", generator=generator).images
-print(np.abs(image).sum())
-```
-
-> [!TIP]
-> If reproducibility is important to your use case, we recommend always passing a CPU `Generator`. The performance loss is often negligible and you'll generate more similar values than if the pipeline had been run on a GPU.
-
-Finally, more complex pipelines such as [`UnCLIPPipeline`], are often extremely
-susceptible to precision error propagation. You'll need to use
-exactly the same hardware and PyTorch version for full reproducibility.
-
-</hfoption>
-</hfoptions>
-
-## Deterministic algorithms
-
-You can also configure PyTorch to use deterministic algorithms to create a reproducible pipeline. The downside is that deterministic algorithms may be slower than non-deterministic ones and you may observe a decrease in performance.
-
-Non-deterministic behavior occurs when operations are launched in more than one CUDA stream. To avoid this, set the environment variable [CUBLAS_WORKSPACE_CONFIG](https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility) to `:16:8` to only use one buffer size during runtime.
-
-PyTorch typically benchmarks multiple algorithms to select the fastest one, but if you want reproducibility, you should disable this feature because the benchmark may select different algorithms each time. Set Diffusers [enable_full_determinism](https://github.com/huggingface/diffusers/blob/142f353e1c638ff1d20bd798402b68f72c1ebbdd/src/diffusers/utils/testing_utils.py#L861) to enable deterministic algorithms.
+Let's use [`runwayml/stable-diffusion-v1-5`](https://huggingface.co/runwayml/stable-diffusion-v1-5) for example, and generate several versions of the following prompt:

 ```py
-enable_full_determinism()
+prompt = "Labrador in the style of Vermeer"
 ```

-Now when you run the same pipeline twice, you'll get identical results.
+Instantiate a pipeline with [`DiffusionPipeline.from_pretrained`] and place it on a GPU (if available):

-```py
-import torch
-from diffusers import DDIMScheduler, StableDiffusionPipeline
-
-pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", use_safetensors=True).to("cuda")
-pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
-g = torch.Generator(device="cuda")
-
-prompt = "A bear is playing a guitar on Times Square"
-
-g.manual_seed(0)
-result1 = pipe(prompt=prompt, num_inference_steps=50, generator=g, output_type="latent").images
-
-g.manual_seed(0)
-result2 = pipe(prompt=prompt, num_inference_steps=50, generator=g, output_type="latent").images
-
-print("L_inf dist =", abs(result1 - result2).max())
-"L_inf dist = tensor(0., device='cuda:0')"
-```
-
-## Deterministic batch generation
-
-A practical application of creating reproducible pipelines is *deterministic batch generation*. You generate a batch of images and select one image to improve with a more detailed prompt. The main idea is to pass a list of [Generator's](https://pytorch.org/docs/stable/generated/torch.Generator.html) to the pipeline and tie each `Generator` to a seed so you can reuse it.
-
-Let's use the [runwayml/stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5) checkpoint and generate a batch of images.
-
-```py
+```python
 import torch
 from diffusers import DiffusionPipeline
 from diffusers.utils import make_image_grid

-pipeline = DiffusionPipeline.from_pretrained(
+pipe = DiffusionPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True
 )
-pipeline = pipeline.to("cuda")
+pipe = pipe.to("cuda")
 ```

-Define four different `Generator`s and assign each `Generator` a seed (`0` to `3`). Then generate a batch of images and pick one to iterate on.
-
-> [!WARNING]
-> Use a list comprehension that iterates over the batch size specified in `range()` to create a unique `Generator` object for each image in the batch. If you multiply the `Generator` by the batch size integer, it only creates *one* `Generator` object that is used sequentially for each image in the batch.
->
-> ```py
-> [torch.Generator().manual_seed(seed)] * 4
-> ```
+Now, define four different `Generator`s and assign each `Generator` a seed (`0` to `3`) so you can reuse a `Generator` later for a specific image:

 ```python
 generator = [torch.Generator(device="cuda").manual_seed(i) for i in range(4)]
-prompt = "Labrador in the style of Vermeer"
-images = pipeline(prompt, generator=generator, num_images_per_prompt=4).images[0]
+```
+
+<Tip warning={true}>
+
+To create a batched seed, you should use a list comprehension that iterates over the length specified in `range()`. This creates a unique `Generator` object for each image in the batch. If you only multiply the `Generator` by the batch size, this only creates one `Generator` object that is used sequentially for each image in the batch.
+
+For example, if you want to use the same seed to create 4 identical images:
+
+```py
+❌ [torch.Generator().manual_seed(seed)] * 4
+
+✅ [torch.Generator().manual_seed(seed) for _ in range(4)]
+```
+
+</Tip>
+
+Generate the images and have a look:
+
+```python
+images = pipe(prompt, generator=generator, num_images_per_prompt=4).images
 make_image_grid(images, rows=2, cols=2)
 ```

-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/diffusers/diffusers-images-docs/resolve/main/reusabe_seeds.jpg"/>
-</div>
+![img](https://huggingface.co/datasets/diffusers/diffusers-images-docs/resolve/main/reusabe_seeds.jpg)

-Let's improve the first image (you can choose any image you want) which corresponds to the `Generator` with seed `0`. Add some additional text to your prompt and then make sure you reuse the same `Generator` with seed `0`. All the generated images should resemble the first image.
+In this example, you'll improve upon the first image - but in reality, you can use any image you want (even the image with double sets of eyes!). The first image used the `Generator` with seed `0`, so you'll reuse that `Generator` for the second round of inference. To improve the quality of the image, add some additional text to the prompt:

 ```python
 prompt = [prompt + t for t in [", highly realistic", ", artsy", ", trending", ", colorful"]]
 generator = [torch.Generator(device="cuda").manual_seed(0) for i in range(4)]
-images = pipeline(prompt, generator=generator).images
+```
+
+Create four generators with seed `0`, and generate another batch of images, all of which should look like the first image from the previous round!
+
+```python
+images = pipe(prompt, generator=generator).images
 make_image_grid(images, rows=2, cols=2)
 ```

-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/diffusers/diffusers-images-docs/resolve/main/reusabe_seeds_2.jpg"/>
-</div>
+![img](https://huggingface.co/datasets/diffusers/diffusers-images-docs/resolve/main/reusabe_seeds_2.jpg)
--- a/docs/source/en/using-diffusers/schedulers.md
+++ b/docs/source/en/using-diffusers/schedulers.md
@@ -212,62 +212,6 @@ images = pipeline(prompt_ids, params, prng_seed, num_inference_steps, jit=True).
 images = pipeline.numpy_to_pil(np.asarray(images.reshape((num_samples,) + images.shape[-3:])))
 ```

-## Custom Timestep Schedules
-
-With all our schedulers, you can choose one of the popular timestep schedules using configurations such as `timestep_spacing`, `interpolation_type`, and `use_karras_sigmas`. Some schedulers also provide the flexibility to use a custom timestep schedule. You can use any list of arbitrary timesteps, we will use the AYS timestep schedule here as example. It is a set of 10-step optimized timestep schedules released by researchers from Nvidia that can achieve significantly better quality compared to the preset timestep schedules. You can read more about their research [here](https://research.nvidia.com/labs/toronto-ai/AlignYourSteps/). 
-
-```python
-from diffusers.schedulers import AysSchedules
-sampling_schedule = AysSchedules["StableDiffusionXLTimesteps"]
-print(sampling_schedule)
-```
-```
-[999, 845, 730, 587, 443, 310, 193, 116, 53, 13]
-```
-
-You can then create a pipeline and pass this custom timestep schedule to it as `timesteps`.
-
-```python
-pipe = StableDiffusionXLPipeline.from_pretrained(
-    "SG161222/RealVisXL_V4.0",
-    torch_dtype=torch.float16,
-    variant="fp16",
-).to("cuda")
-
-pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config, algorithm_type="sde-dpmsolver++")
-
-prompt = "A cinematic shot of a cute little rabbit wearing a jacket and doing a thumbs up"
-
-generator = torch.Generator(device="cpu").manual_seed(2487854446)
-
-image = pipe(
-    prompt=prompt,
-    negative_prompt="",
-    generator=generator,
-    timesteps=sampling_schedule,
-).images[0]
-```
-The generated image has better quality than the default linear timestep schedule for the same number of steps, and it is similar to the default timestep scheduler when running for 25 steps.
-
-<div class="flex gap-4">
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/ays.png"/>
-    <figcaption class="mt-2 text-center text-sm text-gray-500">AYS timestep schedule 10 steps</figcaption>
-  </div>
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/10.png"/>
-    <figcaption class="mt-2 text-center text-sm text-gray-500">Linearly-spaced timestep schedule 10 steps</figcaption>
-  </div>
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/25.png"/>
-    <figcaption class="mt-2 text-center text-sm text-gray-500">Linearly-spaced timestep schedule 25 steps</figcaption>
-  </div>
-</div>
-
-> [!TIP]
-> 🤗 Diffusers currently only supports `timesteps` and `sigmas` for a selected list of schedulers and pipelines, but feel free to open a [feature request](https://github.com/huggingface/diffusers/issues/new/choose) if you want to extend feature to a scheduler and pipeline that does not currently support it!
-
-
 ## Models

 Models are loaded from the [`ModelMixin.from_pretrained`] method, which downloads and caches the latest version of the model weights and configurations. If the latest files are available in the local cache, [`~ModelMixin.from_pretrained`] reuses files in the cache instead of re-downloading them.
--- a/docs/source/ja/installation.md
+++ b/docs/source/ja/installation.md
@@ -106,7 +106,7 @@ pip install -e ".[flax]"

 これらのコマンドは、リポジトリをクローンしたフォルダと Python のライブラリパスをリンクします。
 Python は通常のライブラリパスに加えて、クローンしたフォルダの中を探すようになります。
-例えば、Python パッケージが通常 `~/anaconda3/envs/main/lib/python3.10/site-packages/` にインストールされている場合、Python はクローンした `~/diffusers/` フォルダも同様に参照します。
+例えば、Python パッケージが通常 `~/anaconda3/envs/main/lib/python3.8/site-packages/` にインストールされている場合、Python はクローンした `~/diffusers/` フォルダも同様に参照します。

 <Tip warning={true}>

--- a/docs/source/ja/stable_diffusion.md
+++ b/docs/source/ja/stable_diffusion.md
@@ -49,7 +49,7 @@ prompt = "portrait photo of a old warrior chief"
 pipeline = pipeline.to("cuda")
 ```

-同じイメージを使って改良できるようにするには、[`Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html)を使い、[reproducibility](./using-diffusers/reusing_seeds)の種を設定します：
+同じイメージを使って改良できるようにするには、[`Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html)を使い、[reproducibility](./using-diffusers/reproducibility)の種を設定します：

 ```python
 import torch
--- a/docs/source/ko/installation.md
+++ b/docs/source/ko/installation.md
@@ -105,7 +105,7 @@ pip install -e ".[flax]"

 이러한 명령어들은 저장소를 복제한 폴더와 Python 라이브러리 경로를 연결합니다.
 Python은 이제 일반 라이브러리 경로에 더하여 복제한 폴더 내부를 살펴봅니다.
-예를들어 Python 패키지가 `~/anaconda3/envs/main/lib/python3.10/site-packages/`에 설치되어 있는 경우 Python은 복제한 폴더인 `~/diffusers/`도 검색합니다.
+예를들어 Python 패키지가 `~/anaconda3/envs/main/lib/python3.8/site-packages/`에 설치되어 있는 경우 Python은 복제한 폴더인 `~/diffusers/`도 검색합니다.

 <Tip warning={true}>

--- a/docs/source/ko/optimization/fp16.md
+++ b/docs/source/ko/optimization/fp16.md
@@ -339,7 +339,7 @@ from dataclasses import dataclass

@dataclass
 class UNet2DConditionOutput:
-    sample: torch.Tensor
+    sample: torch.FloatTensor


 pipe = StableDiffusionPipeline.from_pretrained(
--- a/docs/source/ko/stable_diffusion.md
+++ b/docs/source/ko/stable_diffusion.md
@@ -49,7 +49,7 @@ prompt = "portrait photo of a old warrior chief"
 pipeline = pipeline.to("cuda")
 ```

-동일한 이미지를 사용하고 개선할 수 있는지 확인하려면 [`Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html)를 사용하고 [재현성](./using-diffusers/reusing_seeds)에 대한 시드를 설정하세요:
+동일한 이미지를 사용하고 개선할 수 있는지 확인하려면 [`Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html)를 사용하고 [재현성](./using-diffusers/reproducibility)에 대한 시드를 설정하세요:

 ```python
 import torch
--- a/docs/source/ko/training/lora.md
+++ b/docs/source/ko/training/lora.md
@@ -49,15 +49,15 @@ huggingface-cli login

 ### 학습[[dreambooth-training]]

-[Naruto BLIP 캡션](https://huggingface.co/datasets/lambdalabs/naruto-blip-captions) 데이터셋으로 [`stable-diffusion-v1-5`](https://huggingface.co/runwayml/stable-diffusion-v1-5)를 파인튜닝해 나만의 포켓몬을 생성해 보겠습니다.
+[Pokémon BLIP 캡션](https://huggingface.co/datasets/lambdalabs/pokemon-blip-captions) 데이터셋으로 [`stable-diffusion-v1-5`](https://huggingface.co/runwayml/stable-diffusion-v1-5)를 파인튜닝해 나만의 포켓몬을 생성해 보겠습니다.

 시작하려면 `MODEL_NAME` 및 `DATASET_NAME` 환경 변수가 설정되어 있는지 확인하십시오. `OUTPUT_DIR` 및 `HUB_MODEL_ID` 변수는 선택 사항이며 허브에서 모델을 저장할 위치를 지정합니다.

 ```bash
 export MODEL_NAME="runwayml/stable-diffusion-v1-5"
-export OUTPUT_DIR="/sddata/finetune/lora/naruto"
-export HUB_MODEL_ID="naruto-lora"
-export DATASET_NAME="lambdalabs/naruto-blip-captions"
+export OUTPUT_DIR="/sddata/finetune/lora/pokemon"
+export HUB_MODEL_ID="pokemon-lora"
+export DATASET_NAME="lambdalabs/pokemon-blip-captions"
 ```

 학습을 시작하기 전에 알아야 할 몇 가지 플래그가 있습니다.
--- a/docs/source/ko/training/text2image.md
+++ b/docs/source/ko/training/text2image.md
@@ -73,12 +73,12 @@ xFormers는 Flax에 사용할 수 없습니다.

 <frameworkcontent>
 <pt>
-다음과 같이 [Naruto BLIP 캡션](https://huggingface.co/datasets/lambdalabs/naruto-blip-captions) 데이터셋에서 파인튜닝 실행을 위해 [PyTorch 학습 스크립트](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image.py)를 실행합니다:
+다음과 같이 [Pokémon BLIP 캡션](https://huggingface.co/datasets/lambdalabs/pokemon-blip-captions) 데이터셋에서 파인튜닝 실행을 위해 [PyTorch 학습 스크립트](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image.py)를 실행합니다:


 ```bash
 export MODEL_NAME="CompVis/stable-diffusion-v1-4"
-export dataset_name="lambdalabs/naruto-blip-captions"
+export dataset_name="lambdalabs/pokemon-blip-captions"

 accelerate launch train_text_to_image.py \
  --pretrained_model_name_or_path=$MODEL_NAME \
@@ -93,7 +93,7 @@ accelerate launch train_text_to_image.py \
  --learning_rate=1e-05 \
  --max_grad_norm=1 \
  --lr_scheduler="constant" --lr_warmup_steps=0 \
-  --output_dir="sd-naruto-model" 
+  --output_dir="sd-pokemon-model" 
 ```

 자체 데이터셋으로 파인튜닝하려면 🤗 [Datasets](https://huggingface.co/docs/datasets/index)에서 요구하는 형식에 따라 데이터셋을 준비하세요. [데이터셋을 허브에 업로드](https://huggingface.co/docs/datasets/image_dataset#upload-dataset-to-the-hub)하거나 [파일들이 있는 로컬 폴더를 준비](https ://huggingface.co/docs/datasets/image_dataset#imagefolder)할 수 있습니다.
@@ -136,7 +136,7 @@ pip install -U -r requirements_flax.txt

 ```bash
 export MODEL_NAME="runwayml/stable-diffusion-v1-5"
-export dataset_name="lambdalabs/naruto-blip-captions"
+export dataset_name="lambdalabs/pokemon-blip-captions"

 python train_text_to_image_flax.py \
  --pretrained_model_name_or_path=$MODEL_NAME \
@@ -146,7 +146,7 @@ python train_text_to_image_flax.py \
  --max_train_steps=15000 \
  --learning_rate=1e-05 \
  --max_grad_norm=1 \
-  --output_dir="sd-naruto-model" 
+  --output_dir="sd-pokemon-model" 
 ```

 자체 데이터셋으로 파인튜닝하려면 🤗 [Datasets](https://huggingface.co/docs/datasets/index)에서 요구하는 형식에 따라 데이터셋을 준비하세요. [데이터셋을 허브에 업로드](https://huggingface.co/docs/datasets/image_dataset#upload-dataset-to-the-hub)하거나 [파일들이 있는 로컬 폴더를 준비](https ://huggingface.co/docs/datasets/image_dataset#imagefolder)할 수 있습니다.
@@ -166,7 +166,7 @@ python train_text_to_image_flax.py \
  --max_train_steps=15000 \
  --learning_rate=1e-05 \
  --max_grad_norm=1 \
-  --output_dir="sd-naruto-model"
+  --output_dir="sd-pokemon-model"
 ```
 </jax>
 </frameworkcontent>
@@ -189,7 +189,7 @@ pipe = StableDiffusionPipeline.from_pretrained(model_path, torch_dtype=torch.flo
 pipe.to("cuda")

 image = pipe(prompt="yoda").images[0]
-image.save("yoda-naruto.png")
+image.save("yoda-pokemon.png")
 ```
 </pt>
 <jax>
@@ -203,7 +203,7 @@ from diffusers import FlaxStableDiffusionPipeline
 model_path = "path_to_saved_model"
 pipe, params = FlaxStableDiffusionPipeline.from_pretrained(model_path, dtype=jax.numpy.bfloat16)

-prompt = "yoda naruto"
+prompt = "yoda pokemon"
 prng_seed = jax.random.PRNGKey(0)
 num_inference_steps = 50

@@ -218,7 +218,7 @@ prompt_ids = shard(prompt_ids)

 images = pipeline(prompt_ids, params, prng_seed, num_inference_steps, jit=True).images
 images = pipeline.numpy_to_pil(np.asarray(images.reshape((num_samples,) + images.shape[-3:])))
-image.save("yoda-naruto.png")
+image.save("yoda-pokemon.png")
 ```
 </jax>
 </frameworkcontent>
--- a/docs/source/ko/training/unconditional_training.md
+++ b/docs/source/ko/training/unconditional_training.md
@@ -103,13 +103,13 @@ accelerate launch train_unconditional.py \
 <div class="flex justify-center">
    <img src="https://user-images.githubusercontent.com/26864830/180248660-a0b143d0-b89a-42c5-8656-2ebf6ece7e52.png"/>
 </div>
-[Naruto](https://huggingface.co/datasets/lambdalabs/naruto-blip-captions) 데이터셋을 사용할 경우:
+[Pokemon](https://huggingface.co/datasets/huggan/pokemon) 데이터셋을 사용할 경우:

 ```bash
 accelerate launch train_unconditional.py \
-  --dataset_name="lambdalabs/naruto-blip-captions" \
+  --dataset_name="huggan/pokemon" \
  --resolution=64 \
-  --output_dir="ddpm-ema-naruto-64" \
+  --output_dir="ddpm-ema-pokemon-64" \
  --train_batch_size=16 \
  --num_epochs=100 \
  --gradient_accumulation_steps=1 \
@@ -129,9 +129,9 @@ accelerate launch train_unconditional.py \

 ```bash
 accelerate launch --mixed_precision="fp16" --multi_gpu train_unconditional.py \
-  --dataset_name="lambdalabs/naruto-blip-captions" \
+  --dataset_name="huggan/pokemon" \
  --resolution=64 --center_crop --random_flip \
-  --output_dir="ddpm-ema-naruto-64" \
+  --output_dir="ddpm-ema-pokemon-64" \
  --train_batch_size=16 \
  --num_epochs=100 \
  --gradient_accumulation_steps=1 \
--- a/docs/source/pt/installation.md
+++ b/docs/source/pt/installation.md
@@ -102,7 +102,7 @@ pip install -e ".[flax]"

 Esses comandos irá linkar a pasta que você clonou o repositório e os caminhos das suas bibliotecas Python.
 Python então irá procurar dentro da pasta que você clonou além dos caminhos normais das bibliotecas.
-Por exemplo, se o pacote python for tipicamente instalado no `~/anaconda3/envs/main/lib/python3.10/site-packages/`, o Python também irá procurar na pasta `~/diffusers/` que você clonou.
+Por exemplo, se o pacote python for tipicamente instalado no `~/anaconda3/envs/main/lib/python3.8/site-packages/`, o Python também irá procurar na pasta `~/diffusers/` que você clonou.

 <Tip warning={true}>

--- a/docs/source/zh/installation.md
+++ b/docs/source/zh/installation.md
@@ -107,7 +107,7 @@ pip install -e ".[flax]"

 这些命令将连接到你克隆的版本库和你的 Python 库路径。
 现在，不只是在通常的库路径，Python 还会在你克隆的文件夹内寻找包。
-例如，如果你的 Python 包通常安装在 `~/anaconda3/envs/main/lib/python3.10/Site-packages/`，Python 也会搜索你克隆到的文件夹。`~/diffusers/`。
+例如，如果你的 Python 包通常安装在 `~/anaconda3/envs/main/lib/python3.8/Site-packages/`，Python 也会搜索你克隆到的文件夹。`~/diffusers/`。

 <Tip warning={true}>

--- a/docs/source/zh/stable_diffusion.md
+++ b/docs/source/zh/stable_diffusion.md
@@ -51,7 +51,7 @@ prompt = "portrait photo of a old warrior chief"
 pipeline = pipeline.to("cuda")
 ```

-为了确保您可以使用相同的图像并对其进行改进，使用 [`Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) 方法，然后设置一个随机数种子 以确保其 [复现性](./using-diffusers/reusing_seeds):
+为了确保您可以使用相同的图像并对其进行改进，使用 [`Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) 方法，然后设置一个随机数种子 以确保其 [复现性](./using-diffusers/reproducibility):

 ```python
 import torch
--- a/examples/advanced_diffusion_training/README.md
+++ b/examples/advanced_diffusion_training/README.md
@@ -234,7 +234,7 @@ In ComfyUI we will load a LoRA and a textual embedding at the same time.
 SDXL's VAE is known to suffer from numerical instability issues. This is why we also expose a CLI argument namely `--pretrained_vae_model_name_or_path` that lets you specify the location of a better VAE (such as [this one](https://huggingface.co/madebyollin/sdxl-vae-fp16-fix)).

 ### DoRA training 
-The advanced script supports DoRA training too!
+The advanced script now supports DoRA training too!
 > Proposed in [DoRA: Weight-Decomposed Low-Rank Adaptation](https://arxiv.org/abs/2402.09353), 
 **DoRA** is very similar to LoRA, except it decomposes the pre-trained weight into two components, **magnitude** and **direction** and employs LoRA for _directional_ updates to efficiently minimize the number of trainable parameters. 
 The authors found that by using DoRA, both the learning capacity and training stability of LoRA are enhanced without any additional overhead during inference. 
@@ -304,147 +304,6 @@ accelerate launch train_dreambooth_lora_sdxl_advanced.py \
 > [!CAUTION]
 > Min-SNR gamma is not supported with the EDM-style training yet. When training with the PlaygroundAI model, it's recommended to not pass any "variant".

-### B-LoRA training 
-The advanced script now supports B-LoRA training too!
-> Proposed in [Implicit Style-Content Separation using B-LoRA](https://arxiv.org/abs/2403.14572), 
-B-LoRA is a method that leverages LoRA to implicitly separate the style and content components of a **single** image.
-It was shown that learning the LoRA weights of two specific blocks (referred to as B-LoRAs) 
-achieves style-content separation that cannot be achieved by training each B-LoRA independently. 
-Once trained, the two B-LoRAs can be used as independent components to allow various image stylization tasks
-
-**Usage**
-Enable B-LoRA training by adding this flag
-```bash
--use_blora
-```
-You can train a B-LoRA with as little as 1 image, and 1000 steps. Try this default configuration as a start:
-```bash
-!accelerate launch train_dreambooth_b-lora_sdxl.py \
- --pretrained_model_name_or_path="stabilityai/stable-diffusion-xl-base-1.0" \
- --instance_data_dir="linoyts/B-LoRA_teddy_bear" \
- --output_dir="B-LoRA_teddy_bear" \
- --instance_prompt="a [v18]" \
- --resolution=1024 \
- --rank=64 \
- --train_batch_size=1 \
- --learning_rate=5e-5 \
- --lr_scheduler="constant" \
- --lr_warmup_steps=0 \
- --max_train_steps=1000 \
- --checkpointing_steps=2000 \
- --seed="0" \
- --gradient_checkpointing \
- --mixed_precision="fp16"
-```
-**Inference** 
-The inference is a bit different:
-1. we need load *specific* unet layers (as opposed to a regular LoRA/DoRA)
-2. the trained layers we load, changes based on our objective (e.g. style/content)
-
-```python
-import torch
-from diffusers import StableDiffusionXLPipeline, AutoencoderKL
-
-# taken & modified from B-LoRA repo - https://github.com/yardenfren1996/B-LoRA/blob/main/blora_utils.py
-def is_belong_to_blocks(key, blocks):
-    try:
-        for g in blocks:
-            if g in key:
-                return True
-        return False
-    except Exception as e:
-        raise type(e)(f'failed to is_belong_to_block, due to: {e}')
-    
-def lora_lora_unet_blocks(lora_path, alpha, target_blocks):  
-  state_dict, _ = pipeline.lora_state_dict(lora_path)
-  filtered_state_dict = {k: v * alpha for k, v in state_dict.items() if is_belong_to_blocks(k, target_blocks)}
-  return filtered_state_dict
-
-vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16)
-pipeline = StableDiffusionXLPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0",
-    vae=vae,
-    torch_dtype=torch.float16,
-).to("cuda")
-
-# pick a blora for content/style (you can also set one to None) 
-content_B_lora_path  = "lora-library/B-LoRA-teddybear"
-style_B_lora_path= "lora-library/B-LoRA-pen_sketch"
-
-
-content_B_LoRA = lora_lora_unet_blocks(content_B_lora_path,alpha=1,target_blocks=["unet.up_blocks.0.attentions.0"])
-style_B_LoRA = lora_lora_unet_blocks(style_B_lora_path,alpha=1.1,target_blocks=["unet.up_blocks.0.attentions.1"])
-combined_lora = {**content_B_LoRA, **style_B_LoRA}
-
-# Load both loras
-pipeline.load_lora_into_unet(combined_lora, None, pipeline.unet)
-
-#generate
-prompt = "a [v18] in [v30] style"
-pipeline(prompt, num_images_per_prompt=4).images
-```
-### LoRA training of Targeted U-net Blocks
-The advanced script now supports custom choice of U-net blocks to train during Dreambooth LoRA tuning. 
-> [!NOTE]
-> This feature is still experimental
-
-> Recently, works like B-LoRA showed the potential advantages of learning the LoRA weights of specific U-net blocks, not only in speed & memory, 
-> but also in reducing the amount of needed data, improving style manipulation and overcoming overfitting issues. 
-> In light of this, we're introducing a new feature to the advanced script to allow for configurable U-net learned blocks. 
-
-**Usage**
-Configure LoRA learned U-net blocks adding a `lora_unet_blocks` flag, with a comma seperated string specifying the targeted blocks. 
-e.g:
-```bash
--lora_unet_blocks="unet.up_blocks.0.attentions.0,unet.up_blocks.0.attentions.1"
-```
-
-> [!NOTE]
-> if you specify both `--use_blora` and `--lora_unet_blocks`, values given in --lora_unet_blocks will be ignored. 
-> When enabling --use_blora, targeted U-net blocks are automatically set to be "unet.up_blocks.0.attentions.0,unet.up_blocks.0.attentions.1" as discussed in the paper. 
-> If you wish to experiment with different blocks, specify `--lora_unet_blocks` only.
-
-**Inference** 
-Inference is the same as for B-LoRAs, except the input targeted blocks should be modified based on your training configuration. 
-```python
-import torch
-from diffusers import StableDiffusionXLPipeline, AutoencoderKL
-
-# taken & modified from B-LoRA repo - https://github.com/yardenfren1996/B-LoRA/blob/main/blora_utils.py
-def is_belong_to_blocks(key, blocks):
-    try:
-        for g in blocks:
-            if g in key:
-                return True
-        return False
-    except Exception as e:
-        raise type(e)(f'failed to is_belong_to_block, due to: {e}')
-    
-def lora_lora_unet_blocks(lora_path, alpha, target_blocks):  
-  state_dict, _ = pipeline.lora_state_dict(lora_path)
-  filtered_state_dict = {k: v * alpha for k, v in state_dict.items() if is_belong_to_blocks(k, target_blocks)}
-  return filtered_state_dict
-
-vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16)
-pipeline = StableDiffusionXLPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0",
-    vae=vae,
-    torch_dtype=torch.float16,
-).to("cuda")
-
-lora_path  = "lora-library/B-LoRA-pen_sketch"
-
-state_dict = lora_lora_unet_blocks(content_B_lora_path,alpha=1,target_blocks=["unet.up_blocks.0.attentions.0"])
-
-# Load traine dlora layers into the unet
-pipeline.load_lora_into_unet(state_dict, None, pipeline.unet)
-
-#generate
-prompt = "a dog in [v30] style"
-pipeline(prompt, num_images_per_prompt=4).images
-```
-
-
 ### Tips and Tricks
 Check out [these recommended practices](https://huggingface.co/blog/sdxl_lora_advanced_script#additional-good-practices)

--- a/examples/advanced_diffusion_training/train_dreambooth_lora_sd15_advanced.py
+++ b/examples/advanced_diffusion_training/train_dreambooth_lora_sd15_advanced.py
@@ -71,7 +71,7 @@ from diffusers.utils.import_utils import is_xformers_available


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.28.0")
+check_min_version("0.28.0.dev0")

 logger = get_logger(__name__)

@@ -981,7 +981,7 @@ def collate_fn(examples, with_prior_preservation=False):


 class PromptDataset(Dataset):
-    """A simple dataset to prepare the prompts to generate class images on multiple GPUs."""
+    "A simple dataset to prepare the prompts to generate class images on multiple GPUs."

    def __init__(self, prompt, num_samples):
        self.prompt = prompt
--- a/examples/advanced_diffusion_training/train_dreambooth_lora_sdxl_advanced.py
+++ b/examples/advanced_diffusion_training/train_dreambooth_lora_sdxl_advanced.py
@@ -15,6 +15,7 @@

 import argparse
 import gc
+import hashlib
 import itertools
 import json
 import logging
@@ -39,7 +40,6 @@ from accelerate import Accelerator
 from accelerate.logging import get_logger
 from accelerate.utils import DistributedDataParallelKwargs, ProjectConfiguration, set_seed
 from huggingface_hub import create_repo, hf_hub_download, upload_folder
-from huggingface_hub.utils import insecure_hashlib
 from packaging import version
 from peft import LoraConfig, set_peft_model_state_dict
 from peft.utils import get_peft_model_state_dict
@@ -78,7 +78,7 @@ from diffusers.utils.torch_utils import is_compiled_module


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.28.0")
+check_min_version("0.28.0.dev0")

 logger = get_logger(__name__)

@@ -696,23 +696,6 @@ def parse_args(input_args=None):
            "Note: to use DoRA you need to install peft from main, `pip install git+https://github.com/huggingface/peft.git`"
        ),
    )
-    parser.add_argument(
-        "--lora_unet_blocks",
-        type=str,
-        default=None,
-        help=(
-            "the U-net blocks to tune during training. please specify them in a comma separated string, e.g. `unet.up_blocks.0.attentions.0,unet.up_blocks.0.attentions.1` etc."
-            "NOTE: By default (if not specified) - regular LoRA training is performed. "
-            "if --use_blora is enabled, this arg will be ignored, since in B-LoRA training, targeted U-net blocks are `unet.up_blocks.0.attentions.0` and `unet.up_blocks.0.attentions.1`"
-        ),
-    )
-    parser.add_argument(
-        "--use_blora",
-        action="store_true",
-        help=(
-            "Whether to train a B-LoRA as proposed in- Implicit Style-Content Separation using B-LoRA https://arxiv.org/abs/2403.14572. "
-        ),
-    )
    parser.add_argument(
        "--cache_latents",
        action="store_true",
@@ -737,11 +720,6 @@ def parse_args(input_args=None):
            "For full LoRA text encoder training check --train_text_encoder, for textual "
            "inversion training check `--train_text_encoder_ti`"
        )
-    if args.use_blora and args.lora_unet_blocks:
-        warnings.warn(
-            "You specified both `--use_blora` and `--lora_unet_blocks`, for B-LoRA training, target unet blocks are: `unet.up_blocks.0.attentions.0` and `unet.up_blocks.0.attentions.1`. "
-            "If you wish to target different U-net blocks, don't enable `--use_blora`"
-        )

    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
    if env_local_rank != -1 and env_local_rank != args.local_rank:
@@ -762,40 +740,6 @@ def parse_args(input_args=None):
    return args


-# Taken (and slightly modified) from B-LoRA repo https://github.com/yardenfren1996/B-LoRA/blob/main/blora_utils.py
-def is_belong_to_blocks(key, blocks):
-    try:
-        for g in blocks:
-            if g in key:
-                return True
-        return False
-    except Exception as e:
-        raise type(e)(f"failed to is_belong_to_block, due to: {e}")
-
-
-def get_unet_lora_target_modules(unet, use_blora, target_blocks=None):
-    if use_blora:
-        content_b_lora_blocks = "unet.up_blocks.0.attentions.0"
-        style_b_lora_blocks = "unet.up_blocks.0.attentions.1"
-        target_blocks = [content_b_lora_blocks, style_b_lora_blocks]
-    try:
-        blocks = [(".").join(blk.split(".")[1:]) for blk in target_blocks]
-
-        attns = [
-            attn_processor_name.rsplit(".", 1)[0]
-            for attn_processor_name, _ in unet.attn_processors.items()
-            if is_belong_to_blocks(attn_processor_name, blocks)
-        ]
-
-        target_modules = [f"{attn}.{mat}" for mat in ["to_k", "to_q", "to_v", "to_out.0"] for attn in attns]
-        return target_modules
-    except Exception as e:
-        raise type(e)(
-            f"failed to get_target_modules, due to: {e}. "
-            f"Please check the modules specified in --lora_unet_blocks are correct"
-        )
-
-
 # Taken from https://github.com/replicate/cog-sdxl/blob/main/dataset_and_utils.py
 class TokenEmbeddingsHandler:
    def __init__(self, text_encoders, tokenizers):
@@ -1002,20 +946,16 @@ class DreamBoothDataset(Dataset):
                transforms.Normalize([0.5], [0.5]),
            ]
        )
-        # if using B-LoRA for single image. do not use transformations
-        single_image = len(self.instance_images) < 2
        for image in self.instance_images:
-            if not single_image:
-                image = exif_transpose(image)
+            image = exif_transpose(image)
            if not image.mode == "RGB":
                image = image.convert("RGB")
            self.original_sizes.append((image.height, image.width))
            image = train_resize(image)
-
-            if not single_image and args.random_flip and random.random() < 0.5:
+            if args.random_flip and random.random() < 0.5:
                # flip
                image = train_flip(image)
-            if args.center_crop or single_image:
+            if args.center_crop:
                y1 = max(0, int(round((image.height - args.resolution) / 2.0)))
                x1 = max(0, int(round((image.width - args.resolution) / 2.0)))
                image = train_crop(image)
@@ -1136,7 +1076,7 @@ def collate_fn(examples, with_prior_preservation=False):


 class PromptDataset(Dataset):
-    """A simple dataset to prepare the prompts to generate class images on multiple GPUs."""
+    "A simple dataset to prepare the prompts to generate class images on multiple GPUs."

    def __init__(self, prompt, num_samples):
        self.prompt = prompt
@@ -1276,7 +1216,7 @@ def main(args):
                images = pipeline(example["prompt"]).images

                for i, image in enumerate(images):
-                    hash_image = insecure_hashlib.sha1(image.tobytes()).hexdigest()
+                    hash_image = hashlib.sha1(image.tobytes()).hexdigest()
                    image_filename = class_images_dir / f"{example['index'][i] + cur_class_images}-{hash_image}.jpg"
                    image.save(image_filename)

@@ -1434,24 +1374,12 @@ def main(args):
            text_encoder_two.gradient_checkpointing_enable()

    # now we will add new LoRA weights to the attention layers
-
-    if args.use_blora:
-        # if using B-LoRA, the targeted blocks to train are automatically set
-        target_modules = get_unet_lora_target_modules(unet, use_blora=True)
-    elif args.lora_unet_blocks:
-        # if training specific unet blocks not in the B-LoRA scheme
-        target_blocks_list = "".join(args.lora_unet_blocks.split()).split(",")
-        logger.info(f"list of unet blocks to train: {target_blocks_list}")
-        target_modules = get_unet_lora_target_modules(unet, use_blora=False, target_blocks=target_blocks_list)
-    else:
-        target_modules = ["to_k", "to_q", "to_v", "to_out.0"]
-
    unet_lora_config = LoraConfig(
        r=args.rank,
-        use_dora=args.use_dora,
        lora_alpha=args.rank,
+        use_dora=args.use_dora,
        init_lora_weights="gaussian",
-        target_modules=target_modules,
+        target_modules=["to_k", "to_q", "to_v", "to_out.0"],
    )
    unet.add_adapter(unet_lora_config)

@@ -1460,8 +1388,8 @@ def main(args):
    if args.train_text_encoder:
        text_lora_config = LoraConfig(
            r=args.rank,
-            use_dora=args.use_dora,
            lora_alpha=args.rank,
+            use_dora=args.use_dora,
            init_lora_weights="gaussian",
            target_modules=["q_proj", "k_proj", "v_proj", "out_proj"],
        )
@@ -1577,7 +1505,6 @@ def main(args):
            models = [unet_]
            if args.train_text_encoder:
                models.extend([text_encoder_one_, text_encoder_two_])
-                # only upcast trainable parameters (LoRA) into fp32
            cast_training_params(models)

    accelerator.register_save_state_pre_hook(save_model_hook)
@@ -1598,8 +1525,6 @@ def main(args):
        models = [unet]
        if args.train_text_encoder:
            models.extend([text_encoder_one, text_encoder_two])
-
-        # only upcast trainable parameters (LoRA) into fp32
        cast_training_params(models, dtype=torch.float32)

    unet_lora_parameters = list(filter(lambda p: p.requires_grad, unet.parameters()))
@@ -1855,12 +1780,7 @@ def main(args):
    # We need to initialize the trackers we use, and also store our configuration.
    # The trackers initializes automatically on the main process.
    if accelerator.is_main_process:
-        tracker_name = (
-            "dreambooth-lora-sd-xl"
-            if "playground" not in args.pretrained_model_name_or_path
-            else "dreambooth-lora-playground"
-        )
-        accelerator.init_trackers(tracker_name, config=vars(args))
+        accelerator.init_trackers("dreambooth-lora-sd-xl", config=vars(args))

    # Train!
    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
@@ -1913,6 +1833,7 @@ def main(args):
    )

    def get_sigmas(timesteps, n_dim=4, dtype=torch.float32):
+        # TODO: revisit other sampling algorithms
        sigmas = noise_scheduler.sigmas.to(device=accelerator.device, dtype=dtype)
        schedule_timesteps = noise_scheduler.timesteps.to(accelerator.device)
        timesteps = timesteps.to(accelerator.device)
@@ -1931,7 +1852,6 @@ def main(args):
    # flag used for textual inversion
    pivoted = False
    for epoch in range(first_epoch, args.num_train_epochs):
-        unet.train()
        # if performing any kind of optimization of text_encoder params
        if args.train_text_encoder or args.train_text_encoder_ti:
            if epoch == num_train_epochs_text_encoder:
@@ -1949,6 +1869,7 @@ def main(args):
                    text_encoder_one.text_model.embeddings.requires_grad_(True)
                    text_encoder_two.text_model.embeddings.requires_grad_(True)

+        unet.train()
        for step, batch in enumerate(train_dataloader):
            if pivoted:
                # stopping optimization of text_encoder params
@@ -2049,8 +1970,7 @@ def main(args):
                        timesteps,
                        prompt_embeds_input,
                        added_cond_kwargs=unet_added_conditions,
-                        return_dict=False,
-                    )[0]
+                    ).sample
                else:
                    unet_added_conditions = {"time_ids": add_time_ids}
                    prompt_embeds, pooled_prompt_embeds = encode_prompt(
@@ -2068,8 +1988,7 @@ def main(args):
                        timesteps,
                        prompt_embeds_input,
                        added_cond_kwargs=unet_added_conditions,
-                        return_dict=False,
-                    )[0]
+                    ).sample

                weighting = None
                if args.do_edm_style_training:
--- a/examples/community/README.md
+++ b/examples/community/README.md
@@ -68,8 +68,6 @@ Please also check out our [Community Scripts](https://github.com/huggingface/dif
 |   InstantID Pipeline                                                                                               | Stable Diffusion XL Pipeline that supports InstantID                                                                                                                                                                                                                                                                                                                                                 |  [InstantID Pipeline](#instantid-pipeline) | [![Hugging Face Space](https://img.shields.io/badge/🤗%20Hugging%20Face-Space-yellow)](https://huggingface.co/spaces/InstantX/InstantID) | [Haofan Wang](https://github.com/haofanwang) |
 |   UFOGen Scheduler                                                                                               | Scheduler for UFOGen Model (compatible with Stable Diffusion pipelines)                                                                                                                                                                                                                                                                                                                                                 |  [UFOGen Scheduler](#ufogen-scheduler) | - | [dg845](https://github.com/dg845) |
 | Stable Diffusion XL IPEX Pipeline | Accelerate Stable Diffusion XL inference pipeline with BF16/FP32 precision on Intel Xeon CPUs with [IPEX](https://github.com/intel/intel-extension-for-pytorch) | [Stable Diffusion XL on IPEX](#stable-diffusion-xl-on-ipex) | - | [Dan Li](https://github.com/ustcuna/) |
-| Stable Diffusion BoxDiff Pipeline | Training-free controlled generation with bounding boxes using [BoxDiff](https://github.com/showlab/BoxDiff) | [Stable Diffusion BoxDiff Pipeline](#stable-diffusion-boxdiff) | - | [Jingyang Zhang](https://github.com/zjysteven/) |
-|   FRESCO V2V Pipeline                                                                                                    | Implementation of [[CVPR 2024] FRESCO: Spatial-Temporal Correspondence for Zero-Shot Video Translation](https://arxiv.org/abs/2403.12962)                                                                                                                                                                                                                                                                                                                                                                                                                                      | [FRESCO V2V Pipeline](#fresco)      | - |              [Yifan Zhou](https://github.com/SingleZombie) |

 To load a custom pipeline you just need to pass the `custom_pipeline` argument to `DiffusionPipeline`, as one of the files in `diffusers/examples/community`. Feel free to send a PR with your own pipelines, we will merge them quickly.

@@ -240,12 +238,12 @@ pipeline_output = pipe(
    # denoising_steps=10,     # (optional) Number of denoising steps of each inference pass. Default: 10.
    # ensemble_size=10,       # (optional) Number of inference passes in the ensemble. Default: 10.
    # ------------------------------------------------
-
+    
    # ----- recommended setting for LCM version ------
    # denoising_steps=4,
    # ensemble_size=5,
    # -------------------------------------------------
-
+    
    # processing_res=768,     # (optional) Maximum resolution of processing. If set to 0: will not resize at all. Defaults to 768.
    # match_input_res=True,   # (optional) Resize depth prediction to match input resolution.
    # batch_size=0,           # (optional) Inference batch size, no bigger than `num_ensemble`. If set to 0, the script will automatically decide the proper batch size. Defaults to 0.
@@ -1032,7 +1030,7 @@ image = pipe().images[0]

 Make sure you have @crowsonkb's <https://github.com/crowsonkb/k-diffusion> installed:

-```sh
+```
 pip install k-diffusion
 ```

@@ -1678,68 +1676,6 @@ image = pipe(prompt, image=input_image, strength=0.75,).images[0]
 image.save('tensorrt_img2img_new_zealand_hills.png')
 ```

-### Stable Diffusion BoxDiff
-BoxDiff is a training-free method for controlled generation with bounding box coordinates. It shoud work with any Stable Diffusion model. Below shows an example with `stable-diffusion-2-1-base`.
-```py
-import torch
-from PIL import Image, ImageDraw
-from copy import deepcopy
-
-from examples.community.pipeline_stable_diffusion_boxdiff import StableDiffusionBoxDiffPipeline
-
-def draw_box_with_text(img, boxes, names):
-    colors = ["red", "olive", "blue", "green", "orange", "brown", "cyan", "purple"]
-    img_new = deepcopy(img)
-    draw = ImageDraw.Draw(img_new)
-
-    W, H = img.size
-    for bid, box in enumerate(boxes):
-        draw.rectangle([box[0] * W, box[1] * H, box[2] * W, box[3] * H], outline=colors[bid % len(colors)], width=4)
-        draw.text((box[0] * W, box[1] * H), names[bid], fill=colors[bid % len(colors)])
-    return img_new
-
-pipe = StableDiffusionBoxDiffPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-2-1-base",
-    torch_dtype=torch.float16,
-)
-pipe.to("cuda")
-
-# example 1
-prompt = "as the aurora lights up the sky, a herd of reindeer leisurely wanders on the grassy meadow, admiring the breathtaking view, a serene lake quietly reflects the magnificent display, and in the distance, a snow-capped mountain stands majestically, fantasy, 8k, highly detailed"
-phrases = [
-    "aurora",
-    "reindeer",
-    "meadow",
-    "lake",
-    "mountain"
-]
-boxes = [[1,3,512,202], [75,344,421,495], [1,327,508,507], [2,217,507,341], [1,135,509,242]]
-
-# example 2
-# prompt = "A rabbit wearing sunglasses looks very proud"
-# phrases = ["rabbit", "sunglasses"]
-# boxes = [[67,87,366,512], [66,130,364,262]]
-
-boxes = [[x / 512 for x in box] for box in boxes]
-
-images = pipe(
-    prompt,
-    boxdiff_phrases=phrases,
-    boxdiff_boxes=boxes,
-    boxdiff_kwargs={
-        "attention_res": 16,
-        "normalize_eot": True
-    },
-    num_inference_steps=50,
-    guidance_scale=7.5,
-    generator=torch.manual_seed(42),
-    safety_checker=None
-).images
-
-draw_box_with_text(images[0], boxes, phrases).save("output.png")
-```
-
-
 ### Stable Diffusion Reference

 This pipeline uses the Reference Control. Refer to the [sd-webui-controlnet discussion: Reference-only Control](https://github.com/Mikubill/sd-webui-controlnet/discussions/1236)[sd-webui-controlnet discussion: Reference-adain Control](https://github.com/Mikubill/sd-webui-controlnet/discussions/1280).
@@ -1854,13 +1790,13 @@ To use this pipeline, you need to:

 You can simply use pip to install IPEX with the latest version.

-```sh
+```python
 python -m pip install intel_extension_for_pytorch
 ```

 **Note:** To install a specific version, run with the following command:

-```sh
+```
 python -m pip install intel_extension_for_pytorch==<version_name> -f https://developer.intel.com/ipex-whl-stable-cpu
 ```

@@ -1958,13 +1894,13 @@ To use this pipeline, you need to:

 You can simply use pip to install IPEX with the latest version.

-```sh
+```python
 python -m pip install intel_extension_for_pytorch
 ```

 **Note:** To install a specific version, run with the following command:

-```sh
+```
 python -m pip install intel_extension_for_pytorch==<version_name> -f https://developer.intel.com/ipex-whl-stable-cpu
 ```

@@ -3010,8 +2946,8 @@ This code implements a pipeline for the Stable Diffusion model, enabling the div

 ### Sample Code

-```py
-from examples.community.regional_prompting_stable_diffusion import RegionalPromptingStableDiffusionPipeline
+```
+from from examples.community.regional_prompting_stable_diffusion import RegionalPromptingStableDiffusionPipeline
 pipe = RegionalPromptingStableDiffusionPipeline.from_single_file(model_path, vae=vae)

 rp_args = {
@@ -4036,93 +3972,6 @@ onestep_image = pipe(prompt, num_inference_steps=1).images[0]
 multistep_image = pipe(prompt, num_inference_steps=4).images[0]
 ```

-### FRESCO
-
-This is the Diffusers implementation of zero-shot video-to-video translation pipeline [FRESCO](https://github.com/williamyang1991/FRESCO) (without Ebsynth postprocessing and background smooth). To run the code, please install gmflow. Then modify the path in `gmflow_dir`. After that, you can run the pipeline with:
-
-```py
-from PIL import Image
-import cv2
-import torch
-import numpy as np
-
-from diffusers import ControlNetModel,DDIMScheduler, DiffusionPipeline
-import sys
-gmflow_dir = "/path/to/gmflow"
-sys.path.insert(0, gmflow_dir)
-
-def video_to_frame(video_path: str, interval: int):
-    vidcap = cv2.VideoCapture(video_path)
-    success = True
-
-    count = 0
-    res = []
-    while success:
-        count += 1
-        success, image = vidcap.read()
-        if count % interval != 1:
-            continue
-        if image is not None:
-            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
-            res.append(image)
-            if len(res) >= 8:
-                break
-
-    vidcap.release()
-    return res
-
-
-input_video_path = 'https://github.com/williamyang1991/FRESCO/raw/main/data/car-turn.mp4'
-output_video_path = 'car.gif'
-
-# You can use any fintuned SD here
-model_path = 'SG161222/Realistic_Vision_V2.0'
-
-prompt = 'a red car turns in the winter'
-a_prompt = ', RAW photo, subject, (high detailed skin:1.2), 8k uhd, dslr, soft lighting, high quality, film grain, Fujifilm XT3, '
-n_prompt = '(deformed iris, deformed pupils, semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, mutated hands and fingers:1.4), (deformed, distorted, disfigured:1.3), poorly drawn, bad anatomy, wrong anatomy, extra limb, missing limb, floating limbs, disconnected limbs, mutation, mutated, ugly, disgusting, amputation'
-
-input_interval = 5
-frames = video_to_frame(
-    input_video_path, input_interval)
-
-control_frames = []
-# get canny image
-for frame in frames:
-    image = cv2.Canny(frame, 50, 100)
-    np_image = np.array(image)
-    np_image = np_image[:, :, None]
-    np_image = np.concatenate([np_image, np_image, np_image], axis=2)
-    canny_image = Image.fromarray(np_image)
-    control_frames.append(canny_image)
-
-# You can use any ControlNet here
-controlnet = ControlNetModel.from_pretrained(
-    "lllyasviel/sd-controlnet-canny").to('cuda')
-
-pipe = DiffusionPipeline.from_pretrained(
-    model_path, controlnet=controlnet, custom_pipeline='fresco_v2v').to('cuda')
-pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
-
-generator = torch.manual_seed(0)
-frames = [Image.fromarray(frame) for frame in frames]
-
-output_frames = pipe(
-    prompt + a_prompt,
-    frames,
-    control_frames,
-    num_inference_steps=20,
-    strength=0.75,
-    controlnet_conditioning_scale=0.7,
-    generator=generator,
-    negative_prompt=n_prompt
-).images
-
-output_frames[0].save(output_video_path, save_all=True,
-                 append_images=output_frames[1:], duration=100, loop=0)
-
-```
-
 # Perturbed-Attention Guidance

 [Project](https://ku-cvlab.github.io/Perturbed-Attention-Guidance/) / [arXiv](https://arxiv.org/abs/2403.17377) / [GitHub](https://github.com/KU-CVLAB/Perturbed-Attention-Guidance)
@@ -4131,7 +3980,7 @@ This implementation is based on [Diffusers](https://huggingface.co/docs/diffuser

 ## Example Usage

-```py
+```
 import os
 import torch

--- a/examples/community/bit_diffusion.py
+++ b/examples/community/bit_diffusion.py
@@ -44,9 +44,9 @@ def bits_to_decimal(x, bits=BITS):
 # modified scheduler step functions for clamping the predicted x_0 between -bit_scale and +bit_scale
 def ddim_bit_scheduler_step(
    self,
-    model_output: torch.Tensor,
+    model_output: torch.FloatTensor,
    timestep: int,
-    sample: torch.Tensor,
+    sample: torch.FloatTensor,
    eta: float = 0.0,
    use_clipped_model_output: bool = True,
    generator=None,
@@ -56,9 +56,9 @@ def ddim_bit_scheduler_step(
    Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
    process from the learned model outputs (most often the predicted noise).
    Args:
-        model_output (`torch.Tensor`): direct output from learned diffusion model.
+        model_output (`torch.FloatTensor`): direct output from learned diffusion model.
        timestep (`int`): current discrete timestep in the diffusion chain.
-        sample (`torch.Tensor`):
+        sample (`torch.FloatTensor`):
            current instance of sample being created by diffusion process.
        eta (`float`): weight of noise for added noise in diffusion step.
        use_clipped_model_output (`bool`): TODO
@@ -134,9 +134,9 @@ def ddim_bit_scheduler_step(

 def ddpm_bit_scheduler_step(
    self,
-    model_output: torch.Tensor,
+    model_output: torch.FloatTensor,
    timestep: int,
-    sample: torch.Tensor,
+    sample: torch.FloatTensor,
    prediction_type="epsilon",
    generator=None,
    return_dict: bool = True,
@@ -145,9 +145,9 @@ def ddpm_bit_scheduler_step(
    Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
    process from the learned model outputs (most often the predicted noise).
    Args:
-        model_output (`torch.Tensor`): direct output from learned diffusion model.
+        model_output (`torch.FloatTensor`): direct output from learned diffusion model.
        timestep (`int`): current discrete timestep in the diffusion chain.
-        sample (`torch.Tensor`):
+        sample (`torch.FloatTensor`):
            current instance of sample being created by diffusion process.
        prediction_type (`str`, default `epsilon`):
            indicates whether the model predicts the noise (epsilon), or the samples (`sample`).
--- a/examples/community/checkpoint_merger.py
+++ b/examples/community/checkpoint_merger.py
@@ -138,6 +138,7 @@ class CheckpointMergerPipeline(DiffusionPipeline):
            comparison_result &= self._compare_model_configs(config_dicts[idx - 1], config_dicts[idx])
            if not force and comparison_result is False:
                raise ValueError("Incompatible checkpoints. Please check model_index.json for the models.")
+                print(config_dicts[0], config_dicts[1])
        print("Compatible model_index.json files found")
        # Step 2: Basic Validation has succeeded. Let's download the models and save them into our local files.
        cached_folders = []
--- a/examples/community/clip_guided_images_mixing_stable_diffusion.py
+++ b/examples/community/clip_guided_images_mixing_stable_diffusion.py
@@ -233,8 +233,8 @@ class CLIPGuidedImagesMixingStableDiffusion(DiffusionPipeline, StableDiffusionMi
    @torch.no_grad()
    def __call__(
        self,
-        style_image: Union[torch.Tensor, PIL.Image.Image],
-        content_image: Union[torch.Tensor, PIL.Image.Image],
+        style_image: Union[torch.FloatTensor, PIL.Image.Image],
+        content_image: Union[torch.FloatTensor, PIL.Image.Image],
        style_prompt: Optional[str] = None,
        content_prompt: Optional[str] = None,
        height: Optional[int] = 512,
--- a/examples/community/clip_guided_stable_diffusion.py
+++ b/examples/community/clip_guided_stable_diffusion.py
@@ -180,7 +180,7 @@ class CLIPGuidedStableDiffusion(DiffusionPipeline, StableDiffusionMixin):
        num_cutouts: Optional[int] = 4,
        use_cutouts: Optional[bool] = True,
        generator: Optional[torch.Generator] = None,
-        latents: Optional[torch.Tensor] = None,
+        latents: Optional[torch.FloatTensor] = None,
        output_type: Optional[str] = "pil",
        return_dict: bool = True,
    ):
--- a/examples/community/clip_guided_stable_diffusion_img2img.py
+++ b/examples/community/clip_guided_stable_diffusion_img2img.py
@@ -306,7 +306,7 @@ class CLIPGuidedStableDiffusion(DiffusionPipeline, StableDiffusionMixin):
        prompt: Union[str, List[str]],
        height: Optional[int] = 512,
        width: Optional[int] = 512,
-        image: Union[torch.Tensor, PIL.Image.Image] = None,
+        image: Union[torch.FloatTensor, PIL.Image.Image] = None,
        strength: float = 0.8,
        num_inference_steps: Optional[int] = 50,
        guidance_scale: Optional[float] = 7.5,
@@ -317,7 +317,7 @@ class CLIPGuidedStableDiffusion(DiffusionPipeline, StableDiffusionMixin):
        num_cutouts: Optional[int] = 4,
        use_cutouts: Optional[bool] = True,
        generator: Optional[torch.Generator] = None,
-        latents: Optional[torch.Tensor] = None,
+        latents: Optional[torch.FloatTensor] = None,
        output_type: Optional[str] = "pil",
        return_dict: bool = True,
    ):
@@ -359,16 +359,9 @@ class CLIPGuidedStableDiffusion(DiffusionPipeline, StableDiffusionMixin):

        # Preprocess image
        image = preprocess(image, width, height)
-        if latents is None:
-            latents = self.prepare_latents(
-                image,
-                latent_timestep,
-                batch_size,
-                num_images_per_prompt,
-                text_embeddings.dtype,
-                self.device,
-                generator,
-            )
+        latents = self.prepare_latents(
+            image, latent_timestep, batch_size, num_images_per_prompt, text_embeddings.dtype, self.device, generator
+        )

        if clip_guidance_scale > 0:
            if clip_prompt is not None:
--- a/examples/community/composable_stable_diffusion.py
+++ b/examples/community/composable_stable_diffusion.py
@@ -354,10 +354,10 @@ class ComposableStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin)
        num_images_per_prompt: Optional[int] = 1,
        eta: float = 0.0,
        generator: Optional[torch.Generator] = None,
-        latents: Optional[torch.Tensor] = None,
+        latents: Optional[torch.FloatTensor] = None,
        output_type: Optional[str] = "pil",
        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
        callback_steps: int = 1,
        weights: Optional[str] = "",
    ):
@@ -391,7 +391,7 @@ class ComposableStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin)
            generator (`torch.Generator`, *optional*):
                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
                deterministic.
-            latents (`torch.Tensor`, *optional*):
+            latents (`torch.FloatTensor`, *optional*):
                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                tensor will ge generated by sampling using the supplied random `generator`.
@@ -403,7 +403,7 @@ class ComposableStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin)
                plain tuple.
            callback (`Callable`, *optional*):
                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
            callback_steps (`int`, *optional*, defaults to 1):
                The frequency at which the `callback` function will be called. If not specified, the callback will be
                called at every step.
--- a/examples/community/ddim_noise_comparative_analysis.py
+++ b/examples/community/ddim_noise_comparative_analysis.py
@@ -103,7 +103,7 @@ class DDIMNoiseComparativeAnalysisPipeline(DiffusionPipeline):
    @torch.no_grad()
    def __call__(
        self,
-        image: Union[torch.Tensor, PIL.Image.Image] = None,
+        image: Union[torch.FloatTensor, PIL.Image.Image] = None,
        strength: float = 0.8,
        batch_size: int = 1,
        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
@@ -115,7 +115,7 @@ class DDIMNoiseComparativeAnalysisPipeline(DiffusionPipeline):
    ) -> Union[ImagePipelineOutput, Tuple]:
        r"""
        Args:
-            image (`torch.Tensor` or `PIL.Image.Image`):
+            image (`torch.FloatTensor` or `PIL.Image.Image`):
                `Image`, or tensor representing an image batch, that will be used as the starting point for the
                process.
            strength (`float`, *optional*, defaults to 0.8):
--- a/examples/community/fresco_v2v.py
+++ b/examples/community/fresco_v2v.py
--- a/examples/community/gluegen.py
+++ b/examples/community/gluegen.py
@@ -205,7 +205,7 @@ class GlueGenStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin, Lo
        safety_checker: StableDiffusionSafetyChecker,
        feature_extractor: CLIPImageProcessor,
        language_adapter: TranslatorNoLN = None,
-        tensor_norm: torch.Tensor = None,
+        tensor_norm: torch.FloatTensor = None,
        requires_safety_checker: bool = True,
    ):
        super().__init__()
@@ -231,7 +231,7 @@ class GlueGenStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin, Lo
        num_token: int,
        dim: int,
        dim_out: int,
-        tensor_norm: torch.Tensor,
+        tensor_norm: torch.FloatTensor,
        mult: int = 2,
        depth: int = 5,
    ):
@@ -242,7 +242,7 @@ class GlueGenStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin, Lo
        )
        self.language_adapter.load_state_dict(torch.load(model_path))

-    def _adapt_language(self, prompt_embeds: torch.Tensor):
+    def _adapt_language(self, prompt_embeds: torch.FloatTensor):
        prompt_embeds = prompt_embeds / 3
        prompt_embeds = self.language_adapter(prompt_embeds) * (self.tensor_norm / 2)
        return prompt_embeds
@@ -254,8 +254,8 @@ class GlueGenStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin, Lo
        num_images_per_prompt,
        do_classifier_free_guidance,
        negative_prompt=None,
-        prompt_embeds: Optional[torch.Tensor] = None,
-        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
        lora_scale: Optional[float] = None,
        clip_skip: Optional[int] = None,
    ):
@@ -275,10 +275,10 @@ class GlueGenStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin, Lo
                The prompt or prompts not to guide the image generation. If not defined, one has to pass
                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
                less than `1`).
-            prompt_embeds (`torch.Tensor`, *optional*):
+            prompt_embeds (`torch.FloatTensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`torch.Tensor`, *optional*):
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
                argument.
@@ -535,7 +535,7 @@ class GlueGenStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin, Lo
                data type of the generated embeddings

        Returns:
-            `torch.Tensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
+            `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
        """
        assert len(w.shape) == 1
        w = w * 1000.0
@@ -594,9 +594,9 @@ class GlueGenStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin, Lo
        num_images_per_prompt: Optional[int] = 1,
        eta: float = 0.0,
        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
-        latents: Optional[torch.Tensor] = None,
-        prompt_embeds: Optional[torch.Tensor] = None,
-        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
        output_type: Optional[str] = "pil",
        return_dict: bool = True,
        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
@@ -635,14 +635,14 @@ class GlueGenStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin, Lo
            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
                generation deterministic.
-            latents (`torch.Tensor`, *optional*):
+            latents (`torch.FloatTensor`, *optional*):
                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                tensor is generated by sampling using the supplied random `generator`.
-            prompt_embeds (`torch.Tensor`, *optional*):
+            prompt_embeds (`torch.FloatTensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
                provided, text embeddings are generated from the `prompt` input argument.
-            negative_prompt_embeds (`torch.Tensor`, *optional*):
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
--- a/examples/community/hd_painter.py
+++ b/examples/community/hd_painter.py
@@ -28,10 +28,10 @@ class RASGAttnProcessor:
    def __call__(
        self,
        attn: Attention,
-        hidden_states: torch.Tensor,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        temb: Optional[torch.Tensor] = None,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        temb: Optional[torch.FloatTensor] = None,
        scale: float = 1.0,
    ) -> torch.Tensor:
        # Same as the default AttnProcessor up untill the part where similarity matrix gets saved
@@ -111,10 +111,10 @@ class PAIntAAttnProcessor:
    def __call__(
        self,
        attn: Attention,
-        hidden_states: torch.Tensor,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        temb: Optional[torch.Tensor] = None,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        temb: Optional[torch.FloatTensor] = None,
        scale: float = 1.0,
    ) -> torch.Tensor:
        # Automatically recognize the resolution of the current attention layer and resize the masks accordingly
@@ -454,7 +454,7 @@ class StableDiffusionHDPainterPipeline(StableDiffusionInpaintPipeline):
        prompt: Union[str, List[str]] = None,
        image: PipelineImageInput = None,
        mask_image: PipelineImageInput = None,
-        masked_image_latents: torch.Tensor = None,
+        masked_image_latents: torch.FloatTensor = None,
        height: Optional[int] = None,
        width: Optional[int] = None,
        padding_mask_crop: Optional[int] = None,
@@ -467,9 +467,9 @@ class StableDiffusionHDPainterPipeline(StableDiffusionInpaintPipeline):
        num_images_per_prompt: Optional[int] = 1,
        eta: float = 0.01,
        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
-        latents: Optional[torch.Tensor] = None,
-        prompt_embeds: Optional[torch.Tensor] = None,
-        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
        ip_adapter_image: Optional[PipelineImageInput] = None,
        output_type: Optional[str] = "pil",
        return_dict: bool = True,
--- a/examples/community/iadb.py
+++ b/examples/community/iadb.py
@@ -17,21 +17,21 @@ class IADBScheduler(SchedulerMixin, ConfigMixin):

    def step(
        self,
-        model_output: torch.Tensor,
+        model_output: torch.FloatTensor,
        timestep: int,
-        x_alpha: torch.Tensor,
-    ) -> torch.Tensor:
+        x_alpha: torch.FloatTensor,
+    ) -> torch.FloatTensor:
        """
        Predict the sample at the previous timestep by reversing the ODE. Core function to propagate the diffusion
        process from the learned model outputs (most often the predicted noise).

        Args:
-            model_output (`torch.Tensor`): direct output from learned diffusion model. It is the direction from x0 to x1.
+            model_output (`torch.FloatTensor`): direct output from learned diffusion model. It is the direction from x0 to x1.
            timestep (`float`): current timestep in the diffusion chain.
-            x_alpha (`torch.Tensor`): x_alpha sample for the current timestep
+            x_alpha (`torch.FloatTensor`): x_alpha sample for the current timestep

        Returns:
-            `torch.Tensor`: the sample at the previous timestep
+            `torch.FloatTensor`: the sample at the previous timestep

        """
        if self.num_inference_steps is None:
@@ -53,10 +53,10 @@ class IADBScheduler(SchedulerMixin, ConfigMixin):

    def add_noise(
        self,
-        original_samples: torch.Tensor,
-        noise: torch.Tensor,
-        alpha: torch.Tensor,
-    ) -> torch.Tensor:
+        original_samples: torch.FloatTensor,
+        noise: torch.FloatTensor,
+        alpha: torch.FloatTensor,
+    ) -> torch.FloatTensor:
        return original_samples * alpha + noise * (1 - alpha)

    def __len__(self):
--- a/examples/community/imagic_stable_diffusion.py
+++ b/examples/community/imagic_stable_diffusion.py
@@ -110,7 +110,7 @@ class ImagicStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin):
    def train(
        self,
        prompt: Union[str, List[str]],
-        image: Union[torch.Tensor, PIL.Image.Image],
+        image: Union[torch.FloatTensor, PIL.Image.Image],
        height: Optional[int] = 512,
        width: Optional[int] = 512,
        generator: Optional[torch.Generator] = None,
@@ -144,7 +144,7 @@ class ImagicStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin):
            generator (`torch.Generator`, *optional*):
                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
                deterministic.
-            latents (`torch.Tensor`, *optional*):
+            latents (`torch.FloatTensor`, *optional*):
                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                tensor will ge generated by sampling using the supplied random `generator`.
--- a/examples/community/img2img_inpainting.py
+++ b/examples/community/img2img_inpainting.py
@@ -133,9 +133,9 @@ class ImageToImageInpaintingPipeline(DiffusionPipeline):
    def __call__(
        self,
        prompt: Union[str, List[str]],
-        image: Union[torch.Tensor, PIL.Image.Image],
-        inner_image: Union[torch.Tensor, PIL.Image.Image],
-        mask_image: Union[torch.Tensor, PIL.Image.Image],
+        image: Union[torch.FloatTensor, PIL.Image.Image],
+        inner_image: Union[torch.FloatTensor, PIL.Image.Image],
+        mask_image: Union[torch.FloatTensor, PIL.Image.Image],
        height: int = 512,
        width: int = 512,
        num_inference_steps: int = 50,
@@ -144,10 +144,10 @@ class ImageToImageInpaintingPipeline(DiffusionPipeline):
        num_images_per_prompt: Optional[int] = 1,
        eta: float = 0.0,
        generator: Optional[torch.Generator] = None,
-        latents: Optional[torch.Tensor] = None,
+        latents: Optional[torch.FloatTensor] = None,
        output_type: Optional[str] = "pil",
        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
        callback_steps: int = 1,
        **kwargs,
    ):
@@ -194,7 +194,7 @@ class ImageToImageInpaintingPipeline(DiffusionPipeline):
            generator (`torch.Generator`, *optional*):
                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
                deterministic.
-            latents (`torch.Tensor`, *optional*):
+            latents (`torch.FloatTensor`, *optional*):
                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                tensor will ge generated by sampling using the supplied random `generator`.
@@ -206,7 +206,7 @@ class ImageToImageInpaintingPipeline(DiffusionPipeline):
                plain tuple.
            callback (`Callable`, *optional*):
                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
            callback_steps (`int`, *optional*, defaults to 1):
                The frequency at which the `callback` function will be called. If not specified, the callback will be
                called at every step.
--- a/examples/community/instaflow_one_step.py
+++ b/examples/community/instaflow_one_step.py
@@ -189,8 +189,8 @@ class InstaFlowPipeline(
        num_images_per_prompt,
        do_classifier_free_guidance,
        negative_prompt=None,
-        prompt_embeds: Optional[torch.Tensor] = None,
-        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
        lora_scale: Optional[float] = None,
    ):
        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
@@ -219,8 +219,8 @@ class InstaFlowPipeline(
        num_images_per_prompt,
        do_classifier_free_guidance,
        negative_prompt=None,
-        prompt_embeds: Optional[torch.Tensor] = None,
-        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
        lora_scale: Optional[float] = None,
    ):
        r"""
@@ -239,10 +239,10 @@ class InstaFlowPipeline(
                The prompt or prompts not to guide the image generation. If not defined, one has to pass
                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
                less than `1`).
-            prompt_embeds (`torch.Tensor`, *optional*):
+            prompt_embeds (`torch.FloatTensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`torch.Tensor`, *optional*):
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
                argument.
@@ -501,12 +501,12 @@ class InstaFlowPipeline(
        num_images_per_prompt: Optional[int] = 1,
        eta: float = 0.0,
        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
-        latents: Optional[torch.Tensor] = None,
-        prompt_embeds: Optional[torch.Tensor] = None,
-        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
        output_type: Optional[str] = "pil",
        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
        callback_steps: int = 1,
        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
        guidance_rescale: float = 0.0,
@@ -538,14 +538,14 @@ class InstaFlowPipeline(
            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
                generation deterministic.
-            latents (`torch.Tensor`, *optional*):
+            latents (`torch.FloatTensor`, *optional*):
                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                tensor is generated by sampling using the supplied random `generator`.
-            prompt_embeds (`torch.Tensor`, *optional*):
+            prompt_embeds (`torch.FloatTensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
                provided, text embeddings are generated from the `prompt` input argument.
-            negative_prompt_embeds (`torch.Tensor`, *optional*):
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
            output_type (`str`, *optional*, defaults to `"pil"`):
@@ -555,7 +555,7 @@ class InstaFlowPipeline(
                plain tuple.
            callback (`Callable`, *optional*):
                A function that calls every `callback_steps` steps during inference. The function is called with the
-                following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
            callback_steps (`int`, *optional*, defaults to 1):
                The frequency at which the `callback` function is called. If not specified, the callback is called at
                every step.
--- a/examples/community/interpolate_stable_diffusion.py
+++ b/examples/community/interpolate_stable_diffusion.py
@@ -132,12 +132,12 @@ class StableDiffusionWalkPipeline(DiffusionPipeline, StableDiffusionMixin):
        num_images_per_prompt: Optional[int] = 1,
        eta: float = 0.0,
        generator: Optional[torch.Generator] = None,
-        latents: Optional[torch.Tensor] = None,
+        latents: Optional[torch.FloatTensor] = None,
        output_type: Optional[str] = "pil",
        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
        callback_steps: int = 1,
-        text_embeddings: Optional[torch.Tensor] = None,
+        text_embeddings: Optional[torch.FloatTensor] = None,
        **kwargs,
    ):
        r"""
@@ -170,7 +170,7 @@ class StableDiffusionWalkPipeline(DiffusionPipeline, StableDiffusionMixin):
            generator (`torch.Generator`, *optional*):
                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
                deterministic.
-            latents (`torch.Tensor`, *optional*):
+            latents (`torch.FloatTensor`, *optional*):
                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                tensor will ge generated by sampling using the supplied random `generator`.
@@ -182,11 +182,11 @@ class StableDiffusionWalkPipeline(DiffusionPipeline, StableDiffusionMixin):
                plain tuple.
            callback (`Callable`, *optional*):
                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
            callback_steps (`int`, *optional*, defaults to 1):
                The frequency at which the `callback` function will be called. If not specified, the callback will be
                called at every step.
-            text_embeddings (`torch.Tensor`, *optional*, defaults to `None`):
+            text_embeddings (`torch.FloatTensor`, *optional*, defaults to `None`):
                Pre-generated text embeddings to be used as inputs for image generation. Can be used in place of
                `prompt` to avoid re-computing the embeddings. If not provided, the embeddings will be generated from
                the supplied `prompt`.
--- a/examples/community/ip_adapter_face_id.py
+++ b/examples/community/ip_adapter_face_id.py
@@ -62,7 +62,7 @@ class IPAdapterFullImageProjection(nn.Module):
        self.ff = FeedForward(image_embed_dim, cross_attention_dim * num_tokens, mult=mult, activation_fn="gelu")
        self.norm = nn.LayerNorm(cross_attention_dim)

-    def forward(self, image_embeds: torch.Tensor):
+    def forward(self, image_embeds: torch.FloatTensor):
        x = self.ff(image_embeds)
        x = x.reshape(-1, self.num_tokens, self.cross_attention_dim)
        return self.norm(x)
@@ -452,8 +452,8 @@ class IPAdapterFaceIDStableDiffusionPipeline(
        num_images_per_prompt,
        do_classifier_free_guidance,
        negative_prompt=None,
-        prompt_embeds: Optional[torch.Tensor] = None,
-        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
        lora_scale: Optional[float] = None,
        **kwargs,
    ):
@@ -484,8 +484,8 @@ class IPAdapterFaceIDStableDiffusionPipeline(
        num_images_per_prompt,
        do_classifier_free_guidance,
        negative_prompt=None,
-        prompt_embeds: Optional[torch.Tensor] = None,
-        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
        lora_scale: Optional[float] = None,
        clip_skip: Optional[int] = None,
    ):
@@ -505,10 +505,10 @@ class IPAdapterFaceIDStableDiffusionPipeline(
                The prompt or prompts not to guide the image generation. If not defined, one has to pass
                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
                less than `1`).
-            prompt_embeds (`torch.Tensor`, *optional*):
+            prompt_embeds (`torch.FloatTensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`torch.Tensor`, *optional*):
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
                argument.
@@ -788,7 +788,7 @@ class IPAdapterFaceIDStableDiffusionPipeline(
                data type of the generated embeddings

        Returns:
-            `torch.Tensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
+            `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
        """
        assert len(w.shape) == 1
        w = w * 1000.0
@@ -847,10 +847,10 @@ class IPAdapterFaceIDStableDiffusionPipeline(
        num_images_per_prompt: Optional[int] = 1,
        eta: float = 0.0,
        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
-        latents: Optional[torch.Tensor] = None,
-        prompt_embeds: Optional[torch.Tensor] = None,
-        negative_prompt_embeds: Optional[torch.Tensor] = None,
-        image_embeds: Optional[torch.Tensor] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        image_embeds: Optional[torch.FloatTensor] = None,
        output_type: Optional[str] = "pil",
        return_dict: bool = True,
        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
@@ -891,17 +891,17 @@ class IPAdapterFaceIDStableDiffusionPipeline(
            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
                generation deterministic.
-            latents (`torch.Tensor`, *optional*):
+            latents (`torch.FloatTensor`, *optional*):
                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                tensor is generated by sampling using the supplied random `generator`.
-            prompt_embeds (`torch.Tensor`, *optional*):
+            prompt_embeds (`torch.FloatTensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
                provided, text embeddings are generated from the `prompt` input argument.
-            negative_prompt_embeds (`torch.Tensor`, *optional*):
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
-            image_embeds (`torch.Tensor`, *optional*):
+            image_embeds (`torch.FloatTensor`, *optional*):
                Pre-generated image embeddings.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
--- a/examples/community/latent_consistency_img2img.py
+++ b/examples/community/latent_consistency_img2img.py
@@ -88,7 +88,7 @@ class LatentConsistencyModelImg2ImgPipeline(DiffusionPipeline):
                torch device
            num_images_per_prompt (`int`):
                number of images that should be generated per prompt
-            prompt_embeds (`torch.Tensor`, *optional*):
+            prompt_embeds (`torch.FloatTensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
                provided, text embeddings will be generated from `prompt` input argument.
        """
@@ -240,6 +240,14 @@ class LatentConsistencyModelImg2ImgPipeline(DiffusionPipeline):

        return latents

+        if latents is None:
+            latents = torch.randn(shape, dtype=dtype).to(device)
+        else:
+            latents = latents.to(device)
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
    def get_w_embedding(self, w, embedding_dim=512, dtype=torch.float32):
        """
        see https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
@@ -282,10 +290,10 @@ class LatentConsistencyModelImg2ImgPipeline(DiffusionPipeline):
        width: Optional[int] = 768,
        guidance_scale: float = 7.5,
        num_images_per_prompt: Optional[int] = 1,
-        latents: Optional[torch.Tensor] = None,
+        latents: Optional[torch.FloatTensor] = None,
        num_inference_steps: int = 4,
        lcm_origin_steps: int = 50,
-        prompt_embeds: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
        output_type: Optional[str] = "pil",
        return_dict: bool = True,
        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
@@ -327,18 +335,17 @@ class LatentConsistencyModelImg2ImgPipeline(DiffusionPipeline):

        # 5. Prepare latent variable
        num_channels_latents = self.unet.config.in_channels
-        if latents is None:
-            latents = self.prepare_latents(
-                image,
-                latent_timestep,
-                batch_size * num_images_per_prompt,
-                num_channels_latents,
-                height,
-                width,
-                prompt_embeds.dtype,
-                device,
-                latents,
-            )
+        latents = self.prepare_latents(
+            image,
+            latent_timestep,
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            latents,
+        )
        bs = batch_size * num_images_per_prompt

        # 6. Get Guidance Scale Embedding
@@ -395,16 +402,16 @@ class LCMSchedulerOutput(BaseOutput):
    """
    Output class for the scheduler's `step` function output.
    Args:
-        prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
+        prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
            Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
            denoising loop.
-        pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
+        pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
            The predicted denoised sample `(x_{0})` based on the model output from the current timestep.
            `pred_original_sample` can be used to preview progress or for guidance.
    """

-    prev_sample: torch.Tensor
-    denoised: Optional[torch.Tensor] = None
+    prev_sample: torch.FloatTensor
+    denoised: Optional[torch.FloatTensor] = None


 # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
@@ -452,10 +459,10 @@ def rescale_zero_terminal_snr(betas):
    """
    Rescales betas to have zero terminal SNR Based on https://arxiv.org/pdf/2305.08891.pdf (Algorithm 1)
    Args:
-        betas (`torch.Tensor`):
+        betas (`torch.FloatTensor`):
            the betas that the scheduler is being initialized with.
    Returns:
-        `torch.Tensor`: rescaled betas with zero terminal SNR
+        `torch.FloatTensor`: rescaled betas with zero terminal SNR
    """
    # Convert betas to alphas_bar_sqrt
    alphas = 1.0 - betas
@@ -565,7 +572,7 @@ class LCMSchedulerWithTimestamp(SchedulerMixin, ConfigMixin):
            # Glide cosine schedule
            self.betas = betas_for_alpha_bar(num_train_timesteps)
        else:
-            raise NotImplementedError(f"{beta_schedule} is not implemented for {self.__class__}")
+            raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")

        # Rescale for zero SNR
        if rescale_betas_zero_snr:
@@ -587,17 +594,17 @@ class LCMSchedulerWithTimestamp(SchedulerMixin, ConfigMixin):
        self.num_inference_steps = None
        self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps)[::-1].copy().astype(np.int64))

-    def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor:
+    def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor:
        """
        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
        current timestep.
        Args:
-            sample (`torch.Tensor`):
+            sample (`torch.FloatTensor`):
                The input sample.
            timestep (`int`, *optional*):
                The current timestep in the diffusion chain.
        Returns:
-            `torch.Tensor`:
+            `torch.FloatTensor`:
                A scaled input sample.
        """
        return sample
@@ -613,7 +620,7 @@ class LCMSchedulerWithTimestamp(SchedulerMixin, ConfigMixin):
        return variance

    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
-    def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
+    def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor:
        """
        "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
        prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
@@ -685,25 +692,25 @@ class LCMSchedulerWithTimestamp(SchedulerMixin, ConfigMixin):

    def step(
        self,
-        model_output: torch.Tensor,
+        model_output: torch.FloatTensor,
        timeindex: int,
        timestep: int,
-        sample: torch.Tensor,
+        sample: torch.FloatTensor,
        eta: float = 0.0,
        use_clipped_model_output: bool = False,
        generator=None,
-        variance_noise: Optional[torch.Tensor] = None,
+        variance_noise: Optional[torch.FloatTensor] = None,
        return_dict: bool = True,
    ) -> Union[LCMSchedulerOutput, Tuple]:
        """
        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
        process from the learned model outputs (most often the predicted noise).
        Args:
-            model_output (`torch.Tensor`):
+            model_output (`torch.FloatTensor`):
                The direct output from learned diffusion model.
            timestep (`float`):
                The current discrete timestep in the diffusion chain.
-            sample (`torch.Tensor`):
+            sample (`torch.FloatTensor`):
                A current instance of a sample created by the diffusion process.
            eta (`float`):
                The weight of noise for added noise in diffusion step.
@@ -714,7 +721,7 @@ class LCMSchedulerWithTimestamp(SchedulerMixin, ConfigMixin):
                `use_clipped_model_output` has no effect.
            generator (`torch.Generator`, *optional*):
                A random number generator.
-            variance_noise (`torch.Tensor`):
+            variance_noise (`torch.FloatTensor`):
                Alternative to generating noise with `generator` by directly providing the noise for the variance
                itself. Useful for methods such as [`CycleDiffusion`].
            return_dict (`bool`, *optional*, defaults to `True`):
@@ -777,10 +784,10 @@ class LCMSchedulerWithTimestamp(SchedulerMixin, ConfigMixin):
    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise
    def add_noise(
        self,
-        original_samples: torch.Tensor,
-        noise: torch.Tensor,
+        original_samples: torch.FloatTensor,
+        noise: torch.FloatTensor,
        timesteps: torch.IntTensor,
-    ) -> torch.Tensor:
+    ) -> torch.FloatTensor:
        # Make sure alphas_cumprod and timestep have same device and dtype as original_samples
        alphas_cumprod = self.alphas_cumprod.to(device=original_samples.device, dtype=original_samples.dtype)
        timesteps = timesteps.to(original_samples.device)
@@ -799,7 +806,9 @@ class LCMSchedulerWithTimestamp(SchedulerMixin, ConfigMixin):
        return noisy_samples

    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity
-    def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor:
+    def get_velocity(
+        self, sample: torch.FloatTensor, noise: torch.FloatTensor, timesteps: torch.IntTensor
+    ) -> torch.FloatTensor:
        # Make sure alphas_cumprod and timestep have same device and dtype as sample
        alphas_cumprod = self.alphas_cumprod.to(device=sample.device, dtype=sample.dtype)
        timesteps = timesteps.to(sample.device)
--- a/examples/community/latent_consistency_interpolate.py
+++ b/examples/community/latent_consistency_interpolate.py
@@ -281,8 +281,8 @@ class LatentConsistencyModelWalkPipeline(
        num_images_per_prompt,
        do_classifier_free_guidance,
        negative_prompt=None,
-        prompt_embeds: Optional[torch.Tensor] = None,
-        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
        lora_scale: Optional[float] = None,
        clip_skip: Optional[int] = None,
    ):
@@ -302,10 +302,10 @@ class LatentConsistencyModelWalkPipeline(
                The prompt or prompts not to guide the image generation. If not defined, one has to pass
                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
                less than `1`).
-            prompt_embeds (`torch.Tensor`, *optional*):
+            prompt_embeds (`torch.FloatTensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`torch.Tensor`, *optional*):
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
                argument.
@@ -506,7 +506,7 @@ class LatentConsistencyModelWalkPipeline(
                data type of the generated embeddings

        Returns:
-            `torch.Tensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
+            `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
        """
        assert len(w.shape) == 1
        w = w * 1000.0
@@ -546,7 +546,7 @@ class LatentConsistencyModelWalkPipeline(
        height: int,
        width: int,
        callback_steps: int,
-        prompt_embeds: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
        callback_on_step_end_tensor_inputs=None,
    ):
        if height % 8 != 0 or width % 8 != 0:
@@ -580,11 +580,11 @@ class LatentConsistencyModelWalkPipeline(
    @torch.no_grad()
    def interpolate_embedding(
        self,
-        start_embedding: torch.Tensor,
-        end_embedding: torch.Tensor,
+        start_embedding: torch.FloatTensor,
+        end_embedding: torch.FloatTensor,
        num_interpolation_steps: Union[int, List[int]],
        interpolation_type: str,
-    ) -> torch.Tensor:
+    ) -> torch.FloatTensor:
        if interpolation_type == "lerp":
            interpolation_fn = lerp
        elif interpolation_type == "slerp":
@@ -611,11 +611,11 @@ class LatentConsistencyModelWalkPipeline(
    @torch.no_grad()
    def interpolate_latent(
        self,
-        start_latent: torch.Tensor,
-        end_latent: torch.Tensor,
+        start_latent: torch.FloatTensor,
+        end_latent: torch.FloatTensor,
        num_interpolation_steps: Union[int, List[int]],
        interpolation_type: str,
-    ) -> torch.Tensor:
+    ) -> torch.FloatTensor:
        if interpolation_type == "lerp":
            interpolation_fn = lerp
        elif interpolation_type == "slerp":
@@ -663,8 +663,8 @@ class LatentConsistencyModelWalkPipeline(
        guidance_scale: float = 8.5,
        num_images_per_prompt: Optional[int] = 1,
        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
-        latents: Optional[torch.Tensor] = None,
-        prompt_embeds: Optional[torch.Tensor] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
        output_type: Optional[str] = "pil",
        return_dict: bool = True,
        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
@@ -705,11 +705,11 @@ class LatentConsistencyModelWalkPipeline(
            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
                generation deterministic.
-            latents (`torch.Tensor`, *optional*):
+            latents (`torch.FloatTensor`, *optional*):
                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                tensor is generated by sampling using the supplied random `generator`.
-            prompt_embeds (`torch.Tensor`, *optional*):
+            prompt_embeds (`torch.FloatTensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
                provided, text embeddings are generated from the `prompt` input argument.
            output_type (`str`, *optional*, defaults to `"pil"`):
--- a/examples/community/latent_consistency_txt2img.py
+++ b/examples/community/latent_consistency_txt2img.py
@@ -86,7 +86,7 @@ class LatentConsistencyModelPipeline(DiffusionPipeline):
                torch device
            num_images_per_prompt (`int`):
                number of images that should be generated per prompt
-            prompt_embeds (`torch.Tensor`, *optional*):
+            prompt_embeds (`torch.FloatTensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
                provided, text embeddings will be generated from `prompt` input argument.
        """
@@ -208,10 +208,10 @@ class LatentConsistencyModelPipeline(DiffusionPipeline):
        width: Optional[int] = 768,
        guidance_scale: float = 7.5,
        num_images_per_prompt: Optional[int] = 1,
-        latents: Optional[torch.Tensor] = None,
+        latents: Optional[torch.FloatTensor] = None,
        num_inference_steps: int = 4,
        lcm_origin_steps: int = 50,
-        prompt_embeds: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
        output_type: Optional[str] = "pil",
        return_dict: bool = True,
        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
@@ -310,16 +310,16 @@ class LCMSchedulerOutput(BaseOutput):
    """
    Output class for the scheduler's `step` function output.
    Args:
-        prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
+        prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
            Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
            denoising loop.
-        pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
+        pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
            The predicted denoised sample `(x_{0})` based on the model output from the current timestep.
            `pred_original_sample` can be used to preview progress or for guidance.
    """

-    prev_sample: torch.Tensor
-    denoised: Optional[torch.Tensor] = None
+    prev_sample: torch.FloatTensor
+    denoised: Optional[torch.FloatTensor] = None


 # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
@@ -367,10 +367,10 @@ def rescale_zero_terminal_snr(betas):
    """
    Rescales betas to have zero terminal SNR Based on https://arxiv.org/pdf/2305.08891.pdf (Algorithm 1)
    Args:
-        betas (`torch.Tensor`):
+        betas (`torch.FloatTensor`):
            the betas that the scheduler is being initialized with.
    Returns:
-        `torch.Tensor`: rescaled betas with zero terminal SNR
+        `torch.FloatTensor`: rescaled betas with zero terminal SNR
    """
    # Convert betas to alphas_bar_sqrt
    alphas = 1.0 - betas
@@ -477,7 +477,7 @@ class LCMScheduler(SchedulerMixin, ConfigMixin):
            # Glide cosine schedule
            self.betas = betas_for_alpha_bar(num_train_timesteps)
        else:
-            raise NotImplementedError(f"{beta_schedule} is not implemented for {self.__class__}")
+            raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")

        # Rescale for zero SNR
        if rescale_betas_zero_snr:
@@ -499,17 +499,17 @@ class LCMScheduler(SchedulerMixin, ConfigMixin):
        self.num_inference_steps = None
        self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps)[::-1].copy().astype(np.int64))

-    def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor:
+    def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor:
        """
        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
        current timestep.
        Args:
-            sample (`torch.Tensor`):
+            sample (`torch.FloatTensor`):
                The input sample.
            timestep (`int`, *optional*):
                The current timestep in the diffusion chain.
        Returns:
-            `torch.Tensor`:
+            `torch.FloatTensor`:
                A scaled input sample.
        """
        return sample
@@ -525,7 +525,7 @@ class LCMScheduler(SchedulerMixin, ConfigMixin):
        return variance

    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
-    def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
+    def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor:
        """
        "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
        prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
@@ -593,25 +593,25 @@ class LCMScheduler(SchedulerMixin, ConfigMixin):

    def step(
        self,
-        model_output: torch.Tensor,
+        model_output: torch.FloatTensor,
        timeindex: int,
        timestep: int,
-        sample: torch.Tensor,
+        sample: torch.FloatTensor,
        eta: float = 0.0,
        use_clipped_model_output: bool = False,
        generator=None,
-        variance_noise: Optional[torch.Tensor] = None,
+        variance_noise: Optional[torch.FloatTensor] = None,
        return_dict: bool = True,
    ) -> Union[LCMSchedulerOutput, Tuple]:
        """
        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
        process from the learned model outputs (most often the predicted noise).
        Args:
-            model_output (`torch.Tensor`):
+            model_output (`torch.FloatTensor`):
                The direct output from learned diffusion model.
            timestep (`float`):
                The current discrete timestep in the diffusion chain.
-            sample (`torch.Tensor`):
+            sample (`torch.FloatTensor`):
                A current instance of a sample created by the diffusion process.
            eta (`float`):
                The weight of noise for added noise in diffusion step.
@@ -622,7 +622,7 @@ class LCMScheduler(SchedulerMixin, ConfigMixin):
                `use_clipped_model_output` has no effect.
            generator (`torch.Generator`, *optional*):
                A random number generator.
-            variance_noise (`torch.Tensor`):
+            variance_noise (`torch.FloatTensor`):
                Alternative to generating noise with `generator` by directly providing the noise for the variance
                itself. Useful for methods such as [`CycleDiffusion`].
            return_dict (`bool`, *optional*, defaults to `True`):
@@ -685,10 +685,10 @@ class LCMScheduler(SchedulerMixin, ConfigMixin):
    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise
    def add_noise(
        self,
-        original_samples: torch.Tensor,
-        noise: torch.Tensor,
+        original_samples: torch.FloatTensor,
+        noise: torch.FloatTensor,
        timesteps: torch.IntTensor,
-    ) -> torch.Tensor:
+    ) -> torch.FloatTensor:
        # Make sure alphas_cumprod and timestep have same device and dtype as original_samples
        alphas_cumprod = self.alphas_cumprod.to(device=original_samples.device, dtype=original_samples.dtype)
        timesteps = timesteps.to(original_samples.device)
@@ -707,7 +707,9 @@ class LCMScheduler(SchedulerMixin, ConfigMixin):
        return noisy_samples

    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity
-    def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor:
+    def get_velocity(
+        self, sample: torch.FloatTensor, noise: torch.FloatTensor, timesteps: torch.IntTensor
+    ) -> torch.FloatTensor:
        # Make sure alphas_cumprod and timestep have same device and dtype as sample
        alphas_cumprod = self.alphas_cumprod.to(device=sample.device, dtype=sample.dtype)
        timesteps = timesteps.to(sample.device)
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Sayak Paul	541b89b3e4	Merge branch 'main' into fix-lora-device-test	2024-04-25 17:13:28 +05:30
Dhruv Nair	ff7a10dedc	Merge branch 'main' into fix-lora-device-test	2024-04-24 10:58:50 +05:30
sayakpaul	c8b10a4656	empty	2024-04-23 20:40:11 +05:30
Sayak Paul	8058612d73	Merge branch 'main' into fix-lora-device-test	2024-04-23 15:30:26 +05:30
sayakpaul	c55f925f10	quality	2024-04-22 17:23:42 +05:30
sayakpaul	4faf220b68	fix more/	2024-04-22 17:20:26 +05:30
sayakpaul	3874e8cc6e	fix more.	2024-04-22 17:18:43 +05:30
sayakpaul	edb6cd74f7	fix lora device test	2024-04-22 17:17:25 +05:30