Merge branch 'main' into pipeline-fetcher

update
2025-12-08 13:34:27 +08:00 · 2025-02-21 14:41:37 +05:30 · 2025-02-21 14:00:02 +05:30 · 2025-02-21 13:17:33 +05:30
796 changed files with 12676 additions and 87189 deletions
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -38,7 +38,6 @@ jobs:
          python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
          python -m uv pip install -e [quality,test]
          python -m uv pip install pandas peft
-          python -m uv pip uninstall transformers && python -m uv pip install transformers==4.48.0
      - name: Environment
        run: |
          python utils/print_env.py
--- a/.github/workflows/nightly_tests.yml
+++ b/.github/workflows/nightly_tests.yml
@@ -142,7 +142,6 @@ jobs:
        HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
        # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
        CUBLAS_WORKSPACE_CONFIG: :16:8
-        RUN_COMPILE: yes
      run: |
        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
          -s -v -k "not Flax and not Onnx" \
@@ -181,55 +180,6 @@ jobs:
        pip install slack_sdk tabulate
        python utils/log_reports.py >> $GITHUB_STEP_SUMMARY

-  run_torch_compile_tests:
-    name: PyTorch Compile CUDA tests
-
-    runs-on:
-      group: aws-g4dn-2xlarge
-
-    container:
-      image: diffusers/diffusers-pytorch-compile-cuda
-      options: --gpus 0 --shm-size "16gb" --ipc host
-
-    steps:
-    - name: Checkout diffusers
-      uses: actions/checkout@v3
-      with:
-        fetch-depth: 2
-
-    - name: NVIDIA-SMI
-      run: |
-        nvidia-smi
-    - name: Install dependencies
-      run: |
-        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-        python -m uv pip install -e [quality,test,training]
-    - name: Environment
-      run: |
-        python utils/print_env.py
-    - name: Run torch compile tests on GPU
-      env:
-        HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
-        RUN_COMPILE: yes
-      run: |
-        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v -k "compile" --make-reports=tests_torch_compile_cuda tests/
-    - name: Failure short reports
-      if: ${{ failure() }}
-      run: cat reports/tests_torch_compile_cuda_failures_short.txt
-
-    - name: Test suite reports artifacts
-      if: ${{ always() }}
-      uses: actions/upload-artifact@v4
-      with:
-        name: torch_compile_test_reports
-        path: reports
-
-    - name: Generate Report and Notify Channel
-      if: always()
-      run: |
-        pip install slack_sdk tabulate
-        python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
-  
  run_big_gpu_torch_tests:
    name: Torch tests on big GPU
    strategy:
@@ -464,16 +414,10 @@ jobs:
        config:
          - backend: "bitsandbytes"
            test_location: "bnb"
-            additional_deps: ["peft"]
          - backend: "gguf"
            test_location: "gguf"
-            additional_deps: ["peft"]
          - backend: "torchao"
            test_location: "torchao"
-            additional_deps: []
-          - backend: "optimum_quanto"
-            test_location: "quanto"
-            additional_deps: []
    runs-on:
      group: aws-g6e-xlarge-plus
    container:
@@ -491,9 +435,6 @@ jobs:
          python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
          python -m uv pip install -e [quality,test]
          python -m uv pip install -U ${{ matrix.config.backend }}
-          if [ "${{ join(matrix.config.additional_deps, ' ') }}" != "" ]; then
-              python -m uv pip install ${{ join(matrix.config.additional_deps, ' ') }}
-          fi
          python -m uv pip install pytest-reportlog
      - name: Environment
        run: |
@@ -526,60 +467,6 @@ jobs:
          pip install slack_sdk tabulate
          python utils/log_reports.py >> $GITHUB_STEP_SUMMARY

-  run_nightly_pipeline_level_quantization_tests:
-    name: Torch quantization nightly tests
-    strategy:
-      fail-fast: false
-      max-parallel: 2
-    runs-on:
-      group: aws-g6e-xlarge-plus
-    container:
-      image: diffusers/diffusers-pytorch-cuda
-      options: --shm-size "20gb" --ipc host --gpus 0
-    steps:
-      - name: Checkout diffusers
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: 2
-      - name: NVIDIA-SMI
-        run: nvidia-smi
-      - name: Install dependencies
-        run: |
-          python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-          python -m uv pip install -e [quality,test]
-          python -m uv pip install -U bitsandbytes optimum_quanto
-          python -m uv pip install pytest-reportlog
-      - name: Environment
-        run: |
-          python utils/print_env.py
-      - name: Pipeline-level quantization tests on GPU
-        env:
-          HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
-          # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
-          CUBLAS_WORKSPACE_CONFIG: :16:8
-          BIG_GPU_MEMORY: 40
-        run: |
-          python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
-            --make-reports=tests_pipeline_level_quant_torch_cuda \
-            --report-log=tests_pipeline_level_quant_torch_cuda.log \
-            tests/quantization/test_pipeline_level_quantization.py
-      - name: Failure short reports
-        if: ${{ failure() }}
-        run: |
-          cat reports/tests_pipeline_level_quant_torch_cuda_stats.txt
-          cat reports/tests_pipeline_level_quant_torch_cuda_failures_short.txt
-      - name: Test suite reports artifacts
-        if: ${{ always() }}
-        uses: actions/upload-artifact@v4
-        with:
-          name: torch_cuda_pipeline_level_quant_reports
-          path: reports
-      - name: Generate Report and Notify Channel
-        if: always()
-        run: |
-          pip install slack_sdk tabulate
-          python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
-  
 # M1 runner currently not well supported
 # TODO: (Dhruv) add these back when we setup better testing for Apple Silicon
 #  run_nightly_tests_apple_m1:
--- a/.github/workflows/pr_style_bot.yml
+++ b/.github/workflows/pr_style_bot.yml
@@ -9,9 +9,119 @@ permissions:
  pull-requests: write

 jobs:
-  style:
-    uses: huggingface/huggingface_hub/.github/workflows/style-bot-action.yml@main
-    with:
-      python_quality_dependencies: "[quality]"
-    secrets:
-      bot_token: ${{ secrets.GITHUB_TOKEN }}
+  run-style-bot:
+    if: >
+      contains(github.event.comment.body, '@bot /style') &&
+      github.event.issue.pull_request != null
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Extract PR details
+        id: pr_info
+        uses: actions/github-script@v6
+        with:
+          script: |
+            const prNumber = context.payload.issue.number;
+            const { data: pr } = await github.rest.pulls.get({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              pull_number: prNumber
+            });
+            
+            // We capture both the branch ref and the "full_name" of the head repo
+            // so that we can check out the correct repository & branch (including forks).
+            core.setOutput("prNumber", prNumber);
+            core.setOutput("headRef", pr.head.ref);
+            core.setOutput("headRepoFullName", pr.head.repo.full_name);
+
+      - name: Check out PR branch
+        uses: actions/checkout@v3
+        env: 
+          HEADREPOFULLNAME: ${{ steps.pr_info.outputs.headRepoFullName }}
+          HEADREF: ${{ steps.pr_info.outputs.headRef }}
+        with:
+          # Instead of checking out the base repo, use the contributor's repo name
+          repository: ${{ env.HEADREPOFULLNAME }}
+          ref: ${{ env.HEADREF }}
+          # You may need fetch-depth: 0 for being able to push
+          fetch-depth: 0
+          token: ${{ secrets.GITHUB_TOKEN }}
+      
+      - name: Debug
+        env: 
+          HEADREPOFULLNAME: ${{ steps.pr_info.outputs.headRepoFullName }}
+          HEADREF: ${{ steps.pr_info.outputs.headRef }}
+          PRNUMBER: ${{ steps.pr_info.outputs.prNumber }}
+        run: |
+          echo "PR number: ${{ env.PRNUMBER }}"
+          echo "Head Ref: ${{ env.HEADREF }}"
+          echo "Head Repo Full Name: ${{ env.HEADREPOFULLNAME }}"
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+
+      - name: Install dependencies
+        run: |
+          pip install .[quality]
+
+      - name: Download Makefile from main branch
+        run: |
+          curl -o main_Makefile https://raw.githubusercontent.com/huggingface/diffusers/main/Makefile
+        
+      - name: Compare Makefiles
+        run: |
+          if ! diff -q main_Makefile Makefile; then
+            echo "Error: The Makefile has changed. Please ensure it matches the main branch."
+            exit 1
+          fi
+          echo "No changes in Makefile. Proceeding..."
+          rm -rf main_Makefile
+
+      - name: Run make style and make quality
+        run: |
+          make style && make quality
+
+      - name: Commit and push changes
+        id: commit_and_push
+        env: 
+          HEADREPOFULLNAME: ${{ steps.pr_info.outputs.headRepoFullName }}
+          HEADREF: ${{ steps.pr_info.outputs.headRef }}
+          PRNUMBER: ${{ steps.pr_info.outputs.prNumber }}
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          echo "HEADREPOFULLNAME: ${{ env.HEADREPOFULLNAME }}, HEADREF: ${{ env.HEADREF }}"
+          # Configure git with the Actions bot user
+          git config user.name "github-actions[bot]"
+          git config user.email "github-actions[bot]@users.noreply.github.com"
+
+          # Make sure your 'origin' remote is set to the contributor's fork
+          git remote set-url origin "https://x-access-token:${GITHUB_TOKEN}@github.com/${{ env.HEADREPOFULLNAME }}.git"
+
+          # If there are changes after running style/quality, commit them
+          if [ -n "$(git status --porcelain)" ]; then
+            git add .
+            git commit -m "Apply style fixes"
+            # Push to the original contributor's forked branch
+            git push origin HEAD:${{ env.HEADREF }}
+            echo "changes_pushed=true" >> $GITHUB_OUTPUT
+          else
+            echo "No changes to commit."
+            echo "changes_pushed=false" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Comment on PR with workflow run link
+        if: steps.commit_and_push.outputs.changes_pushed == 'true'
+        uses: actions/github-script@v6
+        with:
+          script: |
+            const prNumber = parseInt(process.env.prNumber, 10);
+            const runUrl = `${process.env.GITHUB_SERVER_URL}/${process.env.GITHUB_REPOSITORY}/actions/runs/${process.env.GITHUB_RUN_ID}`
+
+            await github.rest.issues.createComment({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              issue_number: prNumber,
+              body: `Style fixes have been applied. [View the workflow run here](${runUrl}).`
+            });
+        env:
+          prNumber: ${{ steps.pr_info.outputs.prNumber }}
--- a/.github/workflows/pr_tests.yml
+++ b/.github/workflows/pr_tests.yml
@@ -3,6 +3,7 @@ name: Fast tests for PRs
 on:
  pull_request:
    branches: [main]
+    types: [synchronize]
    paths:
      - "src/diffusers/**.py"
      - "benchmarks/**.py"
@@ -11,7 +12,6 @@ on:
      - "tests/**.py"
      - ".github/**.yml"
      - "utils/**.py"
-      - "setup.py"
  push:
    branches:
      - ci-*
--- a/.github/workflows/pr_tests_gpu.yml
+++ b/.github/workflows/pr_tests_gpu.yml
@@ -1,296 +0,0 @@
-name: Fast GPU Tests on PR 
-
-on:
-  pull_request:
-    branches: main
-    paths:
-      - "src/diffusers/models/modeling_utils.py"
-      - "src/diffusers/models/model_loading_utils.py"
-      - "src/diffusers/pipelines/pipeline_utils.py"
-      - "src/diffusers/pipeline_loading_utils.py"
-      - "src/diffusers/loaders/lora_base.py"
-      - "src/diffusers/loaders/lora_pipeline.py"
-      - "src/diffusers/loaders/peft.py"
-      - "tests/pipelines/test_pipelines_common.py"
-      - "tests/models/test_modeling_common.py"
-  workflow_dispatch:
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
-  cancel-in-progress: true
-
-env:
-  DIFFUSERS_IS_CI: yes
-  OMP_NUM_THREADS: 8
-  MKL_NUM_THREADS: 8
-  HF_HUB_ENABLE_HF_TRANSFER: 1
-  PYTEST_TIMEOUT: 600
-  PIPELINE_USAGE_CUTOFF: 1000000000 # set high cutoff so that only always-test pipelines run
-
-jobs:
-  check_code_quality:
-    runs-on: ubuntu-22.04
-    steps:
-      - uses: actions/checkout@v3
-      - name: Set up Python
-        uses: actions/setup-python@v4
-        with:
-          python-version: "3.8"
-      - name: Install dependencies
-        run: |
-          python -m pip install --upgrade pip
-          pip install .[quality]
-      - name: Check quality
-        run: make quality
-      - name: Check if failure
-        if: ${{ failure() }}
-        run: |
-          echo "Quality check failed. Please ensure the right dependency versions are installed with 'pip install -e .[quality]' and run 'make style && make quality'" >> $GITHUB_STEP_SUMMARY
-
-  check_repository_consistency:
-    needs: check_code_quality
-    runs-on: ubuntu-22.04
-    steps:
-      - uses: actions/checkout@v3
-      - name: Set up Python
-        uses: actions/setup-python@v4
-        with:
-          python-version: "3.8"
-      - name: Install dependencies
-        run: |
-          python -m pip install --upgrade pip
-          pip install .[quality]
-      - name: Check repo consistency
-        run: |
-          python utils/check_copies.py
-          python utils/check_dummies.py
-          python utils/check_support_list.py
-          make deps_table_check_updated
-      - name: Check if failure
-        if: ${{ failure() }}
-        run: |
-          echo "Repo consistency check failed. Please ensure the right dependency versions are installed with 'pip install -e .[quality]' and run 'make fix-copies'" >> $GITHUB_STEP_SUMMARY
-  
-  setup_torch_cuda_pipeline_matrix:
-    needs: [check_code_quality, check_repository_consistency]
-    name: Setup Torch Pipelines CUDA Slow Tests Matrix
-    runs-on:
-      group: aws-general-8-plus
-    container:
-      image: diffusers/diffusers-pytorch-cpu
-    outputs:
-      pipeline_test_matrix: ${{ steps.fetch_pipeline_matrix.outputs.pipeline_test_matrix }}
-    steps:
-      - name: Checkout diffusers
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: 2
-      - name: Install dependencies
-        run: |
-          python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-          python -m uv pip install -e [quality,test]
-      - name: Environment
-        run: |
-          python utils/print_env.py
-      - name: Fetch Pipeline Matrix
-        id: fetch_pipeline_matrix
-        run: |
-          matrix=$(python utils/fetch_torch_cuda_pipeline_test_matrix.py)
-          echo $matrix
-          echo "pipeline_test_matrix=$matrix" >> $GITHUB_OUTPUT
-      - name: Pipeline Tests Artifacts
-        if: ${{ always() }}
-        uses: actions/upload-artifact@v4
-        with:
-          name: test-pipelines.json
-          path: reports
-
-  torch_pipelines_cuda_tests:
-    name: Torch Pipelines CUDA Tests
-    needs: setup_torch_cuda_pipeline_matrix
-    strategy:
-      fail-fast: false
-      max-parallel: 8
-      matrix:
-        module: ${{ fromJson(needs.setup_torch_cuda_pipeline_matrix.outputs.pipeline_test_matrix) }}
-    runs-on:
-      group: aws-g4dn-2xlarge
-    container:
-      image: diffusers/diffusers-pytorch-cuda
-      options: --shm-size "16gb" --ipc host --gpus 0
-    steps:
-      - name: Checkout diffusers
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: 2
-
-      - name: NVIDIA-SMI
-        run: |
-          nvidia-smi
-      - name: Install dependencies
-        run: |
-          python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-          python -m uv pip install -e [quality,test]
-          pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
-          pip uninstall transformers -y && python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers.git --no-deps
-
-      - name: Environment
-        run: |
-          python utils/print_env.py
-      - name: Extract tests
-        id: extract_tests
-        run: |
-          pattern=$(python utils/extract_tests_from_mixin.py --type pipeline)
-          echo "$pattern" > /tmp/test_pattern.txt
-          echo "pattern_file=/tmp/test_pattern.txt" >> $GITHUB_OUTPUT
-
-      - name: PyTorch CUDA checkpoint tests on Ubuntu
-        env:
-          HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
-          # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
-          CUBLAS_WORKSPACE_CONFIG: :16:8
-        run: |
-          if [ "${{ matrix.module }}" = "ip_adapters" ]; then 
-              python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
-              -s -v -k "not Flax and not Onnx" \
-              --make-reports=tests_pipeline_${{ matrix.module }}_cuda \
-              tests/pipelines/${{ matrix.module }}
-          else 
-              pattern=$(cat ${{ steps.extract_tests.outputs.pattern_file }})
-              python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
-              -s -v -k "not Flax and not Onnx and $pattern" \
-              --make-reports=tests_pipeline_${{ matrix.module }}_cuda \
-              tests/pipelines/${{ matrix.module }}
-          fi 
-
-      - name: Failure short reports
-        if: ${{ failure() }}
-        run: |
-          cat reports/tests_pipeline_${{ matrix.module }}_cuda_stats.txt
-          cat reports/tests_pipeline_${{ matrix.module }}_cuda_failures_short.txt
-      - name: Test suite reports artifacts
-        if: ${{ always() }}
-        uses: actions/upload-artifact@v4
-        with:
-          name: pipeline_${{ matrix.module }}_test_reports
-          path: reports
-
-  torch_cuda_tests:
-    name: Torch CUDA Tests
-    needs: [check_code_quality, check_repository_consistency]
-    runs-on:
-      group: aws-g4dn-2xlarge
-    container:
-      image: diffusers/diffusers-pytorch-cuda
-      options: --shm-size "16gb" --ipc host --gpus 0
-    defaults:
-      run:
-        shell: bash
-    strategy:
-      fail-fast: false
-      max-parallel: 2
-      matrix:
-        module: [models, schedulers, lora, others]
-    steps:
-    - name: Checkout diffusers
-      uses: actions/checkout@v3
-      with:
-        fetch-depth: 2
-
-    - name: Install dependencies
-      run: |
-        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-        python -m uv pip install -e [quality,test]
-        python -m uv pip install peft@git+https://github.com/huggingface/peft.git
-        pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
-        pip uninstall transformers -y && python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers.git --no-deps
-
-    - name: Environment
-      run: |
-        python utils/print_env.py
-
-    - name: Extract tests
-      id: extract_tests
-      run: |
-        pattern=$(python utils/extract_tests_from_mixin.py --type ${{ matrix.module }})
-        echo "$pattern" > /tmp/test_pattern.txt
-        echo "pattern_file=/tmp/test_pattern.txt" >> $GITHUB_OUTPUT
-
-    - name: Run PyTorch CUDA tests
-      env:
-        HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
-        # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
-        CUBLAS_WORKSPACE_CONFIG: :16:8
-      run: |
-        pattern=$(cat ${{ steps.extract_tests.outputs.pattern_file }})
-        if [ -z "$pattern" ]; then
-          python -m pytest -n 1 -sv --max-worker-restart=0 --dist=loadfile -k "not Flax and not Onnx" tests/${{ matrix.module }} \
-          --make-reports=tests_torch_cuda_${{ matrix.module }}  
-        else
-          python -m pytest -n 1 -sv --max-worker-restart=0 --dist=loadfile -k "not Flax and not Onnx and $pattern" tests/${{ matrix.module }} \
-          --make-reports=tests_torch_cuda_${{ matrix.module }}  
-        fi
-
-    - name: Failure short reports
-      if: ${{ failure() }}
-      run: |
-        cat reports/tests_torch_cuda_${{ matrix.module }}_stats.txt
-        cat reports/tests_torch_cuda_${{ matrix.module }}_failures_short.txt
-
-    - name: Test suite reports artifacts
-      if: ${{ always() }}
-      uses: actions/upload-artifact@v4
-      with:
-        name: torch_cuda_test_reports_${{ matrix.module }}
-        path: reports
-
-  run_examples_tests:
-    name: Examples PyTorch CUDA tests on Ubuntu
-    needs: [check_code_quality, check_repository_consistency]
-    runs-on:
-      group: aws-g4dn-2xlarge
-
-    container:
-      image: diffusers/diffusers-pytorch-cuda
-      options: --gpus 0 --shm-size "16gb" --ipc host
-    steps:
-    - name: Checkout diffusers
-      uses: actions/checkout@v3
-      with:
-        fetch-depth: 2
-
-    - name: NVIDIA-SMI
-      run: |
-        nvidia-smi
-    - name: Install dependencies
-      run: |
-        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-        pip uninstall transformers -y && python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers.git --no-deps
-        python -m uv pip install -e [quality,test,training]
-
-    - name: Environment
-      run: |
-        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-        python utils/print_env.py
-
-    - name: Run example tests on GPU
-      env:
-        HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
-      run: |
-        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-        python -m uv pip install timm
-        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v --make-reports=examples_torch_cuda examples/
-
-    - name: Failure short reports
-      if: ${{ failure() }}
-      run: |
-        cat reports/examples_torch_cuda_stats.txt
-        cat reports/examples_torch_cuda_failures_short.txt
-
-    - name: Test suite reports artifacts
-      if: ${{ always() }}
-      uses: actions/upload-artifact@v4
-      with:
-        name: examples_test_reports
-        path: reports
-
--- a/.github/workflows/push_tests.yml
+++ b/.github/workflows/push_tests.yml
@@ -1,6 +1,13 @@
 name: Fast GPU Tests on main

 on:
+  pull_request:
+    branches: main
+    paths:
+      - "src/diffusers/models/modeling_utils.py"
+      - "src/diffusers/models/model_loading_utils.py"
+      - "src/diffusers/pipelines/pipeline_utils.py"
+      - "src/diffusers/pipeline_loading_utils.py"
  workflow_dispatch:
  push:
    branches:
@@ -160,6 +167,7 @@ jobs:
        path: reports

  flax_tpu_tests:
+    if: ${{ github.event_name != 'pull_request' }}
    name: Flax TPU Tests
    runs-on:
      group: gcp-ct5lp-hightpu-8t
@@ -208,6 +216,7 @@ jobs:
        path: reports

  onnx_cuda_tests:
+    if: ${{ github.event_name != 'pull_request' }}
    name: ONNX CUDA Tests
    runs-on:
      group: aws-g4dn-2xlarge
@@ -256,6 +265,7 @@ jobs:
        path: reports

  run_torch_compile_tests:
+    if: ${{ github.event_name != 'pull_request' }}
    name: PyTorch Compile CUDA tests

    runs-on:
@@ -299,6 +309,7 @@ jobs:
        path: reports

  run_xformers_tests:
+    if: ${{ github.event_name != 'pull_request' }}
    name: PyTorch xformers CUDA tests

    runs-on:
--- a/.github/workflows/release_tests_fast.yml
+++ b/.github/workflows/release_tests_fast.yml
@@ -335,7 +335,7 @@ jobs:
    - name: Environment
      run: |
        python utils/print_env.py
-    - name: Run torch compile tests on GPU
+    - name: Run example tests on GPU
      env:
        HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
        RUN_COMPILE: yes
--- a/docker/diffusers-onnxruntime-cpu/Dockerfile
+++ b/docker/diffusers-onnxruntime-cpu/Dockerfile
@@ -28,9 +28,9 @@ ENV PATH="/opt/venv/bin:$PATH"
 # pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
 RUN python3 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
    python3 -m uv pip install --no-cache-dir \
-        torch \
-        torchvision \
-        torchaudio\
+        torch==2.1.2 \
+        torchvision==0.16.2 \
+        torchaudio==2.1.2 \
        onnxruntime \
        --extra-index-url https://download.pytorch.org/whl/cpu && \
    python3 -m uv pip install --no-cache-dir \
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -17,8 +17,12 @@
    title: AutoPipeline
  - local: tutorials/basic_training
    title: Train a diffusion model
+  - local: tutorials/using_peft_for_inference
+    title: Load LoRAs for inference
  - local: tutorials/fast_diffusion
    title: Accelerate inference of text-to-image diffusion models
+  - local: tutorials/inference_with_big_models
+    title: Working with big models
  title: Tutorials
 - sections:
  - local: using-diffusers/loading
@@ -29,24 +33,11 @@
    title: Load schedulers and models
  - local: using-diffusers/other-formats
    title: Model files and layouts
+  - local: using-diffusers/loading_adapters
+    title: Load adapters
  - local: using-diffusers/push_to_hub
    title: Push files to the Hub
  title: Load pipelines and adapters
- sections:
-  - local: tutorials/using_peft_for_inference
-    title: LoRA
-  - local: using-diffusers/ip_adapter
-    title: IP-Adapter
-  - local: using-diffusers/controlnet
-    title: ControlNet
-  - local: using-diffusers/t2i_adapter
-    title: T2I-Adapter
-  - local: using-diffusers/dreambooth
-    title: DreamBooth
-  - local: using-diffusers/textual_inversion_inference
-    title: Textual inversion
-  title: Adapters
-  isExpanded: false
 - sections:
  - local: using-diffusers/unconditional_image_generation
    title: Unconditional image generation
@@ -68,6 +59,8 @@
    title: Create a server
  - local: training/distributed_inference
    title: Distributed inference
+  - local: using-diffusers/merge_loras
+    title: Merge LoRAs
  - local: using-diffusers/scheduler_features
    title: Scheduler features
  - local: using-diffusers/callback
@@ -83,16 +76,6 @@
  - local: advanced_inference/outpaint
    title: Outpainting
  title: Advanced inference
- sections:
-  - local: hybrid_inference/overview
-    title: Overview
-  - local: hybrid_inference/vae_decode
-    title: VAE Decode
-  - local: hybrid_inference/vae_encode
-    title: VAE Encode
-  - local: hybrid_inference/api_reference
-    title: API Reference
-  title: Hybrid Inference
 - sections:
  - local: using-diffusers/cogvideox
    title: CogVideoX
@@ -104,12 +87,20 @@
    title: SDXL Turbo
  - local: using-diffusers/kandinsky
    title: Kandinsky
+  - local: using-diffusers/ip_adapter
+    title: IP-Adapter
  - local: using-diffusers/omnigen
    title: OmniGen
  - local: using-diffusers/pag
    title: PAG
+  - local: using-diffusers/controlnet
+    title: ControlNet
+  - local: using-diffusers/t2i_adapter
+    title: T2I-Adapter
  - local: using-diffusers/inference_with_lcm
    title: Latent Consistency Model
+  - local: using-diffusers/textual_inversion_inference
+    title: Textual inversion
  - local: using-diffusers/shap-e
    title: Shap-E
  - local: using-diffusers/diffedit
@@ -174,12 +165,10 @@
    title: gguf
  - local: quantization/torchao
    title: torchao
-  - local: quantization/quanto
-    title: quanto
  title: Quantization Methods
 - sections:
  - local: optimization/fp16
-    title: Accelerate inference
+    title: Speed up inference
  - local: optimization/memory
    title: Reduce memory usage
  - local: optimization/torch2.0
@@ -264,23 +253,19 @@
    sections:
    - local: api/models/overview
      title: Overview
-    - local: api/models/auto_model
-      title: AutoModel
    - sections:
      - local: api/models/controlnet
        title: ControlNetModel
-      - local: api/models/controlnet_union
-        title: ControlNetUnionModel
      - local: api/models/controlnet_flux
        title: FluxControlNetModel
      - local: api/models/controlnet_hunyuandit
        title: HunyuanDiT2DControlNetModel
-      - local: api/models/controlnet_sana
-        title: SanaControlNetModel
      - local: api/models/controlnet_sd3
        title: SD3ControlNetModel
      - local: api/models/controlnet_sparsectrl
        title: SparseControlNetModel
+      - local: api/models/controlnet_union
+        title: ControlNetUnionModel
      title: ControlNets
    - sections:
      - local: api/models/allegro_transformer3d
@@ -289,34 +274,28 @@
        title: AuraFlowTransformer2DModel
      - local: api/models/cogvideox_transformer3d
        title: CogVideoXTransformer3DModel
+      - local: api/models/consisid_transformer3d
+        title: ConsisIDTransformer3DModel
      - local: api/models/cogview3plus_transformer2d
        title: CogView3PlusTransformer2DModel
      - local: api/models/cogview4_transformer2d
        title: CogView4Transformer2DModel
-      - local: api/models/consisid_transformer3d
-        title: ConsisIDTransformer3DModel
-      - local: api/models/cosmos_transformer3d
-        title: CosmosTransformer3DModel
      - local: api/models/dit_transformer2d
        title: DiTTransformer2DModel
-      - local: api/models/easyanimate_transformer3d
-        title: EasyAnimateTransformer3DModel
      - local: api/models/flux_transformer
        title: FluxTransformer2DModel
-      - local: api/models/hidream_image_transformer
-        title: HiDreamImageTransformer2DModel
      - local: api/models/hunyuan_transformer2d
        title: HunyuanDiT2DModel
      - local: api/models/hunyuan_video_transformer_3d
        title: HunyuanVideoTransformer3DModel
      - local: api/models/latte_transformer3d
        title: LatteTransformer3DModel
-      - local: api/models/ltx_video_transformer3d
-        title: LTXVideoTransformer3DModel
-      - local: api/models/lumina2_transformer2d
-        title: Lumina2Transformer2DModel
      - local: api/models/lumina_nextdit2d
        title: LuminaNextDiT2DModel
+      - local: api/models/lumina2_transformer2d
+        title: Lumina2Transformer2DModel
+      - local: api/models/ltx_video_transformer3d
+        title: LTXVideoTransformer3DModel
      - local: api/models/mochi_transformer3d
        title: MochiTransformer3DModel
      - local: api/models/omnigen_transformer
@@ -325,28 +304,26 @@
        title: PixArtTransformer2DModel
      - local: api/models/prior_transformer
        title: PriorTransformer
-      - local: api/models/sana_transformer2d
-        title: SanaTransformer2DModel
      - local: api/models/sd3_transformer2d
        title: SD3Transformer2DModel
+      - local: api/models/sana_transformer2d
+        title: SanaTransformer2DModel
      - local: api/models/stable_audio_transformer
        title: StableAudioDiTModel
      - local: api/models/transformer2d
        title: Transformer2DModel
      - local: api/models/transformer_temporal
        title: TransformerTemporalModel
-      - local: api/models/wan_transformer_3d
-        title: WanTransformer3DModel
      title: Transformers
    - sections:
      - local: api/models/stable_cascade_unet
        title: StableCascadeUNet
      - local: api/models/unet
        title: UNet1DModel
-      - local: api/models/unet2d-cond
-        title: UNet2DConditionModel
      - local: api/models/unet2d
        title: UNet2DModel
+      - local: api/models/unet2d-cond
+        title: UNet2DConditionModel
      - local: api/models/unet3d-cond
        title: UNet3DConditionModel
      - local: api/models/unet-motion
@@ -355,28 +332,22 @@
        title: UViT2DModel
      title: UNets
    - sections:
-      - local: api/models/asymmetricautoencoderkl
-        title: AsymmetricAutoencoderKL
-      - local: api/models/autoencoder_dc
-        title: AutoencoderDC
      - local: api/models/autoencoderkl
        title: AutoencoderKL
      - local: api/models/autoencoderkl_allegro
        title: AutoencoderKLAllegro
      - local: api/models/autoencoderkl_cogvideox
        title: AutoencoderKLCogVideoX
-      - local: api/models/autoencoderkl_cosmos
-        title: AutoencoderKLCosmos
      - local: api/models/autoencoder_kl_hunyuan_video
        title: AutoencoderKLHunyuanVideo
      - local: api/models/autoencoderkl_ltx_video
        title: AutoencoderKLLTXVideo
-      - local: api/models/autoencoderkl_magvit
-        title: AutoencoderKLMagvit
      - local: api/models/autoencoderkl_mochi
        title: AutoencoderKLMochi
-      - local: api/models/autoencoder_kl_wan
-        title: AutoencoderKLWan
+      - local: api/models/asymmetricautoencoderkl
+        title: AsymmetricAutoencoderKL
+      - local: api/models/autoencoder_dc
+        title: AutoencoderDC
      - local: api/models/consistency_decoder_vae
        title: ConsistencyDecoderVAE
      - local: api/models/autoencoder_oobleck
@@ -429,16 +400,12 @@
      title: ControlNet with Stable Diffusion 3
    - local: api/pipelines/controlnet_sdxl
      title: ControlNet with Stable Diffusion XL
-    - local: api/pipelines/controlnet_sana
-      title: ControlNet-Sana
    - local: api/pipelines/controlnetxs
      title: ControlNet-XS
    - local: api/pipelines/controlnetxs_sdxl
      title: ControlNet-XS with Stable Diffusion XL
    - local: api/pipelines/controlnet_union
      title: ControlNetUnion
-    - local: api/pipelines/cosmos
-      title: Cosmos
    - local: api/pipelines/dance_diffusion
      title: Dance Diffusion
    - local: api/pipelines/ddim
@@ -451,16 +418,10 @@
      title: DiffEdit
    - local: api/pipelines/dit
      title: DiT
-    - local: api/pipelines/easyanimate
-      title: EasyAnimate
    - local: api/pipelines/flux
      title: Flux
    - local: api/pipelines/control_flux_inpaint
      title: FluxControlInpaint
-    - local: api/pipelines/framepack
-      title: Framepack
-    - local: api/pipelines/hidream
-      title: HiDream-I1
    - local: api/pipelines/hunyuandit
      title: Hunyuan-DiT
    - local: api/pipelines/hunyuan_video
@@ -513,8 +474,6 @@
      title: PixArt-Σ
    - local: api/pipelines/sana
      title: Sana
-    - local: api/pipelines/sana_sprint
-      title: Sana Sprint
    - local: api/pipelines/self_attention_guidance
      title: Self-Attention Guidance
    - local: api/pipelines/semantic_stable_diffusion
@@ -528,40 +487,40 @@
    - sections:
      - local: api/pipelines/stable_diffusion/overview
        title: Overview
-      - local: api/pipelines/stable_diffusion/depth2img
-        title: Depth-to-image
-      - local: api/pipelines/stable_diffusion/gligen
-        title: GLIGEN (Grounded Language-to-Image Generation)
-      - local: api/pipelines/stable_diffusion/image_variation
-        title: Image variation
+      - local: api/pipelines/stable_diffusion/text2img
+        title: Text-to-image
      - local: api/pipelines/stable_diffusion/img2img
        title: Image-to-image
      - local: api/pipelines/stable_diffusion/svd
        title: Image-to-video
      - local: api/pipelines/stable_diffusion/inpaint
        title: Inpainting
-      - local: api/pipelines/stable_diffusion/k_diffusion
-        title: K-Diffusion
-      - local: api/pipelines/stable_diffusion/latent_upscale
-        title: Latent upscaler
-      - local: api/pipelines/stable_diffusion/ldm3d_diffusion
-        title: LDM3D Text-to-(RGB, Depth), Text-to-(RGB-pano, Depth-pano), LDM3D Upscaler
+      - local: api/pipelines/stable_diffusion/depth2img
+        title: Depth-to-image
+      - local: api/pipelines/stable_diffusion/image_variation
+        title: Image variation
      - local: api/pipelines/stable_diffusion/stable_diffusion_safe
        title: Safe Stable Diffusion
-      - local: api/pipelines/stable_diffusion/sdxl_turbo
-        title: SDXL Turbo
      - local: api/pipelines/stable_diffusion/stable_diffusion_2
        title: Stable Diffusion 2
      - local: api/pipelines/stable_diffusion/stable_diffusion_3
        title: Stable Diffusion 3
      - local: api/pipelines/stable_diffusion/stable_diffusion_xl
        title: Stable Diffusion XL
+      - local: api/pipelines/stable_diffusion/sdxl_turbo
+        title: SDXL Turbo
+      - local: api/pipelines/stable_diffusion/latent_upscale
+        title: Latent upscaler
      - local: api/pipelines/stable_diffusion/upscale
        title: Super-resolution
+      - local: api/pipelines/stable_diffusion/k_diffusion
+        title: K-Diffusion
+      - local: api/pipelines/stable_diffusion/ldm3d_diffusion
+        title: LDM3D Text-to-(RGB, Depth), Text-to-(RGB-pano, Depth-pano), LDM3D Upscaler
      - local: api/pipelines/stable_diffusion/adapter
        title: T2I-Adapter
-      - local: api/pipelines/stable_diffusion/text2img
-        title: Text-to-image
+      - local: api/pipelines/stable_diffusion/gligen
+        title: GLIGEN (Grounded Language-to-Image Generation)
      title: Stable Diffusion
    - local: api/pipelines/stable_unclip
      title: Stable unCLIP
@@ -575,10 +534,6 @@
      title: UniDiffuser
    - local: api/pipelines/value_guided_sampling
      title: Value-guided sampling
-    - local: api/pipelines/visualcloze
-      title: VisualCloze
-    - local: api/pipelines/wan
-      title: Wan
    - local: api/pipelines/wuerstchen
      title: Wuerstchen
    title: Pipelines
@@ -588,10 +543,6 @@
      title: Overview
    - local: api/schedulers/cm_stochastic_iterative
      title: CMStochasticIterativeScheduler
-    - local: api/schedulers/ddim_cogvideox
-      title: CogVideoXDDIMScheduler
-    - local: api/schedulers/multistep_dpm_solver_cogvideox
-      title: CogVideoXDPMScheduler
    - local: api/schedulers/consistency_decoder
      title: ConsistencyDecoderScheduler
    - local: api/schedulers/cosine_dpm
--- a/docs/source/en/api/cache.md
+++ b/docs/source/en/api/cache.md
@@ -38,33 +38,6 @@ config = PyramidAttentionBroadcastConfig(
 pipe.transformer.enable_cache(config)
 ```

-## Faster Cache
-
-[FasterCache](https://huggingface.co/papers/2410.19355) from Zhengyao Lv, Chenyang Si, Junhao Song, Zhenyu Yang, Yu Qiao, Ziwei Liu, Kwan-Yee K. Wong.
-
-FasterCache is a method that speeds up inference in diffusion transformers by:
- Reusing attention states between successive inference steps, due to high similarity between them
- Skipping unconditional branch prediction used in classifier-free guidance by revealing redundancies between unconditional and conditional branch outputs for the same timestep, and therefore approximating the unconditional branch output using the conditional branch output
-
-```python
-import torch
-from diffusers import CogVideoXPipeline, FasterCacheConfig
-
-pipe = CogVideoXPipeline.from_pretrained("THUDM/CogVideoX-5b", torch_dtype=torch.bfloat16)
-pipe.to("cuda")
-
-config = FasterCacheConfig(
-    spatial_attention_block_skip_range=2,
-    spatial_attention_timestep_skip_range=(-1, 681),
-    current_timestep_callback=lambda: pipe.current_timestep,
-    attention_weight_callback=lambda _: 0.3,
-    unconditional_batch_skip_range=5,
-    unconditional_batch_timestep_skip_range=(-1, 781),
-    tensor_format="BFCHW",
-)
-pipe.transformer.enable_cache(config)
-```
-
 ### CacheMixin

 [[autodoc]] CacheMixin
@@ -74,9 +47,3 @@ pipe.transformer.enable_cache(config)
 [[autodoc]] PyramidAttentionBroadcastConfig

 [[autodoc]] apply_pyramid_attention_broadcast
-
-### FasterCacheConfig
-
-[[autodoc]] FasterCacheConfig
-
-[[autodoc]] apply_faster_cache
--- a/docs/source/en/api/loaders/lora.md
+++ b/docs/source/en/api/loaders/lora.md
@@ -20,15 +20,11 @@ LoRA is a fast and lightweight training method that inserts and trains a signifi
 - [`FluxLoraLoaderMixin`] provides similar functions for [Flux](https://huggingface.co/docs/diffusers/main/en/api/pipelines/flux).
 - [`CogVideoXLoraLoaderMixin`] provides similar functions for [CogVideoX](https://huggingface.co/docs/diffusers/main/en/api/pipelines/cogvideox).
 - [`Mochi1LoraLoaderMixin`] provides similar functions for [Mochi](https://huggingface.co/docs/diffusers/main/en/api/pipelines/mochi).
- [`AuraFlowLoraLoaderMixin`] provides similar functions for [AuraFlow](https://huggingface.co/fal/AuraFlow).
 - [`LTXVideoLoraLoaderMixin`] provides similar functions for [LTX-Video](https://huggingface.co/docs/diffusers/main/en/api/pipelines/ltx_video).
 - [`SanaLoraLoaderMixin`] provides similar functions for [Sana](https://huggingface.co/docs/diffusers/main/en/api/pipelines/sana).
 - [`HunyuanVideoLoraLoaderMixin`] provides similar functions for [HunyuanVideo](https://huggingface.co/docs/diffusers/main/en/api/pipelines/hunyuan_video).
 - [`Lumina2LoraLoaderMixin`] provides similar functions for [Lumina2](https://huggingface.co/docs/diffusers/main/en/api/pipelines/lumina2).
- [`WanLoraLoaderMixin`] provides similar functions for [Wan](https://huggingface.co/docs/diffusers/main/en/api/pipelines/wan).
- [`CogView4LoraLoaderMixin`] provides similar functions for [CogView4](https://huggingface.co/docs/diffusers/main/en/api/pipelines/cogview4).
 - [`AmusedLoraLoaderMixin`] is for the [`AmusedPipeline`].
- [`HiDreamImageLoraLoaderMixin`] provides similar functions for [HiDream Image](https://huggingface.co/docs/diffusers/main/en/api/pipelines/hidream)
 - [`LoraBaseMixin`] provides a base class with several utility methods to fuse, unfuse, unload, LoRAs and more.

 <Tip>
@@ -60,9 +56,6 @@ To learn more about how to load LoRA weights, see the [LoRA](../../using-diffuse
 ## Mochi1LoraLoaderMixin

 [[autodoc]] loaders.lora_pipeline.Mochi1LoraLoaderMixin
-## AuraFlowLoraLoaderMixin
-
-[[autodoc]] loaders.lora_pipeline.AuraFlowLoraLoaderMixin

 ## LTXVideoLoraLoaderMixin

@@ -80,22 +73,10 @@ To learn more about how to load LoRA weights, see the [LoRA](../../using-diffuse

 [[autodoc]] loaders.lora_pipeline.Lumina2LoraLoaderMixin

-## CogView4LoraLoaderMixin
-
-[[autodoc]] loaders.lora_pipeline.CogView4LoraLoaderMixin
-
-## WanLoraLoaderMixin
-
-[[autodoc]] loaders.lora_pipeline.WanLoraLoaderMixin
-
 ## AmusedLoraLoaderMixin

 [[autodoc]] loaders.lora_pipeline.AmusedLoraLoaderMixin

-## HiDreamImageLoraLoaderMixin
-
-[[autodoc]] loaders.lora_pipeline.HiDreamImageLoraLoaderMixin
-
 ## LoraBaseMixin

 [[autodoc]] loaders.lora_base.LoraBaseMixin
--- a/docs/source/en/api/models/auto_model.md
+++ b/docs/source/en/api/models/auto_model.md
@@ -1,29 +0,0 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# AutoModel
-
-The `AutoModel` is designed to make it easy to load a checkpoint without needing to know the specific model class. `AutoModel` automatically retrieves the correct model class from the checkpoint `config.json` file.
-
-```python
-from diffusers import AutoModel, AutoPipelineForText2Image
-
-unet = AutoModel.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5", subfolder="unet")
-pipe = AutoPipelineForText2Image.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5", unet=unet)
-```
-
-
-## AutoModel
-
-[[autodoc]] AutoModel
-	- all
-	- from_pretrained
--- a/docs/source/en/api/models/autoencoder_kl_wan.md
+++ b/docs/source/en/api/models/autoencoder_kl_wan.md
@@ -1,32 +0,0 @@
-<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License. -->
-
-# AutoencoderKLWan
-
-The 3D variational autoencoder (VAE) model with KL loss used in [Wan 2.1](https://github.com/Wan-Video/Wan2.1) by the Alibaba Wan Team.
-
-The model can be loaded with the following code snippet.
-
-```python
-from diffusers import AutoencoderKLWan
-
-vae = AutoencoderKLWan.from_pretrained("Wan-AI/Wan2.1-T2V-1.3B-Diffusers", subfolder="vae", torch_dtype=torch.float32)
-```
-
-## AutoencoderKLWan
-
-[[autodoc]] AutoencoderKLWan
-  - decode
-  - all
-
-## DecoderOutput
-
-[[autodoc]] models.autoencoders.vae.DecoderOutput
--- a/docs/source/en/api/models/autoencoderkl_allegro.md
+++ b/docs/source/en/api/models/autoencoderkl_allegro.md
@@ -18,7 +18,7 @@ The model can be loaded with the following code snippet.
 ```python
 from diffusers import AutoencoderKLAllegro

-vae = AutoencoderKLAllegro.from_pretrained("rhymes-ai/Allegro", subfolder="vae", torch_dtype=torch.float32).to("cuda")
+vae = AutoencoderKLCogVideoX.from_pretrained("rhymes-ai/Allegro", subfolder="vae", torch_dtype=torch.float32).to("cuda")
 ```

 ## AutoencoderKLAllegro
--- a/docs/source/en/api/models/autoencoderkl_cosmos.md
+++ b/docs/source/en/api/models/autoencoderkl_cosmos.md
@@ -1,40 +0,0 @@
-<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License. -->
-
-# AutoencoderKLCosmos
-
-[Cosmos Tokenizers](https://github.com/NVIDIA/Cosmos-Tokenizer).
-
-Supported models:
- [nvidia/Cosmos-1.0-Tokenizer-CV8x8x8](https://huggingface.co/nvidia/Cosmos-1.0-Tokenizer-CV8x8x8)
-
-The model can be loaded with the following code snippet.
-
-```python
-from diffusers import AutoencoderKLCosmos
-
-vae = AutoencoderKLCosmos.from_pretrained("nvidia/Cosmos-1.0-Tokenizer-CV8x8x8", subfolder="vae")
-```
-
-## AutoencoderKLCosmos
-
-[[autodoc]] AutoencoderKLCosmos
-    - decode
-    - encode
-    - all
-
-## AutoencoderKLOutput
-
-[[autodoc]] models.autoencoders.autoencoder_kl.AutoencoderKLOutput
-
-## DecoderOutput
-
-[[autodoc]] models.autoencoders.vae.DecoderOutput
--- a/docs/source/en/api/models/autoencoderkl_magvit.md
+++ b/docs/source/en/api/models/autoencoderkl_magvit.md
@@ -1,37 +0,0 @@
-<!--Copyright 2025 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License. -->
-
-# AutoencoderKLMagvit
-
-The 3D variational autoencoder (VAE) model with KL loss used in [EasyAnimate](https://github.com/aigc-apps/EasyAnimate) was introduced by Alibaba PAI.
-
-The model can be loaded with the following code snippet.
-
-```python
-from diffusers import AutoencoderKLMagvit
-
-vae = AutoencoderKLMagvit.from_pretrained("alibaba-pai/EasyAnimateV5.1-12b-zh", subfolder="vae", torch_dtype=torch.float16).to("cuda")
-```
-
-## AutoencoderKLMagvit
-
-[[autodoc]] AutoencoderKLMagvit
-    - decode
-    - encode
-    - all
-
-## AutoencoderKLOutput
-
-[[autodoc]] models.autoencoders.autoencoder_kl.AutoencoderKLOutput
-
-## DecoderOutput
-
-[[autodoc]] models.autoencoders.vae.DecoderOutput
--- a/docs/source/en/api/models/controlnet_sana.md
+++ b/docs/source/en/api/models/controlnet_sana.md
@@ -1,29 +0,0 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# SanaControlNetModel
-
-The ControlNet model was introduced in [Adding Conditional Control to Text-to-Image Diffusion Models](https://huggingface.co/papers/2302.05543) by Lvmin Zhang, Anyi Rao, Maneesh Agrawala. It provides a greater degree of control over text-to-image generation by conditioning the model on additional inputs such as edge maps, depth maps, segmentation maps, and keypoints for pose detection.
-
-The abstract from the paper is:
-
-*We present ControlNet, a neural network architecture to add spatial conditioning controls to large, pretrained text-to-image diffusion models. ControlNet locks the production-ready large diffusion models, and reuses their deep and robust encoding layers pretrained with billions of images as a strong backbone to learn a diverse set of conditional controls. The neural architecture is connected with "zero convolutions" (zero-initialized convolution layers) that progressively grow the parameters from zero and ensure that no harmful noise could affect the finetuning. We test various conditioning controls, eg, edges, depth, segmentation, human pose, etc, with Stable Diffusion, using single or multiple conditions, with or without prompts. We show that the training of ControlNets is robust with small (<50k) and large (>1m) datasets. Extensive results show that ControlNet may facilitate wider applications to control image diffusion models.*
-
-This model was contributed by [ishan24](https://huggingface.co/ishan24). ❤️
-The original codebase can be found at [NVlabs/Sana](https://github.com/NVlabs/Sana), and you can find official ControlNet checkpoints on [Efficient-Large-Model's](https://huggingface.co/Efficient-Large-Model) Hub profile.
-
-## SanaControlNetModel
-[[autodoc]] SanaControlNetModel
-
-## SanaControlNetOutput
-[[autodoc]] models.controlnets.controlnet_sana.SanaControlNetOutput
-
--- a/docs/source/en/api/models/cosmos_transformer3d.md
+++ b/docs/source/en/api/models/cosmos_transformer3d.md
@@ -1,30 +0,0 @@
-<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License. -->
-
-# CosmosTransformer3DModel
-
-A Diffusion Transformer model for 3D video-like data was introduced in [Cosmos World Foundation Model Platform for Physical AI](https://huggingface.co/papers/2501.03575) by NVIDIA.
-
-The model can be loaded with the following code snippet.
-
-```python
-from diffusers import CosmosTransformer3DModel
-
-transformer = CosmosTransformer3DModel.from_pretrained("nvidia/Cosmos-1.0-Diffusion-7B-Text2World", subfolder="transformer", torch_dtype=torch.bfloat16)
-```
-
-## CosmosTransformer3DModel
-
-[[autodoc]] CosmosTransformer3DModel
-
-## Transformer2DModelOutput
-
-[[autodoc]] models.modeling_outputs.Transformer2DModelOutput
--- a/docs/source/en/api/models/easyanimate_transformer3d.md
+++ b/docs/source/en/api/models/easyanimate_transformer3d.md
@@ -1,30 +0,0 @@
-<!--Copyright 2025 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License. -->
-
-# EasyAnimateTransformer3DModel
-
-A Diffusion Transformer model for 3D data from [EasyAnimate](https://github.com/aigc-apps/EasyAnimate) was introduced by Alibaba PAI.
-
-The model can be loaded with the following code snippet.
-
-```python
-from diffusers import EasyAnimateTransformer3DModel
-
-transformer = EasyAnimateTransformer3DModel.from_pretrained("alibaba-pai/EasyAnimateV5.1-12b-zh", subfolder="transformer", torch_dtype=torch.float16).to("cuda")
-```
-
-## EasyAnimateTransformer3DModel
-
-[[autodoc]] EasyAnimateTransformer3DModel
-
-## Transformer2DModelOutput
-
-[[autodoc]] models.modeling_outputs.Transformer2DModelOutput
--- a/docs/source/en/api/models/hidream_image_transformer.md
+++ b/docs/source/en/api/models/hidream_image_transformer.md
@@ -1,46 +0,0 @@
-<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License. -->
-
-# HiDreamImageTransformer2DModel
-
-A Transformer model for image-like data from [HiDream-I1](https://huggingface.co/HiDream-ai).
-
-The model can be loaded with the following code snippet.
-
-```python
-from diffusers import HiDreamImageTransformer2DModel
-
-transformer = HiDreamImageTransformer2DModel.from_pretrained("HiDream-ai/HiDream-I1-Full", subfolder="transformer", torch_dtype=torch.bfloat16)
-```
-
-## Loading GGUF quantized checkpoints for HiDream-I1
-
-GGUF checkpoints for the `HiDreamImageTransformer2DModel` can  be loaded using `~FromOriginalModelMixin.from_single_file`
-
-```python
-import torch
-from diffusers import GGUFQuantizationConfig, HiDreamImageTransformer2DModel
-
-ckpt_path = "https://huggingface.co/city96/HiDream-I1-Dev-gguf/blob/main/hidream-i1-dev-Q2_K.gguf"
-transformer = HiDreamImageTransformer2DModel.from_single_file(
-    ckpt_path,
-    quantization_config=GGUFQuantizationConfig(compute_dtype=torch.bfloat16),
-    torch_dtype=torch.bfloat16
-)
-```
-
-## HiDreamImageTransformer2DModel
-
-[[autodoc]] HiDreamImageTransformer2DModel
-
-## Transformer2DModelOutput
-
-[[autodoc]] models.modeling_outputs.Transformer2DModelOutput
--- a/docs/source/en/api/models/wan_transformer_3d.md
+++ b/docs/source/en/api/models/wan_transformer_3d.md
@@ -1,30 +0,0 @@
-<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License. -->
-
-# WanTransformer3DModel
-
-A Diffusion Transformer model for 3D video-like data was introduced in [Wan 2.1](https://github.com/Wan-Video/Wan2.1) by the Alibaba Wan Team.
-
-The model can be loaded with the following code snippet.
-
-```python
-from diffusers import WanTransformer3DModel
-
-transformer = WanTransformer3DModel.from_pretrained("Wan-AI/Wan2.1-T2V-1.3B-Diffusers", subfolder="transformer", torch_dtype=torch.bfloat16)
-```
-
-## WanTransformer3DModel
-
-[[autodoc]] WanTransformer3DModel
-
-## Transformer2DModelOutput
-
-[[autodoc]] models.modeling_outputs.Transformer2DModelOutput
--- a/docs/source/en/api/pipelines/animatediff.md
+++ b/docs/source/en/api/pipelines/animatediff.md
@@ -12,10 +12,6 @@ specific language governing permissions and limitations under the License.

 # Text-to-Video Generation with AnimateDiff

-<div class="flex flex-wrap space-x-1">
-  <img alt="LoRA" src="https://img.shields.io/badge/LoRA-d8b4fe?style=flat"/>
-</div>
-
 ## Overview

 [AnimateDiff: Animate Your Personalized Text-to-Image Diffusion Models without Specific Tuning](https://arxiv.org/abs/2307.04725) by Yuwei Guo, Ceyuan Yang, Anyi Rao, Yaohui Wang, Yu Qiao, Dahua Lin, Bo Dai.
@@ -966,7 +962,7 @@ pipe.to("cuda")
 prompt = {
    0: "A caterpillar on a leaf, high quality, photorealistic",
    40: "A caterpillar transforming into a cocoon, on a leaf, near flowers, photorealistic",
-    80: "A cocoon on a leaf, flowers in the background, photorealistic",
+    80: "A cocoon on a leaf, flowers in the backgrond, photorealistic",
    120: "A cocoon maturing and a butterfly being born, flowers and leaves visible in the background, photorealistic",
    160: "A beautiful butterfly, vibrant colors, sitting on a leaf, flowers in the background, photorealistic",
    200: "A beautiful butterfly, flying away in a forest, photorealistic",
--- a/docs/source/en/api/pipelines/aura_flow.md
+++ b/docs/source/en/api/pipelines/aura_flow.md
@@ -89,23 +89,6 @@ image = pipeline(prompt).images[0]
 image.save("auraflow.png")
 ```

-## Support for `torch.compile()`
-
-AuraFlow can be compiled with `torch.compile()` to speed up inference latency even for different resolutions. First, install PyTorch nightly following the instructions from [here](https://pytorch.org/). The snippet below shows the changes needed to enable this:
-
-```diff
-+ torch.fx.experimental._config.use_duck_shape = False
-+ pipeline.transformer = torch.compile(
-    pipeline.transformer, fullgraph=True, dynamic=True
-)
-```
-
-Specifying `use_duck_shape` to be `False` instructs the compiler if it should use the same symbolic variable to represent input sizes that are the same. For more details, check out [this comment](https://github.com/huggingface/diffusers/pull/11327#discussion_r2047659790).
-
-This enables from 100% (on low resolutions) to a 30% (on 1536x1536 resolution) speed improvements.
-
-Thanks to [AstraliteHeart](https://github.com/huggingface/diffusers/pull/11297/) who helped us rewrite the [`AuraFlowTransformer2DModel`] class so that the above works for different resolutions ([PR](https://github.com/huggingface/diffusers/pull/11297/)).
-
 ## AuraFlowPipeline

 [[autodoc]] AuraFlowPipeline
--- a/docs/source/en/api/pipelines/cogvideox.md
+++ b/docs/source/en/api/pipelines/cogvideox.md
@@ -15,10 +15,6 @@

 # CogVideoX

-<div class="flex flex-wrap space-x-1">
-  <img alt="LoRA" src="https://img.shields.io/badge/LoRA-d8b4fe?style=flat"/>
-</div>
-
 [CogVideoX: Text-to-Video Diffusion Models with An Expert Transformer](https://arxiv.org/abs/2408.06072) from Tsinghua University & ZhipuAI, by Zhuoyi Yang, Jiayan Teng, Wendi Zheng, Ming Ding, Shiyu Huang, Jiazheng Xu, Yuanming Yang, Wenyi Hong, Xiaohan Zhang, Guanyu Feng, Da Yin, Xiaotao Gu, Yuxuan Zhang, Weihan Wang, Yean Cheng, Ting Liu, Bin Xu, Yuxiao Dong, Jie Tang.

 The abstract from the paper is:
--- a/docs/source/en/api/pipelines/consisid.md
+++ b/docs/source/en/api/pipelines/consisid.md
@@ -15,10 +15,6 @@

 # ConsisID

-<div class="flex flex-wrap space-x-1">
-  <img alt="LoRA" src="https://img.shields.io/badge/LoRA-d8b4fe?style=flat"/>
-</div>
-
 [Identity-Preserving Text-to-Video Generation by Frequency Decomposition](https://arxiv.org/abs/2411.17440) from Peking University & University of Rochester & etc, by Shenghai Yuan, Jinfa Huang, Xianyi He, Yunyang Ge, Yujun Shi, Liuhan Chen, Jiebo Luo, Li Yuan.

 The abstract from the paper is:
--- a/docs/source/en/api/pipelines/control_flux_inpaint.md
+++ b/docs/source/en/api/pipelines/control_flux_inpaint.md
@@ -12,10 +12,6 @@ specific language governing permissions and limitations under the License.

 # FluxControlInpaint

-<div class="flex flex-wrap space-x-1">
-  <img alt="LoRA" src="https://img.shields.io/badge/LoRA-d8b4fe?style=flat"/>
-</div>
-
 FluxControlInpaintPipeline is an implementation of Inpainting for Flux.1 Depth/Canny models. It is a pipeline that allows you to inpaint images using the Flux.1 Depth/Canny models. The pipeline takes an image and a mask as input and returns the inpainted image.

 FLUX.1 Depth and Canny [dev] is a 12 billion parameter rectified flow transformer capable of generating an image based on a text description while following the structure of a given input image. **This is not a ControlNet model**.
--- a/docs/source/en/api/pipelines/controlnet.md
+++ b/docs/source/en/api/pipelines/controlnet.md
@@ -12,10 +12,6 @@ specific language governing permissions and limitations under the License.

 # ControlNet

-<div class="flex flex-wrap space-x-1">
-  <img alt="LoRA" src="https://img.shields.io/badge/LoRA-d8b4fe?style=flat"/>
-</div>
-
 ControlNet was introduced in [Adding Conditional Control to Text-to-Image Diffusion Models](https://huggingface.co/papers/2302.05543) by Lvmin Zhang, Anyi Rao, and Maneesh Agrawala.

 With a ControlNet model, you can provide an additional control image to condition and control Stable Diffusion generation. For example, if you provide a depth map, the ControlNet model generates an image that'll preserve the spatial information from the depth map. It is a more flexible and accurate way to control the image generation process.
--- a/docs/source/en/api/pipelines/controlnet_flux.md
+++ b/docs/source/en/api/pipelines/controlnet_flux.md
@@ -12,10 +12,6 @@ specific language governing permissions and limitations under the License.

 # ControlNet with Flux.1

-<div class="flex flex-wrap space-x-1">
-  <img alt="LoRA" src="https://img.shields.io/badge/LoRA-d8b4fe?style=flat"/>
-</div>
-
 FluxControlNetPipeline is an implementation of ControlNet for Flux.1.

 ControlNet was introduced in [Adding Conditional Control to Text-to-Image Diffusion Models](https://huggingface.co/papers/2302.05543) by Lvmin Zhang, Anyi Rao, and Maneesh Agrawala.
--- a/docs/source/en/api/pipelines/controlnet_sana.md
+++ b/docs/source/en/api/pipelines/controlnet_sana.md
@@ -1,36 +0,0 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# ControlNet
-
-<div class="flex flex-wrap space-x-1">
-  <img alt="LoRA" src="https://img.shields.io/badge/LoRA-d8b4fe?style=flat"/>
-</div>
-
-ControlNet was introduced in [Adding Conditional Control to Text-to-Image Diffusion Models](https://huggingface.co/papers/2302.05543) by Lvmin Zhang, Anyi Rao, and Maneesh Agrawala.
-
-With a ControlNet model, you can provide an additional control image to condition and control Stable Diffusion generation. For example, if you provide a depth map, the ControlNet model generates an image that'll preserve the spatial information from the depth map. It is a more flexible and accurate way to control the image generation process.
-
-The abstract from the paper is:
-
-*We present ControlNet, a neural network architecture to add spatial conditioning controls to large, pretrained text-to-image diffusion models. ControlNet locks the production-ready large diffusion models, and reuses their deep and robust encoding layers pretrained with billions of images as a strong backbone to learn a diverse set of conditional controls. The neural architecture is connected with "zero convolutions" (zero-initialized convolution layers) that progressively grow the parameters from zero and ensure that no harmful noise could affect the finetuning. We test various conditioning controls, eg, edges, depth, segmentation, human pose, etc, with Stable Diffusion, using single or multiple conditions, with or without prompts. We show that the training of ControlNets is robust with small (<50k) and large (>1m) datasets. Extensive results show that ControlNet may facilitate wider applications to control image diffusion models.*
-
-This pipeline was contributed by [ishan24](https://huggingface.co/ishan24). ❤️
-The original codebase can be found at [NVlabs/Sana](https://github.com/NVlabs/Sana), and you can find official ControlNet checkpoints on [Efficient-Large-Model's](https://huggingface.co/Efficient-Large-Model) Hub profile.
-
-## SanaControlNetPipeline
-[[autodoc]] SanaControlNetPipeline
-	- all
-	- __call__
-
-## SanaPipelineOutput
-[[autodoc]] pipelines.sana.pipeline_output.SanaPipelineOutput
--- a/docs/source/en/api/pipelines/controlnet_sd3.md
+++ b/docs/source/en/api/pipelines/controlnet_sd3.md
@@ -12,10 +12,6 @@ specific language governing permissions and limitations under the License.

 # ControlNet with Stable Diffusion 3

-<div class="flex flex-wrap space-x-1">
-  <img alt="LoRA" src="https://img.shields.io/badge/LoRA-d8b4fe?style=flat"/>
-</div>
-
 StableDiffusion3ControlNetPipeline is an implementation of ControlNet for Stable Diffusion 3.

 ControlNet was introduced in [Adding Conditional Control to Text-to-Image Diffusion Models](https://huggingface.co/papers/2302.05543) by Lvmin Zhang, Anyi Rao, and Maneesh Agrawala.
--- a/docs/source/en/api/pipelines/controlnet_sdxl.md
+++ b/docs/source/en/api/pipelines/controlnet_sdxl.md
@@ -12,10 +12,6 @@ specific language governing permissions and limitations under the License.

 # ControlNet with Stable Diffusion XL

-<div class="flex flex-wrap space-x-1">
-  <img alt="LoRA" src="https://img.shields.io/badge/LoRA-d8b4fe?style=flat"/>
-</div>
-
 ControlNet was introduced in [Adding Conditional Control to Text-to-Image Diffusion Models](https://huggingface.co/papers/2302.05543) by Lvmin Zhang, Anyi Rao, and Maneesh Agrawala.

 With a ControlNet model, you can provide an additional control image to condition and control Stable Diffusion generation. For example, if you provide a depth map, the ControlNet model generates an image that'll preserve the spatial information from the depth map. It is a more flexible and accurate way to control the image generation process.
--- a/docs/source/en/api/pipelines/controlnet_union.md
+++ b/docs/source/en/api/pipelines/controlnet_union.md
@@ -12,10 +12,6 @@ specific language governing permissions and limitations under the License.

 # ControlNetUnion

-<div class="flex flex-wrap space-x-1">
-  <img alt="LoRA" src="https://img.shields.io/badge/LoRA-d8b4fe?style=flat"/>
-</div>
-
 ControlNetUnionModel is an implementation of ControlNet for Stable Diffusion XL.

 The ControlNet model was introduced in [ControlNetPlus](https://github.com/xinsir6/ControlNetPlus) by xinsir6. It supports multiple conditioning inputs without increasing computation.
--- a/docs/source/en/api/pipelines/controlnetxs.md
+++ b/docs/source/en/api/pipelines/controlnetxs.md
@@ -12,10 +12,6 @@ specific language governing permissions and limitations under the License.

 # ControlNet-XS

-<div class="flex flex-wrap space-x-1">
-  <img alt="LoRA" src="https://img.shields.io/badge/LoRA-d8b4fe?style=flat"/>
-</div>
-
 ControlNet-XS was introduced in [ControlNet-XS](https://vislearn.github.io/ControlNet-XS/) by Denis Zavadski and Carsten Rother. It is based on the observation that the control model in the [original ControlNet](https://huggingface.co/papers/2302.05543) can be made much smaller and still produce good results.

 Like the original ControlNet model, you can provide an additional control image to condition and control Stable Diffusion generation. For example, if you provide a depth map, the ControlNet model generates an image that'll preserve the spatial information from the depth map. It is a more flexible and accurate way to control the image generation process.
--- a/docs/source/en/api/pipelines/cosmos.md
+++ b/docs/source/en/api/pipelines/cosmos.md
@@ -1,41 +0,0 @@
-<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License. -->
-
-# Cosmos
-
-[Cosmos World Foundation Model Platform for Physical AI](https://huggingface.co/papers/2501.03575) by NVIDIA.
-
-*Physical AI needs to be trained digitally first. It needs a digital twin of itself, the policy model, and a digital twin of the world, the world model. In this paper, we present the Cosmos World Foundation Model Platform to help developers build customized world models for their Physical AI setups. We position a world foundation model as a general-purpose world model that can be fine-tuned into customized world models for downstream applications. Our platform covers a video curation pipeline, pre-trained world foundation models, examples of post-training of pre-trained world foundation models, and video tokenizers. To help Physical AI builders solve the most critical problems of our society, we make our platform open-source and our models open-weight with permissive licenses available via https://github.com/NVIDIA/Cosmos.*
-
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
-
-## CosmosTextToWorldPipeline
-
-[[autodoc]] CosmosTextToWorldPipeline
-  - all
-  - __call__
-
-## CosmosVideoToWorldPipeline
-
-[[autodoc]] CosmosVideoToWorldPipeline
-  - all
-  - __call__
-
-## CosmosPipelineOutput
-
-[[autodoc]] pipelines.cosmos.pipeline_output.CosmosPipelineOutput
--- a/docs/source/en/api/pipelines/deepfloyd_if.md
+++ b/docs/source/en/api/pipelines/deepfloyd_if.md
@@ -12,11 +12,6 @@ specific language governing permissions and limitations under the License.

 # DeepFloyd IF

-<div class="flex flex-wrap space-x-1">
-  <img alt="LoRA" src="https://img.shields.io/badge/LoRA-d8b4fe?style=flat"/>
-  <img alt="MPS" src="https://img.shields.io/badge/MPS-000000?style=flat&logo=apple&logoColor=white%22">
-</div>
-
 ## Overview

 DeepFloyd IF is a novel state-of-the-art open-source text-to-image model with a high degree of photorealism and language understanding.
--- a/docs/source/en/api/pipelines/easyanimate.md
+++ b/docs/source/en/api/pipelines/easyanimate.md
@@ -1,88 +0,0 @@
-<!--Copyright 2025 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-->
-
-# EasyAnimate
-[EasyAnimate](https://github.com/aigc-apps/EasyAnimate) by Alibaba PAI.
-
-The description from it's GitHub page:
-*EasyAnimate is a pipeline based on the transformer architecture, designed for generating AI images and videos, and for training baseline models and Lora models for Diffusion Transformer. We support direct prediction from pre-trained EasyAnimate models, allowing for the generation of videos with various resolutions, approximately 6 seconds in length, at 8fps (EasyAnimateV5.1, 1 to 49 frames). Additionally, users can train their own baseline and Lora models for specific style transformations.*
-
-This pipeline was contributed by [bubbliiiing](https://github.com/bubbliiiing). The original codebase can be found [here](https://huggingface.co/alibaba-pai). The original weights can be found under [hf.co/alibaba-pai](https://huggingface.co/alibaba-pai).
-
-There are two official EasyAnimate checkpoints for text-to-video and video-to-video.
-
-| checkpoints | recommended inference dtype |
-|:---:|:---:|
-| [`alibaba-pai/EasyAnimateV5.1-12b-zh`](https://huggingface.co/alibaba-pai/EasyAnimateV5.1-12b-zh) | torch.float16 |
-| [`alibaba-pai/EasyAnimateV5.1-12b-zh-InP`](https://huggingface.co/alibaba-pai/EasyAnimateV5.1-12b-zh-InP) | torch.float16 |
-
-There is one official EasyAnimate checkpoints available for image-to-video and video-to-video.
-
-| checkpoints | recommended inference dtype |
-|:---:|:---:|
-| [`alibaba-pai/EasyAnimateV5.1-12b-zh-InP`](https://huggingface.co/alibaba-pai/EasyAnimateV5.1-12b-zh-InP) | torch.float16 |
-
-There are two official EasyAnimate checkpoints available for control-to-video.
-
-| checkpoints | recommended inference dtype |
-|:---:|:---:|
-| [`alibaba-pai/EasyAnimateV5.1-12b-zh-Control`](https://huggingface.co/alibaba-pai/EasyAnimateV5.1-12b-zh-Control) | torch.float16 |
-| [`alibaba-pai/EasyAnimateV5.1-12b-zh-Control-Camera`](https://huggingface.co/alibaba-pai/EasyAnimateV5.1-12b-zh-Control-Camera) | torch.float16 |
-
-For the EasyAnimateV5.1 series:
- Text-to-video (T2V) and Image-to-video (I2V) works for multiple resolutions. The width and height can vary from 256 to 1024.
- Both T2V and I2V models support generation with 1~49 frames and work best at this value. Exporting videos at 8 FPS is recommended.
-
-## Quantization
-
-Quantization helps reduce the memory requirements of very large models by storing model weights in a lower precision data type. However, quantization may have varying impact on video quality depending on the video model.
-
-Refer to the [Quantization](../../quantization/overview) overview to learn more about supported quantization backends and selecting a quantization backend that supports your use case. The example below demonstrates how to load a quantized [`EasyAnimatePipeline`] for inference with bitsandbytes.
-
-```py
-import torch
-from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig, EasyAnimateTransformer3DModel, EasyAnimatePipeline
-from diffusers.utils import export_to_video
-
-quant_config = DiffusersBitsAndBytesConfig(load_in_8bit=True)
-transformer_8bit = EasyAnimateTransformer3DModel.from_pretrained(
-    "alibaba-pai/EasyAnimateV5.1-12b-zh",
-    subfolder="transformer",
-    quantization_config=quant_config,
-    torch_dtype=torch.float16,
-)
-
-pipeline = EasyAnimatePipeline.from_pretrained(
-    "alibaba-pai/EasyAnimateV5.1-12b-zh",
-    transformer=transformer_8bit,
-    torch_dtype=torch.float16,
-    device_map="balanced",
-)
-
-prompt = "A cat walks on the grass, realistic style."
-negative_prompt = "bad detailed"
-video = pipeline(prompt=prompt, negative_prompt=negative_prompt, num_frames=49, num_inference_steps=30).frames[0]
-export_to_video(video, "cat.mp4", fps=8)
-```
-
-## EasyAnimatePipeline
-
-[[autodoc]] EasyAnimatePipeline
-  - all
-  - __call__
-
-## EasyAnimatePipelineOutput
-
-[[autodoc]] pipelines.easyanimate.pipeline_output.EasyAnimatePipelineOutput
--- a/docs/source/en/api/pipelines/flux.md
+++ b/docs/source/en/api/pipelines/flux.md
@@ -12,11 +12,6 @@ specific language governing permissions and limitations under the License.

 # Flux

-<div class="flex flex-wrap space-x-1">
-  <img alt="LoRA" src="https://img.shields.io/badge/LoRA-d8b4fe?style=flat"/>
-  <img alt="MPS" src="https://img.shields.io/badge/MPS-000000?style=flat&logo=apple&logoColor=white%22">
-</div>
-
 Flux is a series of text-to-image generation models based on diffusion transformers. To know more about Flux, check out the original [blog post](https://blackforestlabs.ai/announcing-black-forest-labs/) by the creators of Flux, Black Forest Labs.

 Original model checkpoints for Flux can be found [here](https://huggingface.co/black-forest-labs). Original inference code can be found [here](https://github.com/black-forest-labs/flux).
@@ -347,7 +342,7 @@ image = pipe(
    height=1024,
    prompt="wearing sunglasses",
    negative_prompt="",
-    true_cfg_scale=4.0,
+    true_cfg=4.0,
    generator=torch.Generator().manual_seed(4444),
    ip_adapter_image=image,
 ).images[0]
@@ -360,74 +355,8 @@ image.save('flux_ip_adapter_output.jpg')
    <figcaption class="mt-2 text-sm text-center text-gray-500">IP-Adapter examples with prompt "wearing sunglasses"</figcaption>
 </div>

-## Optimize

-Flux is a very large model and requires ~50GB of RAM/VRAM to load all the modeling components. Enable some of the optimizations below to lower the memory requirements.
-
-### Group offloading
-
-[Group offloading](../../optimization/memory#group-offloading) lowers VRAM usage by offloading groups of internal layers rather than the whole model or weights. You need to use [`~hooks.apply_group_offloading`] on all the model components of a pipeline. The `offload_type` parameter allows you to toggle between block and leaf-level offloading. Setting it to `leaf_level` offloads the lowest leaf-level parameters to the CPU instead of offloading at the module-level.
-
-On CUDA devices that support asynchronous data streaming, set `use_stream=True` to overlap data transfer and computation to accelerate inference.
-
-> [!TIP]
-> It is possible to mix block and leaf-level offloading for different components in a pipeline.
-
-```py
-import torch
-from diffusers import FluxPipeline
-from diffusers.hooks import apply_group_offloading
-
-model_id = "black-forest-labs/FLUX.1-dev"
-dtype = torch.bfloat16
-pipe = FluxPipeline.from_pretrained(
-	model_id,
-	torch_dtype=dtype,
-)
-
-apply_group_offloading(
-    pipe.transformer,
-    offload_type="leaf_level",
-    offload_device=torch.device("cpu"),
-    onload_device=torch.device("cuda"),
-    use_stream=True,
-)
-apply_group_offloading(
-    pipe.text_encoder, 
-    offload_device=torch.device("cpu"),
-    onload_device=torch.device("cuda"),
-    offload_type="leaf_level",
-    use_stream=True,
-)
-apply_group_offloading(
-    pipe.text_encoder_2, 
-    offload_device=torch.device("cpu"),
-    onload_device=torch.device("cuda"),
-    offload_type="leaf_level",
-    use_stream=True,
-)
-apply_group_offloading(
-    pipe.vae, 
-    offload_device=torch.device("cpu"),
-    onload_device=torch.device("cuda"),
-    offload_type="leaf_level",
-    use_stream=True,
-)
-
-prompt="A cat wearing sunglasses and working as a lifeguard at pool."
-
-generator = torch.Generator().manual_seed(181201)
-image = pipe(
-    prompt,
-    width=576,
-    height=1024,
-    num_inference_steps=30,
-    generator=generator
-).images[0]
-image
-```
-
-### Running FP16 inference
+## Running FP16 inference

 Flux can generate high-quality images with FP16 (i.e. to accelerate inference on Turing/Volta GPUs) but produces different outputs compared to FP32/BF16. The issue is that some activations in the text encoders have to be clipped when running in FP16, which affects the overall image. Forcing text encoders to run with FP32 inference thus removes this output difference. See [here](https://github.com/huggingface/diffusers/pull/9097#issuecomment-2272292516) for details.

@@ -456,7 +385,7 @@ out = pipe(
 out.save("image.png")
 ```

-### Quantization
+## Quantization

 Quantization helps reduce the memory requirements of very large models by storing model weights in a lower precision data type. However, quantization may have varying impact on video quality depending on the video model.

--- a/docs/source/en/api/pipelines/framepack.md
+++ b/docs/source/en/api/pipelines/framepack.md
@@ -1,209 +0,0 @@
-<!-- Copyright 2025 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License. -->
-
-# Framepack
-
-<div class="flex flex-wrap space-x-1">
-  <img alt="LoRA" src="https://img.shields.io/badge/LoRA-d8b4fe?style=flat"/>
-</div>
-
-[Packing Input Frame Context in Next-Frame Prediction Models for Video Generation](https://arxiv.org/abs/2504.12626) by Lvmin Zhang and Maneesh Agrawala.
-
-*We present a neural network structure, FramePack, to train next-frame (or next-frame-section) prediction models for video generation. The FramePack compresses input frames to make the transformer context length a fixed number regardless of the video length. As a result, we are able to process a large number of frames using video diffusion with computation bottleneck similar to image diffusion. This also makes the training video batch sizes significantly higher (batch sizes become comparable to image diffusion training). We also propose an anti-drifting sampling method that generates frames in inverted temporal order with early-established endpoints to avoid exposure bias (error accumulation over iterations). Finally, we show that existing video diffusion models can be finetuned with FramePack, and their visual quality may be improved because the next-frame prediction supports more balanced diffusion schedulers with less extreme flow shift timesteps.*
-
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
-
-## Available models
-
-| Model name | Description |
-|:---|:---|
- [`lllyasviel/FramePackI2V_HY`](https://huggingface.co/lllyasviel/FramePackI2V_HY) | Trained with the "inverted anti-drifting" strategy as described in the paper. Inference requires setting `sampling_type="inverted_anti_drifting"` when running the pipeline. |
- [`lllyasviel/FramePack_F1_I2V_HY_20250503`](https://huggingface.co/lllyasviel/FramePack_F1_I2V_HY_20250503) | Trained with a novel anti-drifting strategy but inference is performed in "vanilla" strategy as described in the paper. Inference requires setting `sampling_type="vanilla"` when running the pipeline. |
-
-## Usage
-
-Refer to the pipeline documentation for basic usage examples. The following section contains examples of offloading, different sampling methods, quantization, and more.
-
-### First and last frame to video
-
-The following example shows how to use Framepack with start and end image controls, using the inverted anti-drifiting sampling model.
-
-```python
-import torch
-from diffusers import HunyuanVideoFramepackPipeline, HunyuanVideoFramepackTransformer3DModel
-from diffusers.utils import export_to_video, load_image
-from transformers import SiglipImageProcessor, SiglipVisionModel
-
-transformer = HunyuanVideoFramepackTransformer3DModel.from_pretrained(
-    "lllyasviel/FramePackI2V_HY", torch_dtype=torch.bfloat16
-)
-feature_extractor = SiglipImageProcessor.from_pretrained(
-    "lllyasviel/flux_redux_bfl", subfolder="feature_extractor"
-)
-image_encoder = SiglipVisionModel.from_pretrained(
-    "lllyasviel/flux_redux_bfl", subfolder="image_encoder", torch_dtype=torch.float16
-)
-pipe = HunyuanVideoFramepackPipeline.from_pretrained(
-    "hunyuanvideo-community/HunyuanVideo",
-    transformer=transformer,
-    feature_extractor=feature_extractor,
-    image_encoder=image_encoder,
-    torch_dtype=torch.float16,
-)
-
-# Enable memory optimizations
-pipe.enable_model_cpu_offload()
-pipe.vae.enable_tiling()
-
-prompt = "CG animation style, a small blue bird takes off from the ground, flapping its wings. The bird's feathers are delicate, with a unique pattern on its chest. The background shows a blue sky with white clouds under bright sunshine. The camera follows the bird upward, capturing its flight and the vastness of the sky from a close-up, low-angle perspective."
-first_image = load_image(
-    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/flf2v_input_first_frame.png"
-)
-last_image = load_image(
-    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/flf2v_input_last_frame.png"
-)
-output = pipe(
-    image=first_image,
-    last_image=last_image,
-    prompt=prompt,
-    height=512,
-    width=512,
-    num_frames=91,
-    num_inference_steps=30,
-    guidance_scale=9.0,
-    generator=torch.Generator().manual_seed(0),
-    sampling_type="inverted_anti_drifting",
-).frames[0]
-export_to_video(output, "output.mp4", fps=30)
-```
-
-### Vanilla sampling
-
-The following example shows how to use Framepack with the F1 model trained with vanilla sampling but new regulation approach for anti-drifting.
-
-```python
-import torch
-from diffusers import HunyuanVideoFramepackPipeline, HunyuanVideoFramepackTransformer3DModel
-from diffusers.utils import export_to_video, load_image
-from transformers import SiglipImageProcessor, SiglipVisionModel
-
-transformer = HunyuanVideoFramepackTransformer3DModel.from_pretrained(
-    "lllyasviel/FramePack_F1_I2V_HY_20250503", torch_dtype=torch.bfloat16
-)
-feature_extractor = SiglipImageProcessor.from_pretrained(
-    "lllyasviel/flux_redux_bfl", subfolder="feature_extractor"
-)
-image_encoder = SiglipVisionModel.from_pretrained(
-    "lllyasviel/flux_redux_bfl", subfolder="image_encoder", torch_dtype=torch.float16
-)
-pipe = HunyuanVideoFramepackPipeline.from_pretrained(
-    "hunyuanvideo-community/HunyuanVideo",
-    transformer=transformer,
-    feature_extractor=feature_extractor,
-    image_encoder=image_encoder,
-    torch_dtype=torch.float16,
-)
-
-# Enable memory optimizations
-pipe.enable_model_cpu_offload()
-pipe.vae.enable_tiling()
-
-image = load_image(
-    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/penguin.png"
-)
-output = pipe(
-    image=image,
-    prompt="A penguin dancing in the snow",
-    height=832,
-    width=480,
-    num_frames=91,
-    num_inference_steps=30,
-    guidance_scale=9.0,
-    generator=torch.Generator().manual_seed(0),
-    sampling_type="vanilla",
-).frames[0]
-export_to_video(output, "output.mp4", fps=30)
-```
-
-### Group offloading
-
-Group offloading ([`~hooks.apply_group_offloading`]) provides aggressive memory optimizations for offloading internal parts of any model to the CPU, with possibly no additional overhead to generation time. If you have very low VRAM available, this approach may be suitable for you depending on the amount of CPU RAM available.
-
-```python
-import torch
-from diffusers import HunyuanVideoFramepackPipeline, HunyuanVideoFramepackTransformer3DModel
-from diffusers.hooks import apply_group_offloading
-from diffusers.utils import export_to_video, load_image
-from transformers import SiglipImageProcessor, SiglipVisionModel
-
-transformer = HunyuanVideoFramepackTransformer3DModel.from_pretrained(
-    "lllyasviel/FramePack_F1_I2V_HY_20250503", torch_dtype=torch.bfloat16
-)
-feature_extractor = SiglipImageProcessor.from_pretrained(
-    "lllyasviel/flux_redux_bfl", subfolder="feature_extractor"
-)
-image_encoder = SiglipVisionModel.from_pretrained(
-    "lllyasviel/flux_redux_bfl", subfolder="image_encoder", torch_dtype=torch.float16
-)
-pipe = HunyuanVideoFramepackPipeline.from_pretrained(
-    "hunyuanvideo-community/HunyuanVideo",
-    transformer=transformer,
-    feature_extractor=feature_extractor,
-    image_encoder=image_encoder,
-    torch_dtype=torch.float16,
-)
-
-# Enable group offloading
-onload_device = torch.device("cuda")
-offload_device = torch.device("cpu")
-list(map(
-    lambda x: apply_group_offloading(x, onload_device, offload_device, offload_type="leaf_level", use_stream=True, low_cpu_mem_usage=True),
-    [pipe.text_encoder, pipe.text_encoder_2, pipe.transformer]
-))
-pipe.image_encoder.to(onload_device)
-pipe.vae.to(onload_device)
-pipe.vae.enable_tiling()
-
-image = load_image(
-    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/penguin.png"
-)
-output = pipe(
-    image=image,
-    prompt="A penguin dancing in the snow",
-    height=832,
-    width=480,
-    num_frames=91,
-    num_inference_steps=30,
-    guidance_scale=9.0,
-    generator=torch.Generator().manual_seed(0),
-    sampling_type="vanilla",
-).frames[0]
-print(f"Max memory: {torch.cuda.max_memory_allocated() / 1024**3:.3f} GB")
-export_to_video(output, "output.mp4", fps=30)
-```
-
-## HunyuanVideoFramepackPipeline
-
-[[autodoc]] HunyuanVideoFramepackPipeline
-  - all
-  - __call__
-
-## HunyuanVideoPipelineOutput
-
-[[autodoc]] pipelines.hunyuan_video.pipeline_output.HunyuanVideoPipelineOutput
-
--- a/docs/source/en/api/pipelines/hidream.md
+++ b/docs/source/en/api/pipelines/hidream.md
@@ -1,43 +0,0 @@
-<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License. -->
-
-# HiDreamImage
-
-[HiDream-I1](https://huggingface.co/HiDream-ai) by HiDream.ai
-
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
-
-## Available models
-
-The following models are available for the [`HiDreamImagePipeline`](text-to-image) pipeline:
-
-| Model name | Description |
-|:---|:---|
-| [`HiDream-ai/HiDream-I1-Full`](https://huggingface.co/HiDream-ai/HiDream-I1-Full) | - |
-| [`HiDream-ai/HiDream-I1-Dev`](https://huggingface.co/HiDream-ai/HiDream-I1-Dev) | - |
-| [`HiDream-ai/HiDream-I1-Fast`](https://huggingface.co/HiDream-ai/HiDream-I1-Fast) | - |
-
-## HiDreamImagePipeline
-
-[[autodoc]] HiDreamImagePipeline
-  - all
-  - __call__
-
-## HiDreamImagePipelineOutput
-
-[[autodoc]] pipelines.hidream_image.pipeline_output.HiDreamImagePipelineOutput
--- a/docs/source/en/api/pipelines/hunyuan_video.md
+++ b/docs/source/en/api/pipelines/hunyuan_video.md
@@ -14,10 +14,6 @@

 # HunyuanVideo

-<div class="flex flex-wrap space-x-1">
-  <img alt="LoRA" src="https://img.shields.io/badge/LoRA-d8b4fe?style=flat"/>
-</div>
-
 [HunyuanVideo](https://www.arxiv.org/abs/2412.03603) by Tencent.

 *Recent advancements in video generation have significantly impacted daily life for both individuals and industries. However, the leading video generation models remain closed-source, resulting in a notable performance gap between industry capabilities and those available to the public. In this report, we introduce HunyuanVideo, an innovative open-source video foundation model that demonstrates performance in video generation comparable to, or even surpassing, that of leading closed-source models. HunyuanVideo encompasses a comprehensive framework that integrates several key elements, including data curation, advanced architectural design, progressive model scaling and training, and an efficient infrastructure tailored for large-scale model training and inference. As a result, we successfully trained a video generative model with over 13 billion parameters, making it the largest among all open-source models. We conducted extensive experiments and implemented a series of targeted designs to ensure high visual quality, motion dynamics, text-video alignment, and advanced filming techniques. According to evaluations by professionals, HunyuanVideo outperforms previous state-of-the-art models, including Runway Gen-3, Luma 1.6, and three top-performing Chinese video generative models. By releasing the code for the foundation model and its applications, we aim to bridge the gap between closed-source and open-source communities. This initiative will empower individuals within the community to experiment with their ideas, fostering a more dynamic and vibrant video generation ecosystem. The code is publicly available at [this https URL](https://github.com/tencent/HunyuanVideo).*
@@ -49,9 +45,7 @@ The following models are available for the image-to-video pipeline:

 | Model name | Description |
 |:---|:---|
-| [`Skywork/SkyReels-V1-Hunyuan-I2V`](https://huggingface.co/Skywork/SkyReels-V1-Hunyuan-I2V) | Skywork's custom finetune of HunyuanVideo (de-distilled). Performs best with `97x544x960` resolution. Performs best at `97x544x960` resolution, `guidance_scale=1.0`, `true_cfg_scale=6.0` and a negative prompt. |
-| [`hunyuanvideo-community/HunyuanVideo-I2V-33ch`](https://huggingface.co/hunyuanvideo-community/HunyuanVideo-I2V) | Tecent's official HunyuanVideo 33-channel I2V model. Performs best at resolutions of 480, 720, 960, 1280. A higher `shift` value when initializing the scheduler is recommended (good values are between 7 and 20). |
-| [`hunyuanvideo-community/HunyuanVideo-I2V`](https://huggingface.co/hunyuanvideo-community/HunyuanVideo-I2V) | Tecent's official HunyuanVideo 16-channel I2V model. Performs best at resolutions of 480, 720, 960, 1280. A higher `shift` value when initializing the scheduler is recommended (good values are between 7 and 20) |
+| [`https://huggingface.co/Skywork/SkyReels-V1-Hunyuan-I2V`](https://huggingface.co/Skywork/SkyReels-V1-Hunyuan-I2V) | Skywork's custom finetune of HunyuanVideo (de-distilled). Performs best with `97x544x960` resolution. Performs best at `97x544x960` resolution, `guidance_scale=1.0`, `true_cfg_scale=6.0` and a negative prompt. |

 ## Quantization

--- a/docs/source/en/api/pipelines/kandinsky3.md
+++ b/docs/source/en/api/pipelines/kandinsky3.md
@@ -9,10 +9,6 @@ specific language governing permissions and limitations under the License.

 # Kandinsky 3

-<div class="flex flex-wrap space-x-1">
-  <img alt="LoRA" src="https://img.shields.io/badge/LoRA-d8b4fe?style=flat"/>
-</div>
-
 Kandinsky 3 is created by [Vladimir Arkhipkin](https://github.com/oriBetelgeuse),[Anastasia Maltseva](https://github.com/NastyaMittseva),[Igor Pavlov](https://github.com/boomb0om),[Andrei Filatov](https://github.com/anvilarth),[Arseniy Shakhmatov](https://github.com/cene555),[Andrey Kuznetsov](https://github.com/kuznetsoffandrey),[Denis Dimitrov](https://github.com/denndimitrov), [Zein Shaheen](https://github.com/zeinsh)

 The description from it's GitHub page:
--- a/docs/source/en/api/pipelines/kolors.md
+++ b/docs/source/en/api/pipelines/kolors.md
@@ -12,11 +12,6 @@ specific language governing permissions and limitations under the License.

 # Kolors: Effective Training of Diffusion Model for Photorealistic Text-to-Image Synthesis

-<div class="flex flex-wrap space-x-1">
-  <img alt="LoRA" src="https://img.shields.io/badge/LoRA-d8b4fe?style=flat"/>
-  <img alt="MPS" src="https://img.shields.io/badge/MPS-000000?style=flat&logo=apple&logoColor=white%22">
-</div>
-
 ![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/kolors/kolors_header_collage.png)

 Kolors is a large-scale text-to-image generation model based on latent diffusion, developed by [the Kuaishou Kolors team](https://github.com/Kwai-Kolors/Kolors). Trained on billions of text-image pairs, Kolors exhibits significant advantages over both open-source and closed-source models in visual quality, complex semantic accuracy, and text rendering for both Chinese and English characters. Furthermore, Kolors supports both Chinese and English inputs, demonstrating strong performance in understanding and generating Chinese-specific content. For more details, please refer to this [technical report](https://github.com/Kwai-Kolors/Kolors/blob/master/imgs/Kolors_paper.pdf).
--- a/docs/source/en/api/pipelines/latent_consistency_models.md
+++ b/docs/source/en/api/pipelines/latent_consistency_models.md
@@ -12,10 +12,6 @@ specific language governing permissions and limitations under the License.

 # Latent Consistency Models

-<div class="flex flex-wrap space-x-1">
-  <img alt="LoRA" src="https://img.shields.io/badge/LoRA-d8b4fe?style=flat"/>
-</div>
-
 Latent Consistency Models (LCMs) were proposed in [Latent Consistency Models: Synthesizing High-Resolution Images with Few-Step Inference](https://huggingface.co/papers/2310.04378) by Simian Luo, Yiqin Tan, Longbo Huang, Jian Li, and Hang Zhao.

 The abstract of the paper is as follows:
--- a/docs/source/en/api/pipelines/ledits_pp.md
+++ b/docs/source/en/api/pipelines/ledits_pp.md
@@ -12,10 +12,6 @@ specific language governing permissions and limitations under the License.

 # LEDITS++

-<div class="flex flex-wrap space-x-1">
-  <img alt="LoRA" src="https://img.shields.io/badge/LoRA-d8b4fe?style=flat"/>
-</div>
-
 LEDITS++ was proposed in [LEDITS++: Limitless Image Editing using Text-to-Image Models](https://huggingface.co/papers/2311.16711) by Manuel Brack, Felix Friedrich, Katharina Kornmeier, Linoy Tsaban, Patrick Schramowski, Kristian Kersting, Apolinário Passos.

 The abstract from the paper is:
@@ -29,7 +25,7 @@ You can find additional information about LEDITS++ on the [project page](https:/
 </Tip>

 <Tip warning={true}>
-Due to some backward compatibility issues with the current diffusers implementation of [`~schedulers.DPMSolverMultistepScheduler`] this implementation of LEdits++ can no longer guarantee perfect inversion.
+Due to some backward compatability issues with the current diffusers implementation of [`~schedulers.DPMSolverMultistepScheduler`] this implementation of LEdits++ can no longer guarantee perfect inversion.
 This issue is unlikely to have any noticeable effects on applied use-cases. However, we provide an alternative implementation that guarantees perfect inversion in a dedicated [GitHub repo](https://github.com/ml-research/ledits_pp).
 </Tip>

--- a/docs/source/en/api/pipelines/ltx_video.md
+++ b/docs/source/en/api/pipelines/ltx_video.md
@@ -14,11 +14,6 @@

 # LTX Video

-<div class="flex flex-wrap space-x-1">
-  <img alt="LoRA" src="https://img.shields.io/badge/LoRA-d8b4fe?style=flat"/>
-  <img alt="MPS" src="https://img.shields.io/badge/MPS-000000?style=flat&logo=apple&logoColor=white%22">
-</div>
-
 [LTX Video](https://huggingface.co/Lightricks/LTX-Video) is the first DiT-based video generation model capable of generating high-quality videos in real-time. It produces 24 FPS videos at a 768x512 resolution faster than they can be watched. Trained on a large-scale dataset of diverse videos, the model generates high-resolution videos with realistic and varied content. We provide a model for both text-to-video as well as image + text-to-video usecases.

 <Tip>
@@ -31,103 +26,11 @@ Available models:

 |  Model name   | Recommended dtype |
 |:-------------:|:-----------------:|
-| [`LTX Video 2B 0.9.0`](https://huggingface.co/Lightricks/LTX-Video/blob/main/ltx-video-2b-v0.9.safetensors) | `torch.bfloat16` |
-| [`LTX Video 2B 0.9.1`](https://huggingface.co/Lightricks/LTX-Video/blob/main/ltx-video-2b-v0.9.1.safetensors) | `torch.bfloat16` |
-| [`LTX Video 2B 0.9.5`](https://huggingface.co/Lightricks/LTX-Video/blob/main/ltx-video-2b-v0.9.5.safetensors) | `torch.bfloat16` |
-| [`LTX Video 13B 0.9.7`](https://huggingface.co/Lightricks/LTX-Video/blob/main/ltxv-13b-0.9.7-dev.safetensors) | `torch.bfloat16` |
-| [`LTX Video Spatial Upscaler 0.9.7`](https://huggingface.co/Lightricks/LTX-Video/blob/main/ltxv-spatial-upscaler-0.9.7.safetensors) | `torch.bfloat16` |
+| [`LTX Video 0.9.0`](https://huggingface.co/Lightricks/LTX-Video/blob/main/ltx-video-2b-v0.9.safetensors) | `torch.bfloat16` |
+| [`LTX Video 0.9.1`](https://huggingface.co/Lightricks/LTX-Video/blob/main/ltx-video-2b-v0.9.1.safetensors) | `torch.bfloat16` |

 Note: The recommended dtype is for the transformer component. The VAE and text encoders can be either `torch.float32`, `torch.bfloat16` or `torch.float16` but the recommended dtype is `torch.bfloat16` as used in the original repository.

-## Recommended settings for generation
-
-For the best results, it is recommended to follow the guidelines mentioned in the official LTX Video [repository](https://github.com/Lightricks/LTX-Video).
-
- Some variants of LTX Video are guidance-distilled. For guidance-distilled models, `guidance_scale` must be set to `1.0`. For any other models, `guidance_scale` should be set higher (e.g., `5.0`) for good generation quality.
- For variants with a timestep-aware VAE (LTXV 0.9.1 and above), it is recommended to set `decode_timestep` to `0.05` and `image_cond_noise_scale` to `0.025`.
- For variants that support interpolation between multiple conditioning images and videos (LTXV 0.9.5 and above), it is recommended to use similar looking images/videos for the best results. High divergence between the conditionings may lead to abrupt transitions in the generated video.
-
-## Using LTX Video 13B 0.9.7
-
-LTX Video 0.9.7 comes with a spatial latent upscaler and a 13B parameter transformer. The inference involves generating a low resolution video first, which is very fast, followed by upscaling and refining the generated video.
-
-<!-- TODO(aryan): modify when official checkpoints are available -->
-
-```python
-import torch
-from diffusers import LTXConditionPipeline, LTXLatentUpsamplePipeline
-from diffusers.pipelines.ltx.pipeline_ltx_condition import LTXVideoCondition
-from diffusers.utils import export_to_video, load_video
-
-pipe = LTXConditionPipeline.from_pretrained("a-r-r-o-w/LTX-Video-0.9.7-diffusers", torch_dtype=torch.bfloat16)
-pipe_upsample = LTXLatentUpsamplePipeline.from_pretrained("a-r-r-o-w/LTX-Video-0.9.7-Latent-Spatial-Upsampler-diffusers", vae=pipe.vae, torch_dtype=torch.bfloat16)
-pipe.to("cuda")
-pipe_upsample.to("cuda")
-pipe.vae.enable_tiling()
-
-def round_to_nearest_resolution_acceptable_by_vae(height, width):
-    height = height - (height % pipe.vae_temporal_compression_ratio)
-    width = width - (width % pipe.vae_temporal_compression_ratio)
-    return height, width
-
-video = load_video(
-    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cosmos/cosmos-video2world-input-vid.mp4"
-)[:21]  # Use only the first 21 frames as conditioning
-condition1 = LTXVideoCondition(video=video, frame_index=0)
-
-prompt = "The video depicts a winding mountain road covered in snow, with a single vehicle traveling along it. The road is flanked by steep, rocky cliffs and sparse vegetation. The landscape is characterized by rugged terrain and a river visible in the distance. The scene captures the solitude and beauty of a winter drive through a mountainous region."
-negative_prompt = "worst quality, inconsistent motion, blurry, jittery, distorted"
-expected_height, expected_width = 768, 1152
-downscale_factor = 2 / 3
-num_frames = 161
-
-# Part 1. Generate video at smaller resolution
-# Text-only conditioning is also supported without the need to pass `conditions`
-downscaled_height, downscaled_width = int(expected_height * downscale_factor), int(expected_width * downscale_factor)
-downscaled_height, downscaled_width = round_to_nearest_resolution_acceptable_by_vae(downscaled_height, downscaled_width)
-latents = pipe(
-    conditions=[condition1],
-    prompt=prompt,
-    negative_prompt=negative_prompt,
-    width=downscaled_width,
-    height=downscaled_height,
-    num_frames=num_frames,
-    num_inference_steps=30,
-    generator=torch.Generator().manual_seed(0),
-    output_type="latent",
-).frames
-
-# Part 2. Upscale generated video using latent upsampler with fewer inference steps
-# The available latent upsampler upscales the height/width by 2x
-upscaled_height, upscaled_width = downscaled_height * 2, downscaled_width * 2
-upscaled_latents = pipe_upsample(
-    latents=latents,
-    output_type="latent"
-).frames
-
-# Part 3. Denoise the upscaled video with few steps to improve texture (optional, but recommended)
-video = pipe(
-    conditions=[condition1],
-    prompt=prompt,
-    negative_prompt=negative_prompt,
-    width=upscaled_width,
-    height=upscaled_height,
-    num_frames=num_frames,
-    denoise_strength=0.4,  # Effectively, 4 inference steps out of 10
-    num_inference_steps=10,
-    latents=upscaled_latents,
-    decode_timestep=0.05,
-    image_cond_noise_scale=0.025,
-    generator=torch.Generator().manual_seed(0),
-    output_type="pil",
-).frames[0]
-
-# Part 4. Downscale the video to the expected resolution
-video = [frame.resize((expected_width, expected_height)) for frame in video]
-
-export_to_video(video, "output.mp4", fps=24)
-```
-
 ## Loading Single Files

 Loading the original LTX Video checkpoints is also possible with [`~ModelMixin.from_single_file`]. We recommend using `from_single_file` for the Lightricks series of models, as they plan to release multiple models in the future in the single file format.
@@ -289,18 +192,6 @@ export_to_video(video, "ship.mp4", fps=24)
  - all
  - __call__

-## LTXConditionPipeline
-
-[[autodoc]] LTXConditionPipeline
-  - all
-  - __call__
-
-## LTXLatentUpsamplePipeline
-
-[[autodoc]] LTXLatentUpsamplePipeline
-  - all
-  - __call__
-
 ## LTXPipelineOutput

 [[autodoc]] pipelines.ltx.pipeline_output.LTXPipelineOutput
--- a/docs/source/en/api/pipelines/lumina.md
+++ b/docs/source/en/api/pipelines/lumina.md
@@ -58,10 +58,10 @@ Use [`torch.compile`](https://huggingface.co/docs/diffusers/main/en/tutorials/fa
 First, load the pipeline:

 ```python
-from diffusers import LuminaPipeline
+from diffusers import LuminaText2ImgPipeline
 import torch

-pipeline = LuminaPipeline.from_pretrained(
+pipeline = LuminaText2ImgPipeline.from_pretrained(
 	"Alpha-VLLM/Lumina-Next-SFT-diffusers", torch_dtype=torch.bfloat16
 ).to("cuda")
 ```
@@ -86,11 +86,11 @@ image = pipeline(prompt="Upper body of a young woman in a Victorian-era outfit w

 Quantization helps reduce the memory requirements of very large models by storing model weights in a lower precision data type. However, quantization may have varying impact on video quality depending on the video model.

-Refer to the [Quantization](../../quantization/overview) overview to learn more about supported quantization backends and selecting a quantization backend that supports your use case. The example below demonstrates how to load a quantized [`LuminaPipeline`] for inference with bitsandbytes.
+Refer to the [Quantization](../../quantization/overview) overview to learn more about supported quantization backends and selecting a quantization backend that supports your use case. The example below demonstrates how to load a quantized [`LuminaText2ImgPipeline`] for inference with bitsandbytes.

 ```py
 import torch
-from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig, Transformer2DModel, LuminaPipeline
+from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig, Transformer2DModel, LuminaText2ImgPipeline
 from transformers import BitsAndBytesConfig as BitsAndBytesConfig, T5EncoderModel

 quant_config = BitsAndBytesConfig(load_in_8bit=True)
@@ -109,7 +109,7 @@ transformer_8bit = Transformer2DModel.from_pretrained(
    torch_dtype=torch.float16,
 )

-pipeline = LuminaPipeline.from_pretrained(
+pipeline = LuminaText2ImgPipeline.from_pretrained(
    "Alpha-VLLM/Lumina-Next-SFT-diffusers",
    text_encoder=text_encoder_8bit,
    transformer=transformer_8bit,
@@ -122,9 +122,9 @@ image = pipeline(prompt).images[0]
 image.save("lumina.png")
 ```

-## LuminaPipeline
+## LuminaText2ImgPipeline

-[[autodoc]] LuminaPipeline
+[[autodoc]] LuminaText2ImgPipeline
 	- all
 	- __call__

--- a/docs/source/en/api/pipelines/lumina2.md
+++ b/docs/source/en/api/pipelines/lumina2.md
@@ -14,10 +14,6 @@

 # Lumina2

-<div class="flex flex-wrap space-x-1">
-  <img alt="LoRA" src="https://img.shields.io/badge/LoRA-d8b4fe?style=flat"/>
-</div>
-
 [Lumina Image 2.0: A Unified and Efficient Image Generative Model](https://huggingface.co/Alpha-VLLM/Lumina-Image-2.0) is a 2 billion parameter flow-based diffusion transformer capable of generating diverse images from text descriptions.

 The abstract from the paper is:
@@ -36,14 +32,14 @@ Single file loading for Lumina Image 2.0 is available for the `Lumina2Transforme

 ```python
 import torch
-from diffusers import Lumina2Transformer2DModel, Lumina2Pipeline
+from diffusers import Lumina2Transformer2DModel, Lumina2Text2ImgPipeline

 ckpt_path = "https://huggingface.co/Alpha-VLLM/Lumina-Image-2.0/blob/main/consolidated.00-of-01.pth"
 transformer = Lumina2Transformer2DModel.from_single_file(
    ckpt_path, torch_dtype=torch.bfloat16
 )

-pipe = Lumina2Pipeline.from_pretrained(
+pipe = Lumina2Text2ImgPipeline.from_pretrained(
    "Alpha-VLLM/Lumina-Image-2.0", transformer=transformer, torch_dtype=torch.bfloat16
 )
 pipe.enable_model_cpu_offload()
@@ -60,7 +56,7 @@ image.save("lumina-single-file.png")
 GGUF Quantized checkpoints for the `Lumina2Transformer2DModel` can be loaded via `from_single_file` with the `GGUFQuantizationConfig` 

 ```python
-from diffusers import Lumina2Transformer2DModel, Lumina2Pipeline, GGUFQuantizationConfig 
+from diffusers import Lumina2Transformer2DModel, Lumina2Text2ImgPipeline, GGUFQuantizationConfig 

 ckpt_path = "https://huggingface.co/calcuis/lumina-gguf/blob/main/lumina2-q4_0.gguf"
 transformer = Lumina2Transformer2DModel.from_single_file(
@@ -69,7 +65,7 @@ transformer = Lumina2Transformer2DModel.from_single_file(
    torch_dtype=torch.bfloat16,
 )

-pipe = Lumina2Pipeline.from_pretrained(
+pipe = Lumina2Text2ImgPipeline.from_pretrained(
    "Alpha-VLLM/Lumina-Image-2.0", transformer=transformer, torch_dtype=torch.bfloat16
 )
 pipe.enable_model_cpu_offload()
@@ -80,8 +76,8 @@ image = pipe(
 image.save("lumina-gguf.png")
 ```

-## Lumina2Pipeline
+## Lumina2Text2ImgPipeline

-[[autodoc]] Lumina2Pipeline
+[[autodoc]] Lumina2Text2ImgPipeline
  - all
  - __call__
--- a/docs/source/en/api/pipelines/marigold.md
+++ b/docs/source/en/api/pipelines/marigold.md
@@ -1,6 +1,4 @@
-<!--
-Copyright 2023-2025 Marigold Team, ETH Zürich. All rights reserved.
-Copyright 2024-2025 The HuggingFace Team. All rights reserved.
+<!--Copyright 2024 Marigold authors and The HuggingFace Team. All rights reserved.

 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
@@ -12,120 +10,67 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->

-# Marigold Computer Vision
+# Marigold Pipelines for Computer Vision Tasks

 ![marigold](https://marigoldmonodepth.github.io/images/teaser_collage_compressed.jpg)

-Marigold was proposed in 
-[Repurposing Diffusion-Based Image Generators for Monocular Depth Estimation](https://huggingface.co/papers/2312.02145), 
-a CVPR 2024 Oral paper by 
-[Bingxin Ke](http://www.kebingxin.com/), 
-[Anton Obukhov](https://www.obukhov.ai/), 
-[Shengyu Huang](https://shengyuh.github.io/), 
-[Nando Metzger](https://nandometzger.github.io/), 
-[Rodrigo Caye Daudt](https://rcdaudt.github.io/), and 
-[Konrad Schindler](https://scholar.google.com/citations?user=FZuNgqIAAAAJ&hl=en).
-The core idea is to **repurpose the generative prior of Text-to-Image Latent Diffusion Models (LDMs) for traditional 
-computer vision tasks**.
-This approach was explored by fine-tuning Stable Diffusion for **Monocular Depth Estimation**, as demonstrated in the 
-teaser above.
+Marigold was proposed in [Repurposing Diffusion-Based Image Generators for Monocular Depth Estimation](https://huggingface.co/papers/2312.02145), a CVPR 2024 Oral paper by [Bingxin Ke](http://www.kebingxin.com/), [Anton Obukhov](https://www.obukhov.ai/), [Shengyu Huang](https://shengyuh.github.io/), [Nando Metzger](https://nandometzger.github.io/), [Rodrigo Caye Daudt](https://rcdaudt.github.io/), and [Konrad Schindler](https://scholar.google.com/citations?user=FZuNgqIAAAAJ&hl=en).
+The idea is to repurpose the rich generative prior of Text-to-Image Latent Diffusion Models (LDMs) for traditional computer vision tasks.
+Initially, this idea was explored to fine-tune Stable Diffusion for Monocular Depth Estimation, as shown in the teaser above.
+Later,
+- [Tianfu Wang](https://tianfwang.github.io/) trained the first Latent Consistency Model (LCM) of Marigold, which unlocked fast single-step inference;
+- [Kevin Qu](https://www.linkedin.com/in/kevin-qu-b3417621b/?locale=en_US) extended the approach to Surface Normals Estimation;
+- [Anton Obukhov](https://www.obukhov.ai/) contributed the pipelines and documentation into diffusers (enabled and supported by [YiYi Xu](https://yiyixuxu.github.io/) and [Sayak Paul](https://sayak.dev/)).

-Marigold was later extended in the follow-up paper, 
-[Marigold: Affordable Adaptation of Diffusion-Based Image Generators for Image Analysis](https://huggingface.co/papers/2312.02145), 
-authored by 
-[Bingxin Ke](http://www.kebingxin.com/), 
-[Kevin Qu](https://www.linkedin.com/in/kevin-qu-b3417621b/?locale=en_US), 
-[Tianfu Wang](https://tianfwang.github.io/), 
-[Nando Metzger](https://nandometzger.github.io/), 
-[Shengyu Huang](https://shengyuh.github.io/), 
-[Bo Li](https://www.linkedin.com/in/bobboli0202/), 
-[Anton Obukhov](https://www.obukhov.ai/), and 
-[Konrad Schindler](https://scholar.google.com/citations?user=FZuNgqIAAAAJ&hl=en).
-This work expanded Marigold to support new modalities such as **Surface Normals** and **Intrinsic Image Decomposition** 
-(IID), introduced a training protocol for **Latent Consistency Models** (LCM), and demonstrated **High-Resolution** (HR) 
-processing capability.
+The abstract from the paper is:

-<Tip>
-
-The early Marigold models (`v1-0` and earlier) were optimized for best results with at least 10 inference steps.
-LCM models were later developed to enable high-quality inference in just 1 to 4 steps.
-Marigold models `v1-1` and later use the DDIM scheduler to achieve optimal 
-results in as few as 1 to 4 steps.
-
-</Tip>
+*Monocular depth estimation is a fundamental computer vision task. Recovering 3D depth from a single image is geometrically ill-posed and requires scene understanding, so it is not surprising that the rise of deep learning has led to a breakthrough. The impressive progress of monocular depth estimators has mirrored the growth in model capacity, from relatively modest CNNs to large Transformer architectures. Still, monocular depth estimators tend to struggle when presented with images with unfamiliar content and layout, since their knowledge of the visual world is restricted by the data seen during training, and challenged by zero-shot generalization to new domains. This motivates us to explore whether the extensive priors captured in recent generative diffusion models can enable better, more generalizable depth estimation. We introduce Marigold, a method for affine-invariant monocular depth estimation that is derived from Stable Diffusion and retains its rich prior knowledge. The estimator can be fine-tuned in a couple of days on a single GPU using only synthetic training data. It delivers state-of-the-art performance across a wide range of datasets, including over 20% performance gains in specific cases. Project page: https://marigoldmonodepth.github.io.*

 ## Available Pipelines

-Each pipeline is tailored for a specific computer vision task, processing an input RGB image and generating a 
-corresponding prediction.
-Currently, the following computer vision tasks are implemented:
+Each pipeline supports one Computer Vision task, which takes an input RGB image as input and produces a *prediction* of the modality of interest, such as a depth map of the input image.
+Currently, the following tasks are implemented:
+
+| Pipeline                                                                                                                                    | Predicted Modalities                                                                                             |                                                                       Demos                                                                        |
+|---------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------|:--------------------------------------------------------------------------------------------------------------------------------------------------:|
+| [MarigoldDepthPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/marigold/pipeline_marigold_depth.py)     | [Depth](https://en.wikipedia.org/wiki/Depth_map), [Disparity](https://en.wikipedia.org/wiki/Binocular_disparity) | [Fast Demo (LCM)](https://huggingface.co/spaces/prs-eth/marigold-lcm), [Slow Original Demo (DDIM)](https://huggingface.co/spaces/prs-eth/marigold) |
+| [MarigoldNormalsPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/marigold/pipeline_marigold_normals.py) | [Surface normals](https://en.wikipedia.org/wiki/Normal_mapping)                                                  |                                   [Fast Demo (LCM)](https://huggingface.co/spaces/prs-eth/marigold-normals-lcm)                                    |

-| Pipeline                                                                                                                                          | Recommended Model Checkpoints                                                                                                                                                                           |                              Spaces (Interactive Apps)                               | Predicted Modalities                                                                                                                                                               |
-|---------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:------------------------------------------------------------------------------------:|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| [MarigoldDepthPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/marigold/pipeline_marigold_depth.py)           | [prs-eth/marigold-depth-v1-1](https://huggingface.co/prs-eth/marigold-depth-v1-1)                                                                                                                       |          [Depth Estimation](https://huggingface.co/spaces/prs-eth/marigold)          | [Depth](https://en.wikipedia.org/wiki/Depth_map), [Disparity](https://en.wikipedia.org/wiki/Binocular_disparity)                                                                   |
-| [MarigoldNormalsPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/marigold/pipeline_marigold_normals.py)       | [prs-eth/marigold-normals-v1-1](https://huggingface.co/prs-eth/marigold-normals-v1-1)                                                                                                                   | [Surface Normals Estimation](https://huggingface.co/spaces/prs-eth/marigold-normals) | [Surface normals](https://en.wikipedia.org/wiki/Normal_mapping)                                                                                                                    |
-| [MarigoldIntrinsicsPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/marigold/pipeline_marigold_intrinsics.py) | [prs-eth/marigold-iid-appearance-v1-1](https://huggingface.co/prs-eth/marigold-iid-appearance-v1-1),<br>[prs-eth/marigold-iid-lighting-v1-1](https://huggingface.co/prs-eth/marigold-iid-lighting-v1-1) | [Intrinsic Image Decomposition](https://huggingface.co/spaces/prs-eth/marigold-iid)  | [Albedo](https://en.wikipedia.org/wiki/Albedo), [Materials](https://www.n.aiq3d.com/wiki/roughnessmetalnessao-map), [Lighting](https://en.wikipedia.org/wiki/Diffuse_reflection)   |

 ## Available Checkpoints

-All original checkpoints are available under the [PRS-ETH](https://huggingface.co/prs-eth/) organization on Hugging Face.
-They are designed for use with diffusers pipelines and the [original codebase](https://github.com/prs-eth/marigold), which can also be used to train 
-new model checkpoints.
-The following is a summary of the recommended checkpoints, all of which produce reliable results with 1 to 4 steps. 
-
-| Checkpoint                                                                                          | Modality     | Comment                                                                                                                                                                              |
-|-----------------------------------------------------------------------------------------------------|--------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| [prs-eth/marigold-depth-v1-1](https://huggingface.co/prs-eth/marigold-depth-v1-1)                   | Depth        | Affine-invariant depth prediction assigns each pixel a value between 0 (near plane) and 1 (far plane), with both planes determined by the model during inference.                    |
-| [prs-eth/marigold-normals-v0-1](https://huggingface.co/prs-eth/marigold-normals-v0-1)               | Normals      | The surface normals predictions are unit-length 3D vectors in the screen space camera, with values in the range from -1 to 1.                                                        |
-| [prs-eth/marigold-iid-appearance-v1-1](https://huggingface.co/prs-eth/marigold-iid-appearance-v1-1) | Intrinsics   | InteriorVerse decomposition is comprised of Albedo and two BRDF material properties: Roughness and Metallicity.                                                                      | 
-| [prs-eth/marigold-iid-lighting-v1-1](https://huggingface.co/prs-eth/marigold-iid-lighting-v1-1)     | Intrinsics   | HyperSim decomposition of an image &nbsp\\(I\\)&nbsp is comprised of Albedo &nbsp\\(A\\), Diffuse shading &nbsp\\(S\\), and Non-diffuse residual &nbsp\\(R\\): &nbsp\\(I = A*S+R\\). |
+The original checkpoints can be found under the [PRS-ETH](https://huggingface.co/prs-eth/) Hugging Face organization.

 <Tip>

-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff 
-between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to 
-efficiently load the same components into multiple pipelines. 
-Also, to know more about reducing the memory usage of this pipeline, refer to the ["Reduce memory usage"] section 
-[here](../../using-diffusers/svd#reduce-memory-usage).
+Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines. Also, to know more about reducing the memory usage of this pipeline, refer to the ["Reduce memory usage"] section [here](../../using-diffusers/svd#reduce-memory-usage).

 </Tip>

 <Tip warning={true}>

-Marigold pipelines were designed and tested with the scheduler embedded in the model checkpoint.
-The optimal number of inference steps varies by scheduler, with no universal value that works best across all cases.
-To accommodate this, the `num_inference_steps` parameter in the pipeline's `__call__` method defaults to `None` (see the 
-API reference).
-Unless set explicitly, it inherits the value from the `default_denoising_steps` field in the checkpoint configuration 
-file (`model_index.json`).
-This ensures high-quality predictions when invoking the pipeline with only the `image` argument.
+Marigold pipelines were designed and tested only with `DDIMScheduler` and `LCMScheduler`.
+Depending on the scheduler, the number of inference steps required to get reliable predictions varies, and there is no universal value that works best across schedulers.
+Because of that, the default value of `num_inference_steps` in the `__call__` method of the pipeline is set to `None` (see the API reference).
+Unless set explicitly, its value will be taken from the checkpoint configuration `model_index.json`.
+This is done to ensure high-quality predictions when calling the pipeline with just the `image` argument.

 </Tip>

-See also Marigold [usage examples](../../using-diffusers/marigold_usage).
-
-## Marigold Depth Prediction API
+See also Marigold [usage examples](marigold_usage).

+## MarigoldDepthPipeline
 [[autodoc]] MarigoldDepthPipeline
+	- all
 	- __call__

+## MarigoldNormalsPipeline
+[[autodoc]] MarigoldNormalsPipeline
+	- all
+	- __call__
+
+## MarigoldDepthOutput
 [[autodoc]] pipelines.marigold.pipeline_marigold_depth.MarigoldDepthOutput

-[[autodoc]] pipelines.marigold.marigold_image_processing.MarigoldImageProcessor.visualize_depth
-
-## Marigold Normals Estimation API
-[[autodoc]] MarigoldNormalsPipeline
-	- __call__
-
-[[autodoc]] pipelines.marigold.pipeline_marigold_normals.MarigoldNormalsOutput
-
-[[autodoc]] pipelines.marigold.marigold_image_processing.MarigoldImageProcessor.visualize_normals
-
-## Marigold Intrinsic Image Decomposition API
-
-[[autodoc]] MarigoldIntrinsicsPipeline
-	- __call__
-
-[[autodoc]] pipelines.marigold.pipeline_marigold_intrinsics.MarigoldIntrinsicsOutput
-
-[[autodoc]] pipelines.marigold.marigold_image_processing.MarigoldImageProcessor.visualize_intrinsics
+## MarigoldNormalsOutput
+[[autodoc]] pipelines.marigold.pipeline_marigold_normals.MarigoldNormalsOutput
--- a/docs/source/en/api/pipelines/mochi.md
+++ b/docs/source/en/api/pipelines/mochi.md
@@ -15,10 +15,6 @@

 # Mochi 1 Preview

-<div class="flex flex-wrap space-x-1">
-  <img alt="LoRA" src="https://img.shields.io/badge/LoRA-d8b4fe?style=flat"/>
-</div>
-
 > [!TIP]
 > Only a research preview of the model weights is available at the moment.

--- a/docs/source/en/api/pipelines/overview.md
+++ b/docs/source/en/api/pipelines/overview.md
@@ -54,7 +54,7 @@ The table below lists all the pipelines currently available in 🤗 Diffusers an
 | [DiT](dit) | text2image |
 | [Flux](flux) | text2image |
 | [Hunyuan-DiT](hunyuandit) | text2image |
-| [I2VGen-XL](i2vgenxl) | image2video |
+| [I2VGen-XL](i2vgenxl) | text2video |
 | [InstructPix2Pix](pix2pix) | image editing |
 | [Kandinsky 2.1](kandinsky) | text2image, image2image, inpainting, interpolation |
 | [Kandinsky 2.2](kandinsky_v22) | text2image, image2image, inpainting |
@@ -65,7 +65,7 @@ The table below lists all the pipelines currently available in 🤗 Diffusers an
 | [Latte](latte) | text2image |
 | [LEDITS++](ledits_pp) | image editing |
 | [Lumina-T2X](lumina) | text2image |
-| [Marigold](marigold) | depth-estimation, normals-estimation, intrinsic-decomposition |
+| [Marigold](marigold) | depth |
 | [MultiDiffusion](panorama) | text2image |
 | [MusicLDM](musicldm) | text2audio |
 | [PAG](pag) | text2image |
@@ -89,7 +89,6 @@ The table below lists all the pipelines currently available in 🤗 Diffusers an
 | [UniDiffuser](unidiffuser) | text2image, image2text, image variation, text variation, unconditional image generation, unconditional audio generation |
 | [Value-guided planning](value_guided_sampling) | value guided sampling |
 | [Wuerstchen](wuerstchen) | text2image |
-| [VisualCloze](visualcloze) | text2image, image2image, subject driven generation, inpainting, style transfer, image restoration, image editing, [depth,normal,edge,pose]2image, [depth,normal,edge,pose]-estimation, virtual try-on, image relighting |

 ## DiffusionPipeline

--- a/docs/source/en/api/pipelines/pag.md
+++ b/docs/source/en/api/pipelines/pag.md
@@ -12,10 +12,6 @@ specific language governing permissions and limitations under the License.

 # Perturbed-Attention Guidance

-<div class="flex flex-wrap space-x-1">
-  <img alt="LoRA" src="https://img.shields.io/badge/LoRA-d8b4fe?style=flat"/>
-</div>
-
 [Perturbed-Attention Guidance (PAG)](https://ku-cvlab.github.io/Perturbed-Attention-Guidance/) is a new diffusion sampling guidance that improves sample quality across both unconditional and conditional settings, achieving this without requiring further training or the integration of external modules.

 PAG was introduced in [Self-Rectifying Diffusion Sampling with Perturbed-Attention Guidance](https://huggingface.co/papers/2403.17377) by Donghoon Ahn, Hyoungwon Cho, Jaewon Min, Wooseok Jang, Jungwoo Kim, SeonHwa Kim, Hyun Hee Park, Kyong Hwan Jin and Seungryong Kim.
--- a/docs/source/en/api/pipelines/panorama.md
+++ b/docs/source/en/api/pipelines/panorama.md
@@ -12,10 +12,6 @@ specific language governing permissions and limitations under the License.

 # MultiDiffusion

-<div class="flex flex-wrap space-x-1">
-  <img alt="LoRA" src="https://img.shields.io/badge/LoRA-d8b4fe?style=flat"/>
-</div>
-
 [MultiDiffusion: Fusing Diffusion Paths for Controlled Image Generation](https://huggingface.co/papers/2302.08113) is by Omer Bar-Tal, Lior Yariv, Yaron Lipman, and Tali Dekel.

 The abstract from the paper is:
--- a/docs/source/en/api/pipelines/pia.md
+++ b/docs/source/en/api/pipelines/pia.md
@@ -12,10 +12,6 @@ specific language governing permissions and limitations under the License.

 # Image-to-Video Generation with PIA (Personalized Image Animator)

-<div class="flex flex-wrap space-x-1">
-  <img alt="LoRA" src="https://img.shields.io/badge/LoRA-d8b4fe?style=flat"/>
-</div>
-
 ## Overview

 [PIA: Your Personalized Image Animator via Plug-and-Play Modules in Text-to-Image Models](https://arxiv.org/abs/2312.13964) by Yiming Zhang, Zhening Xing, Yanhong Zeng, Youqing Fang, Kai Chen
--- a/docs/source/en/api/pipelines/pix2pix.md
+++ b/docs/source/en/api/pipelines/pix2pix.md
@@ -12,10 +12,6 @@ specific language governing permissions and limitations under the License.

 # InstructPix2Pix

-<div class="flex flex-wrap space-x-1">
-  <img alt="LoRA" src="https://img.shields.io/badge/LoRA-d8b4fe?style=flat"/>
-</div>
-
 [InstructPix2Pix: Learning to Follow Image Editing Instructions](https://huggingface.co/papers/2211.09800) is by Tim Brooks, Aleksander Holynski and Alexei A. Efros.

 The abstract from the paper is:
--- a/docs/source/en/api/pipelines/sana.md
+++ b/docs/source/en/api/pipelines/sana.md
@@ -14,11 +14,6 @@

 # SanaPipeline

-<div class="flex flex-wrap space-x-1">
-  <img alt="LoRA" src="https://img.shields.io/badge/LoRA-d8b4fe?style=flat"/>
-  <img alt="MPS" src="https://img.shields.io/badge/MPS-000000?style=flat&logo=apple&logoColor=white%22">
-</div>
-
 [SANA: Efficient High-Resolution Image Synthesis with Linear Diffusion Transformers](https://huggingface.co/papers/2410.10629) from NVIDIA and MIT HAN Lab, by Enze Xie, Junsong Chen, Junyu Chen, Han Cai, Haotian Tang, Yujun Lin, Zhekai Zhang, Muyang Li, Ligeng Zhu, Yao Lu, Song Han.

 The abstract from the paper is:
--- a/docs/source/en/api/pipelines/sana_sprint.md
+++ b/docs/source/en/api/pipelines/sana_sprint.md
@@ -1,100 +0,0 @@
-<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License. -->
-
-# SANA-Sprint
-
-<div class="flex flex-wrap space-x-1">
-  <img alt="LoRA" src="https://img.shields.io/badge/LoRA-d8b4fe?style=flat"/>
-</div>
-
-[SANA-Sprint: One-Step Diffusion with Continuous-Time Consistency Distillation](https://huggingface.co/papers/2503.09641) from NVIDIA, MIT HAN Lab, and Hugging Face by Junsong Chen, Shuchen Xue, Yuyang Zhao, Jincheng Yu, Sayak Paul, Junyu Chen, Han Cai, Enze Xie, Song Han
-
-The abstract from the paper is:
-
-*This paper presents SANA-Sprint, an efficient diffusion model for ultra-fast text-to-image (T2I) generation. SANA-Sprint is built on a pre-trained foundation model and augmented with hybrid distillation, dramatically reducing inference steps from 20 to 1-4. We introduce three key innovations: (1) We propose a training-free approach that transforms a pre-trained flow-matching model for continuous-time consistency distillation (sCM), eliminating costly training from scratch and achieving high training efficiency. Our hybrid distillation strategy combines sCM with latent adversarial distillation (LADD): sCM ensures alignment with the teacher model, while LADD enhances single-step generation fidelity. (2) SANA-Sprint is a unified step-adaptive model that achieves high-quality generation in 1-4 steps, eliminating step-specific training and improving efficiency. (3) We integrate ControlNet with SANA-Sprint for real-time interactive image generation, enabling instant visual feedback for user interaction. SANA-Sprint establishes a new Pareto frontier in speed-quality tradeoffs, achieving state-of-the-art performance with 7.59 FID and 0.74 GenEval in only 1 step — outperforming FLUX-schnell (7.94 FID / 0.71 GenEval) while being 10× faster (0.1s vs 1.1s on H100). It also achieves 0.1s (T2I) and 0.25s (ControlNet) latency for 1024×1024 images on H100, and 0.31s (T2I) on an RTX 4090, showcasing its exceptional efficiency and potential for AI-powered consumer applications (AIPC). Code and pre-trained models will be open-sourced.*
-
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
-
-This pipeline was contributed by [lawrence-cj](https://github.com/lawrence-cj), [shuchen Xue](https://github.com/scxue) and [Enze Xie](https://github.com/xieenze). The original codebase can be found [here](https://github.com/NVlabs/Sana). The original weights can be found under [hf.co/Efficient-Large-Model](https://huggingface.co/Efficient-Large-Model/).
-
-Available models:
-
-|                                                                    Model                                                                    | Recommended dtype |
-|:-------------------------------------------------------------------------------------------------------------------------------------------:|:-----------------:|
-| [`Efficient-Large-Model/Sana_Sprint_1.6B_1024px_diffusers`](https://huggingface.co/Efficient-Large-Model/Sana_Sprint_1.6B_1024px_diffusers) | `torch.bfloat16`  |
-| [`Efficient-Large-Model/Sana_Sprint_0.6B_1024px_diffusers`](https://huggingface.co/Efficient-Large-Model/Sana_Sprint_0.6B_1024px_diffusers) | `torch.bfloat16`  |
-
-Refer to [this](https://huggingface.co/collections/Efficient-Large-Model/sana-sprint-67d6810d65235085b3b17c76) collection for more information.
-
-Note: The recommended dtype mentioned is for the transformer weights. The text encoder must stay in `torch.bfloat16` and VAE weights must stay in `torch.bfloat16` or `torch.float32` for the model to work correctly. Please refer to the inference example below to see how to load the model with the recommended dtype. 
-
-
-## Quantization
-
-Quantization helps reduce the memory requirements of very large models by storing model weights in a lower precision data type. However, quantization may have varying impact on video quality depending on the video model.
-
-Refer to the [Quantization](../../quantization/overview) overview to learn more about supported quantization backends and selecting a quantization backend that supports your use case. The example below demonstrates how to load a quantized [`SanaSprintPipeline`] for inference with bitsandbytes.
-
-```py
-import torch
-from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig, SanaTransformer2DModel, SanaSprintPipeline
-from transformers import BitsAndBytesConfig as BitsAndBytesConfig, AutoModel
-
-quant_config = BitsAndBytesConfig(load_in_8bit=True)
-text_encoder_8bit = AutoModel.from_pretrained(
-    "Efficient-Large-Model/Sana_Sprint_1.6B_1024px_diffusers",
-    subfolder="text_encoder",
-    quantization_config=quant_config,
-    torch_dtype=torch.bfloat16,
-)
-
-quant_config = DiffusersBitsAndBytesConfig(load_in_8bit=True)
-transformer_8bit = SanaTransformer2DModel.from_pretrained(
-    "Efficient-Large-Model/Sana_Sprint_1.6B_1024px_diffusers",
-    subfolder="transformer",
-    quantization_config=quant_config,
-    torch_dtype=torch.bfloat16,
-)
-
-pipeline = SanaSprintPipeline.from_pretrained(
-    "Efficient-Large-Model/Sana_Sprint_1.6B_1024px_diffusers",
-    text_encoder=text_encoder_8bit,
-    transformer=transformer_8bit,
-    torch_dtype=torch.bfloat16,
-    device_map="balanced",
-)
-
-prompt = "a tiny astronaut hatching from an egg on the moon"
-image = pipeline(prompt).images[0]
-image.save("sana.png")
-```
-
-## Setting `max_timesteps`
-
-Users can tweak the `max_timesteps` value for experimenting with the visual quality of the generated outputs. The default `max_timesteps` value was obtained with an inference-time search process. For more details about it, check out the paper.
-
-## SanaSprintPipeline
-
-[[autodoc]] SanaSprintPipeline
-  - all
-  - __call__
-
-
-## SanaPipelineOutput
-
-[[autodoc]] pipelines.sana.pipeline_output.SanaPipelineOutput
--- a/docs/source/en/api/pipelines/stable_diffusion/depth2img.md
+++ b/docs/source/en/api/pipelines/stable_diffusion/depth2img.md
@@ -12,10 +12,6 @@ specific language governing permissions and limitations under the License.

 # Depth-to-image

-<div class="flex flex-wrap space-x-1">
-  <img alt="LoRA" src="https://img.shields.io/badge/LoRA-d8b4fe?style=flat"/>
-</div>
-
 The Stable Diffusion model can also infer depth based on an image using [MiDaS](https://github.com/isl-org/MiDaS). This allows you to pass a text prompt and an initial image to condition the generation of new images as well as a `depth_map` to preserve the image structure.

 <Tip>
--- a/docs/source/en/api/pipelines/stable_diffusion/img2img.md
+++ b/docs/source/en/api/pipelines/stable_diffusion/img2img.md
@@ -12,10 +12,6 @@ specific language governing permissions and limitations under the License.

 # Image-to-image

-<div class="flex flex-wrap space-x-1">
-  <img alt="LoRA" src="https://img.shields.io/badge/LoRA-d8b4fe?style=flat"/>
-</div>
-
 The Stable Diffusion model can also be applied to image-to-image generation by passing a text prompt and an initial image to condition the generation of new images.

 The [`StableDiffusionImg2ImgPipeline`] uses the diffusion-denoising mechanism proposed in [SDEdit: Guided Image Synthesis and Editing with Stochastic Differential Equations](https://huggingface.co/papers/2108.01073) by Chenlin Meng, Yutong He, Yang Song, Jiaming Song, Jiajun Wu, Jun-Yan Zhu, Stefano Ermon.
--- a/docs/source/en/api/pipelines/stable_diffusion/inpaint.md
+++ b/docs/source/en/api/pipelines/stable_diffusion/inpaint.md
@@ -12,10 +12,6 @@ specific language governing permissions and limitations under the License.

 # Inpainting

-<div class="flex flex-wrap space-x-1">
-  <img alt="LoRA" src="https://img.shields.io/badge/LoRA-d8b4fe?style=flat"/>
-</div>
-
 The Stable Diffusion model can also be applied to inpainting which lets you edit specific parts of an image by providing a mask and a text prompt using Stable Diffusion.

 ## Tips
--- a/docs/source/en/api/pipelines/stable_diffusion/ldm3d_diffusion.md
+++ b/docs/source/en/api/pipelines/stable_diffusion/ldm3d_diffusion.md
@@ -12,10 +12,6 @@ specific language governing permissions and limitations under the License.

 # Text-to-(RGB, depth)

-<div class="flex flex-wrap space-x-1">
-  <img alt="LoRA" src="https://img.shields.io/badge/LoRA-d8b4fe?style=flat"/>
-</div>
-
 LDM3D was proposed in [LDM3D: Latent Diffusion Model for 3D](https://huggingface.co/papers/2305.10853) by Gabriela Ben Melech Stan, Diana Wofk, Scottie Fox, Alex Redden, Will Saxton, Jean Yu, Estelle Aflalo, Shao-Yen Tseng, Fabio Nonato, Matthias Muller, and Vasudev Lal. LDM3D generates an image and a depth map from a given text prompt unlike the existing text-to-image diffusion models such as [Stable Diffusion](./overview) which only generates an image. With almost the same number of parameters, LDM3D achieves to create a latent space that can compress both the RGB images and the depth maps.

 Two checkpoints are available for use:
--- a/docs/source/en/api/pipelines/stable_diffusion/overview.md
+++ b/docs/source/en/api/pipelines/stable_diffusion/overview.md
@@ -12,10 +12,6 @@ specific language governing permissions and limitations under the License.

 # Stable Diffusion pipelines

-<div class="flex flex-wrap space-x-1">
-  <img alt="LoRA" src="https://img.shields.io/badge/LoRA-d8b4fe?style=flat"/>
-</div>
-
 Stable Diffusion is a text-to-image latent diffusion model created by the researchers and engineers from [CompVis](https://github.com/CompVis), [Stability AI](https://stability.ai/) and [LAION](https://laion.ai/). Latent diffusion applies the diffusion process over a lower dimensional latent space to reduce memory and compute complexity. This specific type of diffusion model was proposed in [High-Resolution Image Synthesis with Latent Diffusion Models](https://huggingface.co/papers/2112.10752) by Robin Rombach, Andreas Blattmann, Dominik Lorenz, Patrick Esser, Björn Ommer.

 Stable Diffusion is trained on 512x512 images from a subset of the LAION-5B dataset. This model uses a frozen CLIP ViT-L/14 text encoder to condition the model on text prompts. With its 860M UNet and 123M text encoder, the model is relatively lightweight and can run on consumer GPUs.
--- a/docs/source/en/api/pipelines/stable_diffusion/stable_diffusion_3.md
+++ b/docs/source/en/api/pipelines/stable_diffusion/stable_diffusion_3.md
@@ -12,11 +12,6 @@ specific language governing permissions and limitations under the License.

 # Stable Diffusion 3

-<div class="flex flex-wrap space-x-1">
-  <img alt="LoRA" src="https://img.shields.io/badge/LoRA-d8b4fe?style=flat"/>
-  <img alt="MPS" src="https://img.shields.io/badge/MPS-000000?style=flat&logo=apple&logoColor=white%22">
-</div>
-
 Stable Diffusion 3 (SD3) was proposed in [Scaling Rectified Flow Transformers for High-Resolution Image Synthesis](https://arxiv.org/pdf/2403.03206.pdf) by Patrick Esser, Sumith Kulal, Andreas Blattmann, Rahim Entezari, Jonas Muller, Harry Saini, Yam Levi, Dominik Lorenz, Axel Sauer, Frederic Boesel, Dustin Podell, Tim Dockhorn, Zion English, Kyle Lacey, Alex Goodwin, Yannik Marek, and Robin Rombach.

 The abstract from the paper is:
--- a/docs/source/en/api/pipelines/stable_diffusion/stable_diffusion_xl.md
+++ b/docs/source/en/api/pipelines/stable_diffusion/stable_diffusion_xl.md
@@ -12,11 +12,6 @@ specific language governing permissions and limitations under the License.

 # Stable Diffusion XL

-<div class="flex flex-wrap space-x-1">
-  <img alt="LoRA" src="https://img.shields.io/badge/LoRA-d8b4fe?style=flat"/>
-  <img alt="MPS" src="https://img.shields.io/badge/MPS-000000?style=flat&logo=apple&logoColor=white%22">
-</div>
-
 Stable Diffusion XL (SDXL) was proposed in [SDXL: Improving Latent Diffusion Models for High-Resolution Image Synthesis](https://huggingface.co/papers/2307.01952) by Dustin Podell, Zion English, Kyle Lacey, Andreas Blattmann, Tim Dockhorn, Jonas Müller, Joe Penna, and Robin Rombach.

 The abstract from the paper is:
--- a/docs/source/en/api/pipelines/stable_diffusion/text2img.md
+++ b/docs/source/en/api/pipelines/stable_diffusion/text2img.md
@@ -12,10 +12,6 @@ specific language governing permissions and limitations under the License.

 # Text-to-image

-<div class="flex flex-wrap space-x-1">
-  <img alt="LoRA" src="https://img.shields.io/badge/LoRA-d8b4fe?style=flat"/>
-</div>
-
 The Stable Diffusion model was created by researchers and engineers from [CompVis](https://github.com/CompVis), [Stability AI](https://stability.ai/), [Runway](https://github.com/runwayml), and [LAION](https://laion.ai/). The [`StableDiffusionPipeline`] is capable of generating photorealistic images given any text input. It's trained on 512x512 images from a subset of the LAION-5B dataset. This model uses a frozen CLIP ViT-L/14 text encoder to condition the model on text prompts. With its 860M UNet and 123M text encoder, the model is relatively lightweight and can run on consumer GPUs. Latent diffusion is the research on top of which Stable Diffusion was built. It was proposed in [High-Resolution Image Synthesis with Latent Diffusion Models](https://huggingface.co/papers/2112.10752) by Robin Rombach, Andreas Blattmann, Dominik Lorenz, Patrick Esser, Björn Ommer.

 The abstract from the paper is:
--- a/docs/source/en/api/pipelines/stable_diffusion/upscale.md
+++ b/docs/source/en/api/pipelines/stable_diffusion/upscale.md
@@ -12,10 +12,6 @@ specific language governing permissions and limitations under the License.

 # Super-resolution

-<div class="flex flex-wrap space-x-1">
-  <img alt="LoRA" src="https://img.shields.io/badge/LoRA-d8b4fe?style=flat"/>
-</div>
-
 The Stable Diffusion upscaler diffusion model was created by the researchers and engineers from [CompVis](https://github.com/CompVis), [Stability AI](https://stability.ai/), and [LAION](https://laion.ai/). It is used to enhance the resolution of input images by a factor of 4.

 <Tip>
--- a/docs/source/en/api/pipelines/stable_unclip.md
+++ b/docs/source/en/api/pipelines/stable_unclip.md
@@ -12,10 +12,6 @@ specific language governing permissions and limitations under the License.

 # Stable unCLIP

-<div class="flex flex-wrap space-x-1">
-  <img alt="LoRA" src="https://img.shields.io/badge/LoRA-d8b4fe?style=flat"/>
-</div>
-
 Stable unCLIP checkpoints are finetuned from [Stable Diffusion 2.1](./stable_diffusion/stable_diffusion_2) checkpoints to condition on CLIP image embeddings.
 Stable unCLIP still conditions on text embeddings. Given the two separate conditionings, stable unCLIP can be used
 for text guided image variation. When combined with an unCLIP prior, it can also be used for full text to image generation.
--- a/docs/source/en/api/pipelines/text_to_video.md
+++ b/docs/source/en/api/pipelines/text_to_video.md
@@ -18,10 +18,6 @@ specific language governing permissions and limitations under the License.

 # Text-to-video

-<div class="flex flex-wrap space-x-1">
-  <img alt="LoRA" src="https://img.shields.io/badge/LoRA-d8b4fe?style=flat"/>
-</div>
-
 [ModelScope Text-to-Video Technical Report](https://arxiv.org/abs/2308.06571) is by Jiuniu Wang, Hangjie Yuan, Dayou Chen, Yingya Zhang, Xiang Wang, Shiwei Zhang.

 The abstract from the paper is:
--- a/docs/source/en/api/pipelines/text_to_video_zero.md
+++ b/docs/source/en/api/pipelines/text_to_video_zero.md
@@ -12,10 +12,6 @@ specific language governing permissions and limitations under the License.

 # Text2Video-Zero

-<div class="flex flex-wrap space-x-1">
-  <img alt="LoRA" src="https://img.shields.io/badge/LoRA-d8b4fe?style=flat"/>
-</div>
-
 [Text2Video-Zero: Text-to-Image Diffusion Models are Zero-Shot Video Generators](https://huggingface.co/papers/2303.13439) is by Levon Khachatryan, Andranik Movsisyan, Vahram Tadevosyan, Roberto Henschel, [Zhangyang Wang](https://www.ece.utexas.edu/people/faculty/atlas-wang), Shant Navasardyan, [Humphrey Shi](https://www.humphreyshi.com).

 Text2Video-Zero enables zero-shot video generation using either:
--- a/docs/source/en/api/pipelines/unidiffuser.md
+++ b/docs/source/en/api/pipelines/unidiffuser.md
@@ -12,10 +12,6 @@ specific language governing permissions and limitations under the License.

 # UniDiffuser

-<div class="flex flex-wrap space-x-1">
-  <img alt="LoRA" src="https://img.shields.io/badge/LoRA-d8b4fe?style=flat"/>
-</div>
-
 The UniDiffuser model was proposed in [One Transformer Fits All Distributions in Multi-Modal Diffusion at Scale](https://huggingface.co/papers/2303.06555) by Fan Bao, Shen Nie, Kaiwen Xue, Chongxuan Li, Shi Pu, Yaole Wang, Gang Yue, Yue Cao, Hang Su, Jun Zhu.

 The abstract from the paper is:
--- a/docs/source/en/api/pipelines/visualcloze.md
+++ b/docs/source/en/api/pipelines/visualcloze.md
@@ -1,300 +0,0 @@
-<!--Copyright 2025 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-->
-
-# VisualCloze
-
-[VisualCloze: A Universal Image Generation Framework via Visual In-Context Learning](https://arxiv.org/abs/2504.07960) is an innovative in-context learning based universal image generation framework that offers key capabilities:
-1. Support for various in-domain tasks
-2. Generalization to unseen tasks through in-context learning
-3. Unify multiple tasks into one step and generate both target image and intermediate results
-4. Support reverse-engineering conditions from target images
-
-## Overview
-
-The abstract from the paper is:
-
-*Recent progress in diffusion models significantly advances various image generation tasks. However, the current mainstream approach remains focused on building task-specific models, which have limited efficiency when supporting a wide range of different needs. While universal models attempt to address this limitation, they face critical challenges, including generalizable task instruction, appropriate task distributions, and unified architectural design. To tackle these challenges, we propose VisualCloze, a universal image generation framework, which supports a wide range of in-domain tasks, generalization to unseen ones, unseen unification of multiple tasks, and reverse generation. Unlike existing methods that rely on language-based task instruction, leading to task ambiguity and weak generalization, we integrate visual in-context learning, allowing models to identify tasks from visual demonstrations. Meanwhile, the inherent sparsity of visual task distributions hampers the learning of transferable knowledge across tasks. To this end, we introduce Graph200K, a graph-structured dataset that establishes various interrelated tasks, enhancing task density and transferable knowledge. Furthermore, we uncover that our unified image generation formulation shared a consistent objective with image infilling, enabling us to leverage the strong generative priors of pre-trained infilling models without modifying the architectures. The codes, dataset, and models are available at https://visualcloze.github.io.*
-
-## Inference
-
-### Model loading
-
-VisualCloze is a two-stage cascade pipeline, containing `VisualClozeGenerationPipeline` and `VisualClozeUpsamplingPipeline`.
- In `VisualClozeGenerationPipeline`, each image is downsampled before concatenating images into a grid layout, avoiding excessively high resolutions. VisualCloze releases two models suitable for diffusers, i.e., [VisualClozePipeline-384](https://huggingface.co/VisualCloze/VisualClozePipeline-384) and [VisualClozePipeline-512](https://huggingface.co/VisualCloze/VisualClozePipeline-384), which downsample images to resolutions of 384 and 512, respectively. 
- `VisualClozeUpsamplingPipeline` uses [SDEdit](https://arxiv.org/abs/2108.01073) to enable high-resolution image synthesis.
-
-The `VisualClozePipeline` integrates both stages to support convenient end-to-end sampling, while also allowing users to utilize each pipeline independently as needed.
-
-### Input Specifications
-
-#### Task and Content Prompts
- Task prompt: Required to describe the generation task intention
- Content prompt: Optional description or caption of the target image
- When content prompt is not needed, pass `None`
- For batch inference, pass `List[str|None]`
-
-#### Image Input Format
- Format: `List[List[Image|None]]`
- Structure:
-  - All rows except the last represent in-context examples
-  - Last row represents the current query (target image set to `None`)
- For batch inference, pass `List[List[List[Image|None]]]`
-
-#### Resolution Control
- Default behavior:
-  - Initial generation in the first stage: area of ${pipe.resolution}^2$
-  - Upsampling in the second stage: 3x factor
- Custom resolution: Adjust using `upsampling_height` and `upsampling_width` parameters
-
-### Examples
-
-For comprehensive examples covering a wide range of tasks, please refer to the [Online Demo](https://huggingface.co/spaces/VisualCloze/VisualCloze) and [GitHub Repository](https://github.com/lzyhha/VisualCloze). Below are simple examples for three cases: mask-to-image conversion, edge detection, and subject-driven generation.
-
-#### Example for mask2image
-
-```python
-import torch
-from diffusers import VisualClozePipeline
-from diffusers.utils import load_image
-
-pipe = VisualClozePipeline.from_pretrained("VisualCloze/VisualClozePipeline-384", resolution=384, torch_dtype=torch.bfloat16)
-pipe.to("cuda")
-
-# Load in-context images (make sure the paths are correct and accessible)
-image_paths = [
-    # in-context examples
-    [
-        load_image('https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/visualcloze/visualcloze_mask2image_incontext-example-1_mask.jpg'),
-        load_image('https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/visualcloze/visualcloze_mask2image_incontext-example-1_image.jpg'),
-    ],
-    # query with the target image
-    [
-        load_image('https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/visualcloze/visualcloze_mask2image_query_mask.jpg'),
-        None, # No image needed for the target image
-    ],
-]
-
-# Task and content prompt
-task_prompt = "In each row, a logical task is demonstrated to achieve [IMAGE2] an aesthetically pleasing photograph based on [IMAGE1] sam 2-generated masks with rich color coding."
-content_prompt = """Majestic photo of a golden eagle perched on a rocky outcrop in a mountainous landscape. 
-The eagle is positioned in the right foreground, facing left, with its sharp beak and keen eyes prominently visible. 
-Its plumage is a mix of dark brown and golden hues, with intricate feather details. 
-The background features a soft-focus view of snow-capped mountains under a cloudy sky, creating a serene and grandiose atmosphere. 
-The foreground includes rugged rocks and patches of green moss. Photorealistic, medium depth of field, 
-soft natural lighting, cool color palette, high contrast, sharp focus on the eagle, blurred background, 
-tranquil, majestic, wildlife photography."""
-
-# Run the pipeline
-image_result = pipe(
-    task_prompt=task_prompt,
-    content_prompt=content_prompt,
-    image=image_paths,
-    upsampling_width=1344,
-    upsampling_height=768,
-    upsampling_strength=0.4,
-    guidance_scale=30,
-    num_inference_steps=30,
-    max_sequence_length=512,
-    generator=torch.Generator("cpu").manual_seed(0)
-).images[0][0]
-
-# Save the resulting image
-image_result.save("visualcloze.png")
-```
-
-#### Example for edge-detection
-
-```python
-import torch
-from diffusers import VisualClozePipeline
-from diffusers.utils import load_image
-
-pipe = VisualClozePipeline.from_pretrained("VisualCloze/VisualClozePipeline-384", resolution=384, torch_dtype=torch.bfloat16)
-pipe.to("cuda")
-
-# Load in-context images (make sure the paths are correct and accessible)
-image_paths = [
-    # in-context examples
-    [
-        load_image('https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/visualcloze/visualcloze_edgedetection_incontext-example-1_image.jpg'),
-        load_image('https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/visualcloze/visualcloze_edgedetection_incontext-example-1_edge.jpg'),
-    ],
-    [
-        load_image('https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/visualcloze/visualcloze_edgedetection_incontext-example-2_image.jpg'),
-        load_image('https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/visualcloze/visualcloze_edgedetection_incontext-example-2_edge.jpg'),
-    ],
-    # query with the target image
-    [
-        load_image('https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/visualcloze/visualcloze_edgedetection_query_image.jpg'),
-        None, # No image needed for the target image
-    ],
-]
-
-# Task and content prompt
-task_prompt = "Each row illustrates a pathway from [IMAGE1] a sharp and beautifully composed photograph to [IMAGE2] edge map with natural well-connected outlines using a clear logical task."
-content_prompt = ""
-
-# Run the pipeline
-image_result = pipe(
-    task_prompt=task_prompt,
-    content_prompt=content_prompt,
-    image=image_paths,
-    upsampling_width=864,
-    upsampling_height=1152,
-    upsampling_strength=0.4,
-    guidance_scale=30,
-    num_inference_steps=30,
-    max_sequence_length=512,
-    generator=torch.Generator("cpu").manual_seed(0)
-).images[0][0]
-
-# Save the resulting image
-image_result.save("visualcloze.png")
-```
-
-#### Example for subject-driven generation
-
-```python
-import torch
-from diffusers import VisualClozePipeline
-from diffusers.utils import load_image
-
-pipe = VisualClozePipeline.from_pretrained("VisualCloze/VisualClozePipeline-384", resolution=384, torch_dtype=torch.bfloat16)
-pipe.to("cuda")
-
-# Load in-context images (make sure the paths are correct and accessible)
-image_paths = [
-    # in-context examples
-    [
-        load_image('https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/visualcloze/visualcloze_subjectdriven_incontext-example-1_reference.jpg'),
-        load_image('https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/visualcloze/visualcloze_subjectdriven_incontext-example-1_depth.jpg'),
-        load_image('https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/visualcloze/visualcloze_subjectdriven_incontext-example-1_image.jpg'),
-    ],
-    [
-        load_image('https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/visualcloze/visualcloze_subjectdriven_incontext-example-2_reference.jpg'),
-        load_image('https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/visualcloze/visualcloze_subjectdriven_incontext-example-2_depth.jpg'),
-        load_image('https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/visualcloze/visualcloze_subjectdriven_incontext-example-2_image.jpg'),
-    ],
-    # query with the target image
-    [
-        load_image('https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/visualcloze/visualcloze_subjectdriven_query_reference.jpg'),
-        load_image('https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/visualcloze/visualcloze_subjectdriven_query_depth.jpg'),
-        None, # No image needed for the target image
-    ],
-]
-
-# Task and content prompt
-task_prompt = """Each row describes a process that begins with [IMAGE1] an image containing the key object, 
-[IMAGE2] depth map revealing gray-toned spatial layers and results in 
-[IMAGE3] an image with artistic qualitya high-quality image with exceptional detail."""
-content_prompt = """A vintage porcelain collector's item. Beneath a blossoming cherry tree in early spring, 
-this treasure is photographed up close, with soft pink petals drifting through the air and vibrant blossoms framing the scene."""
-
-# Run the pipeline
-image_result = pipe(
-    task_prompt=task_prompt,
-    content_prompt=content_prompt,
-    image=image_paths,
-    upsampling_width=1024,
-    upsampling_height=1024,
-    upsampling_strength=0.2,
-    guidance_scale=30,
-    num_inference_steps=30,
-    max_sequence_length=512,
-    generator=torch.Generator("cpu").manual_seed(0)
-).images[0][0]
-
-# Save the resulting image
-image_result.save("visualcloze.png")
-```
-
-#### Utilize each pipeline independently 
-
-```python
-import torch
-from diffusers import VisualClozeGenerationPipeline, FluxFillPipeline as VisualClozeUpsamplingPipeline
-from diffusers.utils import load_image
-from PIL import Image
-
-pipe = VisualClozeGenerationPipeline.from_pretrained(
-    "VisualCloze/VisualClozePipeline-384", resolution=384, torch_dtype=torch.bfloat16
-)
-pipe.to("cuda")
-
-image_paths = [
-    # in-context examples
-    [
-        load_image(
-            "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/visualcloze/visualcloze_mask2image_incontext-example-1_mask.jpg"
-        ),
-        load_image(
-            "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/visualcloze/visualcloze_mask2image_incontext-example-1_image.jpg"
-        ),
-    ],
-    # query with the target image
-    [
-        load_image(
-            "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/visualcloze/visualcloze_mask2image_query_mask.jpg"
-        ),
-        None,  # No image needed for the target image
-    ],
-]
-task_prompt = "In each row, a logical task is demonstrated to achieve [IMAGE2] an aesthetically pleasing photograph based on [IMAGE1] sam 2-generated masks with rich color coding."
-content_prompt = "Majestic photo of a golden eagle perched on a rocky outcrop in a mountainous landscape. The eagle is positioned in the right foreground, facing left, with its sharp beak and keen eyes prominently visible. Its plumage is a mix of dark brown and golden hues, with intricate feather details. The background features a soft-focus view of snow-capped mountains under a cloudy sky, creating a serene and grandiose atmosphere. The foreground includes rugged rocks and patches of green moss. Photorealistic, medium depth of field, soft natural lighting, cool color palette, high contrast, sharp focus on the eagle, blurred background, tranquil, majestic, wildlife photography."
-
-# Stage 1: Generate initial image
-image = pipe(
-    task_prompt=task_prompt,
-    content_prompt=content_prompt,
-    image=image_paths,
-    guidance_scale=30,
-    num_inference_steps=30,
-    max_sequence_length=512,
-    generator=torch.Generator("cpu").manual_seed(0),
-).images[0][0]
-
-# Stage 2 (optional): Upsample the generated image
-pipe_upsample = VisualClozeUpsamplingPipeline.from_pipe(pipe)
-pipe_upsample.to("cuda")
-
-mask_image = Image.new("RGB", image.size, (255, 255, 255))
-
-image = pipe_upsample(
-    image=image,
-    mask_image=mask_image,
-    prompt=content_prompt,
-    width=1344,
-    height=768,
-    strength=0.4,
-    guidance_scale=30,
-    num_inference_steps=30,
-    max_sequence_length=512,
-    generator=torch.Generator("cpu").manual_seed(0),
-).images[0]
-
-image.save("visualcloze.png")
-```
-
-## VisualClozePipeline
-
-[[autodoc]] VisualClozePipeline
-  - all
-  - __call__
-
-## VisualClozeGenerationPipeline
-
-[[autodoc]] VisualClozeGenerationPipeline
-  - all
-  - __call__
--- a/docs/source/en/api/pipelines/wan.md
+++ b/docs/source/en/api/pipelines/wan.md
@@ -1,519 +0,0 @@
-<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License. -->
-
-# Wan
-
-<div class="flex flex-wrap space-x-1">
-  <img alt="LoRA" src="https://img.shields.io/badge/LoRA-d8b4fe?style=flat"/>
-</div>
-
-[Wan 2.1](https://github.com/Wan-Video/Wan2.1) by the Alibaba Wan Team.
-
-<!-- TODO(aryan): update abstract once paper is out -->
-
-## Generating Videos with Wan 2.1
-
-We will first need to install some additional dependencies.
-
-```shell
-pip install -u ftfy imageio-ffmpeg imageio
-```
-
-### Text to Video Generation
-
-The following example requires 11GB VRAM to run and uses the smaller `Wan-AI/Wan2.1-T2V-1.3B-Diffusers` model. You can switch it out
-for the larger `Wan2.1-I2V-14B-720P-Diffusers` or `Wan-AI/Wan2.1-I2V-14B-480P-Diffusers` if you have at least 35GB VRAM available.
-
-```python
-from diffusers import WanPipeline
-from diffusers.utils import export_to_video
-
-# Available models: Wan-AI/Wan2.1-I2V-14B-720P-Diffusers or Wan-AI/Wan2.1-I2V-14B-480P-Diffusers
-model_id = "Wan-AI/Wan2.1-T2V-1.3B-Diffusers"
-
-pipe = WanPipeline.from_pretrained(model_id, torch_dtype=torch.bfloat16)
-pipe.enable_model_cpu_offload()
-
-prompt = "A cat and a dog baking a cake together in a kitchen. The cat is carefully measuring flour, while the dog is stirring the batter with a wooden spoon. The kitchen is cozy, with sunlight streaming through the window."
-negative_prompt = "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards"
-num_frames = 33
-
-frames = pipe(prompt=prompt, negative_prompt=negative_prompt, num_frames=num_frames).frames[0]
-export_to_video(frames, "wan-t2v.mp4", fps=16)
-```
-
-<Tip>
-You can improve the quality of the generated video by running the decoding step in full precision.
-</Tip>
-
-```python
-from diffusers import WanPipeline, AutoencoderKLWan
-from diffusers.utils import export_to_video
-
-model_id = "Wan-AI/Wan2.1-T2V-1.3B-Diffusers"
-
-vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32)
-pipe = WanPipeline.from_pretrained(model_id, vae=vae, torch_dtype=torch.bfloat16)
-
-# replace this with pipe.to("cuda") if you have sufficient VRAM
-pipe.enable_model_cpu_offload()
-
-prompt = "A cat and a dog baking a cake together in a kitchen. The cat is carefully measuring flour, while the dog is stirring the batter with a wooden spoon. The kitchen is cozy, with sunlight streaming through the window."
-negative_prompt = "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards"
-num_frames = 33
-
-frames = pipe(prompt=prompt, num_frames=num_frames).frames[0]
-export_to_video(frames, "wan-t2v.mp4", fps=16)
-```
-
-### Image to Video Generation
-
-The Image to Video pipeline requires loading the `AutoencoderKLWan` and the `CLIPVisionModel` components in full precision. The following example will need at least
-35GB of VRAM to run.
-
-```python
-import torch
-import numpy as np
-from diffusers import AutoencoderKLWan, WanImageToVideoPipeline
-from diffusers.utils import export_to_video, load_image
-from transformers import CLIPVisionModel
-
-# Available models: Wan-AI/Wan2.1-I2V-14B-480P-Diffusers, Wan-AI/Wan2.1-I2V-14B-720P-Diffusers
-model_id = "Wan-AI/Wan2.1-I2V-14B-480P-Diffusers"
-image_encoder = CLIPVisionModel.from_pretrained(
-    model_id, subfolder="image_encoder", torch_dtype=torch.float32
-)
-vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32)
-pipe = WanImageToVideoPipeline.from_pretrained(
-    model_id, vae=vae, image_encoder=image_encoder, torch_dtype=torch.bfloat16
-)
-
-# replace this with pipe.to("cuda") if you have sufficient VRAM
-pipe.enable_model_cpu_offload()
-
-image = load_image(
-    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/astronaut.jpg"
-)
-
-max_area = 480 * 832
-aspect_ratio = image.height / image.width
-mod_value = pipe.vae_scale_factor_spatial * pipe.transformer.config.patch_size[1]
-height = round(np.sqrt(max_area * aspect_ratio)) // mod_value * mod_value
-width = round(np.sqrt(max_area / aspect_ratio)) // mod_value * mod_value
-image = image.resize((width, height))
-
-prompt = (
-    "An astronaut hatching from an egg, on the surface of the moon, the darkness and depth of space realised in "
-    "the background. High quality, ultrarealistic detail and breath-taking movie-like camera shot."
-)
-negative_prompt = "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards"
-
-num_frames = 33
-
-output = pipe(
-    image=image,
-    prompt=prompt,
-    negative_prompt=negative_prompt,
-    height=height,
-    width=width,
-    num_frames=num_frames,
-    guidance_scale=5.0,
-).frames[0]
-export_to_video(output, "wan-i2v.mp4", fps=16)
-```
-
-### First and Last Frame Interpolation
-
-```python
-import numpy as np
-import torch
-import torchvision.transforms.functional as TF
-from diffusers import AutoencoderKLWan, WanImageToVideoPipeline
-from diffusers.utils import export_to_video, load_image
-from transformers import CLIPVisionModel
-
-
-model_id = "Wan-AI/Wan2.1-FLF2V-14B-720P-diffusers"
-image_encoder = CLIPVisionModel.from_pretrained(model_id, subfolder="image_encoder", torch_dtype=torch.float32)
-vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32)
-pipe = WanImageToVideoPipeline.from_pretrained(
-    model_id, vae=vae, image_encoder=image_encoder, torch_dtype=torch.bfloat16
-)
-pipe.to("cuda")
-
-first_frame = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/flf2v_input_first_frame.png")
-last_frame = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/flf2v_input_last_frame.png")
-
-def aspect_ratio_resize(image, pipe, max_area=720 * 1280):
-    aspect_ratio = image.height / image.width
-    mod_value = pipe.vae_scale_factor_spatial * pipe.transformer.config.patch_size[1]
-    height = round(np.sqrt(max_area * aspect_ratio)) // mod_value * mod_value
-    width = round(np.sqrt(max_area / aspect_ratio)) // mod_value * mod_value
-    image = image.resize((width, height))
-    return image, height, width
-
-def center_crop_resize(image, height, width):
-    # Calculate resize ratio to match first frame dimensions
-    resize_ratio = max(width / image.width, height / image.height)
-    
-    # Resize the image
-    width = round(image.width * resize_ratio)
-    height = round(image.height * resize_ratio)
-    size = [width, height]
-    image = TF.center_crop(image, size)
-    
-    return image, height, width
-
-first_frame, height, width = aspect_ratio_resize(first_frame, pipe)
-if last_frame.size != first_frame.size:
-    last_frame, _, _ = center_crop_resize(last_frame, height, width)
-
-prompt = "CG animation style, a small blue bird takes off from the ground, flapping its wings. The bird's feathers are delicate, with a unique pattern on its chest. The background shows a blue sky with white clouds under bright sunshine. The camera follows the bird upward, capturing its flight and the vastness of the sky from a close-up, low-angle perspective."
-
-output = pipe(
-    image=first_frame, last_image=last_frame, prompt=prompt, height=height, width=width, guidance_scale=5.5
-).frames[0]
-export_to_video(output, "output.mp4", fps=16)
-```
-
-### Video to Video Generation
-
-```python
-import torch
-from diffusers.utils import load_video, export_to_video
-from diffusers import AutoencoderKLWan, WanVideoToVideoPipeline, UniPCMultistepScheduler
-
-# Available models: Wan-AI/Wan2.1-T2V-14B-Diffusers, Wan-AI/Wan2.1-T2V-1.3B-Diffusers
-model_id = "Wan-AI/Wan2.1-T2V-1.3B-Diffusers"
-vae = AutoencoderKLWan.from_pretrained(
-    model_id, subfolder="vae", torch_dtype=torch.float32
-)
-pipe = WanVideoToVideoPipeline.from_pretrained(
-    model_id, vae=vae, torch_dtype=torch.bfloat16
-)
-flow_shift = 3.0  # 5.0 for 720P, 3.0 for 480P
-pipe.scheduler = UniPCMultistepScheduler.from_config(
-    pipe.scheduler.config, flow_shift=flow_shift
-)
-# change to pipe.to("cuda") if you have sufficient VRAM
-pipe.enable_model_cpu_offload()
-
-prompt = "A robot standing on a mountain top. The sun is setting in the background"
-negative_prompt = "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards"
-video = load_video(
-    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/hiker.mp4"
-)
-output = pipe(
-    video=video,
-    prompt=prompt,
-    negative_prompt=negative_prompt,
-    height=480,
-    width=512,
-    guidance_scale=7.0,
-    strength=0.7,
-).frames[0]
-
-export_to_video(output, "wan-v2v.mp4", fps=16)
-```
-
-## Memory Optimizations for Wan 2.1
-
-Base inference with the large 14B Wan 2.1 models can take up to 35GB of VRAM when generating videos at 720p resolution. We'll outline a few memory optimizations we can apply to reduce the VRAM required to run the model.
-
-We'll use `Wan-AI/Wan2.1-I2V-14B-720P-Diffusers` model in these examples to demonstrate the memory savings, but the techniques are applicable to all model checkpoints.
-
-### Group Offloading the Transformer and UMT5 Text Encoder
-
-Find more information about group offloading [here](../optimization/memory.md)
-
-#### Block Level Group Offloading
-
-We can reduce our VRAM requirements by applying group offloading to the larger model components of the pipeline; the `WanTransformer3DModel` and `UMT5EncoderModel`. Group offloading will break up the individual modules of a model and offload/onload them onto your GPU as needed during inference. In this example, we'll apply `block_level` offloading, which will group the modules in a model into blocks of size `num_blocks_per_group` and offload/onload them to GPU. Moving to between CPU and GPU does add latency to the inference process. You can trade off between latency and memory savings by increasing or decreasing the `num_blocks_per_group`.
-
-The following example will now only require 14GB of VRAM to run, but will take approximately 30 minutes to generate a video.
-
-```python
-import torch
-import numpy as np
-from diffusers import AutoencoderKLWan, WanTransformer3DModel, WanImageToVideoPipeline
-from diffusers.hooks.group_offloading import apply_group_offloading
-from diffusers.utils import export_to_video, load_image
-from transformers import UMT5EncoderModel, CLIPVisionModel
-
-# Available models: Wan-AI/Wan2.1-I2V-14B-480P-Diffusers, Wan-AI/Wan2.1-I2V-14B-720P-Diffusers
-model_id = "Wan-AI/Wan2.1-I2V-14B-720P-Diffusers"
-image_encoder = CLIPVisionModel.from_pretrained(
-    model_id, subfolder="image_encoder", torch_dtype=torch.float32
-)
-
-text_encoder = UMT5EncoderModel.from_pretrained(model_id, subfolder="text_encoder", torch_dtype=torch.bfloat16)
-vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32)
-transformer = WanTransformer3DModel.from_pretrained(model_id, subfolder="transformer", torch_dtype=torch.bfloat16)
-
-onload_device = torch.device("cuda")
-offload_device = torch.device("cpu")
-
-apply_group_offloading(text_encoder,
-    onload_device=onload_device,
-    offload_device=offload_device,
-    offload_type="block_level",
-    num_blocks_per_group=4
-)
-
-transformer.enable_group_offload(
-    onload_device=onload_device,
-    offload_device=offload_device,
-    offload_type="block_level",
-    num_blocks_per_group=4,
-)
-pipe = WanImageToVideoPipeline.from_pretrained(
-    model_id,
-    vae=vae,
-    transformer=transformer,
-    text_encoder=text_encoder,
-    image_encoder=image_encoder,
-    torch_dtype=torch.bfloat16
-)
-# Since we've offloaded the larger models already, we can move the rest of the model components to GPU
-pipe.to("cuda")
-
-image = load_image(
-    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/astronaut.jpg"
-)
-
-max_area = 720 * 832
-aspect_ratio = image.height / image.width
-mod_value = pipe.vae_scale_factor_spatial * pipe.transformer.config.patch_size[1]
-height = round(np.sqrt(max_area * aspect_ratio)) // mod_value * mod_value
-width = round(np.sqrt(max_area / aspect_ratio)) // mod_value * mod_value
-image = image.resize((width, height))
-
-prompt = (
-    "An astronaut hatching from an egg, on the surface of the moon, the darkness and depth of space realised in "
-    "the background. High quality, ultrarealistic detail and breath-taking movie-like camera shot."
-)
-negative_prompt = "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards"
-
-num_frames = 33
-
-output = pipe(
-    image=image,
-    prompt=prompt,
-    negative_prompt=negative_prompt,
-    height=height,
-    width=width,
-    num_frames=num_frames,
-    guidance_scale=5.0,
-).frames[0]
-
-export_to_video(output, "wan-i2v.mp4", fps=16)
-```
-
-#### Block Level Group Offloading with CUDA Streams
-
-We can speed up group offloading inference, by enabling the use of [CUDA streams](https://pytorch.org/docs/stable/generated/torch.cuda.Stream.html). However, using CUDA streams requires moving the model parameters into pinned memory. This allocation is handled by Pytorch under the hood, and can result in a significant spike in CPU RAM usage. Please consider this option if your CPU RAM is atleast 2X the size of the model you are group offloading.
-
-In the following example we will use CUDA streams when group offloading the `WanTransformer3DModel`. When testing on an A100, this example will require 14GB of VRAM, 52GB of CPU RAM, but will generate a video in approximately 9 minutes.
-
-```python
-import torch
-import numpy as np
-from diffusers import AutoencoderKLWan, WanTransformer3DModel, WanImageToVideoPipeline
-from diffusers.hooks.group_offloading import apply_group_offloading
-from diffusers.utils import export_to_video, load_image
-from transformers import UMT5EncoderModel, CLIPVisionModel
-
-# Available models: Wan-AI/Wan2.1-I2V-14B-480P-Diffusers, Wan-AI/Wan2.1-I2V-14B-720P-Diffusers
-model_id = "Wan-AI/Wan2.1-I2V-14B-720P-Diffusers"
-image_encoder = CLIPVisionModel.from_pretrained(
-    model_id, subfolder="image_encoder", torch_dtype=torch.float32
-)
-
-text_encoder = UMT5EncoderModel.from_pretrained(model_id, subfolder="text_encoder", torch_dtype=torch.bfloat16)
-vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32)
-transformer = WanTransformer3DModel.from_pretrained(model_id, subfolder="transformer", torch_dtype=torch.bfloat16)
-
-onload_device = torch.device("cuda")
-offload_device = torch.device("cpu")
-
-apply_group_offloading(text_encoder,
-    onload_device=onload_device,
-    offload_device=offload_device,
-    offload_type="block_level",
-    num_blocks_per_group=4
-)
-
-transformer.enable_group_offload(
-    onload_device=onload_device,
-    offload_device=offload_device,
-    offload_type="leaf_level",
-    use_stream=True
-)
-pipe = WanImageToVideoPipeline.from_pretrained(
-    model_id,
-    vae=vae,
-    transformer=transformer,
-    text_encoder=text_encoder,
-    image_encoder=image_encoder,
-    torch_dtype=torch.bfloat16
-)
-# Since we've offloaded the larger models already, we can move the rest of the model components to GPU
-pipe.to("cuda")
-
-image = load_image(
-    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/astronaut.jpg"
-)
-
-max_area = 720 * 832
-aspect_ratio = image.height / image.width
-mod_value = pipe.vae_scale_factor_spatial * pipe.transformer.config.patch_size[1]
-height = round(np.sqrt(max_area * aspect_ratio)) // mod_value * mod_value
-width = round(np.sqrt(max_area / aspect_ratio)) // mod_value * mod_value
-image = image.resize((width, height))
-
-prompt = (
-    "An astronaut hatching from an egg, on the surface of the moon, the darkness and depth of space realised in "
-    "the background. High quality, ultrarealistic detail and breath-taking movie-like camera shot."
-)
-negative_prompt = "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards"
-
-num_frames = 33
-
-output = pipe(
-    image=image,
-    prompt=prompt,
-    negative_prompt=negative_prompt,
-    height=height,
-    width=width,
-    num_frames=num_frames,
-    guidance_scale=5.0,
-).frames[0]
-
-export_to_video(output, "wan-i2v.mp4", fps=16)
-```
-
-### Applying Layerwise Casting to the Transformer
-
-Find more information about layerwise casting [here](../optimization/memory.md)
-
-In this example, we will model offloading with layerwise casting. Layerwise casting will downcast each layer's weights to `torch.float8_e4m3fn`, temporarily upcast to `torch.bfloat16` during the forward pass of the layer, then revert to `torch.float8_e4m3fn` afterward. This approach reduces memory requirements by approximately 50% while introducing a minor quality reduction in the generated video due to the precision trade-off.
-
-This example will require 20GB of VRAM.
-
-```python
-import torch
-import numpy as np
-from diffusers import AutoencoderKLWan, WanTransformer3DModel, WanImageToVideoPipeline
-from diffusers.hooks.group_offloading import apply_group_offloading
-from diffusers.utils import export_to_video, load_image
-from transformers import UMT5EncoderModel, CLIPVisionModel
-
-model_id = "Wan-AI/Wan2.1-I2V-14B-720P-Diffusers"
-image_encoder = CLIPVisionModel.from_pretrained(
-    model_id, subfolder="image_encoder", torch_dtype=torch.float32
-)
-text_encoder = UMT5EncoderModel.from_pretrained(model_id, subfolder="text_encoder", torch_dtype=torch.bfloat16)
-vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32)
-
-transformer = WanTransformer3DModel.from_pretrained(model_id, subfolder="transformer", torch_dtype=torch.bfloat16)
-transformer.enable_layerwise_casting(storage_dtype=torch.float8_e4m3fn, compute_dtype=torch.bfloat16)
-
-pipe = WanImageToVideoPipeline.from_pretrained(
-    model_id,
-    vae=vae,
-    transformer=transformer,
-    text_encoder=text_encoder,
-    image_encoder=image_encoder,
-    torch_dtype=torch.bfloat16
-)
-pipe.enable_model_cpu_offload()
-image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/astronaut.jpg")
-
-max_area = 720 * 832
-aspect_ratio = image.height / image.width
-mod_value = pipe.vae_scale_factor_spatial * pipe.transformer.config.patch_size[1]
-height = round(np.sqrt(max_area * aspect_ratio)) // mod_value * mod_value
-width = round(np.sqrt(max_area / aspect_ratio)) // mod_value * mod_value
-image = image.resize((width, height))
-prompt = (
-    "An astronaut hatching from an egg, on the surface of the moon, the darkness and depth of space realised in "
-    "the background. High quality, ultrarealistic detail and breath-taking movie-like camera shot."
-)
-negative_prompt = "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards"
-num_frames = 33
-
-output = pipe(
-    image=image,
-    prompt=prompt,
-    negative_prompt=negative_prompt,
-    height=height,
-    width=width,
-    num_frames=num_frames,
-    num_inference_steps=50,
-    guidance_scale=5.0,
-).frames[0]
-export_to_video(output, "wan-i2v.mp4", fps=16)
-```
-
-## Using a Custom Scheduler
-
-Wan can be used with many different schedulers, each with their own benefits regarding speed and generation quality. By default, Wan uses the `UniPCMultistepScheduler(prediction_type="flow_prediction", use_flow_sigmas=True, flow_shift=3.0)` scheduler. You can use a different scheduler as follows:
-
-```python
-from diffusers import FlowMatchEulerDiscreteScheduler, UniPCMultistepScheduler, WanPipeline
-
-scheduler_a = FlowMatchEulerDiscreteScheduler(shift=5.0)
-scheduler_b = UniPCMultistepScheduler(prediction_type="flow_prediction", use_flow_sigmas=True, flow_shift=4.0)
-
-pipe = WanPipeline.from_pretrained("Wan-AI/Wan2.1-T2V-1.3B-Diffusers", scheduler=<CUSTOM_SCHEDULER_HERE>)
-
-# or,
-pipe.scheduler = <CUSTOM_SCHEDULER_HERE>
-```
-
-## Using Single File Loading with Wan 2.1
-
-The `WanTransformer3DModel` and `AutoencoderKLWan` models support loading checkpoints in their original format via the `from_single_file` loading
-method.
-
-```python
-import torch
-from diffusers import WanPipeline, WanTransformer3DModel
-
-ckpt_path = "https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/blob/main/split_files/diffusion_models/wan2.1_t2v_1.3B_bf16.safetensors"
-transformer = WanTransformer3DModel.from_single_file(ckpt_path, torch_dtype=torch.bfloat16)
-
-pipe = WanPipeline.from_pretrained("Wan-AI/Wan2.1-T2V-1.3B-Diffusers", transformer=transformer)
-```
-
-## Recommendations for Inference
- Keep `AutencoderKLWan` in `torch.float32` for better decoding quality.
- `num_frames` should satisfy the following constraint: `(num_frames - 1) % 4 == 0`
- For smaller resolution videos, try lower values of `shift` (between `2.0` to `5.0`) in the [Scheduler](https://huggingface.co/docs/diffusers/main/en/api/schedulers/flow_match_euler_discrete#diffusers.FlowMatchEulerDiscreteScheduler.shift). For larger resolution videos, try higher values (between `7.0` and `12.0`). The default value is `3.0` for Wan.
-
-## WanPipeline
-
-[[autodoc]] WanPipeline
-  - all
-  - __call__
-
-## WanImageToVideoPipeline
-
-[[autodoc]] WanImageToVideoPipeline
-  - all
-  - __call__
-
-## WanPipelineOutput
-
-[[autodoc]] pipelines.wan.pipeline_output.WanPipelineOutput
--- a/docs/source/en/api/pipelines/wuerstchen.md
+++ b/docs/source/en/api/pipelines/wuerstchen.md
@@ -12,10 +12,6 @@ specific language governing permissions and limitations under the License.

 # Würstchen

-<div class="flex flex-wrap space-x-1">
-  <img alt="LoRA" src="https://img.shields.io/badge/LoRA-d8b4fe?style=flat"/>
-</div>
-
 <img src="https://github.com/dome272/Wuerstchen/assets/61938694/0617c863-165a-43ee-9303-2a17299a0cf9">

 [Wuerstchen: An Efficient Architecture for Large-Scale Text-to-Image Diffusion Models](https://huggingface.co/papers/2306.00637) is by Pablo Pernias, Dominic Rampas, Mats L. Richter and Christopher Pal and Marc Aubreville.
--- a/docs/source/en/api/quantization.md
+++ b/docs/source/en/api/quantization.md
@@ -13,7 +13,9 @@ specific language governing permissions and limitations under the License.

 # Quantization

-Quantization techniques reduce memory and computational costs by representing weights and activations with lower-precision data types like 8-bit integers (int8). This enables loading larger models you normally wouldn't be able to fit into memory, and speeding up inference.
+Quantization techniques reduce memory and computational costs by representing weights and activations with lower-precision data types like 8-bit integers (int8). This enables loading larger models you normally wouldn't be able to fit into memory, and speeding up inference. Diffusers supports 8-bit and 4-bit quantization with [bitsandbytes](https://huggingface.co/docs/bitsandbytes/en/index).
+
+Quantization techniques that aren't supported in Transformers can be added with the [`DiffusersQuantizer`] class.

 <Tip>

@@ -21,9 +23,6 @@ Learn how to quantize models in the [Quantization](../quantization/overview) gui

 </Tip>

-## PipelineQuantizationConfig
-
-[[autodoc]] quantizers.PipelineQuantizationConfig

 ## BitsAndBytesConfig

@@ -32,11 +31,6 @@ Learn how to quantize models in the [Quantization](../quantization/overview) gui
 ## GGUFQuantizationConfig

 [[autodoc]] GGUFQuantizationConfig
-
-## QuantoConfig
-
-[[autodoc]] QuantoConfig
-
 ## TorchAoConfig

 [[autodoc]] TorchAoConfig
--- a/docs/source/en/api/schedulers/ddim_cogvideox.md
+++ b/docs/source/en/api/schedulers/ddim_cogvideox.md
@@ -1,19 +0,0 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# CogVideoXDDIMScheduler
-
-`CogVideoXDDIMScheduler` is based on [Denoising Diffusion Implicit Models](https://huggingface.co/papers/2010.02502), specifically for CogVideoX models.
-
-## CogVideoXDDIMScheduler
-
-[[autodoc]] CogVideoXDDIMScheduler
--- a/docs/source/en/api/schedulers/multistep_dpm_solver_cogvideox.md
+++ b/docs/source/en/api/schedulers/multistep_dpm_solver_cogvideox.md
@@ -1,19 +0,0 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# CogVideoXDPMScheduler
-
-`CogVideoXDPMScheduler` is based on [DPM-Solver: A Fast ODE Solver for Diffusion Probabilistic Model Sampling in Around 10 Steps](https://huggingface.co/papers/2206.00927) and [DPM-Solver++: Fast Solver for Guided Sampling of Diffusion Probabilistic Models](https://huggingface.co/papers/2211.01095), specifically for CogVideoX models.
-
-## CogVideoXDPMScheduler
-
-[[autodoc]] CogVideoXDPMScheduler
--- a/docs/source/en/community_projects.md
+++ b/docs/source/en/community_projects.md
@@ -83,8 +83,4 @@ Happy exploring, and thank you for being part of the Diffusers community!
    <td><a href="https://github.com/suzukimain/auto_diffusers"> Model Search </a></td>
    <td>Search models on Civitai and Hugging Face</td>
  </tr>
-  <tr style="border-top: 2px solid black">
-    <td><a href="https://github.com/beinsezii/skrample"> Skrample </a></td>
-    <td>Fully modular scheduler functions with 1st class diffusers integration.</td>
-  </tr>
 </table>
--- a/docs/source/en/conceptual/evaluation.md
+++ b/docs/source/en/conceptual/evaluation.md
@@ -16,11 +16,6 @@ specific language governing permissions and limitations under the License.
    <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
 </a>

-> [!TIP]
-> This document has now grown outdated given the emergence of existing evaluation frameworks for diffusion models for image generation. Please check
-> out works like [HEIM](https://crfm.stanford.edu/helm/heim/latest/), [T2I-Compbench](https://arxiv.org/abs/2307.06350),
-> [GenEval](https://arxiv.org/abs/2310.11513).
-
 Evaluation of generative models like [Stable Diffusion](https://huggingface.co/docs/diffusers/stable_diffusion) is subjective in nature. But as practitioners and researchers, we often have to make careful choices amongst many different possibilities. So, when working with different generative models (like GANs, Diffusion, etc.), how do we choose one over the other?

 Qualitative evaluation of such models can be error-prone and might incorrectly influence a decision.
--- a/docs/source/en/hybrid_inference/api_reference.md
+++ b/docs/source/en/hybrid_inference/api_reference.md
@@ -1,9 +0,0 @@
-# Hybrid Inference API Reference
-
-## Remote Decode
-
-[[autodoc]] utils.remote_utils.remote_decode
-
-## Remote Encode
-
-[[autodoc]] utils.remote_utils.remote_encode
--- a/docs/source/en/hybrid_inference/overview.md
+++ b/docs/source/en/hybrid_inference/overview.md
@@ -1,60 +0,0 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# Hybrid Inference
-
-**Empowering local AI builders with Hybrid Inference**
-
-
-> [!TIP]
-> Hybrid Inference is an [experimental feature](https://huggingface.co/blog/remote_vae).
-> Feedback can be provided [here](https://github.com/huggingface/diffusers/issues/new?template=remote-vae-pilot-feedback.yml).
-
-
-
-## Why use Hybrid Inference?
-
-Hybrid Inference offers a fast and simple way to offload local generation requirements.
-
- 🚀 **Reduced Requirements:** Access powerful models without expensive hardware.
- 💎 **Without Compromise:** Achieve the highest quality without sacrificing performance.
- 💰 **Cost Effective:** It's free! 🤑
- 🎯 **Diverse Use Cases:** Fully compatible with Diffusers 🧨 and the wider community.
- 🔧 **Developer-Friendly:** Simple requests, fast responses.
-
---
-
-## Available Models
-
-* **VAE Decode 🖼️:** Quickly decode latent representations into high-quality images without compromising performance or workflow speed.
-* **VAE Encode 🔢:** Efficiently encode images into latent representations for generation and training.
-* **Text Encoders 📃 (coming soon):** Compute text embeddings for your prompts quickly and accurately, ensuring a smooth and high-quality workflow.
-
---
-
-## Integrations
-
-* **[SD.Next](https://github.com/vladmandic/sdnext):** All-in-one UI with direct supports Hybrid Inference.
-* **[ComfyUI-HFRemoteVae](https://github.com/kijai/ComfyUI-HFRemoteVae):** ComfyUI node for Hybrid Inference.
-
-## Changelog
-
- March 10 2025: Added VAE encode
- March 2 2025: Initial release with VAE decoding
-
-## Contents
-
-The documentation is organized into three sections:
-
-* **VAE Decode** Learn the basics of how to use VAE Decode with Hybrid Inference.
-* **VAE Encode** Learn the basics of how to use VAE Encode with Hybrid Inference.
-* **API Reference** Dive into task-specific settings and parameters.
--- a/docs/source/en/hybrid_inference/vae_decode.md
+++ b/docs/source/en/hybrid_inference/vae_decode.md
@@ -1,345 +0,0 @@
-# Getting Started: VAE Decode with Hybrid Inference
-
-VAE decode is an essential component of diffusion models - turning latent representations into images or videos.
-
-## Memory
-
-These tables demonstrate the VRAM requirements for VAE decode with SD v1 and SD XL on different GPUs.
-
-For the majority of these GPUs the memory usage % dictates other models (text encoders, UNet/Transformer) must be offloaded, or tiled decoding has to be used which increases time taken and impacts quality.
-
-<details><summary>SD v1.5</summary>
-
-| GPU | Resolution | Time (seconds) | Memory (%) | Tiled Time (secs) | Tiled Memory (%) |
-| --- | --- | --- | --- | --- | --- |
-| NVIDIA GeForce RTX 4090 | 512x512 | 0.031 | 5.60% | 0.031 (0%) | 5.60% |
-| NVIDIA GeForce RTX 4090 | 1024x1024 | 0.148 | 20.00% | 0.301 (+103%) | 5.60% |
-| NVIDIA GeForce RTX 4080 | 512x512 | 0.05 | 8.40% | 0.050 (0%) | 8.40% |
-| NVIDIA GeForce RTX 4080 | 1024x1024 | 0.224 | 30.00% | 0.356 (+59%) | 8.40% |
-| NVIDIA GeForce RTX 4070 Ti | 512x512 | 0.066 | 11.30% | 0.066 (0%) | 11.30% |
-| NVIDIA GeForce RTX 4070 Ti | 1024x1024 | 0.284 | 40.50% | 0.454 (+60%) | 11.40% |
-| NVIDIA GeForce RTX 3090 | 512x512 | 0.062 | 5.20% | 0.062 (0%) | 5.20% |
-| NVIDIA GeForce RTX 3090 | 1024x1024 | 0.253 | 18.50% | 0.464 (+83%) | 5.20% |
-| NVIDIA GeForce RTX 3080 | 512x512 | 0.07 | 12.80% | 0.070 (0%) | 12.80% |
-| NVIDIA GeForce RTX 3080 | 1024x1024 | 0.286 | 45.30% | 0.466 (+63%) | 12.90% |
-| NVIDIA GeForce RTX 3070 | 512x512 | 0.102 | 15.90% | 0.102 (0%) | 15.90% |
-| NVIDIA GeForce RTX 3070 | 1024x1024 | 0.421 | 56.30% | 0.746 (+77%) | 16.00% |
-
-</details>
-
-<details><summary>SDXL</summary>
-
-| GPU | Resolution | Time (seconds) | Memory Consumed (%) | Tiled Time (seconds) | Tiled Memory (%) |
-| --- | --- | --- | --- | --- | --- |
-| NVIDIA GeForce RTX 4090 | 512x512 | 0.057 | 10.00% | 0.057 (0%) | 10.00% |
-| NVIDIA GeForce RTX 4090 | 1024x1024 | 0.256 | 35.50% | 0.257 (+0.4%) | 35.50% |
-| NVIDIA GeForce RTX 4080 | 512x512 | 0.092 | 15.00% | 0.092 (0%) | 15.00% |
-| NVIDIA GeForce RTX 4080 | 1024x1024 | 0.406 | 53.30% | 0.406 (0%) | 53.30% |
-| NVIDIA GeForce RTX 4070 Ti | 512x512 | 0.121 | 20.20% | 0.120 (-0.8%) | 20.20% |
-| NVIDIA GeForce RTX 4070 Ti | 1024x1024 | 0.519 | 72.00% | 0.519 (0%) | 72.00% |
-| NVIDIA GeForce RTX 3090 | 512x512 | 0.107 | 10.50% | 0.107 (0%) | 10.50% |
-| NVIDIA GeForce RTX 3090 | 1024x1024 | 0.459 | 38.00% | 0.460 (+0.2%) | 38.00% |
-| NVIDIA GeForce RTX 3080 | 512x512 | 0.121 | 25.60% | 0.121 (0%) | 25.60% |
-| NVIDIA GeForce RTX 3080 | 1024x1024 | 0.524 | 93.00% | 0.524 (0%) | 93.00% |
-| NVIDIA GeForce RTX 3070 | 512x512 | 0.183 | 31.80% | 0.183 (0%) | 31.80% |
-| NVIDIA GeForce RTX 3070 | 1024x1024 | 0.794 | 96.40% | 0.794 (0%) | 96.40% |
-
-</details>
-
-## Available VAEs
-
-|   | **Endpoint** | **Model** |
-|:-:|:-----------:|:--------:|
-| **Stable Diffusion v1** | [https://q1bj3bpq6kzilnsu.us-east-1.aws.endpoints.huggingface.cloud](https://q1bj3bpq6kzilnsu.us-east-1.aws.endpoints.huggingface.cloud) | [`stabilityai/sd-vae-ft-mse`](https://hf.co/stabilityai/sd-vae-ft-mse) |
-| **Stable Diffusion XL** | [https://x2dmsqunjd6k9prw.us-east-1.aws.endpoints.huggingface.cloud](https://x2dmsqunjd6k9prw.us-east-1.aws.endpoints.huggingface.cloud) | [`madebyollin/sdxl-vae-fp16-fix`](https://hf.co/madebyollin/sdxl-vae-fp16-fix) |
-| **Flux** | [https://whhx50ex1aryqvw6.us-east-1.aws.endpoints.huggingface.cloud](https://whhx50ex1aryqvw6.us-east-1.aws.endpoints.huggingface.cloud) | [`black-forest-labs/FLUX.1-schnell`](https://hf.co/black-forest-labs/FLUX.1-schnell) |
-| **HunyuanVideo** | [https://o7ywnmrahorts457.us-east-1.aws.endpoints.huggingface.cloud](https://o7ywnmrahorts457.us-east-1.aws.endpoints.huggingface.cloud) | [`hunyuanvideo-community/HunyuanVideo`](https://hf.co/hunyuanvideo-community/HunyuanVideo) |
-
-
-> [!TIP]
-> Model support can be requested [here](https://github.com/huggingface/diffusers/issues/new?template=remote-vae-pilot-feedback.yml).
-
-
-## Code
-
-> [!TIP]
-> Install `diffusers` from `main` to run the code: `pip install git+https://github.com/huggingface/diffusers@main`
-
-
-A helper method simplifies interacting with Hybrid Inference.
-
-```python
-from diffusers.utils.remote_utils import remote_decode
-```
-
-### Basic example
-
-Here, we show how to use the remote VAE on random tensors.
-
-<details><summary>Code</summary>
-
-```python
-image = remote_decode(
-    endpoint="https://q1bj3bpq6kzilnsu.us-east-1.aws.endpoints.huggingface.cloud/",
-    tensor=torch.randn([1, 4, 64, 64], dtype=torch.float16),
-    scaling_factor=0.18215,
-)
-```
-
-</details>
-
-<figure class="image flex flex-col items-center justify-center text-center m-0 w-full">
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/remote_vae/output.png"/>
-</figure>
-
-Usage for Flux is slightly different. Flux latents are packed so we need to send the `height` and `width`.
-
-<details><summary>Code</summary>
-
-```python
-image = remote_decode(
-    endpoint="https://whhx50ex1aryqvw6.us-east-1.aws.endpoints.huggingface.cloud/",
-    tensor=torch.randn([1, 4096, 64], dtype=torch.float16),
-    height=1024,
-    width=1024,
-    scaling_factor=0.3611,
-    shift_factor=0.1159,
-)
-```
-
-</details>
-
-<figure class="image flex flex-col items-center justify-center text-center m-0 w-full">
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/remote_vae/flux_random_latent.png"/>
-</figure>
-
-Finally, an example for HunyuanVideo.
-
-<details><summary>Code</summary>
-
-```python
-video = remote_decode(
-    endpoint="https://o7ywnmrahorts457.us-east-1.aws.endpoints.huggingface.cloud/",
-    tensor=torch.randn([1, 16, 3, 40, 64], dtype=torch.float16),
-    output_type="mp4",
-)
-with open("video.mp4", "wb") as f:
-    f.write(video)
-```
-
-</details>
-
-<figure class="image flex flex-col items-center justify-center text-center m-0 w-full">
-   <video
-      alt="queue.mp4"
-      autoplay loop autobuffer muted playsinline
-    >
-    <source src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/remote_vae/video_1.mp4" type="video/mp4">
-  </video>
-</figure>
-
-
-### Generation
-
-But we want to use the VAE on an actual pipeline to get an actual image, not random noise. The example below shows how to do it with SD v1.5. 
-
-<details><summary>Code</summary>
-
-```python
-from diffusers import StableDiffusionPipeline
-
-pipe = StableDiffusionPipeline.from_pretrained(
-    "stable-diffusion-v1-5/stable-diffusion-v1-5",
-    torch_dtype=torch.float16,
-    variant="fp16",
-    vae=None,
-).to("cuda")
-
-prompt = "Strawberry ice cream, in a stylish modern glass, coconut, splashing milk cream and honey, in a gradient purple background, fluid motion, dynamic movement, cinematic lighting, Mysterious"
-
-latent = pipe(
-    prompt=prompt,
-    output_type="latent",
-).images
-image = remote_decode(
-    endpoint="https://q1bj3bpq6kzilnsu.us-east-1.aws.endpoints.huggingface.cloud/",
-    tensor=latent,
-    scaling_factor=0.18215,
-)
-image.save("test.jpg")
-```
-
-</details>
-
-<figure class="image flex flex-col items-center justify-center text-center m-0 w-full">
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/remote_vae/test.jpg"/>
-</figure>
-
-Here’s another example with Flux.
-
-<details><summary>Code</summary>
-
-```python
-from diffusers import FluxPipeline
-
-pipe = FluxPipeline.from_pretrained(
-    "black-forest-labs/FLUX.1-schnell",
-    torch_dtype=torch.bfloat16,
-    vae=None,
-).to("cuda")
-
-prompt = "Strawberry ice cream, in a stylish modern glass, coconut, splashing milk cream and honey, in a gradient purple background, fluid motion, dynamic movement, cinematic lighting, Mysterious"
-
-latent = pipe(
-    prompt=prompt,
-    guidance_scale=0.0,
-    num_inference_steps=4,
-    output_type="latent",
-).images
-image = remote_decode(
-    endpoint="https://whhx50ex1aryqvw6.us-east-1.aws.endpoints.huggingface.cloud/",
-    tensor=latent,
-    height=1024,
-    width=1024,
-    scaling_factor=0.3611,
-    shift_factor=0.1159,
-)
-image.save("test.jpg")
-```
-
-</details>
-
-<figure class="image flex flex-col items-center justify-center text-center m-0 w-full">
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/remote_vae/test_1.jpg"/>
-</figure>
-
-Here’s an example with HunyuanVideo.
-
-<details><summary>Code</summary>
-
-```python
-from diffusers import HunyuanVideoPipeline, HunyuanVideoTransformer3DModel
-
-model_id = "hunyuanvideo-community/HunyuanVideo"
-transformer = HunyuanVideoTransformer3DModel.from_pretrained(
-    model_id, subfolder="transformer", torch_dtype=torch.bfloat16
-)
-pipe = HunyuanVideoPipeline.from_pretrained(
-    model_id, transformer=transformer, vae=None, torch_dtype=torch.float16
-).to("cuda")
-
-latent = pipe(
-    prompt="A cat walks on the grass, realistic",
-    height=320,
-    width=512,
-    num_frames=61,
-    num_inference_steps=30,
-    output_type="latent",
-).frames
-
-video = remote_decode(
-    endpoint="https://o7ywnmrahorts457.us-east-1.aws.endpoints.huggingface.cloud/",
-    tensor=latent,
-    output_type="mp4",
-)
-
-if isinstance(video, bytes):
-    with open("video.mp4", "wb") as f:
-        f.write(video)
-```
-
-</details>
-
-<figure class="image flex flex-col items-center justify-center text-center m-0 w-full">
-   <video
-      alt="queue.mp4"
-      autoplay loop autobuffer muted playsinline
-    >
-    <source src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/remote_vae/video.mp4" type="video/mp4">
-  </video>
-</figure>
-
-
-### Queueing
-
-One of the great benefits of using a remote VAE is that we can queue multiple generation requests. While the current latent is being processed for decoding, we can already queue another one. This helps improve concurrency. 
-
-
-<details><summary>Code</summary>
-
-```python
-import queue
-import threading
-from IPython.display import display
-from diffusers import StableDiffusionPipeline
-
-def decode_worker(q: queue.Queue):
-    while True:
-        item = q.get()
-        if item is None:
-            break
-        image = remote_decode(
-            endpoint="https://q1bj3bpq6kzilnsu.us-east-1.aws.endpoints.huggingface.cloud/",
-            tensor=item,
-            scaling_factor=0.18215,
-        )
-        display(image)
-        q.task_done()
-
-q = queue.Queue()
-thread = threading.Thread(target=decode_worker, args=(q,), daemon=True)
-thread.start()
-
-def decode(latent: torch.Tensor):
-    q.put(latent)
-
-prompts = [
-    "Blueberry ice cream, in a stylish modern glass , ice cubes, nuts, mint leaves, splashing milk cream, in a gradient purple background, fluid motion, dynamic movement, cinematic lighting, Mysterious",
-    "Lemonade in a glass, mint leaves, in an aqua and white background, flowers, ice cubes, halo, fluid motion, dynamic movement, soft lighting, digital painting, rule of thirds composition, Art by Greg rutkowski, Coby whitmore",
-    "Comic book art, beautiful, vintage, pastel neon colors, extremely detailed pupils, delicate features, light on face, slight smile, Artgerm, Mary Blair, Edmund Dulac, long dark locks, bangs, glowing, fashionable style, fairytale ambience, hot pink.",
-    "Masterpiece, vanilla cone ice cream garnished with chocolate syrup, crushed nuts, choco flakes, in a brown background, gold, cinematic lighting, Art by WLOP",
-    "A bowl of milk, falling cornflakes, berries, blueberries, in a white background, soft lighting, intricate details, rule of thirds, octane render, volumetric lighting",
-    "Cold Coffee with cream, crushed almonds, in a glass, choco flakes, ice cubes, wet, in a wooden background, cinematic lighting, hyper realistic painting, art by Carne Griffiths, octane render, volumetric lighting, fluid motion, dynamic movement, muted colors,",
-]
-
-pipe = StableDiffusionPipeline.from_pretrained(
-    "Lykon/dreamshaper-8",
-    torch_dtype=torch.float16,
-    vae=None,
-).to("cuda")
-
-pipe.unet = pipe.unet.to(memory_format=torch.channels_last)
-pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
-
-_ = pipe(
-    prompt=prompts[0],
-    output_type="latent",
-)
-
-for prompt in prompts:
-    latent = pipe(
-        prompt=prompt,
-        output_type="latent",
-    ).images
-    decode(latent)
-
-q.put(None)
-thread.join()
-```
-
-</details>
-
-
-<figure class="image flex flex-col items-center justify-center text-center m-0 w-full">
-   <video
-      alt="queue.mp4"
-      autoplay loop autobuffer muted playsinline
-    >
-    <source src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/remote_vae/queue.mp4" type="video/mp4">
-  </video>
-</figure>
-
-## Integrations
-
-* **[SD.Next](https://github.com/vladmandic/sdnext):** All-in-one UI with direct supports Hybrid Inference.
-* **[ComfyUI-HFRemoteVae](https://github.com/kijai/ComfyUI-HFRemoteVae):** ComfyUI node for Hybrid Inference.
--- a/docs/source/en/hybrid_inference/vae_encode.md
+++ b/docs/source/en/hybrid_inference/vae_encode.md
@@ -1,183 +0,0 @@
-# Getting Started: VAE Encode with Hybrid Inference
-
-VAE encode is used for training, image-to-image and image-to-video - turning into images or videos into latent representations.
-
-## Memory
-
-These tables demonstrate the VRAM requirements for VAE encode with SD v1 and SD XL on different GPUs.
-
-For the majority of these GPUs the memory usage % dictates other models (text encoders, UNet/Transformer) must be offloaded, or tiled encoding has to be used which increases time taken and impacts quality.
-
-<details><summary>SD v1.5</summary>
-
-| GPU                           | Resolution   |   Time (seconds) |   Memory (%) |   Tiled Time (secs) |   Tiled Memory (%) |
-|:------------------------------|:-------------|-----------------:|-------------:|--------------------:|-------------------:|
-| NVIDIA GeForce RTX 4090       | 512x512      |            0.015 |      3.51901 |               0.015 |            3.51901 |
-| NVIDIA GeForce RTX 4090       | 256x256      |            0.004 |      1.3154  |               0.005 |            1.3154  |
-| NVIDIA GeForce RTX 4090       | 2048x2048    |            0.402 |     47.1852  |               0.496 |            3.51901 |
-| NVIDIA GeForce RTX 4090       | 1024x1024    |            0.078 |     12.2658  |               0.094 |            3.51901 |
-| NVIDIA GeForce RTX 4080 SUPER | 512x512      |            0.023 |      5.30105 |               0.023 |            5.30105 |
-| NVIDIA GeForce RTX 4080 SUPER | 256x256      |            0.006 |      1.98152 |               0.006 |            1.98152 |
-| NVIDIA GeForce RTX 4080 SUPER | 2048x2048    |            0.574 |     71.08    |               0.656 |            5.30105 |
-| NVIDIA GeForce RTX 4080 SUPER | 1024x1024    |            0.111 |     18.4772  |               0.14  |            5.30105 |
-| NVIDIA GeForce RTX 3090       | 512x512      |            0.032 |      3.52782 |               0.032 |            3.52782 |
-| NVIDIA GeForce RTX 3090       | 256x256      |            0.01  |      1.31869 |               0.009 |            1.31869 |
-| NVIDIA GeForce RTX 3090       | 2048x2048    |            0.742 |     47.3033  |               0.954 |            3.52782 |
-| NVIDIA GeForce RTX 3090       | 1024x1024    |            0.136 |     12.2965  |               0.207 |            3.52782 |
-| NVIDIA GeForce RTX 3080       | 512x512      |            0.036 |      8.51761 |               0.036 |            8.51761 |
-| NVIDIA GeForce RTX 3080       | 256x256      |            0.01  |      3.18387 |               0.01  |            3.18387 |
-| NVIDIA GeForce RTX 3080       | 2048x2048    |            0.863 |     86.7424  |               1.191 |            8.51761 |
-| NVIDIA GeForce RTX 3080       | 1024x1024    |            0.157 |     29.6888  |               0.227 |            8.51761 |
-| NVIDIA GeForce RTX 3070       | 512x512      |            0.051 |     10.6941  |               0.051 |           10.6941  |
-| NVIDIA GeForce RTX 3070       | 256x256      |            0.015 |      3.99743 |               0.015 |            3.99743 |
-| NVIDIA GeForce RTX 3070       | 2048x2048    |            1.217 |     96.054   |               1.482 |           10.6941  |
-| NVIDIA GeForce RTX 3070       | 1024x1024    |            0.223 |     37.2751  |               0.327 |           10.6941  |
-
-
-</details>
-
-<details><summary>SDXL</summary>
-
-| GPU                           | Resolution   |   Time (seconds) |   Memory Consumed (%) |   Tiled Time (seconds) |   Tiled Memory (%) |
-|:------------------------------|:-------------|-----------------:|----------------------:|-----------------------:|-------------------:|
-| NVIDIA GeForce RTX 4090       | 512x512      |            0.029 |               4.95707 |                  0.029 |            4.95707 |
-| NVIDIA GeForce RTX 4090       | 256x256      |            0.007 |               2.29666 |                  0.007 |            2.29666 |
-| NVIDIA GeForce RTX 4090       | 2048x2048    |            0.873 |              66.3452  |                  0.863 |           15.5649  |
-| NVIDIA GeForce RTX 4090       | 1024x1024    |            0.142 |              15.5479  |                  0.143 |           15.5479  |
-| NVIDIA GeForce RTX 4080 SUPER | 512x512      |            0.044 |               7.46735 |                  0.044 |            7.46735 |
-| NVIDIA GeForce RTX 4080 SUPER | 256x256      |            0.01  |               3.4597  |                  0.01  |            3.4597  |
-| NVIDIA GeForce RTX 4080 SUPER | 2048x2048    |            1.317 |              87.1615  |                  1.291 |           23.447   |
-| NVIDIA GeForce RTX 4080 SUPER | 1024x1024    |            0.213 |              23.4215  |                  0.214 |           23.4215  |
-| NVIDIA GeForce RTX 3090       | 512x512      |            0.058 |               5.65638 |                  0.058 |            5.65638 |
-| NVIDIA GeForce RTX 3090       | 256x256      |            0.016 |               2.45081 |                  0.016 |            2.45081 |
-| NVIDIA GeForce RTX 3090       | 2048x2048    |            1.755 |              77.8239  |                  1.614 |           18.4193  |
-| NVIDIA GeForce RTX 3090       | 1024x1024    |            0.265 |              18.4023  |                  0.265 |           18.4023  |
-| NVIDIA GeForce RTX 3080       | 512x512      |            0.064 |              13.6568  |                  0.064 |           13.6568  |
-| NVIDIA GeForce RTX 3080       | 256x256      |            0.018 |               5.91728 |                  0.018 |            5.91728 |
-| NVIDIA GeForce RTX 3080       | 2048x2048    |          OOM     |             OOM       |                  1.866 |           44.4717  |
-| NVIDIA GeForce RTX 3080       | 1024x1024    |            0.302 |              44.4308  |                  0.302 |           44.4308  |
-| NVIDIA GeForce RTX 3070       | 512x512      |            0.093 |              17.1465  |                  0.093 |           17.1465  |
-| NVIDIA GeForce RTX 3070       | 256x256      |            0.025 |               7.42931 |                  0.026 |            7.42931 |
-| NVIDIA GeForce RTX 3070       | 2048x2048    |          OOM     |             OOM       |                  2.674 |           55.8355  |
-| NVIDIA GeForce RTX 3070       | 1024x1024    |            0.443 |              55.7841  |                  0.443 |           55.7841  |
-
-</details>
-
-## Available VAEs
-
-|   | **Endpoint** | **Model** |
-|:-:|:-----------:|:--------:|
-| **Stable Diffusion v1** | [https://qc6479g0aac6qwy9.us-east-1.aws.endpoints.huggingface.cloud](https://qc6479g0aac6qwy9.us-east-1.aws.endpoints.huggingface.cloud) | [`stabilityai/sd-vae-ft-mse`](https://hf.co/stabilityai/sd-vae-ft-mse) |
-| **Stable Diffusion XL** | [https://xjqqhmyn62rog84g.us-east-1.aws.endpoints.huggingface.cloud](https://xjqqhmyn62rog84g.us-east-1.aws.endpoints.huggingface.cloud) | [`madebyollin/sdxl-vae-fp16-fix`](https://hf.co/madebyollin/sdxl-vae-fp16-fix) |
-| **Flux** | [https://ptccx55jz97f9zgo.us-east-1.aws.endpoints.huggingface.cloud](https://ptccx55jz97f9zgo.us-east-1.aws.endpoints.huggingface.cloud) | [`black-forest-labs/FLUX.1-schnell`](https://hf.co/black-forest-labs/FLUX.1-schnell) |
-
-
-> [!TIP]
-> Model support can be requested [here](https://github.com/huggingface/diffusers/issues/new?template=remote-vae-pilot-feedback.yml).
-
-
-## Code
-
-> [!TIP]
-> Install `diffusers` from `main` to run the code: `pip install git+https://github.com/huggingface/diffusers@main`
-
-
-A helper method simplifies interacting with Hybrid Inference.
-
-```python
-from diffusers.utils.remote_utils import remote_encode
-```
-
-### Basic example
-
-Let's encode an image, then decode it to demonstrate.
-
-<figure class="image flex flex-col items-center justify-center text-center m-0 w-full">
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/astronaut.jpg"/>
-</figure>
-
-<details><summary>Code</summary>
-
-```python
-from diffusers.utils import load_image
-from diffusers.utils.remote_utils import remote_decode
-
-image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/astronaut.jpg?download=true")
-
-latent = remote_encode(
-    endpoint="https://ptccx55jz97f9zgo.us-east-1.aws.endpoints.huggingface.cloud/",
-    scaling_factor=0.3611,
-    shift_factor=0.1159,
-)
-
-decoded = remote_decode(
-    endpoint="https://whhx50ex1aryqvw6.us-east-1.aws.endpoints.huggingface.cloud/",
-    tensor=latent,
-    scaling_factor=0.3611,
-    shift_factor=0.1159,
-)
-```
-
-</details>
-
-<figure class="image flex flex-col items-center justify-center text-center m-0 w-full">
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/remote_vae/decoded.png"/>
-</figure>
-
-
-### Generation
-
-Now let's look at a generation example, we'll encode the image, generate then remotely decode too!
-
-<details><summary>Code</summary>
-
-```python
-import torch
-from diffusers import StableDiffusionImg2ImgPipeline
-from diffusers.utils import load_image
-from diffusers.utils.remote_utils import remote_decode, remote_encode
-
-pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
-    "stable-diffusion-v1-5/stable-diffusion-v1-5",
-    torch_dtype=torch.float16,
-    variant="fp16",
-    vae=None,
-).to("cuda")
-
-init_image = load_image(
-    "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
-)
-init_image = init_image.resize((768, 512))
-
-init_latent = remote_encode(
-    endpoint="https://qc6479g0aac6qwy9.us-east-1.aws.endpoints.huggingface.cloud/",
-    image=init_image,
-    scaling_factor=0.18215,
-)
-
-prompt = "A fantasy landscape, trending on artstation"
-latent = pipe(
-    prompt=prompt,
-    image=init_latent,
-    strength=0.75,
-    output_type="latent",
-).images
-
-image = remote_decode(
-    endpoint="https://q1bj3bpq6kzilnsu.us-east-1.aws.endpoints.huggingface.cloud/",
-    tensor=latent,
-    scaling_factor=0.18215,
-)
-image.save("fantasy_landscape.jpg")
-```
-
-</details>
-
-<figure class="image flex flex-col items-center justify-center text-center m-0 w-full">
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/remote_vae/fantasy_landscape.png"/>
-</figure>
-
-## Integrations
-
-* **[SD.Next](https://github.com/vladmandic/sdnext):** All-in-one UI with direct supports Hybrid Inference.
-* **[ComfyUI-HFRemoteVae](https://github.com/kijai/ComfyUI-HFRemoteVae):** ComfyUI node for Hybrid Inference.
--- a/docs/source/en/installation.md
+++ b/docs/source/en/installation.md
@@ -161,10 +161,10 @@ Your Python environment will find the `main` version of 🤗 Diffusers on the ne

 Model weights and files are downloaded from the Hub to a cache which is usually your home directory. You can change the cache location by specifying the `HF_HOME` or `HUGGINFACE_HUB_CACHE` environment variables or configuring the `cache_dir` parameter in methods like [`~DiffusionPipeline.from_pretrained`].

-Cached files allow you to run 🤗 Diffusers offline. To prevent 🤗 Diffusers from connecting to the internet, set the `HF_HUB_OFFLINE` environment variable to `1` and 🤗 Diffusers will only load previously downloaded files in the cache.
+Cached files allow you to run 🤗 Diffusers offline. To prevent 🤗 Diffusers from connecting to the internet, set the `HF_HUB_OFFLINE` environment variable to `True` and 🤗 Diffusers will only load previously downloaded files in the cache.

 ```shell
-export HF_HUB_OFFLINE=1
+export HF_HUB_OFFLINE=True
 ```

 For more details about managing and cleaning the cache, take a look at the [caching](https://huggingface.co/docs/huggingface_hub/guides/manage-cache) guide.
@@ -179,16 +179,14 @@ Telemetry is only sent when loading models and pipelines from the Hub,
 and it is not collected if you're loading local files.

 We understand that not everyone wants to share additional information,and we respect your privacy.
-You can disable telemetry collection by setting the `HF_HUB_DISABLE_TELEMETRY` environment variable from your terminal:
+You can disable telemetry collection by setting the `DISABLE_TELEMETRY` environment variable from your terminal:

 On Linux/MacOS:
-
 ```bash
-export HF_HUB_DISABLE_TELEMETRY=1
+export DISABLE_TELEMETRY=YES
 ```

 On Windows:
-
 ```bash
-set HF_HUB_DISABLE_TELEMETRY=1
+set DISABLE_TELEMETRY=YES
 ```
--- a/docs/source/en/optimization/fp16.md
+++ b/docs/source/en/optimization/fp16.md
@@ -10,211 +10,120 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->

-# Accelerate inference
+# Speed up inference

-Diffusion models are slow at inference because generation is an iterative process where noise is gradually refined into an image or video over a certain number of "steps". To speedup this process, you can try experimenting with different [schedulers](../api/schedulers/overview), reduce the precision of the model weights for faster computations, use more memory-efficient attention mechanisms, and more.
+There are several ways to optimize Diffusers for inference speed, such as reducing the computational burden by lowering the data precision or using a lightweight distilled model. There are also memory-efficient attention implementations, [xFormers](xformers) and [scaled dot product attention](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) in PyTorch 2.0, that reduce memory usage which also indirectly speeds up inference. Different speed optimizations can be stacked together to get the fastest inference times.

-Combine and use these techniques together to make inference faster than using any single technique on its own.
+> [!TIP]
+> Optimizing for inference speed or reduced memory usage can lead to improved performance in the other category, so you should try to optimize for both whenever you can. This guide focuses on inference speed, but you can learn more about lowering memory usage in the [Reduce memory usage](memory) guide.

-This guide will go over how to accelerate inference.
+The inference times below are obtained from generating a single 512x512 image from the prompt "a photo of an astronaut riding a horse on mars" with 50 DDIM steps on a NVIDIA A100.

-## Model data type
+| setup    | latency | speed-up |
+|----------|---------|----------|
+| baseline | 5.27s   | x1       |
+| tf32     | 4.14s   | x1.27    |
+| fp16     | 3.51s   | x1.50    |
+| combined | 3.41s   | x1.54    |

-The precision and data type of the model weights affect inference speed because a higher precision requires more memory to load and more time to perform the computations. PyTorch loads model weights in float32 or full precision by default, so changing the data type is a simple way to quickly get faster inference.
+## TensorFloat-32

-<hfoptions id="dtypes">
-<hfoption id="bfloat16">
+On Ampere and later CUDA devices, matrix multiplications and convolutions can use the [TensorFloat-32 (tf32)](https://blogs.nvidia.com/blog/2020/05/14/tensorfloat-32-precision-format/) mode for faster, but slightly less accurate computations. By default, PyTorch enables tf32 mode for convolutions but not matrix multiplications. Unless your network requires full float32 precision, we recommend enabling tf32 for matrix multiplications. It can significantly speed up computations with typically negligible loss in numerical accuracy.

-bfloat16 is similar to float16 but it is more robust to numerical errors. Hardware support for bfloat16 varies, but most modern GPUs are capable of supporting bfloat16.
-
-```py
+```python
 import torch
-from diffusers import StableDiffusionXLPipeline
-
-pipeline = StableDiffusionXLPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.bfloat16
-).to("cuda")
-
-prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k"
-pipeline(prompt, num_inference_steps=30).images[0]
-```
-
-</hfoption>
-<hfoption id="float16">
-
-float16 is similar to bfloat16 but may be more prone to numerical errors.
-
-```py
-import torch
-from diffusers import StableDiffusionXLPipeline
-
-pipeline = StableDiffusionXLPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
-).to("cuda")
-
-prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k"
-pipeline(prompt, num_inference_steps=30).images[0]
-```
-
-</hfoption>
-<hfoption id="TensorFloat-32">
-
-[TensorFloat-32 (tf32)](https://blogs.nvidia.com/blog/2020/05/14/tensorfloat-32-precision-format/) mode is supported on NVIDIA Ampere GPUs and it computes the convolution and matrix multiplication operations in tf32. Storage and other operations are kept in float32. This enables significantly faster computations when combined with bfloat16 or float16.
-
-PyTorch only enables tf32 mode for convolutions by default and you'll need to explicitly enable it for matrix multiplications.
-
-```py
-import torch
-from diffusers import StableDiffusionXLPipeline

 torch.backends.cuda.matmul.allow_tf32 = True
-
-pipeline = StableDiffusionXLPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.bfloat16
-).to("cuda")
-
-prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k"
-pipeline(prompt, num_inference_steps=30).images[0]
 ```

-Refer to the [mixed precision training](https://huggingface.co/docs/transformers/en/perf_train_gpu_one#mixed-precision) docs for more details.
+Learn more about tf32 in the [Mixed precision training](https://huggingface.co/docs/transformers/en/perf_train_gpu_one#tf32) guide.

-</hfoption>
-</hfoptions>
+## Half-precision weights

-## Scaled dot product attention
+To save GPU memory and get more speed, set `torch_dtype=torch.float16` to load and run the model weights directly with half-precision weights.

-> [!TIP]
-> Memory-efficient attention optimizes for inference speed *and* [memory usage](./memory#memory-efficient-attention)!
-
-[Scaled dot product attention (SDPA)](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) implements several attention backends, [FlashAttention](https://github.com/Dao-AILab/flash-attention), [xFormers](https://github.com/facebookresearch/xformers), and a native C++ implementation. It automatically selects the most optimal backend for your hardware.
-
-SDPA is enabled by default if you're using PyTorch >= 2.0 and no additional changes are required to your code. You could try experimenting with other attention backends though if you'd like to choose your own. The example below uses the [torch.nn.attention.sdpa_kernel](https://pytorch.org/docs/stable/generated/torch.nn.attention.sdpa_kernel.html) context manager to enable efficient attention.
-
-```py
-from torch.nn.attention import SDPBackend, sdpa_kernel
+```Python
 import torch
-from diffusers import StableDiffusionXLPipeline
+from diffusers import DiffusionPipeline

-pipeline = StableDiffusionXLPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.bfloat16
-).to("cuda")
-
-prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k"
-
-with sdpa_kernel(SDPBackend.EFFICIENT_ATTENTION):
-  image = pipeline(prompt, num_inference_steps=30).images[0]
-```
-
-## torch.compile
-
-[torch.compile](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html) accelerates inference by compiling PyTorch code and operations into optimized kernels. Diffusers typically compiles the more compute-intensive models like the UNet, transformer, or VAE.
-
-Enable the following compiler settings for maximum speed (refer to the [full list](https://github.com/pytorch/pytorch/blob/main/torch/_inductor/config.py) for more options).
-
-```py
-import torch
-from diffusers import StableDiffusionXLPipeline
-
-torch._inductor.config.conv_1x1_as_mm = True
-torch._inductor.config.coordinate_descent_tuning = True
-torch._inductor.config.epilogue_fusion = False
-torch._inductor.config.coordinate_descent_check_all_directions = True
-```
-
-Load and compile the UNet and VAE. There are several different modes you can choose from, but `"max-autotune"` optimizes for the fastest speed by compiling to a CUDA graph. CUDA graphs effectively reduces the overhead by launching multiple GPU operations through a single CPU operation.
-
-> [!TIP]
-> With PyTorch 2.3.1, you can control the caching behavior of torch.compile. This is particularly beneficial for compilation modes like `"max-autotune"` which performs a grid-search over several compilation flags to find the optimal configuration. Learn more in the [Compile Time Caching in torch.compile](https://pytorch.org/tutorials/recipes/torch_compile_caching_tutorial.html) tutorial.
-
-Changing the memory layout to [channels_last](./memory#torchchannels_last) also optimizes memory and inference speed.
-
-```py
-pipeline = StableDiffusionXLPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
-).to("cuda")
-pipeline.unet.to(memory_format=torch.channels_last)
-pipeline.vae.to(memory_format=torch.channels_last)
-pipeline.unet = torch.compile(
-    pipeline.unet, mode="max-autotune", fullgraph=True
+pipe = DiffusionPipeline.from_pretrained(
+    "stable-diffusion-v1-5/stable-diffusion-v1-5",
+    torch_dtype=torch.float16,
+    use_safetensors=True,
 )
-pipeline.vae.decode = torch.compile(
-    pipeline.vae.decode,
-    mode="max-autotune",
-    fullgraph=True
-)
-
-prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k"
-pipeline(prompt, num_inference_steps=30).images[0]
+pipe = pipe.to("cuda")
 ```

-Compilation is slow the first time, but once compiled, it is significantly faster. Try to only use the compiled pipeline on the same type of inference operations. Calling the compiled pipeline on a different image size retriggers compilation which is slow and inefficient.
-
-### Graph breaks
-
-It is important to specify `fullgraph=True` in torch.compile to ensure there are no graph breaks in the underlying model. This allows you to take advantage of torch.compile without any performance degradation. For the UNet and VAE, this changes how you access the return variables.
-
-```diff
- latents = unet(
-   latents, timestep=timestep, encoder_hidden_states=prompt_embeds
-).sample
-
-+ latents = unet(
-+   latents, timestep=timestep, encoder_hidden_states=prompt_embeds, return_dict=False
-+)[0]
-```
-
-### GPU sync
-
-The `step()` function is [called](https://github.com/huggingface/diffusers/blob/1d686bac8146037e97f3fd8c56e4063230f71751/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py#L1228) on the scheduler each time after the denoiser makes a prediction, and the `sigmas` variable is [indexed](https://github.com/huggingface/diffusers/blob/1d686bac8146037e97f3fd8c56e4063230f71751/src/diffusers/schedulers/scheduling_euler_discrete.py#L476). When placed on the GPU, it introduces latency because of the communication sync between the CPU and GPU. It becomes more evident when the denoiser has already been compiled.
-
-In general, the `sigmas` should [stay on the CPU](https://github.com/huggingface/diffusers/blob/35a969d297cba69110d175ee79c59312b9f49e1e/src/diffusers/schedulers/scheduling_euler_discrete.py#L240) to avoid the communication sync and latency.
-
-## Dynamic quantization
-
-[Dynamic quantization](https://pytorch.org/tutorials/recipes/recipes/dynamic_quantization.html) improves inference speed by reducing precision to enable faster math operations. This particular type of quantization determines how to scale the activations based on the data at runtime rather than using a fixed scaling factor. As a result, the scaling factor is more accurately aligned with the data.
-
-The example below applies [dynamic int8 quantization](https://pytorch.org/tutorials/recipes/recipes/dynamic_quantization.html) to the UNet and VAE with the [torchao](../quantization/torchao) library.
-
-> [!TIP]
-> Refer to our [torchao](../quantization/torchao) docs to learn more about how to use the Diffusers torchao integration.
-
-Configure the compiler tags for maximum speed.
-
-```py
-import torch
-from torchao import apply_dynamic_quant
-from diffusers import StableDiffusionXLPipeline
-
-torch._inductor.config.conv_1x1_as_mm = True
-torch._inductor.config.coordinate_descent_tuning = True
-torch._inductor.config.epilogue_fusion = False
-torch._inductor.config.coordinate_descent_check_all_directions = True
-torch._inductor.config.force_fuse_int_mm_with_mul = True
-torch._inductor.config.use_mixed_mm = True
-```
-
-Filter out some linear layers in the UNet and VAE which don't benefit from dynamic quantization with the [dynamic_quant_filter_fn](https://github.com/huggingface/diffusion-fast/blob/0f169640b1db106fe6a479f78c1ed3bfaeba3386/utils/pipeline_utils.py#L16).
-
-```py
-pipeline = StableDiffusionXLPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.bfloat16
-).to("cuda")
-
-apply_dynamic_quant(pipeline.unet, dynamic_quant_filter_fn)
-apply_dynamic_quant(pipeline.vae, dynamic_quant_filter_fn)
-
-prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k"
-pipeline(prompt, num_inference_steps=30).images[0]
-```
-
-## Fused projection matrices
-
 > [!WARNING]
-> The [fuse_qkv_projections](https://github.com/huggingface/diffusers/blob/58431f102cf39c3c8a569f32d71b2ea8caa461e1/src/diffusers/pipelines/pipeline_utils.py#L2034) method is experimental and support is limited to mostly Stable Diffusion pipelines. Take a look at this [PR](https://github.com/huggingface/diffusers/pull/6179) to learn more about how to enable it for other pipelines
+> Don't use [torch.autocast](https://pytorch.org/docs/stable/amp.html#torch.autocast) in any of the pipelines as it can lead to black images and is always slower than pure float16 precision.

-An input is projected into three subspaces, represented by the projection matrices Q, K, and V, in an attention block. These projections are typically calculated separately, but you can horizontally combine these into a single matrix and perform the projection in a single step. It increases the size of the matrix multiplications of the input projections and also improves the impact of quantization.
+## Distilled model
+
+You could also use a distilled Stable Diffusion model and autoencoder to speed up inference. During distillation, many of the UNet's residual and attention blocks are shed to reduce the model size by 51% and improve latency on CPU/GPU by 43%. The distilled model is faster and uses less memory while generating images of comparable quality to the full Stable Diffusion model.
+
+> [!TIP]
+> Read the [Open-sourcing Knowledge Distillation Code and Weights of SD-Small and SD-Tiny](https://huggingface.co/blog/sd_distillation) blog post to learn more about how knowledge distillation training works to produce a faster, smaller, and cheaper generative model.
+
+The inference times below are obtained from generating 4 images from the prompt "a photo of an astronaut riding a horse on mars" with 25 PNDM steps on a NVIDIA A100. Each generation is repeated 3 times with the distilled Stable Diffusion v1.4 model by [Nota AI](https://hf.co/nota-ai).
+
+| setup                        | latency | speed-up |
+|------------------------------|---------|----------|
+| baseline                     | 6.37s   | x1       |
+| distilled                    | 4.18s   | x1.52    |
+| distilled + tiny autoencoder | 3.83s   | x1.66    |
+
+Let's load the distilled Stable Diffusion model and compare it against the original Stable Diffusion model.

 ```py
-pipeline.fuse_qkv_projections()
-```
+from diffusers import StableDiffusionPipeline
+import torch
+
+distilled = StableDiffusionPipeline.from_pretrained(
+    "nota-ai/bk-sdm-small", torch_dtype=torch.float16, use_safetensors=True,
+).to("cuda")
+prompt = "a golden vase with different flowers"
+generator = torch.manual_seed(2023)
+image = distilled("a golden vase with different flowers", num_inference_steps=25, generator=generator).images[0]
+image
+```
+
+<div class="flex gap-4">
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/original_sd.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">original Stable Diffusion</figcaption>
+  </div>
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/distilled_sd.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">distilled Stable Diffusion</figcaption>
+  </div>
+</div>
+
+### Tiny AutoEncoder
+
+To speed inference up even more, replace the autoencoder with a [distilled version](https://huggingface.co/sayakpaul/taesdxl-diffusers) of it.
+
+```py
+import torch
+from diffusers import AutoencoderTiny, StableDiffusionPipeline
+
+distilled = StableDiffusionPipeline.from_pretrained(
+    "nota-ai/bk-sdm-small", torch_dtype=torch.float16, use_safetensors=True,
+).to("cuda")
+distilled.vae = AutoencoderTiny.from_pretrained(
+    "sayakpaul/taesd-diffusers", torch_dtype=torch.float16, use_safetensors=True,
+).to("cuda")
+
+prompt = "a golden vase with different flowers"
+generator = torch.manual_seed(2023)
+image = distilled("a golden vase with different flowers", num_inference_steps=25, generator=generator).images[0]
+image
+```
+
+<div class="flex justify-center">
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/distilled_sd_vae.png" />
+    <figcaption class="mt-2 text-center text-sm text-gray-500">distilled Stable Diffusion + Tiny AutoEncoder</figcaption>
+  </div>
+</div>
+
+More tiny autoencoder models for other Stable Diffusion models, like Stable Diffusion 3, are available from [madebyollin](https://huggingface.co/madebyollin).
--- a/docs/source/en/optimization/memory.md
+++ b/docs/source/en/optimization/memory.md
@@ -12,258 +12,175 @@ specific language governing permissions and limitations under the License.

 # Reduce memory usage

-Modern diffusion models like [Flux](../api/pipelines/flux) and [Wan](../api/pipelines/wan) have billions of parameters that take up a lot of memory on your hardware for inference. This is challenging because common GPUs often don't have sufficient memory. To overcome the memory limitations, you can use more than one GPU (if available), offload some of the pipeline components to the CPU, and more.
+A barrier to using diffusion models is the large amount of memory required. To overcome this challenge, there are several memory-reducing techniques you can use to run even some of the largest models on free-tier or consumer GPUs. Some of these techniques can even be combined to further reduce memory usage.

-This guide will show you how to reduce your memory usage. 
+<Tip>

-> [!TIP]
-> Keep in mind these techniques may need to be adjusted depending on the model! For example, a transformer-based diffusion model may not benefit equally from these inference speed optimizations as a UNet-based model.
+In many cases, optimizing for memory or speed leads to improved performance in the other, so you should try to optimize for both whenever you can. This guide focuses on minimizing memory usage, but you can also learn more about how to [Speed up inference](fp16).

-## Multiple GPUs
+</Tip>

-If you have access to more than one GPU, there a few options for efficiently loading and distributing a large model across your hardware. These features are supported by the [Accelerate](https://huggingface.co/docs/accelerate/index) library, so make sure it is installed first.
+The results below are obtained from generating a single 512x512 image from the prompt a photo of an astronaut riding a horse on mars with 50 DDIM steps on a Nvidia Titan RTX, demonstrating the speed-up you can expect as a result of reduced memory consumption.

-```bash
-pip install -U accelerate
-```
+|                  | latency | speed-up |
+| ---------------- | ------- | ------- |
+| original         | 9.50s   | x1      |
+| fp16             | 3.61s   | x2.63   |
+| channels last    | 3.30s   | x2.88   |
+| traced UNet      | 3.21s   | x2.96   |
+| memory-efficient attention  | 2.63s  | x3.61   |

-### Sharded checkpoints
+## Sliced VAE

-Loading large checkpoints in several shards in useful because the shards are loaded one at a time. This keeps memory usage low, only requiring enough memory for the model size and the largest shard size. We recommend sharding when the fp32 checkpoint is greater than 5GB. The default shard size is 5GB.
+Sliced VAE enables decoding large batches of images with limited VRAM or batches with 32 images or more by decoding the batches of latents one image at a time. You'll likely want to couple this with [`~ModelMixin.enable_xformers_memory_efficient_attention`] to reduce memory use further if you have xFormers installed.

-Shard a checkpoint in [`~DiffusionPipeline.save_pretrained`] with the `max_shard_size` parameter.
+To use sliced VAE, call [`~StableDiffusionPipeline.enable_vae_slicing`] on your pipeline before inference:

-```py
-from diffusers import AutoModel
-
-unet = AutoModel.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0", subfolder="unet"
-)
-unet.save_pretrained("sdxl-unet-sharded", max_shard_size="5GB")
-```
-
-Now you can use the sharded checkpoint, instead of the regular checkpoint, to save memory.
-
-```py
+```python
 import torch
-from diffusers import AutoModel, StableDiffusionXLPipeline
+from diffusers import StableDiffusionPipeline

-unet = AutoModel.from_pretrained(
-    "username/sdxl-unet-sharded", torch_dtype=torch.float16
-)
-pipeline = StableDiffusionXLPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0",
-    unet=unet,
-    torch_dtype=torch.float16
-).to("cuda")
-```
-
-### Device placement
-
-> [!WARNING]
-> Device placement is an experimental feature and the API may change. Only the `balanced` strategy is supported at the moment. We plan to support additional mapping strategies in the future.
-
-The `device_map` parameter controls how the model components in a pipeline are distributed across devices. The `balanced` device placement strategy evenly splits the pipeline across all available devices.
-
-```py
-import torch
-from diffusers import AutoModel, StableDiffusionXLPipeline
-
-pipeline = StableDiffusionXLPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0",
+pipe = StableDiffusionPipeline.from_pretrained(
+    "stable-diffusion-v1-5/stable-diffusion-v1-5",
    torch_dtype=torch.float16,
-    device_map="balanced"
+    use_safetensors=True,
 )
+pipe = pipe.to("cuda")
+
+prompt = "a photo of an astronaut riding a horse on mars"
+pipe.enable_vae_slicing()
+#pipe.enable_xformers_memory_efficient_attention()
+images = pipe([prompt] * 32).images
 ```

-You can inspect a pipeline's device map with `hf_device_map`.
+You may see a small performance boost in VAE decoding on multi-image batches, and there should be no performance impact on single-image batches.

-```py
-print(pipeline.hf_device_map)
-{'unet': 1, 'vae': 1, 'safety_checker': 0, 'text_encoder': 0}
-```
+## Tiled VAE

-The `device_map` parameter also works on the model-level. This is useful for loading large models, such as the Flux diffusion transformer which has 12.5B parameters. Instead of `balanced`, set it to `"auto"` to automatically distribute a model across the fastest device first before moving to slower devices. Refer to the [Model sharding](../training/distributed_inference#model-sharding) docs for more details.
+Tiled VAE processing also enables working with large images on limited VRAM (for example, generating 4k images on 8GB of VRAM) by splitting the image into overlapping tiles, decoding the tiles, and then blending the outputs together to compose the final image. You should also used tiled VAE with [`~ModelMixin.enable_xformers_memory_efficient_attention`] to reduce memory use further if you have xFormers installed.

-```py
+To use tiled VAE processing, call [`~StableDiffusionPipeline.enable_vae_tiling`] on your pipeline before inference:
+
+```python
 import torch
-from diffusers import AutoModel
+from diffusers import StableDiffusionPipeline, UniPCMultistepScheduler

-transformer = AutoModel.from_pretrained(
-    "black-forest-labs/FLUX.1-dev", 
-    subfolder="transformer",
-    device_map="auto",
-    torch_dtype=torch.bfloat16
-)
-```
-
-For more fine-grained control, pass a dictionary to enforce the maximum GPU memory to use on each device. If a device is not in `max_memory`, it is ignored and pipeline components won't be distributed to it.
-
-```py
-import torch
-from diffusers import AutoModel, StableDiffusionXLPipeline
-
-max_memory = {0:"1GB", 1:"1GB"}
-pipeline = StableDiffusionXLPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0",
+pipe = StableDiffusionPipeline.from_pretrained(
+    "stable-diffusion-v1-5/stable-diffusion-v1-5",
    torch_dtype=torch.float16,
-    device_map="balanced",
-    max_memory=max_memory
+    use_safetensors=True,
 )
+pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
+pipe = pipe.to("cuda")
+prompt = "a beautiful landscape photograph"
+pipe.enable_vae_tiling()
+#pipe.enable_xformers_memory_efficient_attention()
+
+image = pipe([prompt], width=3840, height=2224, num_inference_steps=20).images[0]
 ```

-Diffusers uses the maxmium memory of all devices by default, but if they don't fit on the GPUs, then you'll need to use a single GPU and offload to the CPU with the methods below.
-
- [`~DiffusionPipeline.enable_model_cpu_offload`] only works on a single GPU but a very large model may not fit on it
- [`~DiffusionPipeline.enable_sequential_cpu_offload`] may work but it is extremely slow and also limited to a single GPU
-
-Use the [`~DiffusionPipeline.reset_device_map`] method to reset the `device_map`. This is necessary if you want to use methods like `.to()`, [`~DiffusionPipeline.enable_sequential_cpu_offload`], and [`~DiffusionPipeline.enable_model_cpu_offload`] on a pipeline that was device-mapped.
-
-```py
-pipeline.reset_device_map()
-```
-
-## VAE slicing
-
-VAE slicing saves memory by splitting large batches of inputs into a single batch of data and separately processing them. This method works best when generating more than one image at a time.
-
-For example, if you're generating 4 images at once, decoding would increase peak activation memory by 4x. VAE slicing reduces this by only decoding 1 image at a time instead of all 4 images at once.
-
-Call [`~StableDiffusionPipeline.enable_vae_slicing`] to enable sliced VAE. You can expect a small increase in performance when decoding multi-image batches and no performance impact for single-image batches.
-
-```py
-import torch
-from diffusers import AutoModel, StableDiffusionXLPipeline
-
-pipeline = StableDiffusionXLPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0",
-    torch_dtype=torch.float16,
-).to("cuda")
-pipeline.enable_vae_slicing()
-pipeline(["An astronaut riding a horse on Mars"]*32).images[0]
-print(f"Max memory reserved: {torch.cuda.max_memory_allocated() / 1024**3:.2f} GB")
-```
-
-> [!WARNING]
-> [`AutoencoderKLWan`] and [`AsymmetricAutoencoderKL`] don't support slicing.
-
-## VAE tiling
-
-VAE tiling saves memory by dividing an image into smaller overlapping tiles instead of processing the entire image at once. This also reduces peak memory usage because the GPU is only processing a tile at a time.
-
-Call [`~StableDiffusionPipeline.enable_vae_tiling`] to enable VAE tiling. The generated image may have some tone variation from tile-to-tile because they're decoded separately, but there shouldn't be any obvious seams between the tiles. Tiling is disabled for resolutions lower than a pre-specified (but configurable) limit. For example, this limit is 512x512 for the VAE in [`StableDiffusionPipeline`].
-
-```py
-import torch
-from diffusers import AutoPipelineForImage2Image
-from diffusers.utils import load_image
-
-pipeline = AutoPipelineForImage2Image.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
-).to("cuda")
-pipeline.enable_vae_tiling()
-
-init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/img2img-sdxl-init.png")
-prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k"
-pipeline(prompt, image=init_image, strength=0.5).images[0]
-print(f"Max memory reserved: {torch.cuda.max_memory_allocated() / 1024**3:.2f} GB")
-```
-
-> [!WARNING]
-> [`AutoencoderKLWan`] and [`AsymmetricAutoencoderKL`] don't support tiling.
+The output image has some tile-to-tile tone variation because the tiles are decoded separately, but you shouldn't see any sharp and obvious seams between the tiles. Tiling is turned off for images that are 512x512 or smaller.

 ## CPU offloading

-CPU offloading selectively moves weights from the GPU to the CPU. When a component is required, it is transferred to the GPU and when it isn't required, it is moved to the CPU. This method works on submodules rather than whole models. It saves memory by avoiding storing the entire model on the GPU.
+Offloading the weights to the CPU and only loading them on the GPU when performing the forward pass can also save memory. Often, this technique can reduce memory consumption to less than 3GB.

-CPU offloading dramatically reduces memory usage, but it is also **extremely slow** because submodules are passed back and forth multiple times between devices. It can often be impractical due to how slow it is.
+To perform CPU offloading, call [`~StableDiffusionPipeline.enable_sequential_cpu_offload`]:

-> [!WARNING]
-> Don't move the pipeline to CUDA before calling [`~DiffusionPipeline.enable_sequential_cpu_offload`], otherwise the amount of memory saved is only minimal (refer to this [issue](https://github.com/huggingface/diffusers/issues/1934) for more details). This is a stateful operation that installs hooks on the model.
-
-Call [`~DiffusionPipeline.enable_sequential_cpu_offload`] to enable it on a pipeline.
-
-```py
+```Python
 import torch
-from diffusers import DiffusionPipeline
+from diffusers import StableDiffusionPipeline

-pipeline = DiffusionPipeline.from_pretrained(
-    "black-forest-labs/FLUX.1-schnell", torch_dtype=torch.bfloat16
+pipe = StableDiffusionPipeline.from_pretrained(
+    "stable-diffusion-v1-5/stable-diffusion-v1-5",
+    torch_dtype=torch.float16,
+    use_safetensors=True,
 )
-pipeline.enable_sequential_cpu_offload()

-pipeline(
-    prompt="An astronaut riding a horse on Mars",
-    guidance_scale=0.,
-    height=768,
-    width=1360,
-    num_inference_steps=4,
-    max_sequence_length=256,
-).images[0]
-print(f"Max memory reserved: {torch.cuda.max_memory_allocated() / 1024**3:.2f} GB")
+prompt = "a photo of an astronaut riding a horse on mars"
+pipe.enable_sequential_cpu_offload()
+image = pipe(prompt).images[0]
 ```

+CPU offloading works on submodules rather than whole models. This is the best way to minimize memory consumption, but inference is much slower due to the iterative nature of the diffusion process. The UNet component of the pipeline runs several times (as many as `num_inference_steps`); each time, the different UNet submodules are sequentially onloaded and offloaded as needed, resulting in a large number of memory transfers.
+
+<Tip>
+
+Consider using [model offloading](#model-offloading) if you want to optimize for speed because it is much faster. The tradeoff is your memory savings won't be as large.
+
+</Tip>
+
+<Tip warning={true}>
+
+When using [`~StableDiffusionPipeline.enable_sequential_cpu_offload`], don't move the pipeline to CUDA beforehand or else the gain in memory consumption will only be minimal (see this [issue](https://github.com/huggingface/diffusers/issues/1934) for more information).
+
+[`~StableDiffusionPipeline.enable_sequential_cpu_offload`] is a stateful operation that installs hooks on the models.
+
+</Tip>
+
 ## Model offloading

-Model offloading moves entire models to the GPU instead of selectively moving *some* layers or model components. One of the main pipeline models, usually the text encoder, UNet, and VAE, is placed on the GPU while the other components are held on the CPU. Components like the UNet that run multiple times stays on the GPU until its completely finished and no longer needed. This eliminates the communication overhead of [CPU offloading](#cpu-offloading) and makes model offloading a faster alternative. The tradeoff is memory savings won't be as large.
+<Tip>

-> [!WARNING]
-> Keep in mind that if models are reused outside the pipeline after hookes have been installed (see [Removing Hooks](https://huggingface.co/docs/accelerate/en/package_reference/big_modeling#accelerate.hooks.remove_hook_from_module) for more details), you need to run the entire pipeline and models in the expected order to properly offload them. This is a stateful operation that installs hooks on the model.
+Model offloading requires 🤗 Accelerate version 0.17.0 or higher.

-Call [`~DiffusionPipeline.enable_model_cpu_offload`] to enable it on a pipeline.
+</Tip>

-```py
+[Sequential CPU offloading](#cpu-offloading) preserves a lot of memory but it makes inference slower because submodules are moved to GPU as needed, and they're immediately returned to the CPU when a new module runs.
+
+Full-model offloading is an alternative that moves whole models to the GPU, instead of handling each model's constituent *submodules*. There is a negligible impact on inference time (compared with moving the pipeline to `cuda`), and it still provides some memory savings.
+
+During model offloading, only one of the main components of the pipeline (typically the text encoder, UNet and VAE)
+is placed on the GPU while the others wait on the CPU. Components like the UNet that run for multiple iterations stay on the GPU until they're no longer needed.
+
+Enable model offloading by calling [`~StableDiffusionPipeline.enable_model_cpu_offload`] on the pipeline:
+
+```Python
 import torch
-from diffusers import DiffusionPipeline
+from diffusers import StableDiffusionPipeline

-pipeline = DiffusionPipeline.from_pretrained(
-    "black-forest-labs/FLUX.1-schnell", torch_dtype=torch.bfloat16
+pipe = StableDiffusionPipeline.from_pretrained(
+    "stable-diffusion-v1-5/stable-diffusion-v1-5",
+    torch_dtype=torch.float16,
+    use_safetensors=True,
 )
-pipline.enable_model_cpu_offload()

-pipeline(
-    prompt="An astronaut riding a horse on Mars",
-    guidance_scale=0.,
-    height=768,
-    width=1360,
-    num_inference_steps=4,
-    max_sequence_length=256,
-).images[0]
-print(f"Max memory reserved: {torch.cuda.max_memory_allocated() / 1024**3:.2f} GB")
+prompt = "a photo of an astronaut riding a horse on mars"
+pipe.enable_model_cpu_offload()
+image = pipe(prompt).images[0]
 ```

-[`~DiffusionPipeline.enable_model_cpu_offload`] also helps when you're using the [`~StableDiffusionXLPipeline.encode_prompt`] method on its own to generate the text encoders hidden state.
+<Tip warning={true}>
+
+In order to properly offload models after they're called, it is required to run the entire pipeline and models are called in the pipeline's expected order. Exercise caution if models are reused outside the context of the pipeline after hooks have been installed. See [Removing Hooks](https://huggingface.co/docs/accelerate/en/package_reference/big_modeling#accelerate.hooks.remove_hook_from_module) for more information.
+
+[`~StableDiffusionPipeline.enable_model_cpu_offload`] is a stateful operation that installs hooks on the models and state on the pipeline.
+
+</Tip>

 ## Group offloading

-Group offloading moves groups of internal layers ([torch.nn.ModuleList](https://pytorch.org/docs/stable/generated/torch.nn.ModuleList.html) or [torch.nn.Sequential](https://pytorch.org/docs/stable/generated/torch.nn.Sequential.html)) to the CPU. It uses less memory than [model offloading](#model-offloading) and it is faster than [CPU offloading](#cpu-offloading) because it reduces communication overhead.
+Group offloading is the middle ground between sequential and model offloading. It works by offloading groups of internal layers (either `torch.nn.ModuleList` or `torch.nn.Sequential`), which uses less memory than model-level offloading. It is also faster than sequential-level offloading because the number of device synchronizations is reduced.

-> [!WARNING]
-> Group offloading may not work with all models if the forward implementation contains weight-dependent device casting of inputs because it may clash with group offloading's device casting mechanism.
+To enable group offloading, call the [`~ModelMixin.enable_group_offload`] method on the model if it is a Diffusers model implementation. For any other model implementation, use [`~hooks.group_offloading.apply_group_offloading`]:

-Call [`~ModelMixin.enable_group_offload`] to enable it for standard Diffusers model components that inherit from [`ModelMixin`]. For other model components that don't inherit from [`ModelMixin`], such as a generic [torch.nn.Module](https://pytorch.org/docs/stable/generated/torch.nn.Module.html), use [`~hooks.apply_group_offloading`] instead.
-
-The `offload_type` parameter can be set to `block_level` or `leaf_level`.
-
- `block_level` offloads groups of layers based on the `num_blocks_per_group` parameter. For example, if `num_blocks_per_group=2` on a model with 40 layers, 2 layers are onloaded and offloaded at a time (20 total onloads/offloads). This drastically reduces memory requirements.
- `leaf_level` offloads individual layers at the lowest level and is equivalent to [CPU offloading](#cpu-offloading). But it can be made faster if you use streams without giving up inference speed.
-
-```py
+```python
 import torch
 from diffusers import CogVideoXPipeline
 from diffusers.hooks import apply_group_offloading
 from diffusers.utils import export_to_video

+# Load the pipeline
 onload_device = torch.device("cuda")
 offload_device = torch.device("cpu")
-pipeline = CogVideoXPipeline.from_pretrained("THUDM/CogVideoX-5b", torch_dtype=torch.bfloat16)
+pipe = CogVideoXPipeline.from_pretrained("THUDM/CogVideoX-5b", torch_dtype=torch.bfloat16)

-# Use the enable_group_offload method for Diffusers model implementations
-pipeline.transformer.enable_group_offload(onload_device=onload_device, offload_device=offload_device, offload_type="leaf_level")
-pipeline.vae.enable_group_offload(onload_device=onload_device, offload_type="leaf_level")
+# We can utilize the enable_group_offload method for Diffusers model implementations
+pipe.transformer.enable_group_offload(onload_device=onload_device, offload_device=offload_device, offload_type="leaf_level", use_stream=True)

-# Use the apply_group_offloading method for other model components
-apply_group_offloading(pipeline.text_encoder, onload_device=onload_device, offload_type="block_level", num_blocks_per_group=2)
+# For any other model implementations, the apply_group_offloading function can be used
+apply_group_offloading(pipe.text_encoder, onload_device=onload_device, offload_type="block_level", num_blocks_per_group=2)
+apply_group_offloading(pipe.vae, onload_device=onload_device, offload_type="leaf_level")

 prompt = (
    "A panda, dressed in a small, red jacket and a tiny hat, sits on a wooden stool in a serene bamboo forest. "
@@ -273,55 +190,35 @@ prompt = (
    "The background includes a small, flowing stream and vibrant green foliage, enhancing the peaceful and magical "
    "atmosphere of this unique musical performance."
 )
-video = pipeline(prompt=prompt, guidance_scale=6, num_inference_steps=50).frames[0]
+video = pipe(prompt=prompt, guidance_scale=6, num_inference_steps=50).frames[0]
+# This utilized about 14.79 GB. It can be further reduced by using tiling and using leaf_level offloading throughout the pipeline.
 print(f"Max memory reserved: {torch.cuda.max_memory_allocated() / 1024**3:.2f} GB")
 export_to_video(video, "output.mp4", fps=8)
 ```

-### CUDA stream
+Group offloading (for CUDA devices with support for asynchronous data transfer streams) overlaps data transfer and computation to reduce the overall execution time compared to sequential offloading. This is enabled using layer prefetching with CUDA streams. The next layer to be executed is loaded onto the accelerator device while the current layer is being executed - this increases the memory requirements slightly. Group offloading also supports leaf-level offloading (equivalent to sequential CPU offloading) but can be made much faster when using streams.

-The `use_stream` parameter can be activated for CUDA devices that support asynchronous data transfer streams to reduce overall execution time compared to [CPU offloading](#cpu-offloading). It overlaps data transfer and computation by using layer prefetching. The next layer to be executed is loaded onto the GPU while the current layer is still being executed. It can increase CPU memory significantly so ensure you have 2x the amount of memory as the model size.
+## FP8 layerwise weight-casting

-Set `record_stream=True` for more of a speedup at the cost of slightly increased memory usage. Refer to the [torch.Tensor.record_stream](https://pytorch.org/docs/stable/generated/torch.Tensor.record_stream.html) docs to learn more.
+PyTorch supports `torch.float8_e4m3fn` and `torch.float8_e5m2` as weight storage dtypes, but they can't be used for computation in many different tensor operations due to unimplemented kernel support. However, you can use these dtypes to store model weights in fp8 precision and upcast them on-the-fly when the layers are used in the forward pass. This is known as layerwise weight-casting.

-> [!TIP]
-> When `use_stream=True` on VAEs with tiling enabled, make sure to do a dummy forward pass (possible with dummy inputs as well) before inference to avoid device mismatch errors. This may not work on all implementations, so feel free to open an issue if you encounter any problems.
+Typically, inference on most models is done with `torch.float16` or `torch.bfloat16` weight/computation precision. Layerwise weight-casting cuts down the memory footprint of the model weights by approximately half.

-If you're using `block_level` group offloading with `use_stream` enabled, the `num_blocks_per_group` parameter should be set to `1`, otherwise a warning will be raised.
-
-```py
-pipeline.transformer.enable_group_offload(onload_device=onload_device, offload_device=offload_device, offload_type="leaf_level", use_stream=True, record_stream=True)
-```
-
-The `low_cpu_mem_usage` parameter can be set to `True` to reduce CPU memory usage when using streams during group offloading. It is best for `leaf_level` offloading and when CPU memory is bottlenecked. Memory is saved by creating pinned tensors on the fly instead of pre-pinning them. However, this may increase overall execution time.
-
-## Layerwise casting
-
-Layerwise casting stores weights in a smaller data format (for example, `torch.float8_e4m3fn` and `torch.float8_e5m2`) to use less memory and upcasts those weights to a higher precision like `torch.float16` or `torch.bfloat16` for computation. Certain layers (normalization and modulation related weights) are skipped because storing them in fp8 can degrade generation quality.
-
-> [!WARNING]
-> Layerwise casting may not work with all models if the forward implementation contains internal typecasting of weights. The current implementation of layerwise casting assumes the forward pass is independent of the weight precision and the input datatypes are always specified in `compute_dtype` (see [here](https://github.com/huggingface/transformers/blob/7f5077e53682ca855afc826162b204ebf809f1f9/src/transformers/models/t5/modeling_t5.py#L294-L299) for an incompatible implementation).
->
-> Layerwise casting may also fail on custom modeling implementations with [PEFT](https://huggingface.co/docs/peft/index) layers. There are some checks available but they are not extensively tested or guaranteed to work in all cases.
-
-Call [`~ModelMixin.enable_layerwise_casting`] to set the storage and computation datatypes.
-
-```py
+```python
 import torch
 from diffusers import CogVideoXPipeline, CogVideoXTransformer3DModel
 from diffusers.utils import export_to_video

-transformer = CogVideoXTransformer3DModel.from_pretrained(
-    "THUDM/CogVideoX-5b",
-    subfolder="transformer",
-    torch_dtype=torch.bfloat16
-)
+model_id = "THUDM/CogVideoX-5b"
+
+# Load the model in bfloat16 and enable layerwise casting
+transformer = CogVideoXTransformer3DModel.from_pretrained(model_id, subfolder="transformer", torch_dtype=torch.bfloat16)
 transformer.enable_layerwise_casting(storage_dtype=torch.float8_e4m3fn, compute_dtype=torch.bfloat16)

-pipeline = CogVideoXPipeline.from_pretrained("THUDM/CogVideoX-5b",
-    transformer=transformer,
-    torch_dtype=torch.bfloat16
-).to("cuda")
+# Load the pipeline
+pipe = CogVideoXPipeline.from_pretrained(model_id, transformer=transformer, torch_dtype=torch.bfloat16)
+pipe.to("cuda")
+
 prompt = (
    "A panda, dressed in a small, red jacket and a tiny hat, sits on a wooden stool in a serene bamboo forest. "
    "The panda's fluffy paws strum a miniature acoustic guitar, producing soft, melodic tunes. Nearby, a few other "
@@ -330,53 +227,35 @@ prompt = (
    "The background includes a small, flowing stream and vibrant green foliage, enhancing the peaceful and magical "
    "atmosphere of this unique musical performance."
 )
-video = pipeline(prompt=prompt, guidance_scale=6, num_inference_steps=50).frames[0]
-print(f"Max memory reserved: {torch.cuda.max_memory_allocated() / 1024**3:.2f} GB")
+video = pipe(prompt=prompt, guidance_scale=6, num_inference_steps=50).frames[0]
 export_to_video(video, "output.mp4", fps=8)
 ```

-The [`~hooks.apply_layerwise_casting`] method can also be used if you need more control and flexibility. It can be partially applied to model layers by calling it on specific internal modules. Use the `skip_modules_pattern` or `skip_modules_classes` parameters to specify modules to avoid, such as the normalization and modulation layers.
+In the above example, layerwise casting is enabled on the transformer component of the pipeline. By default, certain layers are skipped from the FP8 weight casting because it can lead to significant degradation of generation quality. The normalization and modulation related weight parameters are also skipped by default.
+
+However, you gain more control and flexibility by directly utilizing the [`~hooks.layerwise_casting.apply_layerwise_casting`] function instead of [`~ModelMixin.enable_layerwise_casting`].
+
+## Channels-last memory format
+
+The channels-last memory format is an alternative way of ordering NCHW tensors in memory to preserve dimension ordering. Channels-last tensors are ordered in such a way that the channels become the densest dimension (storing images pixel-per-pixel). Since not all operators currently support the channels-last format, it may result in worst performance but you should still try and see if it works for your model.
+
+For example, to set the pipeline's UNet to use the channels-last format:

 ```python
-import torch
-from diffusers import CogVideoXTransformer3DModel
-from diffusers.hooks import apply_layerwise_casting
-
-transformer = CogVideoXTransformer3DModel.from_pretrained(
-    "THUDM/CogVideoX-5b",
-    subfolder="transformer",
-    torch_dtype=torch.bfloat16
-)
-
-# skip the normalization layer
-apply_layerwise_casting(
-    transformer,
-    storage_dtype=torch.float8_e4m3fn,
-    compute_dtype=torch.bfloat16,
-    skip_modules_classes=["norm"],
-    non_blocking=True,
-)
-```
-
-## torch.channels_last
-
-[torch.channels_last](https://pytorch.org/tutorials/intermediate/memory_format_tutorial.html) flips how tensors are stored from `(batch size, channels, height, width)` to `(batch size, heigh, width, channels)`. This aligns the tensors with how the hardware sequentially accesses the tensors stored in memory and avoids skipping around in memory to access the pixel values.
-
-Not all operators currently support the channels-last format and may result in worst performance, but it is still worth trying.
-
-```py
-print(pipeline.unet.conv_out.state_dict()["weight"].stride())  # (2880, 9, 3, 1)
-pipeline.unet.to(memory_format=torch.channels_last)  # in-place operation
+print(pipe.unet.conv_out.state_dict()["weight"].stride())  # (2880, 9, 3, 1)
+pipe.unet.to(memory_format=torch.channels_last)  # in-place operation
 print(
-    pipeline.unet.conv_out.state_dict()["weight"].stride()
+    pipe.unet.conv_out.state_dict()["weight"].stride()
 )  # (2880, 1, 960, 320) having a stride of 1 for the 2nd dimension proves that it works
 ```

-## torch.jit.trace
+## Tracing

-[torch.jit.trace](https://pytorch.org/docs/stable/generated/torch.jit.trace.html) records the operations a model performs on a sample input and creates a new, optimized representation of the model based on the recorded execution path. During tracing, the model is optimized to reduce overhead from Python and dynamic control flows and operations are fused together for more efficiency. The returned executable or [ScriptFunction](https://pytorch.org/docs/stable/generated/torch.jit.ScriptFunction.html) can be compiled.
+Tracing runs an example input tensor through the model and captures the operations that are performed on it as that input makes its way through the model's layers. The executable or `ScriptFunction` that is returned is optimized with just-in-time compilation.

-```py
+To trace a UNet:
+
+```python
 import time
 import torch
 from diffusers import StableDiffusionPipeline
@@ -389,7 +268,8 @@ torch.set_grad_enabled(False)
 n_experiments = 2
 unet_runs_per_experiment = 50

-# load sample inputs
+
+# load inputs
 def generate_inputs():
    sample = torch.randn((2, 4, 64, 64), device="cuda", dtype=torch.float16)
    timestep = torch.rand(1, device="cuda", dtype=torch.float16) * 999
@@ -397,12 +277,12 @@ def generate_inputs():
    return sample, timestep, encoder_hidden_states


-pipeline = StableDiffusionPipeline.from_pretrained(
+pipe = StableDiffusionPipeline.from_pretrained(
    "stable-diffusion-v1-5/stable-diffusion-v1-5",
    torch_dtype=torch.float16,
    use_safetensors=True,
 ).to("cuda")
-unet = pipeline.unet
+unet = pipe.unet
 unet.eval()
 unet.to(memory_format=torch.channels_last)  # use channels_last memory format
 unet.forward = functools.partial(unet.forward, return_dict=False)  # set return_dict=False as default
@@ -419,12 +299,14 @@ unet_traced = torch.jit.trace(unet, inputs)
 unet_traced.eval()
 print("done tracing")

+
 # warmup and optimize graph
 for _ in range(5):
    with torch.inference_mode():
        inputs = generate_inputs()
        orig_output = unet_traced(*inputs)

+
 # benchmarking
 with torch.inference_mode():
    for _ in range(n_experiments):
@@ -446,18 +328,20 @@ with torch.inference_mode():
 unet_traced.save("unet_traced.pt")
 ```

-Replace the pipeline's UNet with the traced version.
+Replace the `unet` attribute of the pipeline with the traced model:

-```py
-import torch
+```python
 from diffusers import StableDiffusionPipeline
+import torch
 from dataclasses import dataclass

+
@dataclass
 class UNet2DConditionOutput:
    sample: torch.Tensor

-pipeline = StableDiffusionPipeline.from_pretrained(
+
+pipe = StableDiffusionPipeline.from_pretrained(
    "stable-diffusion-v1-5/stable-diffusion-v1-5",
    torch_dtype=torch.float16,
    use_safetensors=True,
@@ -466,7 +350,8 @@ pipeline = StableDiffusionPipeline.from_pretrained(
 # use jitted unet
 unet_traced = torch.jit.load("unet_traced.pt")

-# del pipeline.unet
+
+# del pipe.unet
 class TracedUNet(torch.nn.Module):
    def __init__(self):
        super().__init__()
@@ -477,7 +362,8 @@ class TracedUNet(torch.nn.Module):
        sample = unet_traced(latent_model_input, t, encoder_hidden_states)[0]
        return UNet2DConditionOutput(sample=sample)

-pipeline.unet = TracedUNet()
+
+pipe.unet = TracedUNet()

 with torch.inference_mode():
    image = pipe([prompt] * 1, num_inference_steps=50).images[0]
@@ -485,31 +371,39 @@ with torch.inference_mode():

 ## Memory-efficient attention

-> [!TIP]
-> Memory-efficient attention optimizes for memory usage *and* [inference speed](./fp16#scaled-dot-product-attention!
+Recent work on optimizing bandwidth in the attention block has generated huge speed-ups and reductions in GPU memory usage. The most recent type of memory-efficient attention is [Flash Attention](https://arxiv.org/abs/2205.14135) (you can check out the original code at [HazyResearch/flash-attention](https://github.com/HazyResearch/flash-attention)).

-The Transformers attention mechanism is memory-intensive, especially for long sequences, so you can try using different and more memory-efficient attention types.
+<Tip>

-By default, if PyTorch >= 2.0 is installed, [scaled dot-product attention (SDPA)](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) is used. You don't need to make any additional changes to your code.
+If you have PyTorch >= 2.0 installed, you should not expect a speed-up for inference when enabling `xformers`.

-SDPA supports [FlashAttention](https://github.com/Dao-AILab/flash-attention) and [xFormers](https://github.com/facebookresearch/xformers) as well as a native C++ PyTorch implementation. It automatically selects the most optimal implementation based on your input.
+</Tip>

-You can explicitly use xFormers with the [`~ModelMixin.enable_xformers_memory_efficient_attention`] method.
+To use Flash Attention, install the following:

-```py
-# pip install xformers
+- PyTorch > 1.12
+- CUDA available
+- [xFormers](xformers)
+
+Then call [`~ModelMixin.enable_xformers_memory_efficient_attention`] on the pipeline:
+
+```python
+from diffusers import DiffusionPipeline
 import torch
-from diffusers import StableDiffusionXLPipeline

-pipeline = StableDiffusionXLPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0",
+pipe = DiffusionPipeline.from_pretrained(
+    "stable-diffusion-v1-5/stable-diffusion-v1-5",
    torch_dtype=torch.float16,
+    use_safetensors=True,
 ).to("cuda")
-pipeline.enable_xformers_memory_efficient_attention()
+
+pipe.enable_xformers_memory_efficient_attention()
+
+with torch.inference_mode():
+    sample = pipe("a small cat")
+
+# optional: You can disable it via
+# pipe.disable_xformers_memory_efficient_attention()
 ```

-Call [`~ModelMixin.disable_xformers_memory_efficient_attention`] to disable it.
-
-```py
-pipeline.disable_xformers_memory_efficient_attention()
-```
+The iteration speed when using `xformers` should match the iteration speed of PyTorch 2.0 as described [here](torch2.0).
--- a/docs/source/en/optimization/mps.md
+++ b/docs/source/en/optimization/mps.md
@@ -12,9 +12,6 @@ specific language governing permissions and limitations under the License.

 # Metal Performance Shaders (MPS)

-> [!TIP]
-> Pipelines with a <img alt="MPS" src="https://img.shields.io/badge/MPS-000000?style=flat&logo=apple&logoColor=white%22"> badge indicate a model can take advantage of the MPS backend on Apple silicon devices for faster inference. Feel free to open a [Pull Request](https://github.com/huggingface/diffusers/compare) to add this badge to pipelines that are missing it.
-
 🤗 Diffusers is compatible with Apple silicon (M1/M2 chips) using the PyTorch [`mps`](https://pytorch.org/docs/stable/notes/mps.html) device, which uses the Metal framework to leverage the GPU on MacOS devices. You'll need to have:

 - macOS computer with Apple silicon (M1/M2) hardware
@@ -40,7 +37,7 @@ image

 <Tip warning={true}>

-The PyTorch [mps](https://pytorch.org/docs/stable/notes/mps.html) backend does not support NDArray sizes greater than `2**32`. Please open an [Issue](https://github.com/huggingface/diffusers/issues/new/choose) if you encounter this problem so we can investigate.
+Generating multiple prompts in a batch can [crash](https://github.com/huggingface/diffusers/issues/363) or fail to work reliably. We believe this is related to the [`mps`](https://github.com/pytorch/pytorch/issues/84039) backend in PyTorch. While this is being investigated, you should iterate instead of batching.

 </Tip>

@@ -62,10 +59,6 @@ If you're using **PyTorch 1.13**, you need to "prime" the pipeline with an addit

 ## Troubleshoot

-This section lists some common issues with using the `mps` backend and how to solve them.
-
-### Attention slicing
-
 M1/M2 performance is very sensitive to memory pressure. When this occurs, the system automatically swaps if it needs to which significantly degrades performance.

 To prevent this from happening, we recommend *attention slicing* to reduce memory pressure during inference and prevent swapping. This is especially relevant if your computer has less than 64GB of system RAM, or if you generate images at non-standard resolutions larger than 512×512 pixels. Call the [`~DiffusionPipeline.enable_attention_slicing`] function on your pipeline:
@@ -79,7 +72,3 @@ pipeline.enable_attention_slicing()
 ```

 Attention slicing performs the costly attention operation in multiple steps instead of all at once. It usually improves performance by ~20% in computers without universal memory, but we've observed *better performance* in most Apple silicon computers unless you have 64GB of RAM or more.
-
-### Batch inference
-
-Generating multiple prompts in a batch can crash or fail to work reliably. If this is the case, try iterating instead of batching.
--- a/docs/source/en/optimization/torch2.0.md
+++ b/docs/source/en/optimization/torch2.0.md
@@ -78,23 +78,6 @@ For more information and different options about `torch.compile`, refer to the [
 > [!TIP]
 > Learn more about other ways PyTorch 2.0 can help optimize your model in the [Accelerate inference of text-to-image diffusion models](../tutorials/fast_diffusion) tutorial.

-### Regional compilation
-
-Compiling the whole model usually has a big problem space for optimization. Models are often composed of multiple repeated blocks. [Regional compilation](https://pytorch.org/tutorials/recipes/regional_compilation.html) compiles the repeated block first (a transformer encoder block, for example), so that the Torch compiler would re-use its cached/optimized generated code for the other blocks, reducing (often massively) the cold start compilation time observed on the first inference call.
-
-Enabling regional compilation might require simple yet intrusive changes to the
-modeling code. However, 🤗 Accelerate provides a utility [`compile_regions()`](https://huggingface.co/docs/accelerate/main/en/usage_guides/compilation#how-to-use-regional-compilation) which automatically compiles
-the repeated blocks of the provided `nn.Module` sequentially, and the rest of the model separately. This helps with reducing cold start time while keeping most (if not all) of the speedup you would get from full compilation.
-
-```py
-# Make sure you're on the latest `accelerate`: `pip install -U accelerate`.
-from accelerate.utils import compile_regions
-
-pipe.unet = compile_regions(pipe.unet, mode="reduce-overhead", fullgraph=True)
-```
-
-As you may have noticed `compile_regions()` takes the same arguments as `torch.compile()`, allowing flexibility.
-
 ## Benchmark

 We conducted a comprehensive benchmark with PyTorch 2.0's efficient attention implementation and `torch.compile` across different GPUs and batch sizes for five of our most used pipelines. The code is benchmarked on 🤗 Diffusers v0.17.0.dev0 to optimize `torch.compile` usage (see [here](https://github.com/huggingface/diffusers/pull/3313) for more details).
--- a/docs/source/en/quantization/bitsandbytes.md
+++ b/docs/source/en/quantization/bitsandbytes.md
@@ -48,8 +48,8 @@ For Ada and higher-series GPUs. we recommend changing `torch_dtype` to `torch.bf
 ```py
 from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig
 from transformers import BitsAndBytesConfig as TransformersBitsAndBytesConfig
-import torch
-from diffusers import AutoModel
+
+from diffusers import FluxTransformer2DModel
 from transformers import T5EncoderModel

 quant_config = TransformersBitsAndBytesConfig(load_in_8bit=True,)
@@ -63,7 +63,7 @@ text_encoder_2_8bit = T5EncoderModel.from_pretrained(

 quant_config = DiffusersBitsAndBytesConfig(load_in_8bit=True,)

-transformer_8bit = AutoModel.from_pretrained(
+transformer_8bit = FluxTransformer2DModel.from_pretrained(
    "black-forest-labs/FLUX.1-dev",
    subfolder="transformer",
    quantization_config=quant_config,
@@ -74,7 +74,7 @@ transformer_8bit = AutoModel.from_pretrained(
 By default, all the other modules such as `torch.nn.LayerNorm` are converted to `torch.float16`. You can change the data type of these modules with the `torch_dtype` parameter.

 ```diff
-transformer_8bit = AutoModel.from_pretrained(
+transformer_8bit = FluxTransformer2DModel.from_pretrained(
    "black-forest-labs/FLUX.1-dev",
    subfolder="transformer",
    quantization_config=quant_config,
@@ -88,8 +88,6 @@ Setting `device_map="auto"` automatically fills all available space on the GPU(s
 CPU, and finally, the hard drive (the absolute slowest option) if there is still not enough memory.

 ```py
-from diffusers import FluxPipeline
-
 pipe = FluxPipeline.from_pretrained(
    "black-forest-labs/FLUX.1-dev",
    transformer=transformer_8bit,
@@ -134,8 +132,8 @@ For Ada and higher-series GPUs. we recommend changing `torch_dtype` to `torch.bf
 ```py
 from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig
 from transformers import BitsAndBytesConfig as TransformersBitsAndBytesConfig
-import torch
-from diffusers import AutoModel
+
+from diffusers import FluxTransformer2DModel
 from transformers import T5EncoderModel

 quant_config = TransformersBitsAndBytesConfig(load_in_4bit=True,)
@@ -149,7 +147,7 @@ text_encoder_2_4bit = T5EncoderModel.from_pretrained(

 quant_config = DiffusersBitsAndBytesConfig(load_in_4bit=True,)

-transformer_4bit = AutoModel.from_pretrained(
+transformer_4bit = FluxTransformer2DModel.from_pretrained(
    "black-forest-labs/FLUX.1-dev",
    subfolder="transformer",
    quantization_config=quant_config,
@@ -160,7 +158,7 @@ transformer_4bit = AutoModel.from_pretrained(
 By default, all the other modules such as `torch.nn.LayerNorm` are converted to `torch.float16`. You can change the data type of these modules with the `torch_dtype` parameter.

 ```diff
-transformer_4bit = AutoModel.from_pretrained(
+transformer_4bit = FluxTransformer2DModel.from_pretrained(
    "black-forest-labs/FLUX.1-dev",
    subfolder="transformer",
    quantization_config=quant_config,
@@ -173,8 +171,6 @@ Let's generate an image using our quantized models.
 Setting `device_map="auto"` automatically fills all available space on the GPU(s) first, then the CPU, and finally, the hard drive (the absolute slowest option) if there is still not enough memory.

 ```py
-from diffusers import FluxPipeline
-
 pipe = FluxPipeline.from_pretrained(
    "black-forest-labs/FLUX.1-dev",
    transformer=transformer_4bit,
@@ -218,16 +214,14 @@ Check your memory footprint with the `get_memory_footprint` method:
 print(model.get_memory_footprint())
 ```

-Note that this only tells you the memory footprint of the model params and does _not_ estimate the inference memory requirements.
-
 Quantized models can be loaded from the [`~ModelMixin.from_pretrained`] method without needing to specify the `quantization_config` parameters:

 ```py
-from diffusers import AutoModel, BitsAndBytesConfig
+from diffusers import FluxTransformer2DModel, BitsAndBytesConfig

 quantization_config = BitsAndBytesConfig(load_in_4bit=True)

-model_4bit = AutoModel.from_pretrained(
+model_4bit = FluxTransformer2DModel.from_pretrained(
    "hf-internal-testing/flux.1-dev-nf4-pkg", subfolder="transformer"
 )
 ```
@@ -249,13 +243,13 @@ An "outlier" is a hidden state value greater than a certain threshold, and these
 To find the best threshold for your model, we recommend experimenting with the `llm_int8_threshold` parameter in [`BitsAndBytesConfig`]:

 ```py
-from diffusers import AutoModel, BitsAndBytesConfig
+from diffusers import FluxTransformer2DModel, BitsAndBytesConfig

 quantization_config = BitsAndBytesConfig(
    load_in_8bit=True, llm_int8_threshold=10,
 )

-model_8bit = AutoModel.from_pretrained(
+model_8bit = FluxTransformer2DModel.from_pretrained(
    "black-forest-labs/FLUX.1-dev",
    subfolder="transformer",
    quantization_config=quantization_config,
@@ -311,7 +305,7 @@ NF4 is a 4-bit data type from the [QLoRA](https://hf.co/papers/2305.14314) paper
 from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig
 from transformers import BitsAndBytesConfig as TransformersBitsAndBytesConfig

-from diffusers import AutoModel
+from diffusers import FluxTransformer2DModel
 from transformers import T5EncoderModel

 quant_config = TransformersBitsAndBytesConfig(
@@ -331,7 +325,7 @@ quant_config = DiffusersBitsAndBytesConfig(
    bnb_4bit_quant_type="nf4",
 )

-transformer_4bit = AutoModel.from_pretrained(
+transformer_4bit = FluxTransformer2DModel.from_pretrained(
    "black-forest-labs/FLUX.1-dev",
    subfolder="transformer",
    quantization_config=quant_config,
@@ -349,7 +343,7 @@ Nested quantization is a technique that can save additional memory at no additio
 from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig
 from transformers import BitsAndBytesConfig as TransformersBitsAndBytesConfig

-from diffusers import AutoModel
+from diffusers import FluxTransformer2DModel
 from transformers import T5EncoderModel

 quant_config = TransformersBitsAndBytesConfig(
@@ -369,7 +363,7 @@ quant_config = DiffusersBitsAndBytesConfig(
    bnb_4bit_use_double_quant=True,
 )

-transformer_4bit = AutoModel.from_pretrained(
+transformer_4bit = FluxTransformer2DModel.from_pretrained(
    "black-forest-labs/FLUX.1-dev",
    subfolder="transformer",
    quantization_config=quant_config,
@@ -385,7 +379,7 @@ Once quantized, you can dequantize a model to its original precision, but this m
 from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig
 from transformers import BitsAndBytesConfig as TransformersBitsAndBytesConfig

-from diffusers import AutoModel
+from diffusers import FluxTransformer2DModel
 from transformers import T5EncoderModel

 quant_config = TransformersBitsAndBytesConfig(
@@ -405,7 +399,7 @@ quant_config = DiffusersBitsAndBytesConfig(
    bnb_4bit_use_double_quant=True,
 )

-transformer_4bit = AutoModel.from_pretrained(
+transformer_4bit = FluxTransformer2DModel.from_pretrained(
    "black-forest-labs/FLUX.1-dev",
    subfolder="transformer",
    quantization_config=quant_config,
@@ -419,4 +413,4 @@ transformer_4bit.dequantize()
 ## Resources

 * [End-to-end notebook showing Flux.1 Dev inference in a free-tier Colab](https://gist.github.com/sayakpaul/c76bd845b48759e11687ac550b99d8b4)
-* [Training](https://github.com/huggingface/diffusers/blob/8c661ea586bf11cb2440da740dd3c4cf84679b85/examples/dreambooth/README_hidream.md#using-quantization)
+* [Training](https://gist.github.com/sayakpaul/05afd428bc089b47af7c016e42004527)
--- a/docs/source/en/quantization/overview.md
+++ b/docs/source/en/quantization/overview.md
@@ -36,93 +36,5 @@ Diffusers currently supports the following quantization methods.
 - [BitsandBytes](./bitsandbytes)
 - [TorchAO](./torchao)
 - [GGUF](./gguf)
- [Quanto](./quanto.md)

 [This resource](https://huggingface.co/docs/transformers/main/en/quantization/overview#when-to-use-what) provides a good overview of the pros and cons of different quantization techniques.
-
-## Pipeline-level quantization
-
-Diffusers allows users to directly initialize pipelines from checkpoints that may contain quantized models ([example](https://huggingface.co/hf-internal-testing/flux.1-dev-nf4-pkg)). However, users may want to apply
-quantization on-the-fly when initializing a pipeline from a pre-trained and non-quantized checkpoint. You can
-do this with [`~quantizers.PipelineQuantizationConfig`].
-
-Start by defining a `PipelineQuantizationConfig`:
-
-```py
-import torch
-from diffusers import DiffusionPipeline
-from diffusers.quantizers.quantization_config import QuantoConfig
-from diffusers.quantizers import PipelineQuantizationConfig
-from transformers import BitsAndBytesConfig
-
-pipeline_quant_config = PipelineQuantizationConfig(
-    quant_mapping={
-        "transformer": QuantoConfig(weights_dtype="int8"),
-        "text_encoder_2": BitsAndBytesConfig(
-            load_in_4bit=True, compute_dtype=torch.bfloat16
-        ),
-    }
-)
-```
-
-Then pass it to [`~DiffusionPipeline.from_pretrained`] and run inference:
-
-```py
-pipe = DiffusionPipeline.from_pretrained(
-    "black-forest-labs/FLUX.1-dev",
-    quantization_config=pipeline_quant_config,
-    torch_dtype=torch.bfloat16,
-).to("cuda")
-
-image = pipe("photo of a cute dog").images[0]
-```
-
-This method allows for more granular control over the quantization specifications of individual 
-model-level components of a pipeline. It also allows for different quantization backends for
-different components. In the above example, you used a combination of Quanto and BitsandBytes. However,
-one caveat of this method is that users need to know which components come from `transformers` to be able
-to import the right quantization config class.
-
-The other method is simpler in terms of experience but is
-less-flexible. Start by defining a `PipelineQuantizationConfig` but in a different way:
-
-```py
-pipeline_quant_config = PipelineQuantizationConfig(
-    quant_backend="bitsandbytes_4bit",
-    quant_kwargs={"load_in_4bit": True, "bnb_4bit_quant_type": "nf4", "bnb_4bit_compute_dtype": torch.bfloat16},
-    components_to_quantize=["transformer", "text_encoder_2"],
-)
-```
-
-This `pipeline_quant_config` can now be passed to [`~DiffusionPipeline.from_pretrained`] similar to the above example.
-
-In this case, `quant_kwargs` will be used to initialize the quantization specifications
-of the respective quantization configuration class of `quant_backend`. `components_to_quantize`
-is used to denote the components that will be quantized. For most pipelines, you would want to
-keep `transformer` in the list as that is often the most compute and memory intensive.
-
-The config below will work for most diffusion pipelines that have a `transformer` component present.
-In most case, you will want to quantize the `transformer` component as that is often the most compute-
-intensive part of a diffusion pipeline.
-
-```py
-pipeline_quant_config = PipelineQuantizationConfig(
-    quant_backend="bitsandbytes_4bit",
-    quant_kwargs={"load_in_4bit": True, "bnb_4bit_quant_type": "nf4", "bnb_4bit_compute_dtype": torch.bfloat16},
-    components_to_quantize=["transformer"],
-)
-```
-
-Below is a list of the supported quantization backends available in both `diffusers` and `transformers`:
-
-* `bitsandbytes_4bit` 
-* `bitsandbytes_8bit`
-* `gguf`
-* `quanto`
-* `torchao`
-
-
-Diffusion pipelines can have multiple text encoders. [`FluxPipeline`] has two, for example. It's
-recommended to quantize the text encoders that are memory-intensive. Some examples include T5,
-Llama, Gemma, etc. In the above example, you quantized the T5 model of [`FluxPipeline`] through
-`text_encoder_2` while keeping the CLIP model intact (accessible through `text_encoder`). 
--- a/docs/source/en/quantization/quanto.md
+++ b/docs/source/en/quantization/quanto.md
@@ -1,148 +0,0 @@
-<!--Copyright 2025 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-->
-
-# Quanto
-
-[Quanto](https://github.com/huggingface/optimum-quanto) is a PyTorch quantization backend for [Optimum](https://huggingface.co/docs/optimum/en/index). It has been designed with versatility and simplicity in mind:
-
- All features are available in eager mode (works with non-traceable models)
- Supports quantization aware training
- Quantized models are compatible with `torch.compile`
- Quantized models are Device agnostic (e.g CUDA,XPU,MPS,CPU)
-
-In order to use the Quanto backend, you will first need to install `optimum-quanto>=0.2.6` and `accelerate`
-
-```shell
-pip install optimum-quanto accelerate
-```
-
-Now you can quantize a model by passing the `QuantoConfig` object to the `from_pretrained()` method. Although the Quanto library does allow quantizing `nn.Conv2d` and `nn.LayerNorm` modules, currently, Diffusers only supports quantizing the weights in the `nn.Linear` layers of a model. The following snippet demonstrates how to apply `float8` quantization with Quanto.   
-
-```python
-import torch
-from diffusers import FluxTransformer2DModel, QuantoConfig
-
-model_id = "black-forest-labs/FLUX.1-dev"
-quantization_config = QuantoConfig(weights_dtype="float8")
-transformer = FluxTransformer2DModel.from_pretrained(
-      model_id,
-      subfolder="transformer",
-      quantization_config=quantization_config,
-      torch_dtype=torch.bfloat16,
-)
-
-pipe = FluxPipeline.from_pretrained(model_id, transformer=transformer, torch_dtype=torch_dtype)
-pipe.to("cuda")
-
-prompt = "A cat holding a sign that says hello world"
-image = pipe(
-    prompt, num_inference_steps=50, guidance_scale=4.5, max_sequence_length=512
-).images[0]
-image.save("output.png")
-```
-
-## Skipping Quantization on specific modules
-
-It is possible to skip applying quantization on certain modules using the `modules_to_not_convert` argument in the `QuantoConfig`. Please ensure that the modules passed in to this argument match the keys of the modules in the `state_dict`  
-
-```python
-import torch
-from diffusers import FluxTransformer2DModel, QuantoConfig
-
-model_id = "black-forest-labs/FLUX.1-dev"
-quantization_config = QuantoConfig(weights_dtype="float8", modules_to_not_convert=["proj_out"])
-transformer = FluxTransformer2DModel.from_pretrained(
-      model_id,
-      subfolder="transformer",
-      quantization_config=quantization_config,
-      torch_dtype=torch.bfloat16,
-)
-```
-
-## Using `from_single_file` with the Quanto Backend
-
-`QuantoConfig` is compatible with `~FromOriginalModelMixin.from_single_file`. 
-
-```python
-import torch
-from diffusers import FluxTransformer2DModel, QuantoConfig
-
-ckpt_path = "https://huggingface.co/black-forest-labs/FLUX.1-dev/blob/main/flux1-dev.safetensors"
-quantization_config = QuantoConfig(weights_dtype="float8")
-transformer = FluxTransformer2DModel.from_single_file(ckpt_path, quantization_config=quantization_config, torch_dtype=torch.bfloat16)
-```
-
-## Saving Quantized models
-
-Diffusers supports serializing Quanto models using the `~ModelMixin.save_pretrained` method.
-
-The serialization and loading requirements are different for models quantized directly with the Quanto library and models quantized
-with Diffusers using Quanto as the backend. It is currently not possible to load models quantized directly with Quanto into Diffusers using `~ModelMixin.from_pretrained`
-
-```python
-import torch
-from diffusers import FluxTransformer2DModel, QuantoConfig
-
-model_id = "black-forest-labs/FLUX.1-dev"
-quantization_config = QuantoConfig(weights_dtype="float8")
-transformer = FluxTransformer2DModel.from_pretrained(
-      model_id,
-      subfolder="transformer",
-      quantization_config=quantization_config,
-      torch_dtype=torch.bfloat16,
-)
-# save quantized model to reuse
-transformer.save_pretrained("<your quantized model save path>")
-
-# you can reload your quantized model with
-model = FluxTransformer2DModel.from_pretrained("<your quantized model save path>")
-```
-
-## Using `torch.compile` with Quanto
-
-Currently the Quanto backend supports `torch.compile` for the following quantization types:
-
- `int8` weights 
-
-```python
-import torch
-from diffusers import FluxPipeline, FluxTransformer2DModel, QuantoConfig
-
-model_id = "black-forest-labs/FLUX.1-dev"
-quantization_config = QuantoConfig(weights_dtype="int8")
-transformer = FluxTransformer2DModel.from_pretrained(
-    model_id,
-    subfolder="transformer",
-    quantization_config=quantization_config,
-    torch_dtype=torch.bfloat16,
-)
-transformer = torch.compile(transformer, mode="max-autotune", fullgraph=True)
-
-pipe = FluxPipeline.from_pretrained(
-    model_id, transformer=transformer, torch_dtype=torch_dtype
-)
-pipe.to("cuda")
-images = pipe("A cat holding a sign that says hello").images[0]
-images.save("flux-quanto-compile.png")
-```
-
-## Supported Quantization Types
-
-### Weights
-
- float8
- int8
- int4
- int2
-
-
--- a/docs/source/en/quantization/torchao.md
+++ b/docs/source/en/quantization/torchao.md
@@ -26,13 +26,13 @@ The example below only quantizes the weights to int8.

 ```python
 import torch
-from diffusers import FluxPipeline, AutoModel, TorchAoConfig
+from diffusers import FluxPipeline, FluxTransformer2DModel, TorchAoConfig

 model_id = "black-forest-labs/FLUX.1-dev"
 dtype = torch.bfloat16

 quantization_config = TorchAoConfig("int8wo")
-transformer = AutoModel.from_pretrained(
+transformer = FluxTransformer2DModel.from_pretrained(
    model_id,
    subfolder="transformer",
    quantization_config=quantization_config,
@@ -85,7 +85,7 @@ The quantization methods supported are as follows:
 | **Category** | **Full Function Names** | **Shorthands** |
 |--------------|-------------------------|----------------|
 | **Integer quantization** | `int4_weight_only`, `int8_dynamic_activation_int4_weight`, `int8_weight_only`, `int8_dynamic_activation_int8_weight` | `int4wo`, `int4dq`, `int8wo`, `int8dq` |
-| **Floating point 8-bit quantization** | `float8_weight_only`, `float8_dynamic_activation_float8_weight`, `float8_static_activation_float8_weight` | `float8wo`, `float8wo_e5m2`, `float8wo_e4m3`, `float8dq`, `float8dq_e4m3`, `float8dq_e4m3_tensor`, `float8dq_e4m3_row` |
+| **Floating point 8-bit quantization** | `float8_weight_only`, `float8_dynamic_activation_float8_weight`, `float8_static_activation_float8_weight` | `float8wo`, `float8wo_e5m2`, `float8wo_e4m3`, `float8dq`, `float8dq_e4m3`, `float8_e4m3_tensor`, `float8_e4m3_row` |
 | **Floating point X-bit quantization** | `fpx_weight_only` | `fpX_eAwB` where `X` is the number of bits (1-7), `A` is exponent bits, and `B` is mantissa bits. Constraint: `X == A + B + 1` |
 | **Unsigned Integer quantization** | `uintx_weight_only` | `uint1wo`, `uint2wo`, `uint3wo`, `uint4wo`, `uint5wo`, `uint6wo`, `uint7wo` |

@@ -99,10 +99,10 @@ To serialize a quantized model in a given dtype, first load the model with the d

 ```python
 import torch
-from diffusers import AutoModel, TorchAoConfig
+from diffusers import FluxTransformer2DModel, TorchAoConfig

 quantization_config = TorchAoConfig("int8wo")
-transformer = AutoModel.from_pretrained(
+transformer = FluxTransformer2DModel.from_pretrained(
    "black-forest-labs/Flux.1-Dev",
    subfolder="transformer",
    quantization_config=quantization_config,
@@ -115,9 +115,9 @@ To load a serialized quantized model, use the [`~ModelMixin.from_pretrained`] me

 ```python
 import torch
-from diffusers import FluxPipeline, AutoModel
+from diffusers import FluxPipeline, FluxTransformer2DModel

-transformer = AutoModel.from_pretrained("/path/to/flux_int8wo", torch_dtype=torch.bfloat16, use_safetensors=False)
+transformer = FluxTransformer2DModel.from_pretrained("/path/to/flux_int8wo", torch_dtype=torch.bfloat16, use_safetensors=False)
 pipe = FluxPipeline.from_pretrained("black-forest-labs/Flux.1-Dev", transformer=transformer, torch_dtype=torch.bfloat16)
 pipe.to("cuda")

@@ -126,15 +126,15 @@ image = pipe(prompt, num_inference_steps=30, guidance_scale=7.0).images[0]
 image.save("output.png")
 ```

-If you are using `torch<=2.6.0`, some quantization methods, such as `uint4wo`, cannot be loaded directly and may result in an `UnpicklingError` when trying to load the models, but work as expected when saving them. In order to work around this, one can load the state dict manually into the model. Note, however, that this requires using `weights_only=False` in `torch.load`, so it should be run only if the weights were obtained from a trustable source.
+Some quantization methods, such as `uint4wo`, cannot be loaded directly and may result in an `UnpicklingError` when trying to load the models, but work as expected when saving them. In order to work around this, one can load the state dict manually into the model. Note, however, that this requires using `weights_only=False` in `torch.load`, so it should be run only if the weights were obtained from a trustable source.

 ```python
 import torch
 from accelerate import init_empty_weights
-from diffusers import FluxPipeline, AutoModel, TorchAoConfig
+from diffusers import FluxPipeline, FluxTransformer2DModel, TorchAoConfig

 # Serialize the model
-transformer = AutoModel.from_pretrained(
+transformer = FluxTransformer2DModel.from_pretrained(
    "black-forest-labs/Flux.1-Dev",
    subfolder="transformer",
    quantization_config=TorchAoConfig("uint4wo"),
@@ -146,13 +146,10 @@ transformer.save_pretrained("/path/to/flux_uint4wo", safe_serialization=False, m
 # Load the model
 state_dict = torch.load("/path/to/flux_uint4wo/diffusion_pytorch_model.bin", weights_only=False, map_location="cpu")
 with init_empty_weights():
-    transformer = AutoModel.from_config("/path/to/flux_uint4wo/config.json")
+    transformer = FluxTransformer2DModel.from_config("/path/to/flux_uint4wo/config.json")
 transformer.load_state_dict(state_dict, strict=True, assign=True)
 ```

-> [!TIP]
-> The [`AutoModel`] API is supported for PyTorch >= 2.6 as shown in the examples below.
-
 ## Resources

 - [TorchAO Quantization API](https://github.com/pytorch/ao/blob/main/torchao/quantization/README.md)
--- a/docs/source/en/quicktour.md
+++ b/docs/source/en/quicktour.md
@@ -163,9 +163,6 @@ Models are initiated with the [`~ModelMixin.from_pretrained`] method which also
 >>> model = UNet2DModel.from_pretrained(repo_id, use_safetensors=True)
 ```

-> [!TIP]
-> Use the [`AutoModel`] API to automatically select a model class if you're unsure of which one to use.
-
 To access the model parameters, call `model.config`:

 ```py
--- a/docs/source/en/training/adapt_a_model.md
+++ b/docs/source/en/training/adapt_a_model.md
@@ -31,10 +31,10 @@ To adapt your text-to-image model for inpainting, you'll need to change the numb
 Initialize a [`UNet2DConditionModel`] with the pretrained text-to-image model weights, and change `in_channels` to 9. Changing the number of `in_channels` means you need to set `ignore_mismatched_sizes=True` and `low_cpu_mem_usage=False` to avoid a size mismatch error because the shape is different now.

 ```py
-from diffusers import AutoModel
+from diffusers import UNet2DConditionModel

 model_id = "stable-diffusion-v1-5/stable-diffusion-v1-5"
-unet = AutoModel.from_pretrained(
+unet = UNet2DConditionModel.from_pretrained(
    model_id,
    subfolder="unet",
    in_channels=9,
--- a/docs/source/en/training/cogvideox.md
+++ b/docs/source/en/training/cogvideox.md
@@ -216,7 +216,7 @@ Setting the `<ID_TOKEN>` is not necessary. From some limited experimentation, we
 > - The original repository uses a `lora_alpha` of `1`. We found this not suitable in many runs, possibly due to difference in modeling backends and training settings. Our recommendation is to set to the `lora_alpha` to either `rank` or `rank // 2`.
 > - If you're training on data whose captions generate bad results with the original model, a `rank` of 64 and above is good and also the recommendation by the team behind CogVideoX. If the generations are already moderately good on your training captions, a `rank` of 16/32 should work. We found that setting the rank too low, say `4`, is not ideal and doesn't produce promising results.
 > - The authors of CogVideoX recommend 4000 training steps and 100 training videos overall to achieve the best result. While that might yield the best results, we found from our limited experimentation that 2000 steps and 25 videos could also be sufficient.
-> - When using the Prodigy optimizer for training, one can follow the recommendations from [this](https://huggingface.co/blog/sdxl_lora_advanced_script) blog. Prodigy tends to overfit quickly. From my very limited testing, I found a learning rate of `0.5` to be suitable in addition to `--prodigy_use_bias_correction`, `prodigy_safeguard_warmup` and `--prodigy_decouple`.
+> - When using the Prodigy opitimizer for training, one can follow the recommendations from [this](https://huggingface.co/blog/sdxl_lora_advanced_script) blog. Prodigy tends to overfit quickly. From my very limited testing, I found a learning rate of `0.5` to be suitable in addition to `--prodigy_use_bias_correction`, `prodigy_safeguard_warmup` and `--prodigy_decouple`.
 > - The recommended learning rate by the CogVideoX authors and from our experimentation with Adam/AdamW is between `1e-3` and `1e-4` for a dataset of 25+ videos.
 >
 > Note that our testing is not exhaustive due to limited time for exploration. Our recommendation would be to play around with the different knobs and dials to find the best settings for your data.
--- a/docs/source/en/training/distributed_inference.md
+++ b/docs/source/en/training/distributed_inference.md
@@ -165,10 +165,10 @@ flush()
 Load the diffusion transformer next which has 12.5B parameters. This time, set `device_map="auto"` to automatically distribute the model across two 16GB GPUs. The `auto` strategy is backed by [Accelerate](https://hf.co/docs/accelerate/index) and available as a part of the [Big Model Inference](https://hf.co/docs/accelerate/concept_guides/big_model_inference) feature. It starts by distributing a model across the fastest device first (GPU) before moving to slower devices like the CPU and hard drive if needed. The trade-off of storing model parameters on slower devices is slower inference latency.

 ```py
-from diffusers import AutoModel
+from diffusers import FluxTransformer2DModel
 import torch 

-transformer = AutoModel.from_pretrained(
+transformer = FluxTransformer2DModel.from_pretrained(
    "black-forest-labs/FLUX.1-dev", 
    subfolder="transformer",
    device_map="auto",
--- a/docs/source/en/training/dreambooth.md
+++ b/docs/source/en/training/dreambooth.md
@@ -589,7 +589,7 @@ For stage 2 of DeepFloyd IF with DreamBooth, pay attention to these parameters:

 * `--learning_rate=5e-6`, use a lower learning rate with a smaller effective batch size
 * `--resolution=256`, the expected resolution for the upscaler
-* `--train_batch_size=2` and `--gradient_accumulation_steps=6`, to effectively train on images with faces requires larger batch sizes
+* `--train_batch_size=2` and `--gradient_accumulation_steps=6`, to effectively train on images wiht faces requires larger batch sizes

 ```bash
 export MODEL_NAME="DeepFloyd/IF-II-L-v1.0"
--- a/docs/source/en/training/t2i_adapters.md
+++ b/docs/source/en/training/t2i_adapters.md
@@ -89,7 +89,7 @@ Many of the basic and important parameters are described in the [Text-to-image](

 As with the script parameters, a walkthrough of the training script is provided in the [Text-to-image](text2image#training-script) training guide. Instead, this guide takes a look at the T2I-Adapter relevant parts of the script.

-The training script begins by preparing the dataset. This includes [tokenizing](https://github.com/huggingface/diffusers/blob/aab6de22c33cc01fb7bc81c0807d6109e2c998c9/examples/t2i_adapter/train_t2i_adapter_sdxl.py#L674) the prompt and [applying transforms](https://github.com/huggingface/diffusers/blob/aab6de22c33cc01fb7bc81c0807d6109e2c998c9/examples/t2i_adapter/train_t2i_adapter_sdxl.py#L714) to the images and conditioning images.
+The training script begins by preparing the dataset. This incudes [tokenizing](https://github.com/huggingface/diffusers/blob/aab6de22c33cc01fb7bc81c0807d6109e2c998c9/examples/t2i_adapter/train_t2i_adapter_sdxl.py#L674) the prompt and [applying transforms](https://github.com/huggingface/diffusers/blob/aab6de22c33cc01fb7bc81c0807d6109e2c998c9/examples/t2i_adapter/train_t2i_adapter_sdxl.py#L714) to the images and conditioning images.

 ```py
 conditioning_image_transforms = transforms.Compose(
--- a/docs/source/en/tutorials/inference_with_big_models.md
+++ b/docs/source/en/tutorials/inference_with_big_models.md
@@ -0,0 +1,139 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Working with big models
+
+A modern diffusion model, like [Stable Diffusion XL (SDXL)](../using-diffusers/sdxl), is not just a single model, but a collection of multiple models. SDXL has four different model-level components:
+
+* A variational autoencoder (VAE)
+* Two text encoders
+* A UNet for denoising
+
+Usually, the text encoders and the denoiser are much larger compared to the VAE.
+
+As models get bigger and better, it’s possible your model is so big that even a single copy won’t fit in memory. But that doesn’t mean it can’t be loaded. If you have more than one GPU, there is more memory available to store your model. In this case, it’s better to split your model checkpoint into several smaller *checkpoint shards*.
+
+When a text encoder checkpoint has multiple shards, like [T5-xxl for SD3](https://huggingface.co/stabilityai/stable-diffusion-3-medium-diffusers/tree/main/text_encoder_3), it is automatically handled by the [Transformers](https://huggingface.co/docs/transformers/index) library as it is a required dependency of Diffusers when using the [`StableDiffusion3Pipeline`]. More specifically, Transformers will automatically handle the loading of multiple shards within the requested model class and get it ready so that inference can be performed.
+
+The denoiser checkpoint can also have multiple shards and supports inference thanks to the [Accelerate](https://huggingface.co/docs/accelerate/index) library.
+
+> [!TIP]
+> Refer to the [Handling big models for inference](https://huggingface.co/docs/accelerate/main/en/concept_guides/big_model_inference) guide for general guidance when working with big models that are hard to fit into memory.
+
+For example, let's save a sharded checkpoint for the [SDXL UNet](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/tree/main/unet):
+
+```python
+from diffusers import UNet2DConditionModel
+
+unet = UNet2DConditionModel.from_pretrained(
+    "stabilityai/stable-diffusion-xl-base-1.0", subfolder="unet"
+)
+unet.save_pretrained("sdxl-unet-sharded", max_shard_size="5GB")
+```
+
+The size of the fp32 variant of the SDXL UNet checkpoint is ~10.4GB. Set the `max_shard_size` parameter to 5GB to create 3 shards. After saving, you can load them in [`StableDiffusionXLPipeline`]:
+
+```python
+from diffusers import UNet2DConditionModel, StableDiffusionXLPipeline
+import torch
+
+unet = UNet2DConditionModel.from_pretrained(
+    "sayakpaul/sdxl-unet-sharded", torch_dtype=torch.float16
+)
+pipeline = StableDiffusionXLPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-xl-base-1.0", unet=unet, torch_dtype=torch.float16
+).to("cuda")
+
+image = pipeline("a cute dog running on the grass", num_inference_steps=30).images[0]
+image.save("dog.png")
+```
+
+If placing all the model-level components on the GPU at once is not feasible, use [`~DiffusionPipeline.enable_model_cpu_offload`] to help you:
+
+```diff
+- pipeline.to("cuda")
+ pipeline.enable_model_cpu_offload()
+```
+
+In general, we recommend sharding when a checkpoint is more than 5GB (in fp32).
+
+## Device placement
+
+On distributed setups, you can run inference across multiple GPUs with Accelerate.
+
+> [!WARNING]
+> This feature is experimental and its APIs might change in the future.
+
+With Accelerate, you can use the `device_map` to determine how to distribute the models of a pipeline across multiple devices. This is useful in situations where you have more than one GPU.
+
+For example, if you have two 8GB GPUs, then using [`~DiffusionPipeline.enable_model_cpu_offload`] may not work so well because:
+
+* it only works on a single GPU
+* a single model might not fit on a single GPU ([`~DiffusionPipeline.enable_sequential_cpu_offload`] might work but it will be extremely slow and it is also limited to a single GPU)
+
+To make use of both GPUs, you can use the "balanced" device placement strategy which splits the models across all available GPUs.
+
+> [!WARNING]
+> Only the "balanced" strategy is supported at the moment, and we plan to support additional mapping strategies in the future.
+
+```diff
+from diffusers import DiffusionPipeline
+import torch
+
+pipeline = DiffusionPipeline.from_pretrained(
+-    "stable-diffusion-v1-5/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True,
+    "stable-diffusion-v1-5/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True, device_map="balanced"
+)
+image = pipeline("a dog").images[0]
+image
+```
+
+You can also pass a dictionary to enforce the maximum GPU memory that can be used on each device:
+
+```diff
+from diffusers import DiffusionPipeline
+import torch
+
+max_memory = {0:"1GB", 1:"1GB"}
+pipeline = DiffusionPipeline.from_pretrained(
+    "stable-diffusion-v1-5/stable-diffusion-v1-5",
+    torch_dtype=torch.float16,
+    use_safetensors=True,
+    device_map="balanced",
+   max_memory=max_memory
+)
+image = pipeline("a dog").images[0]
+image
+```
+
+If a device is not present in `max_memory`, then it will be completely ignored and will not participate in the device placement.
+
+By default, Diffusers uses the maximum memory of all devices. If the models don't fit on the GPUs, they are offloaded to the CPU. If the CPU doesn't have enough memory, then you might see an error. In that case, you could defer to using [`~DiffusionPipeline.enable_sequential_cpu_offload`] and [`~DiffusionPipeline.enable_model_cpu_offload`].
+
+Call [`~DiffusionPipeline.reset_device_map`] to reset the `device_map` of a pipeline. This is also necessary if you want to use methods like `to()`, [`~DiffusionPipeline.enable_sequential_cpu_offload`], and [`~DiffusionPipeline.enable_model_cpu_offload`] on a pipeline that was device-mapped.
+
+```py
+pipeline.reset_device_map()
+```
+
+Once a pipeline has been device-mapped, you can also access its device map via `hf_device_map`:
+
+```py
+print(pipeline.hf_device_map)
+```
+
+An example device map would look like so:
+
+
+```bash
+{'unet': 1, 'vae': 1, 'safety_checker': 0, 'text_encoder': 0}
+```
--- a/docs/source/en/tutorials/using_peft_for_inference.md
+++ b/docs/source/en/tutorials/using_peft_for_inference.md
@@ -10,625 +10,218 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->

-# LoRA
+[[open-in-colab]]

-[LoRA (Low-Rank Adaptation)](https://huggingface.co/papers/2106.09685) is a method for quickly training a model for a new task. It works by freezing the original model weights and adding a small number of *new* trainable parameters. This means it is significantly faster and cheaper to adapt an existing model to new tasks, such as generating images in a new style.
+# Load LoRAs for inference

-LoRA checkpoints are typically only a couple hundred MBs in size, so they're very lightweight and easy to store. Load these smaller set of weights into an existing base model with [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] and specify the file name.
+There are many adapter types (with [LoRAs](https://huggingface.co/docs/peft/conceptual_guides/adapter#low-rank-adaptation-lora) being the most popular) trained in different styles to achieve different effects. You can even combine multiple adapters to create new and unique images.

-<hfoptions id="usage">
-<hfoption id="text-to-image">
+In this tutorial, you'll learn how to easily load and manage adapters for inference with the 🤗 [PEFT](https://huggingface.co/docs/peft/index) integration in 🤗 Diffusers. You'll use LoRA as the main adapter technique, so you'll see the terms LoRA and adapter used interchangeably.

-```py
-import torch
-from diffusers import AutoPipelineForText2Image
+Let's first install all the required libraries.

-pipeline = AutoPipelineForText2Image.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0",
-    torch_dtype=torch.float16
-).to("cuda")
-pipeline.load_lora_weights(
-    "ostris/super-cereal-sdxl-lora",
-    weight_name="cereal_box_sdxl_v1.safetensors",
-    adapter_name="cereal"
-)
-pipeline("bears, pizza bites").images[0]
+```bash
+!pip install -q transformers accelerate peft diffusers
 ```

-</hfoption>
-<hfoption id="text-to-video">
+Now, load a pipeline with a [Stable Diffusion XL (SDXL)](../api/pipelines/stable_diffusion/stable_diffusion_xl) checkpoint:

-```py
-import torch
-from diffusers import LTXConditionPipeline
-from diffusers.utils import export_to_video, load_image
-
-pipeline = LTXConditionPipeline.from_pretrained(
-    "Lightricks/LTX-Video-0.9.5", torch_dtype=torch.bfloat16
-)
-
-pipeline.load_lora_weights(
-    "Lightricks/LTX-Video-Cakeify-LoRA",
-    weight_name="ltxv_095_cakeify_lora.safetensors",
-    adapter_name="cakeify"
-)
-pipeline.set_adapters("cakeify")
-
-# use "CAKEIFY" to trigger the LoRA
-prompt = "CAKEIFY a person using a knife to cut a cake shaped like a Pikachu plushie"
-image = load_image("https://huggingface.co/Lightricks/LTX-Video-Cakeify-LoRA/resolve/main/assets/images/pikachu.png")
-
-video = pipeline(
-    prompt=prompt,
-    image=image,
-    width=576,
-    height=576,
-    num_frames=161,
-    decode_timestep=0.03,
-    decode_noise_scale=0.025,
-    num_inference_steps=50,
-).frames[0]
-export_to_video(video, "output.mp4", fps=26)
-```
-
-</hfoption>
-</hfoptions>
-
-The [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] method is the preferred way to load LoRA weights into the UNet and text encoder because it can handle cases where:
-
- the LoRA weights don't have separate UNet and text encoder identifiers
- the LoRA weights have separate UNet and text encoder identifiers
-
-The [`~loaders.PeftAdapterMixin.load_lora_adapter`] method is used to directly load a LoRA adapter at the *model-level*, as long as the model is a Diffusers model that is a subclass of [`PeftAdapterMixin`]. It builds and prepares the necessary model configuration for the adapter. This method also loads the LoRA adapter into the UNet.
-
-For example, if you're only loading a LoRA into the UNet, [`~loaders.PeftAdapterMixin.load_lora_adapter`] ignores the text encoder keys. Use the `prefix` parameter to filter and load the appropriate state dicts, `"unet"` to load.
-
-```py
-import torch
-from diffusers import AutoPipelineForText2Image
-
-pipeline = AutoPipelineForText2Image.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0",
-    torch_dtype=torch.float16
-).to("cuda")
-pipeline.unet.load_lora_adapter(
-    "jbilcke-hf/sdxl-cinematic-1",
-    weight_name="pytorch_lora_weights.safetensors",
-    adapter_name="cinematic"
-    prefix="unet"
-)
-# use cnmt in the prompt to trigger the LoRA
-pipeline("A cute cnmt eating a slice of pizza, stunning color scheme, masterpiece, illustration").images[0]
-```
-
-## torch.compile
-
-[torch.compile](../optimization/torch2.0#torchcompile) speeds up inference by compiling the PyTorch model to use optimized kernels. Before compiling, the LoRA weights need to be fused into the base model and unloaded first.
-
-```py
-import torch
+```python
 from diffusers import DiffusionPipeline
-
-# load base model and LoRA
-pipeline = DiffusionPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0",
-    torch_dtype=torch.float16
-).to("cuda")
-pipeline.load_lora_weights(
-    "ostris/ikea-instructions-lora-sdxl",
-    weight_name="ikea_instructions_xl_v1_5.safetensors",
-    adapter_name="ikea"
-)
-
-# activate LoRA and set adapter weight
-pipeline.set_adapters("ikea", adapter_weights=0.7)
-
-# fuse LoRAs and unload weights
-pipeline.fuse_lora(adapter_names=["ikea"], lora_scale=1.0)
-pipeline.unload_lora_weights()
-```
-
-Typically, the UNet is compiled because its the most compute intensive component of the pipeline.
-
-```py
-pipeline.unet.to(memory_format=torch.channels_last)
-pipeline.unet = torch.compile(pipeline.unet, mode="reduce-overhead", fullgraph=True)
-
-pipeline("A bowl of ramen shaped like a cute kawaii bear").images[0]
-```
-
-Refer to the [hotswapping](#hotswapping) section to learn how to avoid recompilation when working with compiled models and multiple LoRAs.
-
-## Weight scale
-
-The `scale` parameter is used to control how much of a LoRA to apply. A value of `0` is equivalent to only using the base model weights and a value of `1` is equivalent to fully using the LoRA.
-
-<hfoptions id="weight-scale">
-<hfoption id="simple use case">
-
-For simple use cases, you can pass `cross_attention_kwargs={"scale": 1.0}` to the pipeline.
-
-```py
 import torch
-from diffusers import AutoPipelineForText2Image

-pipeline = AutoPipelineForText2Image.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0",
-    torch_dtype=torch.float16
-).to("cuda")
-pipeline.load_lora_weights(
-    "ostris/super-cereal-sdxl-lora",
-    weight_name="cereal_box_sdxl_v1.safetensors",
-    adapter_name="cereal"
-)
-pipeline("bears, pizza bites", cross_attention_kwargs={"scale": 1.0}).images[0]
+pipe_id = "stabilityai/stable-diffusion-xl-base-1.0"
+pipe = DiffusionPipeline.from_pretrained(pipe_id, torch_dtype=torch.float16).to("cuda")
 ```

-</hfoption>
-<hfoption id="finer control">
+Next, load a [CiroN2022/toy-face](https://huggingface.co/CiroN2022/toy-face) adapter with the [`~diffusers.loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] method. With the 🤗 PEFT integration, you can assign a specific `adapter_name` to the checkpoint, which lets you easily switch between different LoRA checkpoints. Let's call this adapter `"toy"`.

-> [!WARNING]
-> The [`~loaders.PeftAdapterMixin.set_adapters`] method only scales attention weights. If a LoRA has ResNets or down and upsamplers, these components keep a scale value of `1.0`.
+```python
+pipe.load_lora_weights("CiroN2022/toy-face", weight_name="toy_face_sdxl.safetensors", adapter_name="toy")
+```

-For finer control over each individual component of the UNet or text encoder, pass a dictionary instead. In the example below, the `"down"` block in the UNet is scaled by 0.9 and you can further specify in the `"up"` block the scales of the transformers in `"block_0"` and `"block_1"`. If a block like `"mid"` isn't specified, the default value `1.0` is used.
+Make sure to include the token `toy_face` in the prompt and then you can perform inference:

-```py
-import torch
-from diffusers import AutoPipelineForText2Image
+```python
+prompt = "toy_face of a hacker with a hoodie"

-pipeline = AutoPipelineForText2Image.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0",
-    torch_dtype=torch.float16
-).to("cuda")
-pipeline.load_lora_weights(
-    "ostris/super-cereal-sdxl-lora",
-    weight_name="cereal_box_sdxl_v1.safetensors",
-    adapter_name="cereal"
-)
-scales = {
-    "text_encoder": 0.5,
-    "text_encoder_2": 0.5,
+lora_scale = 0.9
+image = pipe(
+    prompt, num_inference_steps=30, cross_attention_kwargs={"scale": lora_scale}, generator=torch.manual_seed(0)
+).images[0]
+image
+```
+
+![toy-face](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/peft_integration/diffusers_peft_lora_inference_8_1.png)
+
+With the `adapter_name` parameter, it is really easy to use another adapter for inference! Load the [nerijs/pixel-art-xl](https://huggingface.co/nerijs/pixel-art-xl) adapter that has been fine-tuned to generate pixel art images and call it `"pixel"`.
+
+The pipeline automatically sets the first loaded adapter (`"toy"`) as the active adapter, but you can activate the `"pixel"` adapter with the [`~loaders.peft.PeftAdapterMixin.set_adapters`] method:
+
+```python
+pipe.load_lora_weights("nerijs/pixel-art-xl", weight_name="pixel-art-xl.safetensors", adapter_name="pixel")
+pipe.set_adapters("pixel")
+```
+
+Make sure you include the token `pixel art` in your prompt to generate a pixel art image:
+
+```python
+prompt = "a hacker with a hoodie, pixel art"
+image = pipe(
+    prompt, num_inference_steps=30, cross_attention_kwargs={"scale": lora_scale}, generator=torch.manual_seed(0)
+).images[0]
+image
+```
+
+![pixel-art](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/peft_integration/diffusers_peft_lora_inference_12_1.png)
+
+<Tip>
+
+By default, if the most up-to-date versions of PEFT and Transformers are detected, `low_cpu_mem_usage` is set to `True` to speed up the loading time of LoRA checkpoints. 
+
+</Tip>
+
+## Merge adapters
+
+You can also merge different adapter checkpoints for inference to blend their styles together.
+
+Once again, use the [`~loaders.peft.PeftAdapterMixin.set_adapters`] method to activate the `pixel` and `toy` adapters and specify the weights for how they should be merged.
+
+```python
+pipe.set_adapters(["pixel", "toy"], adapter_weights=[0.5, 1.0])
+```
+
+<Tip>
+
+LoRA checkpoints in the diffusion community are almost always obtained with [DreamBooth](https://huggingface.co/docs/diffusers/main/en/training/dreambooth). DreamBooth training often relies on "trigger" words in the input text prompts in order for the generation results to look as expected. When you combine multiple LoRA checkpoints, it's important to ensure the trigger words for the corresponding LoRA checkpoints are present in the input text prompts.
+
+</Tip>
+
+Remember to use the trigger words for [CiroN2022/toy-face](https://hf.co/CiroN2022/toy-face) and [nerijs/pixel-art-xl](https://hf.co/nerijs/pixel-art-xl) (these are found in their repositories) in the prompt to generate an image.
+
+```python
+prompt = "toy_face of a hacker with a hoodie, pixel art"
+image = pipe(
+    prompt, num_inference_steps=30, cross_attention_kwargs={"scale": 1.0}, generator=torch.manual_seed(0)
+).images[0]
+image
+```
+
+![toy-face-pixel-art](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/peft_integration/diffusers_peft_lora_inference_16_1.png)
+
+Impressive! As you can see, the model generated an image that mixed the characteristics of both adapters.
+
+> [!TIP]
+> Through its PEFT integration, Diffusers also offers more efficient merging methods which you can learn about in the [Merge LoRAs](../using-diffusers/merge_loras) guide!
+
+To return to only using one adapter, use the [`~loaders.peft.PeftAdapterMixin.set_adapters`] method to activate the `"toy"` adapter:
+
+```python
+pipe.set_adapters("toy")
+
+prompt = "toy_face of a hacker with a hoodie"
+lora_scale = 0.9
+image = pipe(
+    prompt, num_inference_steps=30, cross_attention_kwargs={"scale": lora_scale}, generator=torch.manual_seed(0)
+).images[0]
+image
+```
+
+Or to disable all adapters entirely, use the [`~loaders.peft.PeftAdapterMixin.disable_lora`] method to return the base model.
+
+```python
+pipe.disable_lora()
+
+prompt = "toy_face of a hacker with a hoodie"
+image = pipe(prompt, num_inference_steps=30, generator=torch.manual_seed(0)).images[0]
+image
+```
+
+![no-lora](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/peft_integration/diffusers_peft_lora_inference_20_1.png)
+
+### Customize adapters strength
+
+For even more customization, you can control how strongly the adapter affects each part of the pipeline. For this, pass a dictionary with the control strengths (called "scales") to [`~loaders.peft.PeftAdapterMixin.set_adapters`].
+
+For example, here's how you can turn on the adapter for the `down` parts, but turn it off for the `mid` and `up` parts:
+```python
+pipe.enable_lora()  # enable lora again, after we disabled it above
+prompt = "toy_face of a hacker with a hoodie, pixel art"
+adapter_weight_scales = { "unet": { "down": 1, "mid": 0, "up": 0} }
+pipe.set_adapters("pixel", adapter_weight_scales)
+image = pipe(prompt, num_inference_steps=30, generator=torch.manual_seed(0)).images[0]
+image
+```
+
+![block-lora-text-and-down](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/peft_integration/diffusers_peft_lora_inference_block_down.png)
+
+Let's see how turning off the `down` part and turning on the `mid` and `up` part respectively changes the image.
+```python
+adapter_weight_scales = { "unet": { "down": 0, "mid": 1, "up": 0} }
+pipe.set_adapters("pixel", adapter_weight_scales)
+image = pipe(prompt, num_inference_steps=30, generator=torch.manual_seed(0)).images[0]
+image
+```
+
+![block-lora-text-and-mid](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/peft_integration/diffusers_peft_lora_inference_block_mid.png)
+
+```python
+adapter_weight_scales = { "unet": { "down": 0, "mid": 0, "up": 1} }
+pipe.set_adapters("pixel", adapter_weight_scales)
+image = pipe(prompt, num_inference_steps=30, generator=torch.manual_seed(0)).images[0]
+image
+```
+
+![block-lora-text-and-up](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/peft_integration/diffusers_peft_lora_inference_block_up.png)
+
+Looks cool!
+
+This is a really powerful feature. You can use it to control the adapter strengths down to per-transformer level. And you can even use it for multiple adapters.
+```python
+adapter_weight_scales_toy = 0.5
+adapter_weight_scales_pixel = {
    "unet": {
-        "down": 0.9,
+        "down": 0.9,  # all transformers in the down-part will use scale 0.9
+        # "mid"  # because, in this example, "mid" is not given, all transformers in the mid part will use the default scale 1.0
        "up": {
-            "block_0": 0.6,
-            "block_1": [0.4, 0.8, 1.0],
+            "block_0": 0.6,  # all 3 transformers in the 0th block in the up-part will use scale 0.6
+            "block_1": [0.4, 0.8, 1.0],  # the 3 transformers in the 1st block in the up-part will use scales 0.4, 0.8 and 1.0 respectively
        }
    }
 }
-pipeline.set_adapters("cereal", scales)
-pipeline("bears, pizza bites").images[0]
+pipe.set_adapters(["toy", "pixel"], [adapter_weight_scales_toy, adapter_weight_scales_pixel])
+image = pipe(prompt, num_inference_steps=30, generator=torch.manual_seed(0)).images[0]
+image
 ```

-</hfoption>
-</hfoptions>
+![block-lora-mixed](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/peft_integration/diffusers_peft_lora_inference_block_mixed.png)

-## Hotswapping
+## Manage adapters

-Hotswapping LoRAs is an efficient way to work with multiple LoRAs while avoiding accumulating memory from multiple calls to [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] and in some cases, recompilation, if a model is compiled. This workflow requires a loaded LoRA because the new LoRA weights are swapped in place for the existing loaded LoRA.
+You have attached multiple adapters in this tutorial, and if you're feeling a bit lost on what adapters have been attached to the pipeline's components, use the [`~diffusers.loaders.StableDiffusionLoraLoaderMixin.get_active_adapters`] method to check the list of active adapters:

 ```py
-import torch
-from diffusers import DiffusionPipeline
-
-# load base model and LoRAs
-pipeline = DiffusionPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0",
-    torch_dtype=torch.float16
-).to("cuda")
-pipeline.load_lora_weights(
-    "ostris/ikea-instructions-lora-sdxl",
-    weight_name="ikea_instructions_xl_v1_5.safetensors",
-    adapter_name="ikea"
-)
+active_adapters = pipe.get_active_adapters()
+active_adapters
+["toy", "pixel"]
 ```

-> [!WARNING]
-> Hotswapping is unsupported for LoRAs that target the text encoder.
-
-Set `hotswap=True` in [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] to swap the second LoRA. Use the `adapter_name` parameter to indicate which LoRA to swap (`default_0` is the default name).
+You can also get the active adapters of each pipeline component with [`~diffusers.loaders.StableDiffusionLoraLoaderMixin.get_list_adapters`]:

 ```py
-pipeline.load_lora_weights(
-    "lordjia/by-feng-zikai",
-    hotswap=True,
-    adapter_name="ikea"
-)
+list_adapters_component_wise = pipe.get_list_adapters()
+list_adapters_component_wise
+{"text_encoder": ["toy", "pixel"], "unet": ["toy", "pixel"], "text_encoder_2": ["toy", "pixel"]}
 ```

-### Compiled models
-
-For compiled models, use [`~loaders.lora_base.LoraBaseMixin.enable_lora_hotswap`] to avoid recompilation when hotswapping LoRAs. This method should be called *before* loading the first LoRA and `torch.compile` should be called *after* loading the first LoRA.
-
-> [!TIP]
-> The [`~loaders.lora_base.LoraBaseMixin.enable_lora_hotswap`] method isn't always necessary if the second LoRA targets the identical LoRA ranks and scales as the first LoRA.
-
-Within [`~loaders.lora_base.LoraBaseMixin.enable_lora_hotswap`], the `target_rank` parameter is important for setting the rank for all LoRA adapters. Setting it to `max_rank` sets it to the highest value. For LoRAs with different ranks, you set it to a higher rank value. The default rank value is 128.
+The [`~loaders.peft.PeftAdapterMixin.delete_adapters`] function completely removes an adapter and their LoRA layers from a model.

 ```py
-import torch
-from diffusers import DiffusionPipeline
-
-# load base model and LoRAs
-pipeline = DiffusionPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0",
-    torch_dtype=torch.float16
-).to("cuda")
-# 1. enable_lora_hotswap
-pipeline.enable_lora_hotswap(target_rank=max_rank)
-pipeline.load_lora_weights(
-    "ostris/ikea-instructions-lora-sdxl",
-    weight_name="ikea_instructions_xl_v1_5.safetensors",
-    adapter_name="ikea"
-)
-# 2. torch.compile
-pipeline.unet = torch.compile(pipeline.unet, mode="reduce-overhead", fullgraph=True)
-
-# 3. hotswap
-pipeline.load_lora_weights(
-    "lordjia/by-feng-zikai",
-    hotswap=True,
-    adapter_name="ikea"
-)
+pipe.delete_adapters("toy")
+pipe.get_active_adapters()
+["pixel"]
 ```

-> [!TIP]
-> Move your code inside the `with torch._dynamo.config.patch(error_on_recompile=True)` context manager to detect if a model was recompiled. If a model is recompiled despite following all the steps above, please open an [issue](https://github.com/huggingface/diffusers/issues) with a reproducible example.
+## PeftInputAutocastDisableHook

-There are still scenarios where recompulation is unavoidable, such as when the hotswapped LoRA targets more layers than the initial adapter. Try to load the LoRA that targets the most layers *first*. For more details about this limitation, refer to the PEFT [hotswapping](https://huggingface.co/docs/peft/main/en/package_reference/hotswap#peft.utils.hotswap.hotswap_adapter) docs.
-
-## Merge
-
-The weights from each LoRA can be merged together to produce a blend of multiple existing styles. There are several methods for merging LoRAs, each of which differ in *how* the weights are merged (may affect generation quality).
-
-### set_adapters
-
-The [`~loaders.PeftAdapterMixin.set_adapters`] method merges LoRAs by concatenating their weighted matrices. Pass the LoRA names to [`~loaders.PeftAdapterMixin.set_adapters`] and use the `adapter_weights` parameter to control the scaling of each LoRA. For example, if `adapter_weights=[0.5, 0.5]`, the output is an average of both LoRAs.
-
-> [!TIP]
-> The `"scale"` parameter determines how much of the merged LoRA to apply. See the [Weight scale](#weight-scale) section for more details.
-
-```py
-import torch
-from diffusers import DiffusionPipeline
-
-pipeline = DiffusionPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0",
-    torch_dtype=torch.float16
-).to("cuda")
-pipeline.load_lora_weights(
-    "ostris/ikea-instructions-lora-sdxl",
-    weight_name="ikea_instructions_xl_v1_5.safetensors",
-    adapter_name="ikea"
-)
-pipeline.load_lora_weights(
-    "lordjia/by-feng-zikai",
-    weight_name="fengzikai_v1.0_XL.safetensors",
-    adapter_name="feng"
-)
-pipeline.set_adapters(["ikea", "feng"], adapter_weights=[0.7, 0.8])
-# use by Feng Zikai to activate the lordjia/by-feng-zikai LoRA
-pipeline("A bowl of ramen shaped like a cute kawaii bear, by Feng Zikai", cross_attention_kwargs={"scale": 1.0}).images[0]
-```
-
-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lora_merge_set_adapters.png"/>
-</div>
-
-### add_weighted_adapter
-
-> [!TIP]
-> This is an experimental method and you can refer to PEFTs [Model merging](https://huggingface.co/docs/peft/developer_guides/model_merging) for more details. Take a look at this [issue](https://github.com/huggingface/diffusers/issues/6892) if you're interested in the motivation and design behind this integration.
-
-The [`~peft.LoraModel.add_weighted_adapter`] method enables more efficient merging methods like [TIES](https://huggingface.co/papers/2306.01708) or [DARE](https://huggingface.co/papers/2311.03099). These merging methods remove redundant and potentially interfering parameters from merged models. Keep in mind the LoRA ranks need to have identical ranks to be merged.
-
-Make sure the latest stable version of Diffusers and PEFT is installed.
-
-```bash
-pip install -U -q diffusers peft
-```
-
-Load a UNET that corresponds to the LoRA UNet.
-
-```py
-import copy
-import torch
-from diffusers import AutoModel, DiffusionPipeline
-from peft import get_peft_model, LoraConfig, PeftModel
-
-unet = AutoModel.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0",
-    torch_dtype=torch.float16,
-    use_safetensors=True,
-    variant="fp16",
-    subfolder="unet",
-).to("cuda")
-```
-
-Load a pipeline, pass the UNet to it, and load a LoRA.
-
-```py
-pipeline = DiffusionPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0",
-    variant="fp16",
-    torch_dtype=torch.float16,
-    unet=unet
-).to("cuda")
-pipeline.load_lora_weights(
-    "ostris/ikea-instructions-lora-sdxl",
-    weight_name="ikea_instructions_xl_v1_5.safetensors",
-    adapter_name="ikea"
-)
-```
-
-Create a [`~peft.PeftModel`] from the LoRA checkpoint by combining the first UNet you loaded and the LoRA UNet from the pipeline.
-
-```py
-sdxl_unet = copy.deepcopy(unet)
-ikea_peft_model = get_peft_model(
-    sdxl_unet,
-    pipeline.unet.peft_config["ikea"],
-    adapter_name="ikea"
-)
-
-original_state_dict = {f"base_model.model.{k}": v for k, v in pipeline.unet.state_dict().items()}
-ikea_peft_model.load_state_dict(original_state_dict, strict=True)
-```
-
-> [!TIP]
-> You can save and reuse the `ikea_peft_model` by pushing it to the Hub as shown below.
-> ```py
-> ikea_peft_model.push_to_hub("ikea_peft_model", token=TOKEN)
-> ```
-
-Repeat this process and create a [`~peft.PeftModel`] for the second LoRA.
-
-```py
-pipeline.delete_adapters("ikea")
-sdxl_unet.delete_adapters("ikea")
-
-pipeline.load_lora_weights(
-    "lordjia/by-feng-zikai",
-    weight_name="fengzikai_v1.0_XL.safetensors",
-    adapter_name="feng"
-)
-pipeline.set_adapters(adapter_names="feng")
-
-feng_peft_model = get_peft_model(
-    sdxl_unet,
-    pipeline.unet.peft_config["feng"],
-    adapter_name="feng"
-)
-
-original_state_dict = {f"base_model.model.{k}": v for k, v in pipe.unet.state_dict().items()}
-feng_peft_model.load_state_dict(original_state_dict, strict=True)
-```
-
-Load a base UNet model and load the adapters.
-
-```py
-base_unet = AutoModel.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0",
-    torch_dtype=torch.float16,
-    use_safetensors=True,
-    variant="fp16",
-    subfolder="unet",
-).to("cuda")
-
-model = PeftModel.from_pretrained(
-    base_unet,
-    "stevhliu/ikea_peft_model",
-    use_safetensors=True,
-    subfolder="ikea",
-    adapter_name="ikea"
-)
-model.load_adapter(
-    "stevhliu/feng_peft_model",
-    use_safetensors=True,
-    subfolder="feng",
-    adapter_name="feng"
-)
-```
-
-Merge the LoRAs with [`~peft.LoraModel.add_weighted_adapter`] and specify how you want to merge them with `combination_type`. The example below uses the `"dare_linear"` method (refer to this [blog post](https://huggingface.co/blog/peft_merging) to learn more about these merging methods), which randomly prunes some weights and then performs a weighted sum of the tensors based on the set weightage of each LoRA in `weights`.
-
-Activate the merged LoRAs with [`~loaders.PeftAdapterMixin.set_adapters`].
-
-```py
-model.add_weighted_adapter(
-    adapters=["ikea", "feng"],
-    combination_type="dare_linear",
-    weights=[1.0, 1.0],
-    adapter_name="ikea-feng"
-)
-model.set_adapters("ikea-feng")
-
-pipeline = DiffusionPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0",
-    unet=model,
-    variant="fp16",
-    torch_dtype=torch.float16,
-).to("cuda")
-pipeline("A bowl of ramen shaped like a cute kawaii bear, by Feng Zikai").images[0]
-```
-
-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ikea-feng-dare-linear.png"/>
-</div>
-
-### fuse_lora
-
-The [`~loaders.lora_base.LoraBaseMixin.fuse_lora`] method fuses the LoRA weights directly with the original UNet and text encoder weights of the underlying model. This reduces the overhead of loading the underlying model for each LoRA because it only loads the model once, which lowers memory usage and increases inference speed.
-
-```py
-import torch
-from diffusers import DiffusionPipeline
-
-pipeline = DiffusionPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0",
-    torch_dtype=torch.float16
-).to("cuda")
-pipeline.load_lora_weights(
-    "ostris/ikea-instructions-lora-sdxl",
-    weight_name="ikea_instructions_xl_v1_5.safetensors",
-    adapter_name="ikea"
-)
-pipeline.load_lora_weights(
-    "lordjia/by-feng-zikai",
-    weight_name="fengzikai_v1.0_XL.safetensors",
-    adapter_name="feng"
-)
-pipeline.set_adapters(["ikea", "feng"], adapter_weights=[0.7, 0.8])
-```
-
-Call [`~loaders.lora_base.LoraBaseMixin.fuse_lora`] to fuse them. The `lora_scale` parameter controls how much to scale the output by with the LoRA weights. It is important to make this adjustment now because passing `scale` to `cross_attention_kwargs` won't work in the pipeline.
-
-```py
-pipeline.fuse_lora(adapter_names=["ikea", "feng"], lora_scale=1.0)
-```
-
-Unload the LoRA weights since they're already fused with the underlying model. Save the fused pipeline with either [`~DiffusionPipeline.save_pretrained`] to save it locally or [`~PushToHubMixin.push_to_hub`] to save it to the Hub.
-
-<hfoptions id="save">
-<hfoption id="save locally">
-
-```py
-pipeline.unload_lora_weights()
-pipeline.save_pretrained("path/to/fused-pipeline")
-```
-
-</hfoption>
-<hfoption id="save to Hub">
-
-```py
-pipeline.unload_lora_weights()
-pipeline.push_to_hub("fused-ikea-feng")
-```
-
-</hfoption>
-</hfoptions>
-
-The fused pipeline can now be quickly loaded for inference without requiring each LoRA to be separately loaded.
-
-```py
-pipeline = DiffusionPipeline.from_pretrained(
-    "username/fused-ikea-feng", torch_dtype=torch.float16,
-).to("cuda")
-pipeline("A bowl of ramen shaped like a cute kawaii bear, by Feng Zikai").images[0]
-```
-
-Use [`~loaders.LoraLoaderMixin.unfuse_lora`] to restore the underlying models weights, for example, if you want to use a different `lora_scale` value. You can only unfuse if there is a single LoRA fused. For example, it won't work with the pipeline from above because there are multiple fused LoRAs. In these cases, you'll need to reload the entire model.
-
-```py
-pipeline.unfuse_lora()
-```
-
-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/fuse_lora.png"/>
-</div>
-
-## Manage
-
-Diffusers provides several methods to help you manage working with LoRAs. These methods can be especially useful if you're working with multiple LoRAs.
-
-### set_adapters
-
-[`~loaders.PeftAdapterMixin.set_adapters`] also activates the current LoRA to use if there are multiple active LoRAs. This allows you to switch between different LoRAs by specifying their name.
-
-```py
-import torch
-from diffusers import DiffusionPipeline
-
-pipeline = DiffusionPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0",
-    torch_dtype=torch.float16
-).to("cuda")
-pipeline.load_lora_weights(
-    "ostris/ikea-instructions-lora-sdxl",
-    weight_name="ikea_instructions_xl_v1_5.safetensors",
-    adapter_name="ikea"
-)
-pipeline.load_lora_weights(
-    "lordjia/by-feng-zikai",
-    weight_name="fengzikai_v1.0_XL.safetensors",
-    adapter_name="feng"
-)
-# activates the feng LoRA instead of the ikea LoRA
-pipeline.set_adapters("feng")
-```
-
-### save_lora_adapter
-
-Save an adapter with [`~loaders.PeftAdapterMixin.save_lora_adapter`].
-
-```py
-import torch
-from diffusers import AutoPipelineForText2Image
-
-pipeline = AutoPipelineForText2Image.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0",
-    torch_dtype=torch.float16
-).to("cuda")
-pipeline.unet.load_lora_adapter(
-    "jbilcke-hf/sdxl-cinematic-1",
-    weight_name="pytorch_lora_weights.safetensors",
-    adapter_name="cinematic"
-    prefix="unet"
-)
-pipeline.save_lora_adapter("path/to/save", adapter_name="cinematic")
-```
-
-### unload_lora_weights
-
-The [`~loaders.lora_base.LoraBaseMixin.unload_lora_weights`] method unloads any LoRA weights in the pipeline to restore the underlying model weights.
-
-```py
-pipeline.unload_lora_weights()
-```
-
-### disable_lora
-
-The [`~loaders.PeftAdapterMixin.disable_lora`] method disables all LoRAs (but they're still kept on the pipeline) and restores the pipeline to the underlying model weights.
-
-```py
-pipeline.disable_lora()
-```
-
-### get_active_adapters
-
-The [`~loaders.lora_base.LoraBaseMixin.get_active_adapters`] method returns a list of active LoRAs attached to a pipeline.
-
-```py
-pipeline.get_active_adapters()
-["cereal", "ikea"]
-```
-
-### get_list_adapters
-
-The [`~loaders.lora_base.LoraBaseMixin.get_list_adapters`] method returns the active LoRAs for each component in the pipeline.
-
-```py
-pipeline.get_list_adapters()
-{"unet": ["cereal", "ikea"], "text_encoder_2": ["cereal"]}
-```
-
-### delete_adapters
-
-The [`~loaders.PeftAdapterMixin.delete_adapters`] method completely removes a LoRA and its layers from a model.
-
-```py
-pipeline.delete_adapters("ikea")
-```
-
-## Resources
-
-Browse the [LoRA Studio](https://lorastudio.co/models) for different LoRAs to use or you can upload your favorite LoRAs from Civitai to the Hub with the Space below.
-
-<iframe
-	src="https://multimodalart-civitai-to-hf.hf.space"
-	frameborder="0"
-	width="850"
-	height="450"
-></iframe>
-
-You can find additional LoRAs in the [FLUX LoRA the Explorer](https://huggingface.co/spaces/multimodalart/flux-lora-the-explorer) and [LoRA the Explorer](https://huggingface.co/spaces/multimodalart/LoraTheExplorer) Spaces.
+[[autodoc]] hooks.layerwise_casting.PeftInputAutocastDisableHook
--- a/docs/source/en/using-diffusers/callback.md
+++ b/docs/source/en/using-diffusers/callback.md
@@ -157,84 +157,6 @@ pipeline(
 )
 ```

-## IP Adapter Cutoff
-
-IP Adapter is an image prompt adapter that can be used for diffusion models without any changes to the underlying model. We can use the IP Adapter Cutoff Callback to disable the IP Adapter after a certain number of steps. To set up the callback, you need to specify the number of denoising steps after which the callback comes into effect. You can do so by using either one of these two arguments:
-
- `cutoff_step_ratio`: Float number with the ratio of the steps.
- `cutoff_step_index`: Integer number with the exact number of the step.
-
-We need to download the diffusion model and load the ip_adapter for it as follows:
-
-```py
-from diffusers import AutoPipelineForText2Image
-from diffusers.utils import load_image
-import torch
-
-pipeline = AutoPipelineForText2Image.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16).to("cuda")
-pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="sdxl_models", weight_name="ip-adapter_sdxl.bin")
-pipeline.set_ip_adapter_scale(0.6)
-```
-The setup for the callback should look something like this:
-
-```py
-
-from diffusers import AutoPipelineForText2Image
-from diffusers.callbacks import IPAdapterScaleCutoffCallback
-from diffusers.utils import load_image
-import torch
- 
-
-pipeline = AutoPipelineForText2Image.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0", 
-    torch_dtype=torch.float16
-).to("cuda")
-
-
-pipeline.load_ip_adapter(
-    "h94/IP-Adapter", 
-    subfolder="sdxl_models", 
-    weight_name="ip-adapter_sdxl.bin"
-)
-
-pipeline.set_ip_adapter_scale(0.6)
-
-
-callback = IPAdapterScaleCutoffCallback(
-    cutoff_step_ratio=None, 
-    cutoff_step_index=5
-)
-
-image = load_image(
-    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_adapter_diner.png"
-)
-
-generator = torch.Generator(device="cuda").manual_seed(2628670641)
-
-images = pipeline(
-    prompt="a tiger sitting in a chair drinking orange juice",
-    ip_adapter_image=image,
-    negative_prompt="deformed, ugly, wrong proportion, low res, bad anatomy, worst quality, low quality",
-    generator=generator,
-    num_inference_steps=50,
-    callback_on_step_end=callback,
-).images
-
-images[0].save("custom_callback_img.png")
-```
-
-<div class="flex gap-4">
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/without_callback.png" alt="generated image of a tiger sitting in a chair drinking orange juice" />
-    <figcaption class="mt-2 text-center text-sm text-gray-500">without IPAdapterScaleCutoffCallback</figcaption>
-  </div>
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/with_callback2.png" alt="generated image of a tiger sitting in a chair drinking orange juice with ip adapter callback" />
-    <figcaption class="mt-2 text-center text-sm text-gray-500">with IPAdapterScaleCutoffCallback</figcaption>
-  </div>
-</div>
-
-
 ## Display image after each generation step

 > [!TIP]
--- a/docs/source/en/using-diffusers/controlnet.md
+++ b/docs/source/en/using-diffusers/controlnet.md
@@ -12,28 +12,46 @@ specific language governing permissions and limitations under the License.

 # ControlNet

-[ControlNet](https://huggingface.co/papers/2302.05543) is an adapter that enables controllable generation such as generating an image of a cat in a *specific pose* or following the lines in a sketch of a *specific* cat. It works by adding a smaller network of "zero convolution" layers and progressively training these to avoid disrupting with the original model. The original model parameters are frozen to avoid retraining it.
+ControlNet is a type of model for controlling image diffusion models by conditioning the model with an additional input image. There are many types of conditioning inputs (canny edge, user sketching, human pose, depth, and more) you can use to control a diffusion model. This is hugely useful because it affords you greater control over image generation, making it easier to generate specific images without experimenting with different text prompts or denoising values as much.

-A ControlNet is conditioned on extra visual information or "structural controls" (canny edge, depth maps, human pose, etc.) that can be combined with text prompts to generate images that are guided by the visual input.
+<Tip>

-> [!TIP]
-> ControlNets are available to many models such as [Flux](../api/pipelines/controlnet_flux), [Hunyuan-DiT](../api/pipelines/controlnet_hunyuandit), [Stable Diffusion 3](../api/pipelines/controlnet_sd3), and more. The examples in this guide use Flux and Stable Diffusion XL.
+Check out Section 3.5 of the [ControlNet](https://huggingface.co/papers/2302.05543) paper v1 for a list of ControlNet implementations on various conditioning inputs. You can find the official Stable Diffusion ControlNet conditioned models on [lllyasviel](https://huggingface.co/lllyasviel)'s Hub profile, and more [community-trained](https://huggingface.co/models?other=stable-diffusion&other=controlnet) ones on the Hub.

-Load a ControlNet conditioned on a specific control, such as canny edge, and pass it to the pipeline in [`~DiffusionPipeline.from_pretrained`].
+For Stable Diffusion XL (SDXL) ControlNet models, you can find them on the 🤗 [Diffusers](https://huggingface.co/diffusers) Hub organization, or you can browse [community-trained](https://huggingface.co/models?other=stable-diffusion-xl&other=controlnet) ones on the Hub.

-<hfoptions id="usage">
-<hfoption id="text-to-image">
+</Tip>

-Generate a canny image with [opencv-python](https://github.com/opencv/opencv-python).
+A ControlNet model has two sets of weights (or blocks) connected by a zero-convolution layer:
+
+- a *locked copy* keeps everything a large pretrained diffusion model has learned
+- a *trainable copy* is trained on the additional conditioning input
+
+Since the locked copy preserves the pretrained model, training and implementing a ControlNet on a new conditioning input is as fast as finetuning any other model because you aren't training the model from scratch.
+
+This guide will show you how to use ControlNet for text-to-image, image-to-image, inpainting, and more! There are many types of ControlNet conditioning inputs to choose from, but in this guide we'll only focus on several of them. Feel free to experiment with other conditioning inputs!
+
+Before you begin, make sure you have the following libraries installed:

 ```py
+# uncomment to install the necessary libraries in Colab
+#!pip install -q diffusers transformers accelerate opencv-python
+```
+
+## Text-to-image
+
+For text-to-image, you normally pass a text prompt to the model. But with ControlNet, you can specify an additional conditioning input. Let's condition the model with a canny image, a white outline of an image on a black background. This way, the ControlNet can use the canny image as a control to guide the model to generate an image with the same outline.
+
+Load an image and use the [opencv-python](https://github.com/opencv/opencv-python) library to extract the canny image:
+
+```py
+from diffusers.utils import load_image, make_image_grid
+from PIL import Image
 import cv2
 import numpy as np
-from PIL import Image
-from diffusers.utils import load_image

 original_image = load_image(
-    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/non-enhanced-prompt.png"
+    "https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png"
 )

 image = np.array(original_image)
@@ -47,300 +65,523 @@ image = np.concatenate([image, image, image], axis=2)
 canny_image = Image.fromarray(image)
 ```

-Pass the canny image to the pipeline. Use the `controlnet_conditioning_scale` parameter to determine how much weight to assign to the control.
-
-```py
-import torch
-from diffusers.utils import load_image
-from diffusers import FluxControlNetPipeline, FluxControlNetModel
-
-controlnet = FluxControlNetModel.from_pretrained(
-    "InstantX/FLUX.1-dev-Controlnet-Canny", torch_dtype=torch.bfloat16
-)
-pipeline = FluxControlNetPipeline.from_pretrained(
-    "black-forest-labs/FLUX.1-dev", controlnet=controlnet, torch_dtype=torch.bfloat16
-).to("cuda")
-
-prompt = """
-A photorealistic overhead image of a cat reclining sideways in a flamingo pool floatie holding a margarita. 
-The cat is floating leisurely in the pool and completely relaxed and happy.
-"""
-
-pipeline(
-    prompt, 
-    control_image=canny_image,
-    controlnet_conditioning_scale=0.5,
-    num_inference_steps=50, 
-    guidance_scale=3.5,
-).images[0]
-```
-
-<div style="display: flex; gap: 10px; justify-content: space-around; align-items: flex-end;">
-  <figure>
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/non-enhanced-prompt.png" width="300" alt="Generated image (prompt only)"/>
-    <figcaption style="text-align: center;">original image</figcaption>
-  </figure>
-  <figure>
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/canny-cat.png" width="300" alt="Control image (Canny edges)"/>
-    <figcaption style="text-align: center;">canny image</figcaption>
-  </figure>
-  <figure>
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/canny-cat-generated.png" width="300" alt="Generated image (ControlNet + prompt)"/>
-    <figcaption style="text-align: center;">generated image</figcaption>
-  </figure>
+<div class="flex gap-4">
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">original image</figcaption>
+  </div>
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/vermeer_canny_edged.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">canny image</figcaption>
+  </div>
 </div>

+Next, load a ControlNet model conditioned on canny edge detection and pass it to the [`StableDiffusionControlNetPipeline`]. Use the faster [`UniPCMultistepScheduler`] and enable model offloading to speed up inference and reduce memory usage.

-</hfoption>
-<hfoption id="image-to-image">
+```py
+from diffusers import StableDiffusionControlNetPipeline, ControlNetModel, UniPCMultistepScheduler
+import torch

-Generate a depth map with a depth estimation pipeline from Transformers.
+controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16, use_safetensors=True)
+pipe = StableDiffusionControlNetPipeline.from_pretrained(
+    "stable-diffusion-v1-5/stable-diffusion-v1-5", controlnet=controlnet, torch_dtype=torch.float16, use_safetensors=True
+)
+
+pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
+pipe.enable_model_cpu_offload()
+```
+
+Now pass your prompt and canny image to the pipeline:
+
+```py
+output = pipe(
+    "the mona lisa", image=canny_image
+).images[0]
+make_image_grid([original_image, canny_image, output], rows=1, cols=3)
+```
+
+<div class="flex justify-center">
+  <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet-text2img.png"/>
+</div>
+
+## Image-to-image
+
+For image-to-image, you'd typically pass an initial image and a prompt to the pipeline to generate a new image. With ControlNet, you can pass an additional conditioning input to guide the model. Let's condition the model with a depth map, an image which contains spatial information. This way, the ControlNet can use the depth map as a control to guide the model to generate an image that preserves spatial information.
+
+You'll use the [`StableDiffusionControlNetImg2ImgPipeline`] for this task, which is different from the [`StableDiffusionControlNetPipeline`] because it allows you to pass an initial image as the starting point for the image generation process.
+
+Load an image and use the `depth-estimation` [`~transformers.Pipeline`] from 🤗 Transformers to extract the depth map of an image:

 ```py
 import torch
 import numpy as np
-from PIL import Image
-from transformers import DPTImageProcessor, DPTForDepthEstimation
-from diffusers import ControlNetModel, StableDiffusionXLControlNetImg2ImgPipeline, AutoencoderKL
-from diffusers.utils import load_image

+from transformers import pipeline
+from diffusers.utils import load_image, make_image_grid

-depth_estimator = DPTForDepthEstimation.from_pretrained("Intel/dpt-hybrid-midas").to("cuda")
-feature_extractor = DPTImageProcessor.from_pretrained("Intel/dpt-hybrid-midas")
-
-def get_depth_map(image):
-    image = feature_extractor(images=image, return_tensors="pt").pixel_values.to("cuda")
-    with torch.no_grad(), torch.autocast("cuda"):
-        depth_map = depth_estimator(image).predicted_depth
-
-    depth_map = torch.nn.functional.interpolate(
-        depth_map.unsqueeze(1),
-        size=(1024, 1024),
-        mode="bicubic",
-        align_corners=False,
-    )
-    depth_min = torch.amin(depth_map, dim=[1, 2, 3], keepdim=True)
-    depth_max = torch.amax(depth_map, dim=[1, 2, 3], keepdim=True)
-    depth_map = (depth_map - depth_min) / (depth_max - depth_min)
-    image = torch.cat([depth_map] * 3, dim=1)
-    image = image.permute(0, 2, 3, 1).cpu().numpy()[0]
-    image = Image.fromarray((image * 255.0).clip(0, 255).astype(np.uint8))
-    return image
-
-depth_image = get_depth_map(image)
-```
-
-Pass the depth map to the pipeline. Use the `controlnet_conditioning_scale` parameter to determine how much weight to assign to the control.
-
-```py
-controlnet = ControlNetModel.from_pretrained(
-    "diffusers/controlnet-depth-sdxl-1.0-small",
-    torch_dtype=torch.float16,
-)
-vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16)
-pipeline = StableDiffusionXLControlNetImg2ImgPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0",
-    controlnet=controlnet,
-    vae=vae,
-    torch_dtype=torch.float16,
-).to("cuda")
-
-prompt = """
-A photorealistic overhead image of a cat reclining sideways in a flamingo pool floatie holding a margarita. 
-The cat is floating leisurely in the pool and completely relaxed and happy.
-"""
 image = load_image(
-    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/non-enhanced-prompt.png"
-).resize((1024, 1024))
-controlnet_conditioning_scale = 0.5 
-pipeline(
-    prompt,
-    image=image,
-    control_image=depth_image,
-    controlnet_conditioning_scale=controlnet_conditioning_scale,
-    strength=0.99,
-    num_inference_steps=100,
-).images[0]
-```
-
-<div style="display: flex; gap: 10px; justify-content: space-around; align-items: flex-end;">
-  <figure>
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/non-enhanced-prompt.png" width="300" alt="Generated image (prompt only)"/>
-    <figcaption style="text-align: center;">original image</figcaption>
-  </figure>
-  <figure>
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sdxl_depth_image.png" width="300" alt="Control image (Canny edges)"/>
-    <figcaption style="text-align: center;">depth map</figcaption>
-  </figure>
-  <figure> 
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sdxl_depth_cat.png" width="300" alt="Generated image (ControlNet + prompt)"/>
-    <figcaption style="text-align: center;">generated image</figcaption>
-  </figure>
-</div>
-
-</hfoption>
-<hfoption id="inpainting">
-
-Generate a mask image and convert it to a tensor to mark the pixels in the original image as masked if the corresponding pixel in the mask image is over a certain threshold.
-
-```py
-import cv2
-import torch
-import numpy as np
-from PIL import Image
-from diffusers.utils import load_image
-from diffusers import StableDiffusionXLControlNetInpaintPipeline, ControlNetModel
-
-init_image = load_image(
-    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/non-enhanced-prompt.png"
+    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet-img2img.jpg"
 )
-init_image = init_image.resize((1024, 1024))
-mask_image = load_image(
-    "/content/cat_mask.png"
-)
-mask_image = mask_image.resize((1024, 1024))

-def make_canny_condition(image):
+def get_depth_map(image, depth_estimator):
+    image = depth_estimator(image)["depth"]
    image = np.array(image)
-    image = cv2.Canny(image, 100, 200)
    image = image[:, :, None]
    image = np.concatenate([image, image, image], axis=2)
-    image = Image.fromarray(image)
-    return image
+    detected_map = torch.from_numpy(image).float() / 255.0
+    depth_map = detected_map.permute(2, 0, 1)
+    return depth_map

-control_image = make_canny_condition(init_image)
+depth_estimator = pipeline("depth-estimation")
+depth_map = get_depth_map(image, depth_estimator).unsqueeze(0).half().to("cuda")
 ```

-Pass the mask and control image to the pipeline. Use the `controlnet_conditioning_scale` parameter to determine how much weight to assign to the control.
+Next, load a ControlNet model conditioned on depth maps and pass it to the [`StableDiffusionControlNetImg2ImgPipeline`]. Use the faster [`UniPCMultistepScheduler`] and enable model offloading to speed up inference and reduce memory usage.

 ```py
-controlnet = ControlNetModel.from_pretrained(
-    "diffusers/controlnet-canny-sdxl-1.0", torch_dtype=torch.float16
+from diffusers import StableDiffusionControlNetImg2ImgPipeline, ControlNetModel, UniPCMultistepScheduler
+import torch
+
+controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11f1p_sd15_depth", torch_dtype=torch.float16, use_safetensors=True)
+pipe = StableDiffusionControlNetImg2ImgPipeline.from_pretrained(
+    "stable-diffusion-v1-5/stable-diffusion-v1-5", controlnet=controlnet, torch_dtype=torch.float16, use_safetensors=True
 )
-pipeline = StableDiffusionXLControlNetInpaintPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0", controlnet=controlnet, torch_dtype=torch.float16
+
+pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
+pipe.enable_model_cpu_offload()
+```
+
+Now pass your prompt, initial image, and depth map to the pipeline:
+
+```py
+output = pipe(
+    "lego batman and robin", image=image, control_image=depth_map,
+).images[0]
+make_image_grid([image, output], rows=1, cols=2)
+```
+
+<div class="flex gap-4">
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet-img2img.jpg"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">original image</figcaption>
+  </div>
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet-img2img-2.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">generated image</figcaption>
+  </div>
+</div>
+
+## Inpainting
+
+For inpainting, you need an initial image, a mask image, and a prompt describing what to replace the mask with. ControlNet models allow you to add another control image to condition a model with. Let’s condition the model with an inpainting mask. This way, the ControlNet can use the inpainting mask as a control to guide the model to generate an image within the mask area.
+
+Load an initial image and a mask image:
+
+```py
+from diffusers.utils import load_image, make_image_grid
+
+init_image = load_image(
+    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet-inpaint.jpg"
 )
-pipeline(
-    "a cute and fluffy bunny rabbit",
-    num_inference_steps=100,
-    strength=0.99,
-    controlnet_conditioning_scale=0.5,
+init_image = init_image.resize((512, 512))
+
+mask_image = load_image(
+    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet-inpaint-mask.jpg"
+)
+mask_image = mask_image.resize((512, 512))
+make_image_grid([init_image, mask_image], rows=1, cols=2)
+```
+
+Create a function to prepare the control image from the initial and mask images. This'll create a tensor to mark the pixels in `init_image` as masked if the corresponding pixel in `mask_image` is over a certain threshold.
+
+```py
+import numpy as np
+import torch
+
+def make_inpaint_condition(image, image_mask):
+    image = np.array(image.convert("RGB")).astype(np.float32) / 255.0
+    image_mask = np.array(image_mask.convert("L")).astype(np.float32) / 255.0
+
+    assert image.shape[0:1] == image_mask.shape[0:1]
+    image[image_mask > 0.5] = -1.0  # set as masked pixel
+    image = np.expand_dims(image, 0).transpose(0, 3, 1, 2)
+    image = torch.from_numpy(image)
+    return image
+
+control_image = make_inpaint_condition(init_image, mask_image)
+```
+
+<div class="flex gap-4">
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet-inpaint.jpg"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">original image</figcaption>
+  </div>
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet-inpaint-mask.jpg"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">mask image</figcaption>
+  </div>
+</div>
+
+Load a ControlNet model conditioned on inpainting and pass it to the [`StableDiffusionControlNetInpaintPipeline`]. Use the faster [`UniPCMultistepScheduler`] and enable model offloading to speed up inference and reduce memory usage.
+
+```py
+from diffusers import StableDiffusionControlNetInpaintPipeline, ControlNetModel, UniPCMultistepScheduler
+
+controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11p_sd15_inpaint", torch_dtype=torch.float16, use_safetensors=True)
+pipe = StableDiffusionControlNetInpaintPipeline.from_pretrained(
+    "stable-diffusion-v1-5/stable-diffusion-v1-5", controlnet=controlnet, torch_dtype=torch.float16, use_safetensors=True
+)
+
+pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
+pipe.enable_model_cpu_offload()
+```
+
+Now pass your prompt, initial image, mask image, and control image to the pipeline:
+
+```py
+output = pipe(
+    "corgi face with large ears, detailed, pixar, animated, disney",
+    num_inference_steps=20,
+    eta=1.0,
    image=init_image,
    mask_image=mask_image,
    control_image=control_image,
 ).images[0]
+make_image_grid([init_image, mask_image, output], rows=1, cols=3)
 ```

-<div style="display: flex; gap: 10px; justify-content: space-around; align-items: flex-end;">
-  <figure>
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/non-enhanced-prompt.png" width="300" alt="Generated image (prompt only)"/>
-    <figcaption style="text-align: center;">original image</figcaption>
-  </figure>
-  <figure>
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cat_mask.png" width="300" alt="Control image (Canny edges)"/>
-    <figcaption style="text-align: center;">mask image</figcaption>
-  </figure>
-  <figure> 
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sdxl_rabbit_inpaint.png" width="300" alt="Generated image (ControlNet + prompt)"/>
-    <figcaption style="text-align: center;">generated image</figcaption>
-  </figure>
+<div class="flex justify-center">
+  <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet-inpaint-result.png"/>
 </div>

-</hfoption>
-</hfoptions>
+## Guess mode

-## Multi-ControlNet
+[Guess mode](https://github.com/lllyasviel/ControlNet/discussions/188) does not require supplying a prompt to a ControlNet at all! This forces the ControlNet encoder to do its best to "guess" the contents of the input control map (depth map, pose estimation, canny edge, etc.).

-You can compose multiple ControlNet conditionings, such as canny image and a depth map, to create a *MultiControlNet*. For the best rersults, you should mask conditionings so they don't overlap and experiment with different `controlnet_conditioning_scale` parameters to adjust how much weight is assigned to each control input.
+Guess mode adjusts the scale of the output residuals from a ControlNet by a fixed ratio depending on the block depth. The shallowest `DownBlock` corresponds to 0.1, and as the blocks get deeper, the scale increases exponentially such that the scale of the `MidBlock` output becomes 1.0.

-The example below composes a canny image and depth map.
+<Tip>

-Pass the ControlNets as a list to the pipeline and resize the images to the expected input size.
+Guess mode does not have any impact on prompt conditioning and you can still provide a prompt if you want.
+
+</Tip>
+
+Set `guess_mode=True` in the pipeline, and it is [recommended](https://github.com/lllyasviel/ControlNet#guess-mode--non-prompt-mode) to set the `guidance_scale` value between 3.0 and 5.0.

 ```py
+from diffusers import StableDiffusionControlNetPipeline, ControlNetModel
+from diffusers.utils import load_image, make_image_grid
+import numpy as np
 import torch
+from PIL import Image
+import cv2
+
+controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny", use_safetensors=True)
+pipe = StableDiffusionControlNetPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5", controlnet=controlnet, use_safetensors=True).to("cuda")
+
+original_image = load_image("https://huggingface.co/takuma104/controlnet_dev/resolve/main/bird_512x512.png")
+
+image = np.array(original_image)
+
+low_threshold = 100
+high_threshold = 200
+
+image = cv2.Canny(image, low_threshold, high_threshold)
+image = image[:, :, None]
+image = np.concatenate([image, image, image], axis=2)
+canny_image = Image.fromarray(image)
+
+image = pipe("", image=canny_image, guess_mode=True, guidance_scale=3.0).images[0]
+make_image_grid([original_image, canny_image, image], rows=1, cols=3)
+```
+
+<div class="flex gap-4">
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare_guess_mode/output_images/diffusers/output_bird_canny_0.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">regular mode with prompt</figcaption>
+  </div>
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare_guess_mode/output_images/diffusers/output_bird_canny_0_gm.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">guess mode without prompt</figcaption>
+  </div>
+</div>
+
+## ControlNet with Stable Diffusion XL
+
+There aren't too many ControlNet models compatible with Stable Diffusion XL (SDXL) at the moment, but we've trained two full-sized ControlNet models for SDXL conditioned on canny edge detection and depth maps. We're also experimenting with creating smaller versions of these SDXL-compatible ControlNet models so it is easier to run on resource-constrained hardware. You can find these checkpoints on the [🤗 Diffusers Hub organization](https://huggingface.co/diffusers)!
+
+Let's use a SDXL ControlNet conditioned on canny images to generate an image. Start by loading an image and prepare the canny image:
+
+```py
 from diffusers import StableDiffusionXLControlNetPipeline, ControlNetModel, AutoencoderKL
+from diffusers.utils import load_image, make_image_grid
+from PIL import Image
+import cv2
+import numpy as np
+import torch
+
+original_image = load_image(
+    "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/hf-logo.png"
+)
+
+image = np.array(original_image)
+
+low_threshold = 100
+high_threshold = 200
+
+image = cv2.Canny(image, low_threshold, high_threshold)
+image = image[:, :, None]
+image = np.concatenate([image, image, image], axis=2)
+canny_image = Image.fromarray(image)
+make_image_grid([original_image, canny_image], rows=1, cols=2)
+```
+
+<div class="flex gap-4">
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/hf-logo.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">original image</figcaption>
+  </div>
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/hf-logo-canny.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">canny image</figcaption>
+  </div>
+</div>
+
+Load a SDXL ControlNet model conditioned on canny edge detection and pass it to the [`StableDiffusionXLControlNetPipeline`]. You can also enable model offloading to reduce memory usage.
+
+```py
+controlnet = ControlNetModel.from_pretrained(
+    "diffusers/controlnet-canny-sdxl-1.0",
+    torch_dtype=torch.float16,
+    use_safetensors=True
+)
+vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16, use_safetensors=True)
+pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-xl-base-1.0",
+    controlnet=controlnet,
+    vae=vae,
+    torch_dtype=torch.float16,
+    use_safetensors=True
+)
+pipe.enable_model_cpu_offload()
+```
+
+Now pass your prompt (and optionally a negative prompt if you're using one) and canny image to the pipeline:
+
+<Tip>
+
+The [`controlnet_conditioning_scale`](https://huggingface.co/docs/diffusers/main/en/api/pipelines/controlnet#diffusers.StableDiffusionControlNetPipeline.__call__.controlnet_conditioning_scale) parameter determines how much weight to assign to the conditioning inputs. A value of 0.5 is recommended for good generalization, but feel free to experiment with this number!
+
+</Tip>
+
+```py
+prompt = "aerial view, a futuristic research complex in a bright foggy jungle, hard lighting"
+negative_prompt = 'low quality, bad quality, sketches'
+
+image = pipe(
+    prompt,
+    negative_prompt=negative_prompt,
+    image=canny_image,
+    controlnet_conditioning_scale=0.5,
+).images[0]
+make_image_grid([original_image, canny_image, image], rows=1, cols=3)
+```
+
+<div class="flex justify-center">
+    <img class="rounded-xl" src="https://huggingface.co/diffusers/controlnet-canny-sdxl-1.0/resolve/main/out_hug_lab_7.png"/>
+</div>
+
+You can use [`StableDiffusionXLControlNetPipeline`] in guess mode as well by setting the parameter to `True`:
+
+```py
+from diffusers import StableDiffusionXLControlNetPipeline, ControlNetModel, AutoencoderKL
+from diffusers.utils import load_image, make_image_grid
+import numpy as np
+import torch
+import cv2
+from PIL import Image
+
+prompt = "aerial view, a futuristic research complex in a bright foggy jungle, hard lighting"
+negative_prompt = "low quality, bad quality, sketches"
+
+original_image = load_image(
+    "https://hf.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/hf-logo.png"
+)
+
+controlnet = ControlNetModel.from_pretrained(
+    "diffusers/controlnet-canny-sdxl-1.0", torch_dtype=torch.float16, use_safetensors=True
+)
+vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16, use_safetensors=True)
+pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-xl-base-1.0", controlnet=controlnet, vae=vae, torch_dtype=torch.float16, use_safetensors=True
+)
+pipe.enable_model_cpu_offload()
+
+image = np.array(original_image)
+image = cv2.Canny(image, 100, 200)
+image = image[:, :, None]
+image = np.concatenate([image, image, image], axis=2)
+canny_image = Image.fromarray(image)
+
+image = pipe(
+    prompt, negative_prompt=negative_prompt, controlnet_conditioning_scale=0.5, image=canny_image, guess_mode=True,
+).images[0]
+make_image_grid([original_image, canny_image, image], rows=1, cols=3)
+```
+
+<Tip>
+
+You can use a refiner model with `StableDiffusionXLControlNetPipeline` to improve image quality, just like you can with a regular `StableDiffusionXLPipeline`.
+See the [Refine image quality](./sdxl#refine-image-quality) section to learn how to use the refiner model.
+Make sure to use `StableDiffusionXLControlNetPipeline` and pass `image` and `controlnet_conditioning_scale`.
+
+```py
+base = StableDiffusionXLControlNetPipeline(...)
+image = base(
+    prompt=prompt,
+    controlnet_conditioning_scale=0.5,
+    image=canny_image,
+    num_inference_steps=40,
+    denoising_end=0.8,
+    output_type="latent",
+).images
+# rest exactly as with StableDiffusionXLPipeline
+```
+
+</Tip>
+
+## MultiControlNet
+
+<Tip>
+
+Replace the SDXL model with a model like [stable-diffusion-v1-5/stable-diffusion-v1-5](https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5) to use multiple conditioning inputs with Stable Diffusion models.
+
+</Tip>
+
+You can compose multiple ControlNet conditionings from different image inputs to create a *MultiControlNet*. To get better results, it is often helpful to:
+
+1. mask conditionings such that they don't overlap (for example, mask the area of a canny image where the pose conditioning is located)
+2. experiment with the [`controlnet_conditioning_scale`](https://huggingface.co/docs/diffusers/main/en/api/pipelines/controlnet#diffusers.StableDiffusionControlNetPipeline.__call__.controlnet_conditioning_scale) parameter to determine how much weight to assign to each conditioning input
+
+In this example, you'll combine a canny image and a human pose estimation image to generate a new image.
+
+Prepare the canny image conditioning:
+
+```py
+from diffusers.utils import load_image, make_image_grid
+from PIL import Image
+import numpy as np
+import cv2
+
+original_image = load_image(
+    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/landscape.png"
+)
+image = np.array(original_image)
+
+low_threshold = 100
+high_threshold = 200
+
+image = cv2.Canny(image, low_threshold, high_threshold)
+
+# zero out middle columns of image where pose will be overlaid
+zero_start = image.shape[1] // 4
+zero_end = zero_start + image.shape[1] // 2
+image[:, zero_start:zero_end] = 0
+
+image = image[:, :, None]
+image = np.concatenate([image, image, image], axis=2)
+canny_image = Image.fromarray(image)
+make_image_grid([original_image, canny_image], rows=1, cols=2)
+```
+
+<div class="flex gap-4">
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/landscape.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">original image</figcaption>
+  </div>
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/controlnet/landscape_canny_masked.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">canny image</figcaption>
+  </div>
+</div>
+
+For human pose estimation, install [controlnet_aux](https://github.com/patrickvonplaten/controlnet_aux):
+
+```py
+# uncomment to install the necessary library in Colab
+#!pip install -q controlnet-aux
+```
+
+Prepare the human pose estimation conditioning:
+
+```py
+from controlnet_aux import OpenposeDetector
+
+openpose = OpenposeDetector.from_pretrained("lllyasviel/ControlNet")
+original_image = load_image(
+    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/person.png"
+)
+openpose_image = openpose(original_image)
+make_image_grid([original_image, openpose_image], rows=1, cols=2)
+```
+
+<div class="flex gap-4">
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/person.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">original image</figcaption>
+  </div>
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/controlnet/person_pose.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">human pose image</figcaption>
+  </div>
+</div>
+
+Load a list of ControlNet models that correspond to each conditioning, and pass them to the [`StableDiffusionXLControlNetPipeline`]. Use the faster [`UniPCMultistepScheduler`] and enable model offloading to reduce memory usage.
+
+```py
+from diffusers import StableDiffusionXLControlNetPipeline, ControlNetModel, AutoencoderKL, UniPCMultistepScheduler
+import torch

 controlnets = [
    ControlNetModel.from_pretrained(
-        "diffusers/controlnet-depth-sdxl-1.0-small", torch_dtype=torch.float16
+        "thibaud/controlnet-openpose-sdxl-1.0", torch_dtype=torch.float16
    ),
    ControlNetModel.from_pretrained(
-        "diffusers/controlnet-canny-sdxl-1.0", torch_dtype=torch.float16,
+        "diffusers/controlnet-canny-sdxl-1.0", torch_dtype=torch.float16, use_safetensors=True
    ),
 ]

-vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16)
-pipeline = StableDiffusionXLControlNetPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0", controlnet=controlnets, vae=vae, torch_dtype=torch.float16
-).to("cuda")
-
-prompt = """
-a relaxed rabbit sitting on a striped towel next to a pool with a tropical drink nearby, 
-bright sunny day, vacation scene, 35mm photograph, film, professional, 4k, highly detailed
-"""
-negative_prompt = "lowres, bad anatomy, worst quality, low quality, deformed, ugly"
-
-images = [canny_image.resize((1024, 1024)), depth_image.resize((1024, 1024))]
-
-pipeline(
-    prompt,
-    negative_prompt=negative_prompt,
-    image=images,
-    num_inference_steps=100,
-    controlnet_conditioning_scale=[0.5, 0.5],
-    strength=0.7,
-).images[0]
+vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16, use_safetensors=True)
+pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-xl-base-1.0", controlnet=controlnets, vae=vae, torch_dtype=torch.float16, use_safetensors=True
+)
+pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
+pipe.enable_model_cpu_offload()
 ```

-<div style="display: flex; gap: 10px; justify-content: space-around; align-items: flex-end;">
-  <figure>
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/canny-cat.png" width="300" alt="Generated image (prompt only)"/>
-    <figcaption style="text-align: center;">canny image</figcaption>
-  </figure>
-  <figure>
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/multicontrolnet_depth.png" width="300" alt="Control image (Canny edges)"/>
-    <figcaption style="text-align: center;">depth map</figcaption>
-  </figure>
-  <figure> 
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sdxl_multi_controlnet.png" width="300" alt="Generated image (ControlNet + prompt)"/>
-    <figcaption style="text-align: center;">generated image</figcaption>
-  </figure>
-</div>
-
-## guess_mode
-
-[Guess mode](https://github.com/lllyasviel/ControlNet/discussions/188) generates an image from **only** the control input (canny edge, depth map, pose, etc.) and without guidance from a prompt. It adjusts the scale of the ControlNet's output residuals by a fixed ratio depending on block depth. The earlier `DownBlock` is only scaled by `0.1` and the `MidBlock` is fully scaled by `1.0`.
+Now you can pass your prompt (an optional negative prompt if you're using one), canny image, and pose image to the pipeline:

 ```py
-import torch
-from diffusers.utils import load_iamge
-from diffusers import StableDiffusionXLControlNetPipeline, ControlNetModel
+prompt = "a giant standing in a fantasy landscape, best quality"
+negative_prompt = "monochrome, lowres, bad anatomy, worst quality, low quality"

-controlnet = ControlNetModel.from_pretrained(
-  "diffusers/controlnet-canny-sdxl-1.0", torch_dtype=torch.float16
-)
-pipeline = StableDiffusionXLControlNetPipeline.from_pretrained(
-  "stabilityai/stable-diffusion-xl-base-1.0",
-  controlnet=controlnet,
-  torch_dtype=torch.float16
-).to("cuda")
+generator = torch.manual_seed(1)

-canny_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/canny-cat.png")
-pipeline(
-  "",
-  image=canny_image,
-  guess_mode=True
-).images[0]
+images = [openpose_image.resize((1024, 1024)), canny_image.resize((1024, 1024))]
+
+images = pipe(
+    prompt,
+    image=images,
+    num_inference_steps=25,
+    generator=generator,
+    negative_prompt=negative_prompt,
+    num_images_per_prompt=3,
+    controlnet_conditioning_scale=[1.0, 0.8],
+).images
+make_image_grid([original_image, canny_image, openpose_image,
+                images[0].resize((512, 512)), images[1].resize((512, 512)), images[2].resize((512, 512))], rows=2, cols=3)
 ```

-<div style="display: flex; gap: 10px; justify-content: space-around; align-items: flex-end;">
-  <figure>
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/canny-cat.png" width="300" alt="Control image (Canny edges)"/>
-    <figcaption style="text-align: center;">canny image</figcaption>
-  </figure>
-  <figure>
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/guess_mode.png" width="300" alt="Generated image (Guess mode)"/>
-    <figcaption style="text-align: center;">generated image</figcaption>
-  </figure>
-</div>
+<div class="flex justify-center">
+	<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/multicontrolnet.png"/>
+</div>
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Sayak Paul	6d0d52d46c	Merge branch 'main' into pipeline-fetcher	2025-02-21 14:41:37 +05:30
DN6	9e56d656df	update	2025-02-21 14:00:02 +05:30
DN6	7d7e18b9cc	update	2025-02-21 13:17:33 +05:30