up

add dynamic cfg
remove dynamic guidance scale
2026-04-26 01:41:29 +08:00 · 2024-08-06 07:20:35 +02:00 · 2024-08-06 03:59:06 +02:00 · 2024-08-05 17:52:13 +02:00 · 2024-08-05 22:20:06 +08:00 · 2024-08-05 22:08:50 +08:00
593 changed files with 5420 additions and 82179 deletions
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -7,7 +7,6 @@ on:

 env:
  DIFFUSERS_IS_CI: yes
-  HF_HUB_ENABLE_HF_TRANSFER: 1
  HF_HOME: /mnt/cache
  OMP_NUM_THREADS: 8
  MKL_NUM_THREADS: 8
@@ -51,7 +50,7 @@ jobs:

      - name: Test suite reports artifacts
        if: ${{ always() }}
-        uses: actions/upload-artifact@v4
+        uses: actions/upload-artifact@v2
        with:
          name: benchmark_test_reports
          path: benchmarks/benchmark_outputs
--- a/.github/workflows/mirror_community_pipeline.yml
+++ b/.github/workflows/mirror_community_pipeline.yml
@@ -25,7 +25,7 @@ jobs:
    env:
      SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL_COMMUNITY_MIRROR }}

-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest
    steps:
      # Checkout to correct ref
      #   If workflow dispatch
--- a/.github/workflows/nightly_tests.yml
+++ b/.github/workflows/nightly_tests.yml
@@ -43,7 +43,7 @@ jobs:

      - name: Pipeline Tests Artifacts
        if: ${{ always() }}
-        uses: actions/upload-artifact@v4
+        uses: actions/upload-artifact@v2
        with:
          name: test-pipelines.json
          path: reports
@@ -72,14 +72,14 @@ jobs:
        run: |
          python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
          python -m uv pip install -e [quality,test]
-          pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
+          python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
          python -m uv pip install pytest-reportlog
      - name: Environment
        run: |
          python utils/print_env.py
      - name: Pipeline CUDA Test
        env:
-          HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
          # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
          CUBLAS_WORKSPACE_CONFIG: :16:8
        run: |
@@ -95,7 +95,7 @@ jobs:
          cat reports/tests_pipeline_${{ matrix.module }}_cuda_failures_short.txt
      - name: Test suite reports artifacts
        if: ${{ always() }}
-        uses: actions/upload-artifact@v4
+        uses: actions/upload-artifact@v2
        with:
          name: pipeline_${{ matrix.module }}_test_reports
          path: reports
@@ -116,7 +116,6 @@ jobs:
      run:
        shell: bash
    strategy:
-      fail-fast: false
      max-parallel: 2
      matrix:
        module: [models, schedulers, lora, others, single_file, examples]
@@ -130,8 +129,8 @@ jobs:
      run: |
        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
        python -m uv pip install -e [quality,test]
+        python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
        python -m uv pip install peft@git+https://github.com/huggingface/peft.git
-        pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
        python -m uv pip install pytest-reportlog
    - name: Environment
      run: python utils/print_env.py
@@ -139,7 +138,7 @@ jobs:
    - name: Run nightly PyTorch CUDA tests for non-pipeline modules
      if: ${{ matrix.module != 'examples'}}
      env:
-        HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
+        HF_TOKEN: ${{ secrets.HF_TOKEN }}
        # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
        CUBLAS_WORKSPACE_CONFIG: :16:8
      run: |
@@ -152,7 +151,7 @@ jobs:
    - name: Run nightly example tests with Torch
      if: ${{ matrix.module == 'examples' }}
      env:
-        HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
+        HF_TOKEN: ${{ secrets.HF_TOKEN }}
        # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
        CUBLAS_WORKSPACE_CONFIG: :16:8
      run: |
@@ -169,7 +168,7 @@ jobs:

    - name: Test suite reports artifacts
      if: ${{ always() }}
-      uses: actions/upload-artifact@v4
+      uses: actions/upload-artifact@v2
      with:
        name: torch_${{ matrix.module }}_cuda_test_reports
        path: reports
@@ -180,62 +179,6 @@ jobs:
        pip install slack_sdk tabulate
        python utils/log_reports.py >> $GITHUB_STEP_SUMMARY

-  run_big_gpu_torch_tests:
-    name: Torch tests on big GPU
-    strategy:
-      fail-fast: false
-      max-parallel: 2
-    runs-on:
-      group: aws-g6e-xlarge-plus
-    container:
-      image: diffusers/diffusers-pytorch-cuda
-      options: --shm-size "16gb" --ipc host --gpus 0
-    steps:
-      - name: Checkout diffusers
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: 2
-      - name: NVIDIA-SMI
-        run: nvidia-smi
-      - name: Install dependencies
-        run: |
-          python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-          python -m uv pip install -e [quality,test]
-          python -m uv pip install peft@git+https://github.com/huggingface/peft.git
-          pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
-          python -m uv pip install pytest-reportlog
-      - name: Environment
-        run: |
-          python utils/print_env.py
-      - name: Selected Torch CUDA Test on big GPU
-        env:
-          HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
-          # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
-          CUBLAS_WORKSPACE_CONFIG: :16:8
-          BIG_GPU_MEMORY: 40
-        run: |
-          python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
-            -m "big_gpu_with_torch_cuda" \
-            --make-reports=tests_big_gpu_torch_cuda \
-            --report-log=tests_big_gpu_torch_cuda.log \
-            tests/
-      - name: Failure short reports
-        if: ${{ failure() }}
-        run: |
-          cat reports/tests_big_gpu_torch_cuda_stats.txt
-          cat reports/tests_big_gpu_torch_cuda_failures_short.txt
-      - name: Test suite reports artifacts
-        if: ${{ always() }}
-        uses: actions/upload-artifact@v4
-        with:
-          name: torch_cuda_big_gpu_test_reports
-          path: reports
-      - name: Generate Report and Notify Channel
-        if: always()
-        run: |
-          pip install slack_sdk tabulate
-          python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
-
  run_flax_tpu_tests:
    name: Nightly Flax TPU Tests
    runs-on: docker-tpu
@@ -257,7 +200,7 @@ jobs:
      run: |
        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
        python -m uv pip install -e [quality,test]
-        pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
+        python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
        python -m uv pip install pytest-reportlog

    - name: Environment
@@ -265,7 +208,7 @@ jobs:

    - name: Run nightly Flax TPU tests
      env:
-        HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
+        HF_TOKEN: ${{ secrets.HF_TOKEN }}
      run: |
        python -m pytest -n 0 \
          -s -v -k "Flax" \
@@ -281,7 +224,7 @@ jobs:

    - name: Test suite reports artifacts
      if: ${{ always() }}
-      uses: actions/upload-artifact@v4
+      uses: actions/upload-artifact@v2
      with:
        name: flax_tpu_test_reports
        path: reports
@@ -313,14 +256,14 @@ jobs:
      run: |
        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
        python -m uv pip install -e [quality,test]
-        pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
+        python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
        python -m uv pip install pytest-reportlog
    - name: Environment
      run: python utils/print_env.py

    - name: Run Nightly ONNXRuntime CUDA tests
      env:
-        HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
+        HF_TOKEN: ${{ secrets.HF_TOKEN }}
      run: |
        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
          -s -v -k "Onnx" \
@@ -336,9 +279,9 @@ jobs:

    - name: Test suite reports artifacts
      if: ${{ always() }}
-      uses: actions/upload-artifact@v4
+      uses: actions/upload-artifact@v2
      with:
-        name: tests_onnx_cuda_reports
+        name: ${{ matrix.config.report }}_test_reports
        path: reports

    - name: Generate Report and Notify Channel
@@ -347,118 +290,64 @@ jobs:
        pip install slack_sdk tabulate
        python utils/log_reports.py >> $GITHUB_STEP_SUMMARY

-# M1 runner currently not well supported
-# TODO: (Dhruv) add these back when we setup better testing for Apple Silicon
-#  run_nightly_tests_apple_m1:
-#    name: Nightly PyTorch MPS tests on MacOS
-#    runs-on: [ self-hosted, apple-m1 ]
-#    if: github.event_name == 'schedule'
-#
-#    steps:
-#      - name: Checkout diffusers
-#        uses: actions/checkout@v3
-#        with:
-#          fetch-depth: 2
-#
-#      - name: Clean checkout
-#        shell: arch -arch arm64 bash {0}
-#        run: |
-#          git clean -fxd
-#      - name: Setup miniconda
-#        uses: ./.github/actions/setup-miniconda
-#        with:
-#          python-version: 3.9
-#
-#      - name: Install dependencies
-#        shell: arch -arch arm64 bash {0}
-#        run: |
-#          ${CONDA_RUN} python -m pip install --upgrade pip uv
-#          ${CONDA_RUN} python -m uv pip install -e [quality,test]
-#          ${CONDA_RUN} python -m uv pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu
-#          ${CONDA_RUN} python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate
-#          ${CONDA_RUN} python -m uv pip install pytest-reportlog
-#      - name: Environment
-#        shell: arch -arch arm64 bash {0}
-#        run: |
-#          ${CONDA_RUN} python utils/print_env.py
-#      - name: Run nightly PyTorch tests on M1 (MPS)
-#        shell: arch -arch arm64 bash {0}
-#        env:
-#          HF_HOME: /System/Volumes/Data/mnt/cache
-#          HF_TOKEN: ${{ secrets.HF_TOKEN }}
-#        run: |
-#          ${CONDA_RUN} python -m pytest -n 1 -s -v --make-reports=tests_torch_mps \
-#            --report-log=tests_torch_mps.log \
-#            tests/
-#      - name: Failure short reports
-#        if: ${{ failure() }}
-#        run: cat reports/tests_torch_mps_failures_short.txt
-#
-#      - name: Test suite reports artifacts
-#        if: ${{ always() }}
-#        uses: actions/upload-artifact@v4
-#        with:
-#          name: torch_mps_test_reports
-#          path: reports
-#
-#      - name: Generate Report and Notify Channel
-#        if: always()
-#        run: |
-#          pip install slack_sdk tabulate
-#          python utils/log_reports.py >> $GITHUB_STEP_SUMMARY  run_nightly_tests_apple_m1:
-#    name: Nightly PyTorch MPS tests on MacOS
-#    runs-on: [ self-hosted, apple-m1 ]
-#    if: github.event_name == 'schedule'
-#
-#    steps:
-#      - name: Checkout diffusers
-#        uses: actions/checkout@v3
-#        with:
-#          fetch-depth: 2
-#
-#      - name: Clean checkout
-#        shell: arch -arch arm64 bash {0}
-#        run: |
-#          git clean -fxd
-#      - name: Setup miniconda
-#        uses: ./.github/actions/setup-miniconda
-#        with:
-#          python-version: 3.9
-#
-#      - name: Install dependencies
-#        shell: arch -arch arm64 bash {0}
-#        run: |
-#          ${CONDA_RUN} python -m pip install --upgrade pip uv
-#          ${CONDA_RUN} python -m uv pip install -e [quality,test]
-#          ${CONDA_RUN} python -m uv pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu
-#          ${CONDA_RUN} python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate
-#          ${CONDA_RUN} python -m uv pip install pytest-reportlog
-#      - name: Environment
-#        shell: arch -arch arm64 bash {0}
-#        run: |
-#          ${CONDA_RUN} python utils/print_env.py
-#      - name: Run nightly PyTorch tests on M1 (MPS)
-#        shell: arch -arch arm64 bash {0}
-#        env:
-#          HF_HOME: /System/Volumes/Data/mnt/cache
-#          HF_TOKEN: ${{ secrets.HF_TOKEN }}
-#        run: |
-#          ${CONDA_RUN} python -m pytest -n 1 -s -v --make-reports=tests_torch_mps \
-#            --report-log=tests_torch_mps.log \
-#            tests/
-#      - name: Failure short reports
-#        if: ${{ failure() }}
-#        run: cat reports/tests_torch_mps_failures_short.txt
-#
-#      - name: Test suite reports artifacts
-#        if: ${{ always() }}
-#        uses: actions/upload-artifact@v4
-#        with:
-#          name: torch_mps_test_reports
-#          path: reports
-#
-#      - name: Generate Report and Notify Channel
-#        if: always()
-#        run: |
-#          pip install slack_sdk tabulate
-#          python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
+  run_nightly_tests_apple_m1:
+    name: Nightly PyTorch MPS tests on MacOS
+    runs-on: [ self-hosted, apple-m1 ]
+    if: github.event_name == 'schedule'
+
+    steps:
+      - name: Checkout diffusers
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 2
+
+      - name: Clean checkout
+        shell: arch -arch arm64 bash {0}
+        run: |
+          git clean -fxd
+
+      - name: Setup miniconda
+        uses: ./.github/actions/setup-miniconda
+        with:
+          python-version: 3.9
+
+      - name: Install dependencies
+        shell: arch -arch arm64 bash {0}
+        run: |
+          ${CONDA_RUN} python -m pip install --upgrade pip uv
+          ${CONDA_RUN} python -m uv pip install -e [quality,test]
+          ${CONDA_RUN} python -m uv pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu
+          ${CONDA_RUN} python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate
+          ${CONDA_RUN} python -m uv pip install pytest-reportlog
+
+      - name: Environment
+        shell: arch -arch arm64 bash {0}
+        run: |
+          ${CONDA_RUN} python utils/print_env.py
+
+      - name: Run nightly PyTorch tests on M1 (MPS)
+        shell: arch -arch arm64 bash {0}
+        env:
+          HF_HOME: /System/Volumes/Data/mnt/cache
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        run: |
+          ${CONDA_RUN} python -m pytest -n 1 -s -v --make-reports=tests_torch_mps \
+            --report-log=tests_torch_mps.log \
+            tests/
+
+      - name: Failure short reports
+        if: ${{ failure() }}
+        run: cat reports/tests_torch_mps_failures_short.txt
+
+      - name: Test suite reports artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v2
+        with:
+          name: torch_mps_test_reports
+          path: reports
+
+      - name: Generate Report and Notify Channel
+        if: always()
+        run: |
+          pip install slack_sdk tabulate
+          python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
--- a/.github/workflows/notify_slack_about_release.yml
+++ b/.github/workflows/notify_slack_about_release.yml
@@ -7,7 +7,7 @@ on:

 jobs:
  build:
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest

    steps:
    - uses: actions/checkout@v3
--- a/.github/workflows/pr_dependency_test.yml
+++ b/.github/workflows/pr_dependency_test.yml
@@ -16,7 +16,7 @@ concurrency:

 jobs:
  check_dependencies:
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v3
      - name: Set up Python
--- a/.github/workflows/pr_flax_dependency_test.yml
+++ b/.github/workflows/pr_flax_dependency_test.yml
@@ -16,7 +16,7 @@ concurrency:

 jobs:
  check_flax_dependencies:
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v3
      - name: Set up Python
--- a/.github/workflows/pr_test_fetcher.yml
+++ b/.github/workflows/pr_test_fetcher.yml
@@ -171,7 +171,7 @@ jobs:

    - name: Test suite reports artifacts
      if: ${{ always() }}
-      uses: actions/upload-artifact@v4
+      uses: actions/upload-artifact@v2
      with:
        name: pr_${{ matrix.config.report }}_test_reports
        path: reports
--- a/.github/workflows/pr_test_peft_backend.yml
+++ b/.github/workflows/pr_test_peft_backend.yml
@@ -20,7 +20,7 @@ env:

 jobs:
  check_code_quality:
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v3
      - name: Set up Python
@@ -40,7 +40,7 @@ jobs:

  check_repository_consistency:
    needs: check_code_quality
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v3
      - name: Set up Python
@@ -92,14 +92,12 @@ jobs:
      run: |
        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
        python -m uv pip install -e [quality,test]
-        # TODO (sayakpaul, DN6): revisit `--no-deps`
        if [ "${{ matrix.lib-versions }}" == "main" ]; then
-            python -m pip install -U peft@git+https://github.com/huggingface/peft.git --no-deps
-            python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers.git --no-deps
-            pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git --no-deps
+            python -m pip install -U peft@git+https://github.com/huggingface/peft.git
+            python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers.git
+            python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
        else
-            python -m uv pip install -U peft --no-deps
-            python -m uv pip install -U transformers accelerate --no-deps
+            python -m uv pip install -U peft transformers accelerate
        fi

    - name: Environment
@@ -112,23 +110,23 @@ jobs:
        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
        python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \
          -s -v \
-          --make-reports=tests_${{ matrix.lib-versions }} \
+          --make-reports=tests_${{ matrix.config.report }} \
          tests/lora/
        python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \
          -s -v \
-          --make-reports=tests_models_lora_${{ matrix.lib-versions }} \
+          --make-reports=tests_models_lora_${{ matrix.config.report }} \
          tests/models/ -k "lora"


    - name: Failure short reports
      if: ${{ failure() }}
      run: |
-        cat reports/tests_${{ matrix.lib-versions }}_failures_short.txt
-        cat reports/tests_models_lora_${{ matrix.lib-versions }}_failures_short.txt
+        cat reports/tests_${{ matrix.config.report }}_failures_short.txt
+        cat reports/tests_models_lora_${{ matrix.config.report }}_failures_short.txt

    - name: Test suite reports artifacts
      if: ${{ always() }}
-      uses: actions/upload-artifact@v4
+      uses: actions/upload-artifact@v2
      with:
-        name: pr_${{ matrix.lib-versions }}_test_reports
+        name: pr_${{ matrix.config.report }}_test_reports
        path: reports
--- a/.github/workflows/pr_tests.yml
+++ b/.github/workflows/pr_tests.yml
@@ -22,14 +22,13 @@ concurrency:

 env:
  DIFFUSERS_IS_CI: yes
-  HF_HUB_ENABLE_HF_TRANSFER: 1
  OMP_NUM_THREADS: 4
  MKL_NUM_THREADS: 4
  PYTEST_TIMEOUT: 60

 jobs:
  check_code_quality:
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v3
      - name: Set up Python
@@ -49,7 +48,7 @@ jobs:

  check_repository_consistency:
    needs: check_code_quality
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v3
      - name: Set up Python
@@ -169,9 +168,9 @@ jobs:

    - name: Test suite reports artifacts
      if: ${{ always() }}
-      uses: actions/upload-artifact@v4
+      uses: actions/upload-artifact@v2
      with:
-        name: pr_${{ matrix.config.framework }}_${{ matrix.config.report }}_test_reports
+        name: pr_${{ matrix.config.report }}_test_reports
        path: reports

  run_staging_tests:
@@ -230,7 +229,7 @@ jobs:

    - name: Test suite reports artifacts
      if: ${{ always() }}
-      uses: actions/upload-artifact@v4
+      uses: actions/upload-artifact@v2
      with:
        name: pr_${{ matrix.config.report }}_test_reports
        path: reports
--- a/.github/workflows/pr_torch_dependency_test.yml
+++ b/.github/workflows/pr_torch_dependency_test.yml
@@ -16,7 +16,7 @@ concurrency:

 jobs:
  check_torch_dependencies:
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v3
      - name: Set up Python
--- a/.github/workflows/push_tests.yml
+++ b/.github/workflows/push_tests.yml
@@ -1,7 +1,6 @@
-name: Fast GPU Tests on main
+name: Slow Tests on main

 on:
-  workflow_dispatch:
  push:
    branches:
      - main
@@ -14,7 +13,6 @@ env:
  DIFFUSERS_IS_CI: yes
  OMP_NUM_THREADS: 8
  MKL_NUM_THREADS: 8
-  HF_HUB_ENABLE_HF_TRANSFER: 1
  PYTEST_TIMEOUT: 600
  PIPELINE_USAGE_CUTOFF: 50000

@@ -47,7 +45,7 @@ jobs:
          echo "pipeline_test_matrix=$matrix" >> $GITHUB_OUTPUT
      - name: Pipeline Tests Artifacts
        if: ${{ always() }}
-        uses: actions/upload-artifact@v4
+        uses: actions/upload-artifact@v2
        with:
          name: test-pipelines.json
          path: reports
@@ -77,11 +75,11 @@ jobs:
        run: |
          python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
          python -m uv pip install -e [quality,test]
-          pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
+          python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
      - name: Environment
        run: |
          python utils/print_env.py
-      - name: PyTorch CUDA checkpoint tests on Ubuntu
+      - name: Slow PyTorch CUDA checkpoint tests on Ubuntu
        env:
          HF_TOKEN: ${{ secrets.HF_TOKEN }}
          # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
@@ -98,7 +96,7 @@ jobs:
          cat reports/tests_pipeline_${{ matrix.module }}_cuda_failures_short.txt
      - name: Test suite reports artifacts
        if: ${{ always() }}
-        uses: actions/upload-artifact@v4
+        uses: actions/upload-artifact@v2
        with:
          name: pipeline_${{ matrix.module }}_test_reports
          path: reports
@@ -114,8 +112,6 @@ jobs:
      run:
        shell: bash
    strategy:
-      fail-fast: false
-      max-parallel: 2
      matrix:
        module: [models, schedulers, lora, others, single_file]
    steps:
@@ -128,8 +124,8 @@ jobs:
      run: |
        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
        python -m uv pip install -e [quality,test]
+        python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
        python -m uv pip install peft@git+https://github.com/huggingface/peft.git
-        pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git

    - name: Environment
      run: |
@@ -143,20 +139,20 @@ jobs:
      run: |
        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
          -s -v -k "not Flax and not Onnx" \
-          --make-reports=tests_torch_cuda_${{ matrix.module }} \
+          --make-reports=tests_torch_cuda \
          tests/${{ matrix.module }}

    - name: Failure short reports
      if: ${{ failure() }}
      run: |
-        cat reports/tests_torch_cuda_${{ matrix.module }}_stats.txt
-        cat reports/tests_torch_cuda_${{ matrix.module }}_failures_short.txt
+        cat reports/tests_torch_cuda_stats.txt
+        cat reports/tests_torch_cuda_failures_short.txt

    - name: Test suite reports artifacts
      if: ${{ always() }}
-      uses: actions/upload-artifact@v4
+      uses: actions/upload-artifact@v2
      with:
-        name: torch_cuda_test_reports_${{ matrix.module }}
+        name: torch_cuda_test_reports
        path: reports

  flax_tpu_tests:
@@ -178,13 +174,13 @@ jobs:
      run: |
        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
        python -m uv pip install -e [quality,test]
-        pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
+        python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git

    - name: Environment
      run: |
        python utils/print_env.py

-    - name: Run Flax TPU tests
+    - name: Run slow Flax TPU tests
      env:
        HF_TOKEN: ${{ secrets.HF_TOKEN }}
      run: |
@@ -201,7 +197,7 @@ jobs:

    - name: Test suite reports artifacts
      if: ${{ always() }}
-      uses: actions/upload-artifact@v4
+      uses: actions/upload-artifact@v2
      with:
        name: flax_tpu_test_reports
        path: reports
@@ -226,13 +222,13 @@ jobs:
      run: |
        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
        python -m uv pip install -e [quality,test]
-        pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
+        python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git

    - name: Environment
      run: |
        python utils/print_env.py

-    - name: Run ONNXRuntime CUDA tests
+    - name: Run slow ONNXRuntime CUDA tests
      env:
        HF_TOKEN: ${{ secrets.HF_TOKEN }}
      run: |
@@ -249,7 +245,7 @@ jobs:

    - name: Test suite reports artifacts
      if: ${{ always() }}
-      uses: actions/upload-artifact@v4
+      uses: actions/upload-artifact@v2
      with:
        name: onnx_cuda_test_reports
        path: reports
@@ -292,7 +288,7 @@ jobs:

    - name: Test suite reports artifacts
      if: ${{ always() }}
-      uses: actions/upload-artifact@v4
+      uses: actions/upload-artifact@v2
      with:
        name: torch_compile_test_reports
        path: reports
@@ -334,7 +330,7 @@ jobs:

    - name: Test suite reports artifacts
      if: ${{ always() }}
-      uses: actions/upload-artifact@v4
+      uses: actions/upload-artifact@v2
      with:
        name: torch_xformers_test_reports
        path: reports
@@ -385,7 +381,7 @@ jobs:

    - name: Test suite reports artifacts
      if: ${{ always() }}
-      uses: actions/upload-artifact@v4
+      uses: actions/upload-artifact@v2
      with:
        name: examples_test_reports
        path: reports
--- a/.github/workflows/push_tests_fast.yml
+++ b/.github/workflows/push_tests_fast.yml
@@ -18,7 +18,6 @@ env:
  HF_HOME: /mnt/cache
  OMP_NUM_THREADS: 8
  MKL_NUM_THREADS: 8
-  HF_HUB_ENABLE_HF_TRANSFER: 1
  PYTEST_TIMEOUT: 600
  RUN_SLOW: no

@@ -120,7 +119,7 @@ jobs:

    - name: Test suite reports artifacts
      if: ${{ always() }}
-      uses: actions/upload-artifact@v4
+      uses: actions/upload-artifact@v2
      with:
        name: pr_${{ matrix.config.report }}_test_reports
        path: reports
--- a/.github/workflows/push_tests_mps.yml
+++ b/.github/workflows/push_tests_mps.yml
@@ -13,7 +13,6 @@ env:
  HF_HOME: /mnt/cache
  OMP_NUM_THREADS: 8
  MKL_NUM_THREADS: 8
-  HF_HUB_ENABLE_HF_TRANSFER: 1
  PYTEST_TIMEOUT: 600
  RUN_SLOW: no

@@ -70,7 +69,7 @@ jobs:

    - name: Test suite reports artifacts
      if: ${{ always() }}
-      uses: actions/upload-artifact@v4
+      uses: actions/upload-artifact@v2
      with:
        name: pr_torch_mps_test_reports
        path: reports
--- a/.github/workflows/pypi_publish.yaml
+++ b/.github/workflows/pypi_publish.yaml
@@ -10,7 +10,7 @@ on:

 jobs:
  find-and-checkout-latest-branch:
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest
    outputs:
      latest_branch: ${{ steps.set_latest_branch.outputs.latest_branch }}
    steps:
@@ -36,7 +36,7 @@ jobs:

  release:
    needs: find-and-checkout-latest-branch
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest

    steps:
      - name: Checkout Repo
--- a/.github/workflows/release_tests_fast.yml
+++ b/.github/workflows/release_tests_fast.yml
@@ -1,389 +0,0 @@
-# Duplicate workflow to push_tests.yml that is meant to run on release/patch branches as a final check
-# Creating a duplicate workflow here is simpler than adding complex path/branch parsing logic to push_tests.yml
-# Needs to be updated if push_tests.yml updated
-name: (Release) Fast GPU Tests on main
-
-on:
-  push:
-    branches:
-      - "v*.*.*-release"
-      - "v*.*.*-patch"
-
-env:
-  DIFFUSERS_IS_CI: yes
-  OMP_NUM_THREADS: 8
-  MKL_NUM_THREADS: 8
-  PYTEST_TIMEOUT: 600
-  PIPELINE_USAGE_CUTOFF: 50000
-
-jobs:
-  setup_torch_cuda_pipeline_matrix:
-    name: Setup Torch Pipelines CUDA Slow Tests Matrix
-    runs-on:
-      group: aws-general-8-plus
-    container:
-      image: diffusers/diffusers-pytorch-cpu
-    outputs:
-      pipeline_test_matrix: ${{ steps.fetch_pipeline_matrix.outputs.pipeline_test_matrix }}
-    steps:
-      - name: Checkout diffusers
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: 2
-      - name: Install dependencies
-        run: |
-          python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-          python -m uv pip install -e [quality,test]
-      - name: Environment
-        run: |
-          python utils/print_env.py
-      - name: Fetch Pipeline Matrix
-        id: fetch_pipeline_matrix
-        run: |
-          matrix=$(python utils/fetch_torch_cuda_pipeline_test_matrix.py)
-          echo $matrix
-          echo "pipeline_test_matrix=$matrix" >> $GITHUB_OUTPUT
-      - name: Pipeline Tests Artifacts
-        if: ${{ always() }}
-        uses: actions/upload-artifact@v4
-        with:
-          name: test-pipelines.json
-          path: reports
-
-  torch_pipelines_cuda_tests:
-    name: Torch Pipelines CUDA Tests
-    needs: setup_torch_cuda_pipeline_matrix
-    strategy:
-      fail-fast: false
-      max-parallel: 8
-      matrix:
-        module: ${{ fromJson(needs.setup_torch_cuda_pipeline_matrix.outputs.pipeline_test_matrix) }}
-    runs-on:
-      group: aws-g4dn-2xlarge
-    container:
-      image: diffusers/diffusers-pytorch-cuda
-      options: --shm-size "16gb" --ipc host --gpus 0
-    steps:
-      - name: Checkout diffusers
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: 2
-      - name: NVIDIA-SMI
-        run: |
-          nvidia-smi
-      - name: Install dependencies
-        run: |
-          python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-          python -m uv pip install -e [quality,test]
-          pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
-      - name: Environment
-        run: |
-          python utils/print_env.py
-      - name: Slow PyTorch CUDA checkpoint tests on Ubuntu
-        env:
-          HF_TOKEN: ${{ secrets.HF_TOKEN }}
-          # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
-          CUBLAS_WORKSPACE_CONFIG: :16:8
-        run: |
-          python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
-            -s -v -k "not Flax and not Onnx" \
-            --make-reports=tests_pipeline_${{ matrix.module }}_cuda \
-            tests/pipelines/${{ matrix.module }}
-      - name: Failure short reports
-        if: ${{ failure() }}
-        run: |
-          cat reports/tests_pipeline_${{ matrix.module }}_cuda_stats.txt
-          cat reports/tests_pipeline_${{ matrix.module }}_cuda_failures_short.txt
-      - name: Test suite reports artifacts
-        if: ${{ always() }}
-        uses: actions/upload-artifact@v4
-        with:
-          name: pipeline_${{ matrix.module }}_test_reports
-          path: reports
-
-  torch_cuda_tests:
-    name: Torch CUDA Tests
-    runs-on:
-      group: aws-g4dn-2xlarge
-    container:
-      image: diffusers/diffusers-pytorch-cuda
-      options: --shm-size "16gb" --ipc host --gpus 0
-    defaults:
-      run:
-        shell: bash
-    strategy:
-      fail-fast: false
-      max-parallel: 2
-      matrix:
-        module: [models, schedulers, lora, others, single_file]
-    steps:
-    - name: Checkout diffusers
-      uses: actions/checkout@v3
-      with:
-        fetch-depth: 2
-
-    - name: Install dependencies
-      run: |
-        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-        python -m uv pip install -e [quality,test]
-        python -m uv pip install peft@git+https://github.com/huggingface/peft.git
-        pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
-
-    - name: Environment
-      run: |
-        python utils/print_env.py
-
-    - name: Run PyTorch CUDA tests
-      env:
-        HF_TOKEN: ${{ secrets.HF_TOKEN }}
-        # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
-        CUBLAS_WORKSPACE_CONFIG: :16:8
-      run: |
-        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
-          -s -v -k "not Flax and not Onnx" \
-          --make-reports=tests_torch_${{ matrix.module }}_cuda \
-          tests/${{ matrix.module }}
-
-    - name: Failure short reports
-      if: ${{ failure() }}
-      run: |
-        cat reports/tests_torch_${{ matrix.module }}_cuda_stats.txt
-        cat reports/tests_torch_${{ matrix.module }}_cuda_failures_short.txt
-
-    - name: Test suite reports artifacts
-      if: ${{ always() }}
-      uses: actions/upload-artifact@v4
-      with:
-        name: torch_cuda_${{ matrix.module }}_test_reports
-        path: reports
-
-  flax_tpu_tests:
-    name: Flax TPU Tests
-    runs-on: docker-tpu
-    container:
-      image: diffusers/diffusers-flax-tpu
-      options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ --privileged
-    defaults:
-      run:
-        shell: bash
-    steps:
-    - name: Checkout diffusers
-      uses: actions/checkout@v3
-      with:
-        fetch-depth: 2
-
-    - name: Install dependencies
-      run: |
-        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-        python -m uv pip install -e [quality,test]
-        pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
-
-    - name: Environment
-      run: |
-        python utils/print_env.py
-
-    - name: Run slow Flax TPU tests
-      env:
-        HF_TOKEN: ${{ secrets.HF_TOKEN }}
-      run: |
-        python -m pytest -n 0 \
-          -s -v -k "Flax" \
-          --make-reports=tests_flax_tpu \
-          tests/
-
-    - name: Failure short reports
-      if: ${{ failure() }}
-      run: |
-        cat reports/tests_flax_tpu_stats.txt
-        cat reports/tests_flax_tpu_failures_short.txt
-
-    - name: Test suite reports artifacts
-      if: ${{ always() }}
-      uses: actions/upload-artifact@v4
-      with:
-        name: flax_tpu_test_reports
-        path: reports
-
-  onnx_cuda_tests:
-    name: ONNX CUDA Tests
-    runs-on:
-      group: aws-g4dn-2xlarge
-    container:
-      image: diffusers/diffusers-onnxruntime-cuda
-      options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ --gpus 0
-    defaults:
-      run:
-        shell: bash
-    steps:
-    - name: Checkout diffusers
-      uses: actions/checkout@v3
-      with:
-        fetch-depth: 2
-
-    - name: Install dependencies
-      run: |
-        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-        python -m uv pip install -e [quality,test]
-        pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
-
-    - name: Environment
-      run: |
-        python utils/print_env.py
-
-    - name: Run slow ONNXRuntime CUDA tests
-      env:
-        HF_TOKEN: ${{ secrets.HF_TOKEN }}
-      run: |
-        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
-          -s -v -k "Onnx" \
-          --make-reports=tests_onnx_cuda \
-          tests/
-
-    - name: Failure short reports
-      if: ${{ failure() }}
-      run: |
-        cat reports/tests_onnx_cuda_stats.txt
-        cat reports/tests_onnx_cuda_failures_short.txt
-
-    - name: Test suite reports artifacts
-      if: ${{ always() }}
-      uses: actions/upload-artifact@v4
-      with:
-        name: onnx_cuda_test_reports
-        path: reports
-
-  run_torch_compile_tests:
-    name: PyTorch Compile CUDA tests
-
-    runs-on:
-      group: aws-g4dn-2xlarge
-
-    container:
-      image: diffusers/diffusers-pytorch-compile-cuda
-      options: --gpus 0 --shm-size "16gb" --ipc host
-
-    steps:
-    - name: Checkout diffusers
-      uses: actions/checkout@v3
-      with:
-        fetch-depth: 2
-
-    - name: NVIDIA-SMI
-      run: |
-        nvidia-smi
-    - name: Install dependencies
-      run: |
-        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-        python -m uv pip install -e [quality,test,training]
-    - name: Environment
-      run: |
-        python utils/print_env.py
-    - name: Run example tests on GPU
-      env:
-        HF_TOKEN: ${{ secrets.HF_TOKEN }}
-        RUN_COMPILE: yes
-      run: |
-        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v -k "compile" --make-reports=tests_torch_compile_cuda tests/
-    - name: Failure short reports
-      if: ${{ failure() }}
-      run: cat reports/tests_torch_compile_cuda_failures_short.txt
-
-    - name: Test suite reports artifacts
-      if: ${{ always() }}
-      uses: actions/upload-artifact@v4
-      with:
-        name: torch_compile_test_reports
-        path: reports
-
-  run_xformers_tests:
-    name: PyTorch xformers CUDA tests
-
-    runs-on:
-      group: aws-g4dn-2xlarge
-
-    container:
-      image: diffusers/diffusers-pytorch-xformers-cuda
-      options: --gpus 0 --shm-size "16gb" --ipc host
-
-    steps:
-    - name: Checkout diffusers
-      uses: actions/checkout@v3
-      with:
-        fetch-depth: 2
-
-    - name: NVIDIA-SMI
-      run: |
-        nvidia-smi
-    - name: Install dependencies
-      run: |
-        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-        python -m uv pip install -e [quality,test,training]
-    - name: Environment
-      run: |
-        python utils/print_env.py
-    - name: Run example tests on GPU
-      env:
-        HF_TOKEN: ${{ secrets.HF_TOKEN }}
-      run: |
-        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v -k "xformers" --make-reports=tests_torch_xformers_cuda tests/
-    - name: Failure short reports
-      if: ${{ failure() }}
-      run: cat reports/tests_torch_xformers_cuda_failures_short.txt
-
-    - name: Test suite reports artifacts
-      if: ${{ always() }}
-      uses: actions/upload-artifact@v4
-      with:
-        name: torch_xformers_test_reports
-        path: reports
-
-  run_examples_tests:
-    name: Examples PyTorch CUDA tests on Ubuntu
-
-    runs-on:
-      group: aws-g4dn-2xlarge
-
-    container:
-      image: diffusers/diffusers-pytorch-cuda
-      options: --gpus 0 --shm-size "16gb" --ipc host
-
-    steps:
-    - name: Checkout diffusers
-      uses: actions/checkout@v3
-      with:
-        fetch-depth: 2
-
-    - name: NVIDIA-SMI
-      run: |
-        nvidia-smi
-
-    - name: Install dependencies
-      run: |
-        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-        python -m uv pip install -e [quality,test,training]
-
-    - name: Environment
-      run: |
-        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-        python utils/print_env.py
-
-    - name: Run example tests on GPU
-      env:
-        HF_TOKEN: ${{ secrets.HF_TOKEN }}
-      run: |
-        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-        python -m uv pip install timm
-        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v --make-reports=examples_torch_cuda examples/
-
-    - name: Failure short reports
-      if: ${{ failure() }}
-      run: |
-        cat reports/examples_torch_cuda_stats.txt
-        cat reports/examples_torch_cuda_failures_short.txt
-
-    - name: Test suite reports artifacts
-      if: ${{ always() }}
-      uses: actions/upload-artifact@v4
-      with:
-        name: examples_test_reports
-        path: reports
--- a/.github/workflows/ssh-runner.yml
+++ b/.github/workflows/ssh-runner.yml
@@ -4,13 +4,8 @@ on:
  workflow_dispatch:
    inputs:
      runner_type:
-        description: 'Type of runner to test (aws-g6-4xlarge-plus: a10, aws-g4dn-2xlarge: t4, aws-g6e-xlarge-plus: L40)'
-        type: choice
+        description: 'Type of runner to test (a10 or t4)'
        required: true
-        options:
-          - aws-g6-4xlarge-plus
-          - aws-g4dn-2xlarge
-          - aws-g6e-xlarge-plus
      docker_image:
        description: 'Name of the Docker image'
        required: true
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@@ -8,10 +8,7 @@ jobs:
  close_stale_issues:
    name: Close Stale Issues
    if: github.repository == 'huggingface/diffusers'
-    runs-on: ubuntu-22.04
-    permissions:
-      issues: write
-      pull-requests: write
+    runs-on: ubuntu-latest
    env:
      GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
    steps:
--- a/.github/workflows/trufflehog.yml
+++ b/.github/workflows/trufflehog.yml
@@ -5,7 +5,7 @@ name: Secret Leaks

 jobs:
  trufflehog:
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest
    steps:
    - name: Checkout code
      uses: actions/checkout@v4
--- a/.github/workflows/typos.yml
+++ b/.github/workflows/typos.yml
@@ -5,7 +5,7 @@ on:

 jobs:
  build:
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest

    steps:
      - uses: actions/checkout@v3
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -57,7 +57,7 @@ Any question or comment related to the Diffusers library can be asked on the [di
 - ...

 Every question that is asked on the forum or on Discord actively encourages the community to publicly
-share knowledge and might very well help a beginner in the future who has the same question you're
+share knowledge and might very well help a beginner in the future that has the same question you're
 having. Please do pose any questions you might have.
 In the same spirit, you are of immense help to the community by answering such questions because this way you are publicly documenting knowledge for everybody to learn from.

@@ -503,4 +503,4 @@ $ git push --set-upstream origin your-branch-for-syncing

 ### Style guide

-For documentation strings, 🧨 Diffusers follows the [Google style](https://google.github.io/styleguide/pyguide.html).
+For documentation strings, 🧨 Diffusers follows the [Google style](https://google.github.io/styleguide/pyguide.html).
--- a/PHILOSOPHY.md
+++ b/PHILOSOPHY.md
@@ -15,7 +15,7 @@ specific language governing permissions and limitations under the License.
 🧨 Diffusers provides **state-of-the-art** pretrained diffusion models across multiple modalities.
 Its purpose is to serve as a **modular toolbox** for both inference and training.

-We aim to build a library that stands the test of time and therefore take API design very seriously.
+We aim at building a library that stands the test of time and therefore take API design very seriously.

 In a nutshell, Diffusers is built to be a natural extension of PyTorch. Therefore, most of our design choices are based on [PyTorch's Design Principles](https://pytorch.org/docs/stable/community/design.html#pytorch-design-philosophy). Let's go over the most important ones:

@@ -65,7 +65,7 @@ Pipelines are designed to be easy to use (therefore do not follow [*Simple over
 The following design principles are followed:
 - Pipelines follow the single-file policy. All pipelines can be found in individual directories under src/diffusers/pipelines. One pipeline folder corresponds to one diffusion paper/project/release. Multiple pipeline files can be gathered in one pipeline folder, as it’s done for [`src/diffusers/pipelines/stable-diffusion`](https://github.com/huggingface/diffusers/tree/main/src/diffusers/pipelines/stable_diffusion). If pipelines share similar functionality, one can make use of the [# Copied from mechanism](https://github.com/huggingface/diffusers/blob/125d783076e5bd9785beb05367a2d2566843a271/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py#L251).
 - Pipelines all inherit from [`DiffusionPipeline`].
- Every pipeline consists of different model and scheduler components, that are documented in the [`model_index.json` file](https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5/blob/main/model_index.json), are accessible under the same name as attributes of the pipeline and can be shared between pipelines with [`DiffusionPipeline.components`](https://huggingface.co/docs/diffusers/main/en/api/diffusion_pipeline#diffusers.DiffusionPipeline.components) function.
+- Every pipeline consists of different model and scheduler components, that are documented in the [`model_index.json` file](https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/model_index.json), are accessible under the same name as attributes of the pipeline and can be shared between pipelines with [`DiffusionPipeline.components`](https://huggingface.co/docs/diffusers/main/en/api/diffusion_pipeline#diffusers.DiffusionPipeline.components) function.
 - Every pipeline should be loadable via the [`DiffusionPipeline.from_pretrained`](https://huggingface.co/docs/diffusers/main/en/api/diffusion_pipeline#diffusers.DiffusionPipeline.from_pretrained) function.
 - Pipelines should be used **only** for inference.
 - Pipelines should be very readable, self-explanatory, and easy to tweak.
@@ -107,4 +107,4 @@ The following design principles are followed:
 - Every scheduler exposes the timesteps to be "looped over" via a `timesteps` attribute, which is an array of timesteps the model will be called upon.
 - The `step(...)` function takes a predicted model output and the "current" sample (x_t) and returns the "previous", slightly more denoised sample (x_t-1).
 - Given the complexity of diffusion schedulers, the `step` function does not expose all the complexity and can be a bit of a "black box".
- In almost all cases, novel schedulers shall be implemented in a new scheduling file.
+- In almost all cases, novel schedulers shall be implemented in a new scheduling file.
--- a/README.md
+++ b/README.md
@@ -73,7 +73,7 @@ Generating outputs is super easy with 🤗 Diffusers. To generate an image from
 from diffusers import DiffusionPipeline
 import torch

-pipeline = DiffusionPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5", torch_dtype=torch.float16)
+pipeline = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16)
 pipeline.to("cuda")
 pipeline("An image of a squirrel in Picasso style").images[0]
 ```
@@ -144,7 +144,7 @@ Also, say 👋 in our public Discord channel <a href="https://discord.gg/G7tWnz9
  <tr style="border-top: 2px solid black">
    <td>Text-to-Image</td>
    <td><a href="https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/text2img">Stable Diffusion Text-to-Image</a></td>
-      <td><a href="https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5"> stable-diffusion-v1-5/stable-diffusion-v1-5 </a></td>
+      <td><a href="https://huggingface.co/runwayml/stable-diffusion-v1-5"> runwayml/stable-diffusion-v1-5 </a></td>
  </tr>
  <tr>
    <td>Text-to-Image</td>
@@ -174,7 +174,7 @@ Also, say 👋 in our public Discord channel <a href="https://discord.gg/G7tWnz9
  <tr>
    <td>Text-guided Image-to-Image</td>
    <td><a href="https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/img2img">Stable Diffusion Image-to-Image</a></td>
-      <td><a href="https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5"> stable-diffusion-v1-5/stable-diffusion-v1-5 </a></td>
+      <td><a href="https://huggingface.co/runwayml/stable-diffusion-v1-5"> runwayml/stable-diffusion-v1-5 </a></td>
  </tr>
  <tr style="border-top: 2px solid black">
    <td>Text-guided Image Inpainting</td>
@@ -202,7 +202,6 @@ Also, say 👋 in our public Discord channel <a href="https://discord.gg/G7tWnz9

 - https://github.com/microsoft/TaskMatrix
 - https://github.com/invoke-ai/InvokeAI
- https://github.com/InstantID/InstantID
 - https://github.com/apple/ml-stable-diffusion
 - https://github.com/Sanster/lama-cleaner
 - https://github.com/IDEA-Research/Grounded-Segment-Anything
--- a/benchmarks/base_classes.py
+++ b/benchmarks/base_classes.py
@@ -34,7 +34,7 @@ from utils import (  # noqa: E402


 RESOLUTION_MAPPING = {
-    "Lykon/DreamShaper": (512, 512),
+    "runwayml/stable-diffusion-v1-5": (512, 512),
    "lllyasviel/sd-controlnet-canny": (512, 512),
    "diffusers/controlnet-canny-sdxl-1.0": (1024, 1024),
    "TencentARC/t2iadapter_canny_sd14v1": (512, 512),
@@ -268,7 +268,7 @@ class IPAdapterTextToImageBenchmark(TextToImageBenchmark):
 class ControlNetBenchmark(TextToImageBenchmark):
    pipeline_class = StableDiffusionControlNetPipeline
    aux_network_class = ControlNetModel
-    root_ckpt = "Lykon/DreamShaper"
+    root_ckpt = "runwayml/stable-diffusion-v1-5"

    url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/benchmarking/canny_image_condition.png"
    image = load_image(url).convert("RGB")
@@ -311,7 +311,7 @@ class ControlNetSDXLBenchmark(ControlNetBenchmark):
 class T2IAdapterBenchmark(ControlNetBenchmark):
    pipeline_class = StableDiffusionAdapterPipeline
    aux_network_class = T2IAdapter
-    root_ckpt = "Lykon/DreamShaper"
+    root_ckpt = "CompVis/stable-diffusion-v1-4"

    url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/benchmarking/canny_for_adapter.png"
    image = load_image(url).convert("L")
--- a/benchmarks/benchmark_ip_adapters.py
+++ b/benchmarks/benchmark_ip_adapters.py
@@ -7,8 +7,7 @@ from base_classes import IPAdapterTextToImageBenchmark  # noqa: E402


 IP_ADAPTER_CKPTS = {
-    # because original SD v1.5 has been taken down.
-    "Lykon/DreamShaper": ("h94/IP-Adapter", "ip-adapter_sd15.bin"),
+    "runwayml/stable-diffusion-v1-5": ("h94/IP-Adapter", "ip-adapter_sd15.bin"),
    "stabilityai/stable-diffusion-xl-base-1.0": ("h94/IP-Adapter", "ip-adapter_sdxl.bin"),
 }

@@ -18,7 +17,7 @@ if __name__ == "__main__":
    parser.add_argument(
        "--ckpt",
        type=str,
-        default="rstabilityai/stable-diffusion-xl-base-1.0",
+        default="runwayml/stable-diffusion-v1-5",
        choices=list(IP_ADAPTER_CKPTS.keys()),
    )
    parser.add_argument("--batch_size", type=int, default=1)
--- a/benchmarks/benchmark_sd_img.py
+++ b/benchmarks/benchmark_sd_img.py
@@ -11,9 +11,9 @@ if __name__ == "__main__":
    parser.add_argument(
        "--ckpt",
        type=str,
-        default="Lykon/DreamShaper",
+        default="runwayml/stable-diffusion-v1-5",
        choices=[
-            "Lykon/DreamShaper",
+            "runwayml/stable-diffusion-v1-5",
            "stabilityai/stable-diffusion-2-1",
            "stabilityai/stable-diffusion-xl-refiner-1.0",
            "stabilityai/sdxl-turbo",
--- a/benchmarks/benchmark_sd_inpainting.py
+++ b/benchmarks/benchmark_sd_inpainting.py
@@ -11,9 +11,9 @@ if __name__ == "__main__":
    parser.add_argument(
        "--ckpt",
        type=str,
-        default="Lykon/DreamShaper",
+        default="runwayml/stable-diffusion-v1-5",
        choices=[
-            "Lykon/DreamShaper",
+            "runwayml/stable-diffusion-v1-5",
            "stabilityai/stable-diffusion-2-1",
            "stabilityai/stable-diffusion-xl-base-1.0",
        ],
--- a/benchmarks/benchmark_text_to_image.py
+++ b/benchmarks/benchmark_text_to_image.py
@@ -7,7 +7,7 @@ from base_classes import TextToImageBenchmark, TurboTextToImageBenchmark  # noqa


 ALL_T2I_CKPTS = [
-    "Lykon/DreamShaper",
+    "runwayml/stable-diffusion-v1-5",
    "segmind/SSD-1B",
    "stabilityai/stable-diffusion-xl-base-1.0",
    "kandinsky-community/kandinsky-2-2-decoder",
@@ -21,7 +21,7 @@ if __name__ == "__main__":
    parser.add_argument(
        "--ckpt",
        type=str,
-        default="Lykon/DreamShaper",
+        default="runwayml/stable-diffusion-v1-5",
        choices=ALL_T2I_CKPTS,
    )
    parser.add_argument("--batch_size", type=int, default=1)
--- a/benchmarks/push_results.py
+++ b/benchmarks/push_results.py
@@ -3,7 +3,7 @@ import sys

 import pandas as pd
 from huggingface_hub import hf_hub_download, upload_file
-from huggingface_hub.utils import EntryNotFoundError
+from huggingface_hub.utils._errors import EntryNotFoundError


 sys.path.append(".")
--- a/docker/diffusers-flax-cpu/Dockerfile
+++ b/docker/diffusers-flax-cpu/Dockerfile
@@ -43,7 +43,6 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
        numpy==1.26.4 \
        scipy \
        tensorboard \
-        transformers \
-        hf_transfer
+        transformers

 CMD ["/bin/bash"]
--- a/docker/diffusers-flax-tpu/Dockerfile
+++ b/docker/diffusers-flax-tpu/Dockerfile
@@ -45,7 +45,6 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
        numpy==1.26.4 \
        scipy \
        tensorboard \
-        transformers \
-        hf_transfer
+        transformers

 CMD ["/bin/bash"]
--- a/docker/diffusers-onnxruntime-cpu/Dockerfile
+++ b/docker/diffusers-onnxruntime-cpu/Dockerfile
@@ -43,7 +43,6 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
        numpy==1.26.4 \
        scipy \
        tensorboard \
-        transformers \
-        hf_transfer
+        transformers

 CMD ["/bin/bash"]
--- a/docker/diffusers-onnxruntime-cuda/Dockerfile
+++ b/docker/diffusers-onnxruntime-cuda/Dockerfile
@@ -28,7 +28,7 @@ ENV PATH="/opt/venv/bin:$PATH"
 # pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
 RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
    python3.10 -m uv pip install --no-cache-dir \
-        "torch<2.5.0" \
+        torch \
        torchvision \
        torchaudio \
        "onnxruntime-gpu>=1.13.1" \
@@ -44,7 +44,6 @@ RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
        numpy==1.26.4 \
        scipy \
        tensorboard \
-        transformers \
-        hf_transfer
+        transformers

 CMD ["/bin/bash"]
--- a/docker/diffusers-pytorch-compile-cuda/Dockerfile
+++ b/docker/diffusers-pytorch-compile-cuda/Dockerfile
@@ -29,7 +29,7 @@ ENV PATH="/opt/venv/bin:$PATH"
 # pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
 RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
    python3.10 -m uv pip install --no-cache-dir \
-    "torch<2.5.0" \
+    torch \
    torchvision \
    torchaudio \
    invisible_watermark && \
@@ -44,7 +44,6 @@ RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
    numpy==1.26.4 \
    scipy \
    tensorboard \
-    transformers \
-    hf_transfer
+    transformers

 CMD ["/bin/bash"]
--- a/docker/diffusers-pytorch-cpu/Dockerfile
+++ b/docker/diffusers-pytorch-cpu/Dockerfile
@@ -29,7 +29,7 @@ ENV PATH="/opt/venv/bin:$PATH"
 # pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
 RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
    python3.10 -m uv pip install --no-cache-dir \
-        "torch<2.5.0" \
+        torch \
        torchvision \
        torchaudio \
        invisible_watermark \
@@ -44,7 +44,6 @@ RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
        numpy==1.26.4 \
        scipy \
        tensorboard \
-        transformers matplotlib  \
-        hf_transfer
+        transformers matplotlib

 CMD ["/bin/bash"]
--- a/docker/diffusers-pytorch-cuda/Dockerfile
+++ b/docker/diffusers-pytorch-cuda/Dockerfile
@@ -29,7 +29,7 @@ ENV PATH="/opt/venv/bin:$PATH"
 # pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
 RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
    python3.10 -m uv pip install --no-cache-dir \
-    "torch<2.5.0" \
+    torch \
    torchvision \
    torchaudio \
    invisible_watermark && \
@@ -45,7 +45,6 @@ RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
    scipy \
    tensorboard \
    transformers \
-    pytorch-lightning  \
-    hf_transfer
+    pytorch-lightning

 CMD ["/bin/bash"]
--- a/docker/diffusers-pytorch-xformers-cuda/Dockerfile
+++ b/docker/diffusers-pytorch-xformers-cuda/Dockerfile
@@ -29,7 +29,7 @@ ENV PATH="/opt/venv/bin:$PATH"
 # pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
 RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
    python3.10 -m pip install --no-cache-dir \
-        "torch<2.5.0" \
+        torch \
        torchvision \
        torchaudio \
        invisible_watermark && \
@@ -45,7 +45,6 @@ RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
        scipy \
        tensorboard \
        transformers \
-        xformers  \
-        hf_transfer
+        xformers

 CMD ["/bin/bash"]
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -56,7 +56,7 @@
  - local: using-diffusers/overview_techniques
    title: Overview
  - local: training/distributed_inference
-    title: Distributed inference
+    title: Distributed inference with multiple GPUs
  - local: using-diffusers/merge_loras
    title: Merge LoRAs
  - local: using-diffusers/scheduler_features
@@ -75,8 +75,6 @@
    title: Outpainting
  title: Advanced inference
 - sections:
-  - local: using-diffusers/cogvideox
-    title: CogVideoX
  - local: using-diffusers/sdxl
    title: Stable Diffusion XL
  - local: using-diffusers/sdxl_turbo
@@ -131,8 +129,6 @@
      title: T2I-Adapters
    - local: training/instructpix2pix
      title: InstructPix2Pix
-    - local: training/cogvideox
-      title: CogVideoX
    title: Models
  - isExpanded: false
    sections:
@@ -150,12 +146,6 @@
      title: Reinforcement learning training with DDPO
    title: Methods
  title: Training
- sections:
-  - local: quantization/overview
-    title: Getting Started
-  - local: quantization/bitsandbytes
-    title: bitsandbytes
-  title: Quantization Methods
 - sections:
  - local: optimization/fp16
    title: Speed up inference
@@ -171,8 +161,6 @@
    title: DeepCache
  - local: optimization/tgate
    title: TGATE
-  - local: optimization/xdit
-    title: xDiT
  - sections:
    - local: using-diffusers/stable_diffusion_jax_how_to
      title: JAX/Flax
@@ -188,8 +176,6 @@
      title: Metal Performance Shaders (MPS)
    - local: optimization/habana
      title: Habana Gaudi
-    - local: optimization/neuron
-      title: AWS Neuron
    title: Optimized hardware
  title: Accelerate inference and reduce memory
 - sections:
@@ -204,10 +190,6 @@
  - local: conceptual/evaluation
    title: Evaluating Diffusion Models
  title: Conceptual Guides
- sections:
-  - local: community_projects
-    title: Projects built with Diffusers
-  title: Community Projects
 - sections:
  - isExpanded: false
    sections:
@@ -217,8 +199,6 @@
      title: Logging
    - local: api/outputs
      title: Outputs
-    - local: api/quantization
-      title: Quantization
    title: Main Classes
  - isExpanded: false
    sections:
@@ -239,95 +219,67 @@
    sections:
    - local: api/models/overview
      title: Overview
-    - sections:
-      - local: api/models/controlnet
-        title: ControlNetModel
-      - local: api/models/controlnet_flux
-        title: FluxControlNetModel
-      - local: api/models/controlnet_hunyuandit
-        title: HunyuanDiT2DControlNetModel
-      - local: api/models/controlnet_sd3
-        title: SD3ControlNetModel
-      - local: api/models/controlnet_sparsectrl
-        title: SparseControlNetModel
-      title: ControlNets
-    - sections:
-      - local: api/models/allegro_transformer3d
-        title: AllegroTransformer3DModel
-      - local: api/models/aura_flow_transformer2d
-        title: AuraFlowTransformer2DModel
-      - local: api/models/cogvideox_transformer3d
-        title: CogVideoXTransformer3DModel
-      - local: api/models/cogview3plus_transformer2d
-        title: CogView3PlusTransformer2DModel
-      - local: api/models/dit_transformer2d
-        title: DiTTransformer2DModel
-      - local: api/models/flux_transformer
-        title: FluxTransformer2DModel
-      - local: api/models/hunyuan_transformer2d
-        title: HunyuanDiT2DModel
-      - local: api/models/latte_transformer3d
-        title: LatteTransformer3DModel
-      - local: api/models/lumina_nextdit2d
-        title: LuminaNextDiT2DModel
-      - local: api/models/mochi_transformer3d
-        title: MochiTransformer3DModel
-      - local: api/models/pixart_transformer2d
-        title: PixArtTransformer2DModel
-      - local: api/models/prior_transformer
-        title: PriorTransformer
-      - local: api/models/sd3_transformer2d
-        title: SD3Transformer2DModel
-      - local: api/models/stable_audio_transformer
-        title: StableAudioDiTModel
-      - local: api/models/transformer2d
-        title: Transformer2DModel
-      - local: api/models/transformer_temporal
-        title: TransformerTemporalModel
-      title: Transformers
-    - sections:
-      - local: api/models/stable_cascade_unet
-        title: StableCascadeUNet
-      - local: api/models/unet
-        title: UNet1DModel
-      - local: api/models/unet2d
-        title: UNet2DModel
-      - local: api/models/unet2d-cond
-        title: UNet2DConditionModel
-      - local: api/models/unet3d-cond
-        title: UNet3DConditionModel
-      - local: api/models/unet-motion
-        title: UNetMotionModel
-      - local: api/models/uvit2d
-        title: UViT2DModel
-      title: UNets
-    - sections:
-      - local: api/models/autoencoderkl
-        title: AutoencoderKL
-      - local: api/models/autoencoderkl_allegro
-        title: AutoencoderKLAllegro
-      - local: api/models/autoencoderkl_cogvideox
-        title: AutoencoderKLCogVideoX
-      - local: api/models/autoencoderkl_mochi
-        title: AutoencoderKLMochi
-      - local: api/models/asymmetricautoencoderkl
-        title: AsymmetricAutoencoderKL
-      - local: api/models/consistency_decoder_vae
-        title: ConsistencyDecoderVAE
-      - local: api/models/autoencoder_oobleck
-        title: Oobleck AutoEncoder
-      - local: api/models/autoencoder_tiny
-        title: Tiny AutoEncoder
-      - local: api/models/vq
-        title: VQModel
-      title: VAEs
+    - local: api/models/unet
+      title: UNet1DModel
+    - local: api/models/unet2d
+      title: UNet2DModel
+    - local: api/models/unet2d-cond
+      title: UNet2DConditionModel
+    - local: api/models/unet3d-cond
+      title: UNet3DConditionModel
+    - local: api/models/unet-motion
+      title: UNetMotionModel
+    - local: api/models/uvit2d
+      title: UViT2DModel
+    - local: api/models/vq
+      title: VQModel
+    - local: api/models/autoencoderkl
+      title: AutoencoderKL
+    - local: api/models/asymmetricautoencoderkl
+      title: AsymmetricAutoencoderKL
+    - local: api/models/autoencoder_tiny
+      title: Tiny AutoEncoder
+    - local: api/models/autoencoder_oobleck
+      title: Oobleck AutoEncoder
+    - local: api/models/consistency_decoder_vae
+      title: ConsistencyDecoderVAE
+    - local: api/models/transformer2d
+      title: Transformer2DModel
+    - local: api/models/pixart_transformer2d
+      title: PixArtTransformer2DModel
+    - local: api/models/dit_transformer2d
+      title: DiTTransformer2DModel
+    - local: api/models/hunyuan_transformer2d
+      title: HunyuanDiT2DModel
+    - local: api/models/aura_flow_transformer2d
+      title: AuraFlowTransformer2DModel
+    - local: api/models/flux_transformer
+      title: FluxTransformer2DModel
+    - local: api/models/latte_transformer3d
+      title: LatteTransformer3DModel
+    - local: api/models/lumina_nextdit2d
+      title: LuminaNextDiT2DModel
+    - local: api/models/transformer_temporal
+      title: TransformerTemporalModel
+    - local: api/models/sd3_transformer2d
+      title: SD3Transformer2DModel
+    - local: api/models/stable_audio_transformer
+      title: StableAudioDiTModel
+    - local: api/models/prior_transformer
+      title: PriorTransformer
+    - local: api/models/controlnet
+      title: ControlNetModel
+    - local: api/models/controlnet_hunyuandit
+      title: HunyuanDiT2DControlNetModel
+    - local: api/models/controlnet_sd3
+      title: SD3ControlNetModel
+    - local: api/models/controlnet_sparsectrl
+      title: SparseControlNetModel
    title: Models
  - isExpanded: false
    sections:
    - local: api/pipelines/overview
      title: Overview
-    - local: api/pipelines/allegro
-      title: Allegro
    - local: api/pipelines/amused
      title: aMUSEd
    - local: api/pipelines/animatediff
@@ -344,16 +296,10 @@
      title: AutoPipeline
    - local: api/pipelines/blip_diffusion
      title: BLIP-Diffusion
-    - local: api/pipelines/cogvideox
-      title: CogVideoX
-    - local: api/pipelines/cogview3
-      title: CogView3
    - local: api/pipelines/consistency_models
      title: Consistency Models
    - local: api/pipelines/controlnet
      title: ControlNet
-    - local: api/pipelines/controlnet_flux
-      title: ControlNet with Flux.1
    - local: api/pipelines/controlnet_hunyuandit
      title: ControlNet with Hunyuan-DiT
    - local: api/pipelines/controlnet_sd3
@@ -404,8 +350,6 @@
      title: Lumina-T2X
    - local: api/pipelines/marigold
      title: Marigold
-    - local: api/pipelines/mochi
-      title: Mochi
    - local: api/pipelines/panorama
      title: MultiDiffusion
    - local: api/pipelines/musicldm
--- a/docs/source/en/api/loaders/single_file.md
+++ b/docs/source/en/api/loaders/single_file.md
@@ -22,6 +22,7 @@ The [`~loaders.FromSingleFileMixin.from_single_file`] method allows you to load:

 ## Supported pipelines

+- [`CogVideoXPipeline`]
 - [`StableDiffusionPipeline`]
 - [`StableDiffusionImg2ImgPipeline`]
 - [`StableDiffusionInpaintPipeline`]
@@ -49,9 +50,9 @@ The [`~loaders.FromSingleFileMixin.from_single_file`] method allows you to load:
 - [`UNet2DConditionModel`]
 - [`StableCascadeUNet`]
 - [`AutoencoderKL`]
+- [`AutoencoderKLCogVideoX`]
 - [`ControlNetModel`]
 - [`SD3Transformer2DModel`]
- [`FluxTransformer2DModel`]

 ## FromSingleFileMixin

--- a/docs/source/en/api/models/allegro_transformer3d.md
+++ b/docs/source/en/api/models/allegro_transformer3d.md
@@ -1,30 +0,0 @@
-<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License. -->
-
-# AllegroTransformer3DModel
-
-A Diffusion Transformer model for 3D data from [Allegro](https://github.com/rhymes-ai/Allegro) was introduced in [Allegro: Open the Black Box of Commercial-Level Video Generation Model](https://huggingface.co/papers/2410.15458) by RhymesAI.
-
-The model can be loaded with the following code snippet.
-
-```python
-from diffusers import AllegroTransformer3DModel
-
-vae = AllegroTransformer3DModel.from_pretrained("rhymes-ai/Allegro", subfolder="transformer", torch_dtype=torch.bfloat16).to("cuda")
-```
-
-## AllegroTransformer3DModel
-
-[[autodoc]] AllegroTransformer3DModel
-
-## Transformer2DModelOutput
-
-[[autodoc]] models.modeling_outputs.Transformer2DModelOutput
--- a/docs/source/en/api/models/autoencoderkl_allegro.md
+++ b/docs/source/en/api/models/autoencoderkl_allegro.md
@@ -1,37 +0,0 @@
-<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License. -->
-
-# AutoencoderKLAllegro
-
-The 3D variational autoencoder (VAE) model with KL loss used in [Allegro](https://github.com/rhymes-ai/Allegro) was introduced in [Allegro: Open the Black Box of Commercial-Level Video Generation Model](https://huggingface.co/papers/2410.15458) by RhymesAI.
-
-The model can be loaded with the following code snippet.
-
-```python
-from diffusers import AutoencoderKLAllegro
-
-vae = AutoencoderKLCogVideoX.from_pretrained("rhymes-ai/Allegro", subfolder="vae", torch_dtype=torch.float32).to("cuda")
-```
-
-## AutoencoderKLAllegro
-
-[[autodoc]] AutoencoderKLAllegro
-    - decode
-    - encode
-    - all
-
-## AutoencoderKLOutput
-
-[[autodoc]] models.autoencoders.autoencoder_kl.AutoencoderKLOutput
-
-## DecoderOutput
-
-[[autodoc]] models.autoencoders.vae.DecoderOutput
--- a/docs/source/en/api/models/autoencoderkl_cogvideox.md
+++ b/docs/source/en/api/models/autoencoderkl_cogvideox.md
@@ -11,14 +11,18 @@ specific language governing permissions and limitations under the License. -->

 # AutoencoderKLCogVideoX

-The 3D variational autoencoder (VAE) model with KL loss used in [CogVideoX](https://github.com/THUDM/CogVideo) was introduced in [CogVideoX: Text-to-Video Diffusion Models with An Expert Transformer](https://github.com/THUDM/CogVideo/blob/main/resources/CogVideoX.pdf) by Tsinghua University & ZhipuAI.
+The 3D variational autoencoder (VAE) model with KL loss using CogVideoX.

-The model can be loaded with the following code snippet.
+## Loading from the original format

-```python
+By default, the [`AutoencoderKLCogVideoX`] should be loaded with [`~ModelMixin.from_pretrained`], but it can also be loaded from the original format using [`FromOriginalModelMixin.from_single_file`] as follows:
+
+```py
 from diffusers import AutoencoderKLCogVideoX

-vae = AutoencoderKLCogVideoX.from_pretrained("THUDM/CogVideoX-2b", subfolder="vae", torch_dtype=torch.float16).to("cuda")
+url = "THUDM/CogVideoX-2b"  # can also be a local file
+model = AutoencoderKLCogVideoX.from_single_file(url)
+
 ```

 ## AutoencoderKLCogVideoX
@@ -28,10 +32,38 @@ vae = AutoencoderKLCogVideoX.from_pretrained("THUDM/CogVideoX-2b", subfolder="va
    - encode
    - all

-## AutoencoderKLOutput
+## CogVideoXSafeConv3d

-[[autodoc]] models.autoencoders.autoencoder_kl.AutoencoderKLOutput
+[[autodoc]] CogVideoXSafeConv3d

-## DecoderOutput
+## CogVideoXCausalConv3d

-[[autodoc]] models.autoencoders.vae.DecoderOutput
+[[autodoc]] CogVideoXCausalConv3d
+
+## CogVideoXSpatialNorm3D
+
+[[autodoc]] CogVideoXSpatialNorm3D
+
+## CogVideoXResnetBlock3D
+
+[[autodoc]] CogVideoXResnetBlock3D
+
+## CogVideoXDownBlock3D
+
+[[autodoc]] CogVideoXDownBlock3D
+
+## CogVideoXMidBlock3D
+
+[[autodoc]] CogVideoXMidBlock3D
+
+## CogVideoXUpBlock3D
+
+[[autodoc]] CogVideoXUpBlock3D
+
+## CogVideoXEncoder3D
+
+[[autodoc]] CogVideoXEncoder3D
+
+## CogVideoXDecoder3D
+
+[[autodoc]] CogVideoXDecoder3D
--- a/docs/source/en/api/models/autoencoderkl_mochi.md
+++ b/docs/source/en/api/models/autoencoderkl_mochi.md
@@ -1,32 +0,0 @@
-<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License. -->
-
-# AutoencoderKLMochi
-
-The 3D variational autoencoder (VAE) model with KL loss used in [Mochi](https://github.com/genmoai/models) was introduced in [Mochi 1 Preview](https://huggingface.co/genmo/mochi-1-preview) by Tsinghua University & ZhipuAI.
-
-The model can be loaded with the following code snippet.
-
-```python
-from diffusers import AutoencoderKLMochi
-
-vae = AutoencoderKLMochi.from_pretrained("genmo/mochi-1-preview", subfolder="vae", torch_dtype=torch.float32).to("cuda")
-```
-
-## AutoencoderKLMochi
-
-[[autodoc]] AutoencoderKLMochi
-    - decode
-    - all
-
-## DecoderOutput
-
-[[autodoc]] models.autoencoders.vae.DecoderOutput
--- a/docs/source/en/api/models/cogvideox_transformer3d.md
+++ b/docs/source/en/api/models/cogvideox_transformer3d.md
@@ -9,22 +9,10 @@ Unless required by applicable law or agreed to in writing, software distributed
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License. -->

-# CogVideoXTransformer3DModel
+## CogVideoXTransformer3DModel

-A Diffusion Transformer model for 3D data from [CogVideoX](https://github.com/THUDM/CogVideo) was introduced in [CogVideoX: Text-to-Video Diffusion Models with An Expert Transformer](https://github.com/THUDM/CogVideo/blob/main/resources/CogVideoX.pdf) by Tsinghua University & ZhipuAI.
-
-The model can be loaded with the following code snippet.
-
-```python
-from diffusers import CogVideoXTransformer3DModel
-
-vae = CogVideoXTransformer3DModel.from_pretrained("THUDM/CogVideoX-2b", subfolder="transformer", torch_dtype=torch.float16).to("cuda")
-```
+A Diffusion Transformer model for 3D data from [CogVideoX](https://github.com/THUDM/CogVideoX).

 ## CogVideoXTransformer3DModel

 [[autodoc]] CogVideoXTransformer3DModel
-
-## Transformer2DModelOutput
-
-[[autodoc]] models.modeling_outputs.Transformer2DModelOutput
--- a/docs/source/en/api/models/cogview3plus_transformer2d.md
+++ b/docs/source/en/api/models/cogview3plus_transformer2d.md
@@ -1,30 +0,0 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License. -->
-
-# CogView3PlusTransformer2DModel
-
-A Diffusion Transformer model for 2D data from [CogView3Plus](https://github.com/THUDM/CogView3) was introduced in [CogView3: Finer and Faster Text-to-Image Generation via Relay Diffusion](https://huggingface.co/papers/2403.05121) by Tsinghua University & ZhipuAI.
-
-The model can be loaded with the following code snippet.
-
-```python
-from diffusers import CogView3PlusTransformer2DModel
-
-vae = CogView3PlusTransformer2DModel.from_pretrained("THUDM/CogView3Plus-3b", subfolder="transformer", torch_dtype=torch.bfloat16).to("cuda")
-```
-
-## CogView3PlusTransformer2DModel
-
-[[autodoc]] CogView3PlusTransformer2DModel
-
-## Transformer2DModelOutput
-
-[[autodoc]] models.modeling_outputs.Transformer2DModelOutput
--- a/docs/source/en/api/models/controlnet.md
+++ b/docs/source/en/api/models/controlnet.md
@@ -29,7 +29,7 @@ from diffusers import StableDiffusionControlNetPipeline, ControlNetModel
 url = "https://huggingface.co/lllyasviel/ControlNet-v1-1/blob/main/control_v11p_sd15_canny.pth"  # can also be a local path
 controlnet = ControlNetModel.from_single_file(url)

-url = "https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5/blob/main/v1-5-pruned.safetensors"  # can also be a local path
+url = "https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned.safetensors"  # can also be a local path
 pipe = StableDiffusionControlNetPipeline.from_single_file(url, controlnet=controlnet)
 ```

--- a/docs/source/en/api/models/controlnet_flux.md
+++ b/docs/source/en/api/models/controlnet_flux.md
@@ -1,45 +0,0 @@
-<!--Copyright 2024 The HuggingFace Team and The InstantX Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# FluxControlNetModel
-
-FluxControlNetModel is an implementation of ControlNet for Flux.1.
-
-The ControlNet model was introduced in [Adding Conditional Control to Text-to-Image Diffusion Models](https://huggingface.co/papers/2302.05543) by Lvmin Zhang, Anyi Rao, Maneesh Agrawala. It provides a greater degree of control over text-to-image generation by conditioning the model on additional inputs such as edge maps, depth maps, segmentation maps, and keypoints for pose detection.
-
-The abstract from the paper is:
-
-*We present ControlNet, a neural network architecture to add spatial conditioning controls to large, pretrained text-to-image diffusion models. ControlNet locks the production-ready large diffusion models, and reuses their deep and robust encoding layers pretrained with billions of images as a strong backbone to learn a diverse set of conditional controls. The neural architecture is connected with "zero convolutions" (zero-initialized convolution layers) that progressively grow the parameters from zero and ensure that no harmful noise could affect the finetuning. We test various conditioning controls, eg, edges, depth, segmentation, human pose, etc, with Stable Diffusion, using single or multiple conditions, with or without prompts. We show that the training of ControlNets is robust with small (<50k) and large (>1m) datasets. Extensive results show that ControlNet may facilitate wider applications to control image diffusion models.*
-
-## Loading from the original format
-
-By default the [`FluxControlNetModel`] should be loaded with [`~ModelMixin.from_pretrained`].
-
-```py
-from diffusers import FluxControlNetPipeline
-from diffusers.models import FluxControlNetModel, FluxMultiControlNetModel
-
-controlnet = FluxControlNetModel.from_pretrained("InstantX/FLUX.1-dev-Controlnet-Canny")
-pipe = FluxControlNetPipeline.from_pretrained("black-forest-labs/FLUX.1-dev", controlnet=controlnet)
-
-controlnet = FluxControlNetModel.from_pretrained("InstantX/FLUX.1-dev-Controlnet-Canny")
-controlnet = FluxMultiControlNetModel([controlnet])
-pipe = FluxControlNetPipeline.from_pretrained("black-forest-labs/FLUX.1-dev", controlnet=controlnet)
-```
-
-## FluxControlNetModel
-
-[[autodoc]] FluxControlNetModel
-
-## FluxControlNetOutput
-
-[[autodoc]] models.controlnet_flux.FluxControlNetOutput
--- a/docs/source/en/api/models/mochi_transformer3d.md
+++ b/docs/source/en/api/models/mochi_transformer3d.md
@@ -1,30 +0,0 @@
-<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License. -->
-
-# MochiTransformer3DModel
-
-A Diffusion Transformer model for 3D video-like data was introduced in [Mochi-1 Preview](https://huggingface.co/genmo/mochi-1-preview) by Genmo.
-
-The model can be loaded with the following code snippet.
-
-```python
-from diffusers import MochiTransformer3DModel
-
-vae = MochiTransformer3DModel.from_pretrained("genmo/mochi-1-preview", subfolder="transformer", torch_dtype=torch.float16).to("cuda")
-```
-
-## MochiTransformer3DModel
-
-[[autodoc]] MochiTransformer3DModel
-
-## Transformer2DModelOutput
-
-[[autodoc]] models.modeling_outputs.Transformer2DModelOutput
--- a/docs/source/en/api/models/stable_cascade_unet.md
+++ b/docs/source/en/api/models/stable_cascade_unet.md
@@ -1,19 +0,0 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# StableCascadeUNet
-
-A UNet model from the [Stable Cascade pipeline](../pipelines/stable_cascade.md).
-
-## StableCascadeUNet
-
-[[autodoc]] models.unets.unet_stable_cascade.StableCascadeUNet
--- a/docs/source/en/api/pipelines/allegro.md
+++ b/docs/source/en/api/pipelines/allegro.md
@@ -1,34 +0,0 @@
-<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License. -->
-
-# Allegro
-
-[Allegro: Open the Black Box of Commercial-Level Video Generation Model](https://huggingface.co/papers/2410.15458) from RhymesAI, by Yuan Zhou, Qiuyue Wang, Yuxuan Cai, Huan Yang.
-
-The abstract from the paper is:
-
-*Significant advancements have been made in the field of video generation, with the open-source community contributing a wealth of research papers and tools for training high-quality models. However, despite these efforts, the available information and resources remain insufficient for achieving commercial-level performance. In this report, we open the black box and introduce Allegro, an advanced video generation model that excels in both quality and temporal consistency. We also highlight the current limitations in the field and present a comprehensive methodology for training high-performance, commercial-level video generation models, addressing key aspects such as data, model architecture, training pipeline, and evaluation. Our user study shows that Allegro surpasses existing open-source models and most commercial models, ranking just behind Hailuo and Kling. Code: https://github.com/rhymes-ai/Allegro , Model: https://huggingface.co/rhymes-ai/Allegro , Gallery: https://rhymes.ai/allegro_gallery .*
-
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers.md) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading.md#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
-
-## AllegroPipeline
-
-[[autodoc]] AllegroPipeline
-  - all
-  - __call__
-
-## AllegroPipelineOutput
-
-[[autodoc]] pipelines.allegro.pipeline_output.AllegroPipelineOutput
--- a/docs/source/en/api/pipelines/animatediff.md
+++ b/docs/source/en/api/pipelines/animatediff.md
@@ -29,7 +29,6 @@ The abstract of the paper is the following:
 | [AnimateDiffSparseControlNetPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/animatediff/pipeline_animatediff_sparsectrl.py) | *Controlled Video-to-Video Generation with AnimateDiff using SparseCtrl* |
 | [AnimateDiffSDXLPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py) | *Video-to-Video Generation with AnimateDiff* |
 | [AnimateDiffVideoToVideoPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py) | *Video-to-Video Generation with AnimateDiff* |
-| [AnimateDiffVideoToVideoControlNetPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video_controlnet.py) | *Video-to-Video Generation with AnimateDiff using ControlNet* |

 ## Available checkpoints

@@ -519,97 +518,6 @@ Here are some sample outputs:
    </tr>
 </table>

-
-
-### AnimateDiffVideoToVideoControlNetPipeline
-
-AnimateDiff can be used together with ControlNets to enhance video-to-video generation by allowing for precise control over the output. ControlNet was introduced in [Adding Conditional Control to Text-to-Image Diffusion Models](https://huggingface.co/papers/2302.05543) by Lvmin Zhang, Anyi Rao, and Maneesh Agrawala, and allows you to condition Stable Diffusion with an additional control image to ensure that the spatial information is preserved throughout the video. 
-
-This pipeline allows you to condition your generation both on the original video and on a sequence of control images.
-
-```python
-import torch
-from PIL import Image
-from tqdm.auto import tqdm
-
-from controlnet_aux.processor import OpenposeDetector
-from diffusers import AnimateDiffVideoToVideoControlNetPipeline
-from diffusers.utils import export_to_gif, load_video
-from diffusers import AutoencoderKL, ControlNetModel, MotionAdapter, LCMScheduler
-
-# Load the ControlNet
-controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-openpose", torch_dtype=torch.float16)
-# Load the motion adapter
-motion_adapter = MotionAdapter.from_pretrained("wangfuyun/AnimateLCM")
-# Load SD 1.5 based finetuned model
-vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse", torch_dtype=torch.float16)
-pipe = AnimateDiffVideoToVideoControlNetPipeline.from_pretrained(
-    "SG161222/Realistic_Vision_V5.1_noVAE",
-    motion_adapter=motion_adapter,
-    controlnet=controlnet,
-    vae=vae,
-).to(device="cuda", dtype=torch.float16)
-
-# Enable LCM to speed up inference
-pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config, beta_schedule="linear")
-pipe.load_lora_weights("wangfuyun/AnimateLCM", weight_name="AnimateLCM_sd15_t2v_lora.safetensors", adapter_name="lcm-lora")
-pipe.set_adapters(["lcm-lora"], [0.8])
-
-video = load_video("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/dance.gif")
-video = [frame.convert("RGB") for frame in video]
-
-prompt = "astronaut in space, dancing"
-negative_prompt = "bad quality, worst quality, jpeg artifacts, ugly"
-
-# Create controlnet preprocessor
-open_pose = OpenposeDetector.from_pretrained("lllyasviel/Annotators").to("cuda")
-
-# Preprocess controlnet images
-conditioning_frames = []
-for frame in tqdm(video):
-    conditioning_frames.append(open_pose(frame))
-
-strength = 0.8
-with torch.inference_mode():
-    video = pipe(
-        video=video,
-        prompt=prompt,
-        negative_prompt=negative_prompt,
-        num_inference_steps=10,
-        guidance_scale=2.0,
-        controlnet_conditioning_scale=0.75,
-        conditioning_frames=conditioning_frames,
-        strength=strength,
-        generator=torch.Generator().manual_seed(42),
-    ).frames[0]
-
-video = [frame.resize(conditioning_frames[0].size) for frame in video]
-export_to_gif(video, f"animatediff_vid2vid_controlnet.gif", fps=8)
-```
-
-Here are some sample outputs:
-
-<table align="center">
-    <tr>
-      <th align="center">Source Video</th>
-      <th align="center">Output Video</th>
-    </tr>
-    <tr>
-        <td align="center">
-          anime girl, dancing
-          <br />
-          <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/dance.gif" alt="anime girl, dancing" />
-        </td>
-        <td align="center">
-          astronaut in space, dancing
-          <br/>
-          <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff_vid2vid_controlnet.gif" alt="astronaut in space, dancing" />
-        </td>
-    </tr>
-</table>
-
-**The lights and composition were transferred from the Source Video.**
-
 ## Using Motion LoRAs

 Motion LoRAs are a collection of LoRAs that work with the `guoyww/animatediff-motion-adapter-v1-5-2` checkpoint. These LoRAs are responsible for adding specific types of motion to the animations.
@@ -914,89 +822,6 @@ export_to_gif(frames, "animatelcm-motion-lora.gif")
    </tr>
 </table>

-## Using FreeNoise
-
-[FreeNoise: Tuning-Free Longer Video Diffusion via Noise Rescheduling](https://arxiv.org/abs/2310.15169) by Haonan Qiu, Menghan Xia, Yong Zhang, Yingqing He, Xintao Wang, Ying Shan, Ziwei Liu.
-
-FreeNoise is a sampling mechanism that can generate longer videos with short-video generation models by employing noise-rescheduling, temporal attention over sliding windows, and weighted averaging of latent frames. It also can be used with multiple prompts to allow for interpolated video generations. More details are available in the paper.
-
-The currently supported AnimateDiff pipelines that can be used with FreeNoise are:
- [`AnimateDiffPipeline`]
- [`AnimateDiffControlNetPipeline`]
- [`AnimateDiffVideoToVideoPipeline`]
- [`AnimateDiffVideoToVideoControlNetPipeline`]
-
-In order to use FreeNoise, a single line needs to be added to the inference code after loading your pipelines.
-
-```diff
-+ pipe.enable_free_noise()
-```
-
-After this, either a single prompt could be used, or multiple prompts can be passed as a dictionary of integer-string pairs. The integer keys of the dictionary correspond to the frame index at which the influence of that prompt would be maximum. Each frame index should map to a single string prompt. The prompts for intermediate frame indices, that are not passed in the dictionary, are created by interpolating between the frame prompts that are passed. By default, simple linear interpolation is used. However, you can customize this behaviour with a callback to the `prompt_interpolation_callback` parameter when enabling FreeNoise.
-
-Full example:
-
-```python
-import torch
-from diffusers import AutoencoderKL, AnimateDiffPipeline, LCMScheduler, MotionAdapter
-from diffusers.utils import export_to_video, load_image
-
-# Load pipeline
-dtype = torch.float16
-motion_adapter = MotionAdapter.from_pretrained("wangfuyun/AnimateLCM", torch_dtype=dtype)
-vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse", torch_dtype=dtype)
-
-pipe = AnimateDiffPipeline.from_pretrained("emilianJR/epiCRealism", motion_adapter=motion_adapter, vae=vae, torch_dtype=dtype)
-pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config, beta_schedule="linear")
-
-pipe.load_lora_weights(
-    "wangfuyun/AnimateLCM", weight_name="AnimateLCM_sd15_t2v_lora.safetensors", adapter_name="lcm_lora"
-)
-pipe.set_adapters(["lcm_lora"], [0.8])
-
-# Enable FreeNoise for long prompt generation
-pipe.enable_free_noise(context_length=16, context_stride=4)
-pipe.to("cuda")
-
-# Can be a single prompt, or a dictionary with frame timesteps
-prompt = {
-    0: "A caterpillar on a leaf, high quality, photorealistic",
-    40: "A caterpillar transforming into a cocoon, on a leaf, near flowers, photorealistic",
-    80: "A cocoon on a leaf, flowers in the backgrond, photorealistic",
-    120: "A cocoon maturing and a butterfly being born, flowers and leaves visible in the background, photorealistic",
-    160: "A beautiful butterfly, vibrant colors, sitting on a leaf, flowers in the background, photorealistic",
-    200: "A beautiful butterfly, flying away in a forest, photorealistic",
-    240: "A cyberpunk butterfly, neon lights, glowing",
-}
-negative_prompt = "bad quality, worst quality, jpeg artifacts"
-
-# Run inference
-output = pipe(
-    prompt=prompt,
-    negative_prompt=negative_prompt,
-    num_frames=256,
-    guidance_scale=2.5,
-    num_inference_steps=10,
-    generator=torch.Generator("cpu").manual_seed(0),
-)
-
-# Save video
-frames = output.frames[0]
-export_to_video(frames, "output.mp4", fps=16)
-```
-
-### FreeNoise memory savings
-
-Since FreeNoise processes multiple frames together, there are parts in the modeling where the memory required exceeds that available on normal consumer GPUs. The main memory bottlenecks that we identified are spatial and temporal attention blocks, upsampling and downsampling blocks, resnet blocks and feed-forward layers. Since most of these blocks operate effectively only on the channel/embedding dimension, one can perform chunked inference across the batch dimensions. The batch dimension in AnimateDiff are either spatial (`[B x F, H x W, C]`) or temporal (`B x H x W, F, C`) in nature (note that it may seem counter-intuitive, but the batch dimension here are correct, because spatial blocks process across the `B x F` dimension while the temporal blocks process across the `B x H x W` dimension). We introduce a `SplitInferenceModule` that makes it easier to chunk across any dimension and perform inference. This saves a lot of memory but comes at the cost of requiring more time for inference.
-
-```diff
-# Load pipeline and adapters
-# ...
-+ pipe.enable_free_noise_split_inference()
-+ pipe.unet.enable_forward_chunking(16)
-```
-
-The call to `pipe.enable_free_noise_split_inference` method accepts two parameters: `spatial_split_size` (defaults to `256`) and `temporal_split_size` (defaults to `16`). These can be configured based on how much VRAM you have available. A lower split size results in lower memory usage but slower inference, whereas a larger split size results in faster inference at the cost of more memory.

 ## Using `from_single_file` with the MotionAdapter

@@ -1041,12 +866,6 @@ pipe = AnimateDiffPipeline.from_pretrained("emilianJR/epiCRealism", motion_adapt
  - all
  - __call__

-## AnimateDiffVideoToVideoControlNetPipeline
-
-[[autodoc]] AnimateDiffVideoToVideoControlNetPipeline
-  - all
-  - __call__
-
 ## AnimateDiffPipelineOutput

 [[autodoc]] pipelines.animatediff.AnimateDiffPipelineOutput
--- a/docs/source/en/api/pipelines/cogvideox.md
+++ b/docs/source/en/api/pipelines/cogvideox.md
@@ -10,16 +10,18 @@
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
-# limitations under the License.
+# limitations under the License. 
+
+## TODO: The paper is still being written.
 -->

 # CogVideoX

-[CogVideoX: Text-to-Video Diffusion Models with An Expert Transformer](https://arxiv.org/abs/2408.06072) from Tsinghua University & ZhipuAI, by Zhuoyi Yang, Jiayan Teng, Wendi Zheng, Ming Ding, Shiyu Huang, Jiazheng Xu, Yuanming Yang, Wenyi Hong, Xiaohan Zhang, Guanyu Feng, Da Yin, Xiaotao Gu, Yuxuan Zhang, Weihan Wang, Yean Cheng, Ting Liu, Bin Xu, Yuxiao Dong, Jie Tang.
+[TODO]() from Tsinghua University & ZhipuAI.

 The abstract from the paper is:

-*We introduce CogVideoX, a large-scale diffusion transformer model designed for generating videos based on text prompts. To efficently model video data, we propose to levearge a 3D Variational Autoencoder (VAE) to compresses videos along both spatial and temporal dimensions. To improve the text-video alignment, we propose an expert transformer with the expert adaptive LayerNorm to facilitate the deep fusion between the two modalities. By employing a progressive training technique, CogVideoX is adept at producing coherent, long-duration videos characterized by significant motion. In addition, we develop an effectively text-video data processing pipeline that includes various data preprocessing strategies and a video captioning method. It significantly helps enhance the performance of CogVideoX, improving both generation quality and semantic alignment. Results show that CogVideoX demonstrates state-of-the-art performance across both multiple machine metrics and human evaluations. The model weight of CogVideoX-2B is publicly available at https://github.com/THUDM/CogVideo.*
+The paper is still being written.

 <Tip>

@@ -27,20 +29,7 @@ Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers.m

 </Tip>

-This pipeline was contributed by [zRzRzRzRzRzRzR](https://github.com/zRzRzRzRzRzRzR). The original codebase can be found [here](https://huggingface.co/THUDM). The original weights can be found under [hf.co/THUDM](https://huggingface.co/THUDM).
-
-There are two models available that can be used with the text-to-video and video-to-video CogVideoX pipelines:
- [`THUDM/CogVideoX-2b`](https://huggingface.co/THUDM/CogVideoX-2b): The recommended dtype for running this model is `fp16`.
- [`THUDM/CogVideoX-5b`](https://huggingface.co/THUDM/CogVideoX-5b): The recommended dtype for running this model is `bf16`.
-
-There is one model available that can be used with the image-to-video CogVideoX pipeline:
- [`THUDM/CogVideoX-5b-I2V`](https://huggingface.co/THUDM/CogVideoX-5b-I2V): The recommended dtype for running this model is `bf16`.
-
-There are two models that support pose controllable generation (by the [Alibaba-PAI](https://huggingface.co/alibaba-pai) team):
- [`alibaba-pai/CogVideoX-Fun-V1.1-2b-Pose`](https://huggingface.co/alibaba-pai/CogVideoX-Fun-V1.1-2b-Pose): The recommended dtype for running this model is `bf16`.
- [`alibaba-pai/CogVideoX-Fun-V1.1-5b-Pose`](https://huggingface.co/alibaba-pai/CogVideoX-Fun-V1.1-5b-Pose): The recommended dtype for running this model is `bf16`.
-
-## Inference
+### Inference

 Use [`torch.compile`](https://huggingface.co/docs/diffusers/main/en/tutorials/fast_diffusion#torchcompile) to reduce the inference latency.

@@ -48,86 +37,43 @@ First, load the pipeline:

 ```python
 import torch
-from diffusers import CogVideoXPipeline, CogVideoXImageToVideoPipeline
-from diffusers.utils import export_to_video,load_image
-pipe = CogVideoXPipeline.from_pretrained("THUDM/CogVideoX-5b").to("cuda") # or "THUDM/CogVideoX-2b" 
+from diffusers import LattePipeline
+
+pipeline = LattePipeline.from_pretrained(
+	"THUDM/CogVideoX-2b", torch_dtype=torch.float16
+).to("cuda")
 ```

-If you are using the image-to-video pipeline, load it as follows:
+Then change the memory layout of the pipelines `transformer` and `vae` components to `torch.channels-last`:

 ```python
-pipe = CogVideoXImageToVideoPipeline.from_pretrained("THUDM/CogVideoX-5b-I2V").to("cuda")
+pipeline.transformer.to(memory_format=torch.channels_last)
+pipeline.vae.to(memory_format=torch.channels_last)
 ```

-Then change the memory layout of the pipelines `transformer` component to `torch.channels_last`:
+Finally, compile the components and run inference:

 ```python
-pipe.transformer.to(memory_format=torch.channels_last)
-```
+pipeline.transformer = torch.compile(pipeline.transformer)
+pipeline.vae.decode = torch.compile(pipeline.vae.decode)

-Compile the components and run inference:
-
-```python
-pipe.transformer = torch.compile(pipeline.transformer, mode="max-autotune", fullgraph=True)
-
-# CogVideoX works well with long and well-described prompts
+# CogVideoX works very well with long and well-described prompts
 prompt = "A panda, dressed in a small, red jacket and a tiny hat, sits on a wooden stool in a serene bamboo forest. The panda's fluffy paws strum a miniature acoustic guitar, producing soft, melodic tunes. Nearby, a few other pandas gather, watching curiously and some clapping in rhythm. Sunlight filters through the tall bamboo, casting a gentle glow on the scene. The panda's face is expressive, showing concentration and joy as it plays. The background includes a small, flowing stream and vibrant green foliage, enhancing the peaceful and magical atmosphere of this unique musical performance."
-video = pipe(prompt=prompt, guidance_scale=6, num_inference_steps=50).frames[0]
+video = pipeline(prompt=prompt, guidance_scale=6, num_inference_steps=50).frames[0]
 ```

-The [T2V benchmark](https://gist.github.com/a-r-r-o-w/5183d75e452a368fd17448fcc810bd3f) results on an 80GB A100 machine are:
+The [benchmark](TODO: link) results on an 80GB A100 machine are:

 ```
-Without torch.compile(): Average inference time: 96.89 seconds.
-With torch.compile(): Average inference time: 76.27 seconds.
+Without torch.compile(): Average inference time: TODO seconds.
+With torch.compile(): Average inference time: TODO seconds.
 ```

-### Memory optimization
-
-CogVideoX-2b requires about 19 GB of GPU memory to decode 49 frames (6 seconds of video at 8 FPS) with output resolution 720x480 (W x H), which makes it not possible to run on consumer GPUs or free-tier T4 Colab. The following memory optimizations could be used to reduce the memory footprint. For replication, you can refer to [this](https://gist.github.com/a-r-r-o-w/3959a03f15be5c9bd1fe545b09dfcc93) script.
-
- `pipe.enable_model_cpu_offload()`:
-  - Without enabling cpu offloading, memory usage is `33 GB`
-  - With enabling cpu offloading, memory usage is `19 GB`
- `pipe.enable_sequential_cpu_offload()`:
-  - Similar to `enable_model_cpu_offload` but can significantly reduce memory usage at the cost of slow inference
-  - When enabled, memory usage is under `4 GB`
- `pipe.vae.enable_tiling()`:
-  - With enabling cpu offloading and tiling, memory usage is `11 GB`
- `pipe.vae.enable_slicing()`
-
-### Quantized inference
-
-[torchao](https://github.com/pytorch/ao) and [optimum-quanto](https://github.com/huggingface/optimum-quanto/) can be used to quantize the text encoder, transformer and VAE modules to lower the memory requirements. This makes it possible to run the model on a free-tier T4 Colab or lower VRAM GPUs!
-
-It is also worth noting that torchao quantization is fully compatible with [torch.compile](/optimization/torch2.0#torchcompile), which allows for much faster inference speed. Additionally, models can be serialized and stored in a quantized datatype to save disk space with torchao. Find examples and benchmarks in the gists below.
- [torchao](https://gist.github.com/a-r-r-o-w/4d9732d17412888c885480c6521a9897)
- [quanto](https://gist.github.com/a-r-r-o-w/31be62828b00a9292821b85c1017effa)
-
 ## CogVideoXPipeline

 [[autodoc]] CogVideoXPipeline
  - all
  - __call__

-## CogVideoXImageToVideoPipeline
-
-[[autodoc]] CogVideoXImageToVideoPipeline
-  - all
-  - __call__
-
-## CogVideoXVideoToVideoPipeline
-
-[[autodoc]] CogVideoXVideoToVideoPipeline
-  - all
-  - __call__
-
-## CogVideoXFunControlPipeline
-
-[[autodoc]] CogVideoXFunControlPipeline
-  - all
-  - __call__
-
 ## CogVideoXPipelineOutput
-
-[[autodoc]] pipelines.cogvideo.pipeline_output.CogVideoXPipelineOutput
+[[autodoc]] pipelines.pipline_cogvideo.pipeline_output.CogVideoXPipelineOutput
--- a/docs/source/en/api/pipelines/cogview3.md
+++ b/docs/source/en/api/pipelines/cogview3.md
@@ -1,40 +0,0 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-->
-
-# CogView3Plus
-
-[CogView3: Finer and Faster Text-to-Image Generation via Relay Diffusion](https://huggingface.co/papers/2403.05121) from Tsinghua University & ZhipuAI, by Wendi Zheng, Jiayan Teng, Zhuoyi Yang, Weihan Wang, Jidong Chen, Xiaotao Gu, Yuxiao Dong, Ming Ding, Jie Tang.
-
-The abstract from the paper is:
-
-*Recent advancements in text-to-image generative systems have been largely driven by diffusion models. However, single-stage text-to-image diffusion models still face challenges, in terms of computational efficiency and the refinement of image details. To tackle the issue, we propose CogView3, an innovative cascaded framework that enhances the performance of text-to-image diffusion. CogView3 is the first model implementing relay diffusion in the realm of text-to-image generation, executing the task by first creating low-resolution images and subsequently applying relay-based super-resolution. This methodology not only results in competitive text-to-image outputs but also greatly reduces both training and inference costs. Our experimental results demonstrate that CogView3 outperforms SDXL, the current state-of-the-art open-source text-to-image diffusion model, by 77.0% in human evaluations, all while requiring only about 1/2 of the inference time. The distilled variant of CogView3 achieves comparable performance while only utilizing 1/10 of the inference time by SDXL.*
-
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers.md) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading.md#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
-
-This pipeline was contributed by [zRzRzRzRzRzRzR](https://github.com/zRzRzRzRzRzRzR). The original codebase can be found [here](https://huggingface.co/THUDM). The original weights can be found under [hf.co/THUDM](https://huggingface.co/THUDM).
-
-## CogView3PlusPipeline
-
-[[autodoc]] CogView3PlusPipeline
-  - all
-  - __call__
-
-## CogView3PipelineOutput
-
-[[autodoc]] pipelines.cogview3.pipeline_output.CogView3PipelineOutput
--- a/docs/source/en/api/pipelines/controlnet_flux.md
+++ b/docs/source/en/api/pipelines/controlnet_flux.md
@@ -1,56 +0,0 @@
-<!--Copyright 2024 The HuggingFace Team, The InstantX Team, and the XLabs Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# ControlNet with Flux.1
-
-FluxControlNetPipeline is an implementation of ControlNet for Flux.1.
-
-ControlNet was introduced in [Adding Conditional Control to Text-to-Image Diffusion Models](https://huggingface.co/papers/2302.05543) by Lvmin Zhang, Anyi Rao, and Maneesh Agrawala.
-
-With a ControlNet model, you can provide an additional control image to condition and control Stable Diffusion generation. For example, if you provide a depth map, the ControlNet model generates an image that'll preserve the spatial information from the depth map. It is a more flexible and accurate way to control the image generation process.
-
-The abstract from the paper is:
-
-*We present ControlNet, a neural network architecture to add spatial conditioning controls to large, pretrained text-to-image diffusion models. ControlNet locks the production-ready large diffusion models, and reuses their deep and robust encoding layers pretrained with billions of images as a strong backbone to learn a diverse set of conditional controls. The neural architecture is connected with "zero convolutions" (zero-initialized convolution layers) that progressively grow the parameters from zero and ensure that no harmful noise could affect the finetuning. We test various conditioning controls, eg, edges, depth, segmentation, human pose, etc, with Stable Diffusion, using single or multiple conditions, with or without prompts. We show that the training of ControlNets is robust with small (<50k) and large (>1m) datasets. Extensive results show that ControlNet may facilitate wider applications to control image diffusion models.*
-
-This controlnet code is implemented by [The InstantX Team](https://huggingface.co/InstantX). You can find pre-trained checkpoints for Flux-ControlNet in the table below:
-
-
-| ControlNet type | Developer | Link |
-| -------- | ---------- | ---- |
-| Canny | [The InstantX Team](https://huggingface.co/InstantX) | [Link](https://huggingface.co/InstantX/FLUX.1-dev-Controlnet-Canny) |
-| Depth | [The InstantX Team](https://huggingface.co/InstantX) | [Link](https://huggingface.co/Shakker-Labs/FLUX.1-dev-ControlNet-Depth) |
-| Union | [The InstantX Team](https://huggingface.co/InstantX) | [Link](https://huggingface.co/InstantX/FLUX.1-dev-Controlnet-Union) |
-
-XLabs ControlNets are also supported, which was contributed by the [XLabs team](https://huggingface.co/XLabs-AI).
-
-| ControlNet type | Developer | Link |
-| -------- | ---------- | ---- |
-| Canny | [The XLabs Team](https://huggingface.co/XLabs-AI) | [Link](https://huggingface.co/XLabs-AI/flux-controlnet-canny-diffusers) |
-| Depth | [The XLabs Team](https://huggingface.co/XLabs-AI) | [Link](https://huggingface.co/XLabs-AI/flux-controlnet-depth-diffusers) |
-| HED | [The XLabs Team](https://huggingface.co/XLabs-AI) | [Link](https://huggingface.co/XLabs-AI/flux-controlnet-hed-diffusers) |
-
-
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-components-across-pipelines) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
-
-## FluxControlNetPipeline
-[[autodoc]] FluxControlNetPipeline
-	- all
-	- __call__
-
-
-## FluxPipelineOutput
-[[autodoc]] pipelines.flux.pipeline_output.FluxPipelineOutput
--- a/docs/source/en/api/pipelines/controlnet_sd3.md
+++ b/docs/source/en/api/pipelines/controlnet_sd3.md
@@ -1,4 +1,4 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+<!--Copyright 2023 The HuggingFace Team and The InstantX Team. All rights reserved.

 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
@@ -22,16 +22,7 @@ The abstract from the paper is:

 *We present ControlNet, a neural network architecture to add spatial conditioning controls to large, pretrained text-to-image diffusion models. ControlNet locks the production-ready large diffusion models, and reuses their deep and robust encoding layers pretrained with billions of images as a strong backbone to learn a diverse set of conditional controls. The neural architecture is connected with "zero convolutions" (zero-initialized convolution layers) that progressively grow the parameters from zero and ensure that no harmful noise could affect the finetuning. We test various conditioning controls, eg, edges, depth, segmentation, human pose, etc, with Stable Diffusion, using single or multiple conditions, with or without prompts. We show that the training of ControlNets is robust with small (<50k) and large (>1m) datasets. Extensive results show that ControlNet may facilitate wider applications to control image diffusion models.*

-This controlnet code is mainly implemented by [The InstantX Team](https://huggingface.co/InstantX). The inpainting-related code was developed by [The Alimama Creative Team](https://huggingface.co/alimama-creative). You can find pre-trained checkpoints for SD3-ControlNet in the table below:
-
-
-| ControlNet type | Developer | Link |
-| -------- | ---------- | ---- |
-| Canny | [The InstantX Team](https://huggingface.co/InstantX) | [Link](https://huggingface.co/InstantX/SD3-Controlnet-Canny) |
-| Pose | [The InstantX Team](https://huggingface.co/InstantX) | [Link](https://huggingface.co/InstantX/SD3-Controlnet-Pose) |
-| Tile | [The InstantX Team](https://huggingface.co/InstantX) | [Link](https://huggingface.co/InstantX/SD3-Controlnet-Tile) |
-| Inpainting | [The AlimamaCreative Team](https://huggingface.co/alimama-creative) | [link](https://huggingface.co/alimama-creative/SD3-Controlnet-Inpainting) |
-
+This code is implemented by [The InstantX Team](https://huggingface.co/InstantX). You can find pre-trained checkpoints for SD3-ControlNet on [The InstantX Team](https://huggingface.co/InstantX) Hub profile.

 <Tip>

@@ -44,10 +35,5 @@ Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers)
 	- all
 	- __call__

-## StableDiffusion3ControlNetInpaintingPipeline
-[[autodoc]] pipelines.controlnet_sd3.pipeline_stable_diffusion_3_controlnet_inpainting.StableDiffusion3ControlNetInpaintingPipeline
-	- all
-	- __call__
-
 ## StableDiffusion3PipelineOutput
 [[autodoc]] pipelines.stable_diffusion_3.pipeline_output.StableDiffusion3PipelineOutput
--- a/docs/source/en/api/pipelines/flux.md
+++ b/docs/source/en/api/pipelines/flux.md
@@ -37,7 +37,7 @@ Both checkpoints have slightly difference usage which we detail below.

 ```python
 import torch
-from diffusers import FluxPipeline
+from diffusers import  FluxPipeline

 pipe = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-schnell", torch_dtype=torch.bfloat16)
 pipe.enable_model_cpu_offload()
@@ -61,7 +61,7 @@ out.save("image.png")

 ```python
 import torch
-from diffusers import FluxPipeline
+from diffusers import  FluxPipeline

 pipe = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16)
 pipe.enable_model_cpu_offload()
@@ -77,114 +77,8 @@ out = pipe(
 out.save("image.png")
 ```

-## Running FP16 inference
-Flux can generate high-quality images with FP16 (i.e. to accelerate inference on Turing/Volta GPUs) but produces different outputs compared to FP32/BF16. The issue is that some activations in the text encoders have to be clipped when running in FP16, which affects the overall image. Forcing text encoders to run with FP32 inference thus removes this output difference. See [here](https://github.com/huggingface/diffusers/pull/9097#issuecomment-2272292516) for details.
-
-FP16 inference code:
-```python
-import torch
-from diffusers import FluxPipeline
-
-pipe = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-schnell", torch_dtype=torch.bfloat16) # can replace schnell with dev
-# to run on low vram GPUs (i.e. between 4 and 32 GB VRAM)
-pipe.enable_sequential_cpu_offload()
-pipe.vae.enable_slicing()
-pipe.vae.enable_tiling()
-
-pipe.to(torch.float16) # casting here instead of in the pipeline constructor because doing so in the constructor loads all models into CPU memory at once
-
-prompt = "A cat holding a sign that says hello world"
-out = pipe(
-    prompt=prompt,
-    guidance_scale=0.,
-    height=768,
-    width=1360,
-    num_inference_steps=4,
-    max_sequence_length=256,
-).images[0]
-out.save("image.png")
-```
-
-## Single File Loading for the `FluxTransformer2DModel`
-
-The `FluxTransformer2DModel` supports loading checkpoints in the original format shipped by Black Forest Labs. This is also useful when trying to load finetunes or quantized versions of the models that have been published by the community.
-
-<Tip>
-`FP8` inference can be brittle depending on the GPU type, CUDA version, and `torch` version that you are using. It is recommended that you use the `optimum-quanto` library in order to run FP8 inference on your machine.
-</Tip>
-
-The following example demonstrates how to run Flux with less than 16GB of VRAM.
-
-First install `optimum-quanto`
-
-```shell
-pip install optimum-quanto
-```
-
-Then run the following example
-
-```python
-import torch
-from diffusers import FluxTransformer2DModel, FluxPipeline
-from transformers import T5EncoderModel, CLIPTextModel
-from optimum.quanto import freeze, qfloat8, quantize
-
-bfl_repo = "black-forest-labs/FLUX.1-dev"
-dtype = torch.bfloat16
-
-transformer = FluxTransformer2DModel.from_single_file("https://huggingface.co/Kijai/flux-fp8/blob/main/flux1-dev-fp8.safetensors", torch_dtype=dtype)
-quantize(transformer, weights=qfloat8)
-freeze(transformer)
-
-text_encoder_2 = T5EncoderModel.from_pretrained(bfl_repo, subfolder="text_encoder_2", torch_dtype=dtype)
-quantize(text_encoder_2, weights=qfloat8)
-freeze(text_encoder_2)
-
-pipe = FluxPipeline.from_pretrained(bfl_repo, transformer=None, text_encoder_2=None, torch_dtype=dtype)
-pipe.transformer = transformer
-pipe.text_encoder_2 = text_encoder_2
-
-pipe.enable_model_cpu_offload()
-
-prompt = "A cat holding a sign that says hello world"
-image = pipe(
-    prompt,
-    guidance_scale=3.5,
-    output_type="pil",
-    num_inference_steps=20,
-    generator=torch.Generator("cpu").manual_seed(0)
-).images[0]
-
-image.save("flux-fp8-dev.png")
-```
-
 ## FluxPipeline

 [[autodoc]] FluxPipeline
 	- all
-	- __call__
-
-## FluxImg2ImgPipeline
-
-[[autodoc]] FluxImg2ImgPipeline
-	- all
-	- __call__
-
-## FluxInpaintPipeline
-
-[[autodoc]] FluxInpaintPipeline
-	- all
-	- __call__
-
-
-## FluxControlNetInpaintPipeline
-
-[[autodoc]] FluxControlNetInpaintPipeline
-	- all
-	- __call__
-
-## FluxControlNetImg2ImgPipeline
-
-[[autodoc]] FluxControlNetImg2ImgPipeline
-	- all
-	- __call__
+	- __call__
--- a/docs/source/en/api/pipelines/kolors.md
+++ b/docs/source/en/api/pipelines/kolors.md
@@ -14,7 +14,7 @@ specific language governing permissions and limitations under the License.

 ![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/kolors/kolors_header_collage.png)

-Kolors is a large-scale text-to-image generation model based on latent diffusion, developed by [the Kuaishou Kolors team](https://github.com/Kwai-Kolors/Kolors). Trained on billions of text-image pairs, Kolors exhibits significant advantages over both open-source and closed-source models in visual quality, complex semantic accuracy, and text rendering for both Chinese and English characters. Furthermore, Kolors supports both Chinese and English inputs, demonstrating strong performance in understanding and generating Chinese-specific content. For more details, please refer to this [technical report](https://github.com/Kwai-Kolors/Kolors/blob/master/imgs/Kolors_paper.pdf).
+Kolors is a large-scale text-to-image generation model based on latent diffusion, developed by [the Kuaishou Kolors team](kwai-kolors@kuaishou.com). Trained on billions of text-image pairs, Kolors exhibits significant advantages over both open-source and closed-source models in visual quality, complex semantic accuracy, and text rendering for both Chinese and English characters. Furthermore, Kolors supports both Chinese and English inputs, demonstrating strong performance in understanding and generating Chinese-specific content. For more details, please refer to this [technical report](https://github.com/Kwai-Kolors/Kolors/blob/master/imgs/Kolors_paper.pdf).

 The abstract from the technical report is:

@@ -74,7 +74,7 @@ image_encoder = CLIPVisionModelWithProjection.from_pretrained(

 pipe = KolorsPipeline.from_pretrained(
    "Kwai-Kolors/Kolors-diffusers", image_encoder=image_encoder, torch_dtype=torch.float16, variant="fp16"
-)
+).to("cuda")
 pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config, use_karras_sigmas=True)

 pipe.load_ip_adapter(
@@ -105,11 +105,3 @@ image.save("kolors_ipa_sample.png")

 - all
 - __call__
-
-## KolorsImg2ImgPipeline
-
-[[autodoc]] KolorsImg2ImgPipeline
-
- all
- __call__
-
--- a/docs/source/en/api/pipelines/mochi.md
+++ b/docs/source/en/api/pipelines/mochi.md
@@ -1,36 +0,0 @@
-<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-->
-
-# Mochi
-
-[Mochi 1 Preview](https://huggingface.co/genmo/mochi-1-preview) from Genmo.
-
-*Mochi 1 preview is an open state-of-the-art video generation model with high-fidelity motion and strong prompt adherence in preliminary evaluation. This model dramatically closes the gap between closed and open video generation systems. The model is released under a permissive Apache 2.0 license.*
-
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers.md) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading.md#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
-
-## MochiPipeline
-
-[[autodoc]] MochiPipeline
-  - all
-  - __call__
-
-## MochiPipelineOutput
-
-[[autodoc]] pipelines.mochi.pipeline_output.MochiPipelineOutput
--- a/docs/source/en/api/pipelines/overview.md
+++ b/docs/source/en/api/pipelines/overview.md
@@ -30,64 +30,63 @@ The table below lists all the pipelines currently available in 🤗 Diffusers an

 | Pipeline | Tasks |
 |---|---|
-| [aMUSEd](amused) | text2image |
+| [AltDiffusion](alt_diffusion) | image2image |
 | [AnimateDiff](animatediff) | text2video |
 | [Attend-and-Excite](attend_and_excite) | text2image |
+| [Audio Diffusion](audio_diffusion) | image2audio |
 | [AudioLDM](audioldm) | text2audio |
 | [AudioLDM2](audioldm2) | text2audio |
-| [AuraFlow](auraflow) | text2image |
 | [BLIP Diffusion](blip_diffusion) | text2image |
-| [CogVideoX](cogvideox) | text2video |
 | [Consistency Models](consistency_models) | unconditional image generation |
 | [ControlNet](controlnet) | text2image, image2image, inpainting |
-| [ControlNet with Flux.1](controlnet_flux) | text2image |
-| [ControlNet with Hunyuan-DiT](controlnet_hunyuandit) | text2image |
-| [ControlNet with Stable Diffusion 3](controlnet_sd3) | text2image |
 | [ControlNet with Stable Diffusion XL](controlnet_sdxl) | text2image |
 | [ControlNet-XS](controlnetxs) | text2image |
 | [ControlNet-XS with Stable Diffusion XL](controlnetxs_sdxl) | text2image |
+| [Cycle Diffusion](cycle_diffusion) | image2image |
 | [Dance Diffusion](dance_diffusion) | unconditional audio generation |
 | [DDIM](ddim) | unconditional image generation |
 | [DDPM](ddpm) | unconditional image generation |
 | [DeepFloyd IF](deepfloyd_if) | text2image, image2image, inpainting, super-resolution |
 | [DiffEdit](diffedit) | inpainting |
 | [DiT](dit) | text2image |
-| [Flux](flux) | text2image |
-| [Hunyuan-DiT](hunyuandit) | text2image |
-| [I2VGen-XL](i2vgenxl) | text2video |
+| [GLIGEN](stable_diffusion/gligen) | text2image |
 | [InstructPix2Pix](pix2pix) | image editing |
 | [Kandinsky 2.1](kandinsky) | text2image, image2image, inpainting, interpolation |
 | [Kandinsky 2.2](kandinsky_v22) | text2image, image2image, inpainting |
 | [Kandinsky 3](kandinsky3) | text2image, image2image |
-| [Kolors](kolors) | text2image |
 | [Latent Consistency Models](latent_consistency_models) | text2image |
 | [Latent Diffusion](latent_diffusion) | text2image, super-resolution |
-| [Latte](latte) | text2image |
+| [LDM3D](stable_diffusion/ldm3d_diffusion) | text2image, text-to-3D, text-to-pano, upscaling |
 | [LEDITS++](ledits_pp) | image editing |
-| [Lumina-T2X](lumina) | text2image |
-| [Marigold](marigold) | depth |
 | [MultiDiffusion](panorama) | text2image |
 | [MusicLDM](musicldm) | text2audio |
-| [PAG](pag) | text2image |
 | [Paint by Example](paint_by_example) | inpainting |
-| [PIA](pia) | image2video |
+| [ParaDiGMS](paradigms) | text2image |
+| [Pix2Pix Zero](pix2pix_zero) | image editing |
 | [PixArt-α](pixart) | text2image |
-| [PixArt-Σ](pixart_sigma) | text2image |
+| [PNDM](pndm) | unconditional image generation |
+| [RePaint](repaint) | inpainting |
+| [Score SDE VE](score_sde_ve) | unconditional image generation |
 | [Self-Attention Guidance](self_attention_guidance) | text2image |
 | [Semantic Guidance](semantic_stable_diffusion) | text2image |
 | [Shap-E](shap_e) | text-to-3D, image-to-3D |
+| [Spectrogram Diffusion](spectrogram_diffusion) |  |
 | [Stable Audio](stable_audio) | text2audio |
-| [Stable Cascade](stable_cascade) | text2image |
 | [Stable Diffusion](stable_diffusion/overview) | text2image, image2image, depth2image, inpainting, image variation, latent upscaler, super-resolution |
+| [Stable Diffusion Model Editing](model_editing) | model editing |
 | [Stable Diffusion XL](stable_diffusion/stable_diffusion_xl) | text2image, image2image, inpainting |
 | [Stable Diffusion XL Turbo](stable_diffusion/sdxl_turbo) | text2image, image2image, inpainting |
 | [Stable unCLIP](stable_unclip) | text2image, image variation |
+| [Stochastic Karras VE](stochastic_karras_ve) | unconditional image generation |
 | [T2I-Adapter](stable_diffusion/adapter) | text2image |
 | [Text2Video](text_to_video) | text2video, video2video |
 | [Text2Video-Zero](text_to_video_zero) | text2video |
 | [unCLIP](unclip) | text2image, image variation |
+| [Unconditional Latent Diffusion](latent_diffusion_uncond) | unconditional image generation |
 | [UniDiffuser](unidiffuser) | text2image, image2text, image variation, text variation, unconditional image generation, unconditional audio generation |
 | [Value-guided planning](value_guided_sampling) | value guided sampling |
+| [Versatile Diffusion](versatile_diffusion) | text2image, image variation |
+| [VQ Diffusion](vq_diffusion) | text2image |
 | [Wuerstchen](wuerstchen) | text2image |

 ## DiffusionPipeline
--- a/docs/source/en/api/pipelines/pag.md
+++ b/docs/source/en/api/pipelines/pag.md
@@ -20,7 +20,7 @@ The abstract from the paper is:

 *Recent studies have demonstrated that diffusion models are capable of generating high-quality samples, but their quality heavily depends on sampling guidance techniques, such as classifier guidance (CG) and classifier-free guidance (CFG). These techniques are often not applicable in unconditional generation or in various downstream tasks such as image restoration. In this paper, we propose a novel sampling guidance, called Perturbed-Attention Guidance (PAG), which improves diffusion sample quality across both unconditional and conditional settings, achieving this without requiring additional training or the integration of external modules. PAG is designed to progressively enhance the structure of samples throughout the denoising process. It involves generating intermediate samples with degraded structure by substituting selected self-attention maps in diffusion U-Net with an identity matrix, by considering the self-attention mechanisms' ability to capture structural information, and guiding the denoising process away from these degraded samples. In both ADM and Stable Diffusion, PAG surprisingly improves sample quality in conditional and even unconditional scenarios. Moreover, PAG significantly improves the baseline performance in various downstream tasks where existing guidances such as CG or CFG cannot be fully utilized, including ControlNet with empty prompts and image restoration such as inpainting and deblurring.*

-PAG can be used by specifying the `pag_applied_layers` as a parameter when instantiating a PAG pipeline. It can be a single string or a list of strings. Each string can be a unique layer identifier or a regular expression to identify one or more layers.
+PAG can be used by specifying the `pag_applied_layers` as a parameter when instantiating a PAG pipeline. It can be a single string or a list of strings. Each string can be a unique layer identifier or a regular expression to identify one or more layers. 

 - Full identifier as a normal string: `down_blocks.2.attentions.0.transformer_blocks.0.attn1.processor`
 - Full identifier as a RegEx: `down_blocks.2.(attentions|motion_modules).0.transformer_blocks.0.attn1.processor`
@@ -43,26 +43,13 @@ Since RegEx is supported as a way for matching layer identifiers, it is crucial
  - all
  - __call__

-## KolorsPAGPipeline
-[[autodoc]] KolorsPAGPipeline
-  - all
-  - __call__
-
 ## StableDiffusionPAGPipeline
 [[autodoc]] StableDiffusionPAGPipeline
 	- all
 	- __call__

-## StableDiffusionPAGImg2ImgPipeline
-[[autodoc]] StableDiffusionPAGImg2ImgPipeline
-	- all
-	- __call__
-
 ## StableDiffusionControlNetPAGPipeline
 [[autodoc]] StableDiffusionControlNetPAGPipeline
-
-## StableDiffusionControlNetPAGInpaintPipeline
-[[autodoc]] StableDiffusionControlNetPAGInpaintPipeline
 	- all
 	- __call__

@@ -86,16 +73,6 @@ Since RegEx is supported as a way for matching layer identifiers, it is crucial
 	- all
 	- __call__

-## StableDiffusionXLControlNetPAGImg2ImgPipeline
-[[autodoc]] StableDiffusionXLControlNetPAGImg2ImgPipeline
-	- all
-	- __call__
-
-## StableDiffusion3PAGPipeline
-[[autodoc]] StableDiffusion3PAGPipeline
-	- all
-	- __call__
-

 ## PixArtSigmaPAGPipeline
 [[autodoc]] PixArtSigmaPAGPipeline
--- a/docs/source/en/api/pipelines/stable_audio.md
+++ b/docs/source/en/api/pipelines/stable_audio.md
@@ -21,7 +21,7 @@ Stable Audio is trained on a corpus of around 48k audio recordings, where around
 The abstract of the paper is the following:
 *Open generative models are vitally important for the community, allowing for fine-tunes and serving as baselines when presenting new models. However, most current text-to-audio models are private and not accessible for artists and researchers to build upon. Here we describe the architecture and training process of a new open-weights text-to-audio model trained with Creative Commons data. Our evaluation shows that the model's performance is competitive with the state-of-the-art across various metrics. Notably, the reported FDopenl3 results (measuring the realism of the generations) showcase its potential for high-quality stereo sound synthesis at 44.1kHz.*

-This pipeline was contributed by [Yoach Lacombe](https://huggingface.co/ylacombe). The original codebase can be found at [Stability-AI/stable-audio-tools](https://github.com/Stability-AI/stable-audio-tools).
+This pipeline was contributed by [Yoach Lacombe](https://huggingface.co/ylacombe). The original codebase can be found at [Stability-AI/stable-audio-tool](https://github.com/Stability-AI/stable-audio-tool).

 ## Tips

--- a/docs/source/en/api/pipelines/stable_diffusion/inpaint.md
+++ b/docs/source/en/api/pipelines/stable_diffusion/inpaint.md
@@ -19,7 +19,7 @@ The Stable Diffusion model can also be applied to inpainting which lets you edit
 It is recommended to use this pipeline with checkpoints that have been specifically fine-tuned for inpainting, such
 as [runwayml/stable-diffusion-inpainting](https://huggingface.co/runwayml/stable-diffusion-inpainting). Default
 text-to-image Stable Diffusion checkpoints, such as
-[stable-diffusion-v1-5/stable-diffusion-v1-5](https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5) are also compatible but they might be less performant.
+[runwayml/stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5) are also compatible but they might be less performant.

 <Tip>

--- a/docs/source/en/api/pipelines/stable_diffusion/overview.md
+++ b/docs/source/en/api/pipelines/stable_diffusion/overview.md
@@ -203,7 +203,7 @@ from diffusers import StableDiffusionImg2ImgPipeline
 import gradio as gr


-pipe = StableDiffusionImg2ImgPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5")
+pipe = StableDiffusionImg2ImgPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")

 gr.Interface.from_pipeline(pipe).launch()
 ```
--- a/docs/source/en/api/pipelines/stable_diffusion/stable_diffusion_3.md
+++ b/docs/source/en/api/pipelines/stable_diffusion/stable_diffusion_3.md
@@ -54,11 +54,6 @@ image = pipe(
 image.save("sd3_hello_world.png")
 ```

-**Note:** Stable Diffusion 3.5 can also be run using the SD3 pipeline, and all mentioned optimizations and techniques apply to it as well. In total there are three official models in the SD3 family:
- [`stabilityai/stable-diffusion-3-medium-diffusers`](https://huggingface.co/stabilityai/stable-diffusion-3-medium-diffusers)
- [`stabilityai/stable-diffusion-3.5-large`](https://huggingface.co/stabilityai/stable-diffusion-3-5-large)
- [`stabilityai/stable-diffusion-3.5-large-turbo`](https://huggingface.co/stabilityai/stable-diffusion-3-5-large-turbo)
-
 ## Memory Optimisations for SD3

 SD3 uses three text encoders, one if which is the very large T5-XXL model. This makes it challenging to run the model on GPUs with less than 24GB of VRAM, even when using `fp16` precision. The following section outlines a few memory optimizations in Diffusers that make it easier to run SD3 on low resource hardware.
@@ -313,26 +308,6 @@ image = pipe("a picture of a cat holding a sign that says hello world").images[0
 image.save('sd3-single-file-t5-fp8.png')
 ```

-### Loading the single file checkpoint for the Stable Diffusion 3.5 Transformer Model
-
-```python
-import torch
-from diffusers import SD3Transformer2DModel, StableDiffusion3Pipeline
-
-transformer = SD3Transformer2DModel.from_single_file(
-    "https://huggingface.co/stabilityai/stable-diffusion-3.5-large-turbo/blob/main/sd3.5_large.safetensors",
-    torch_dtype=torch.bfloat16,
-)
-pipe = StableDiffusion3Pipeline.from_pretrained(
-    "stabilityai/stable-diffusion-3.5-large",
-    transformer=transformer,
-    torch_dtype=torch.bfloat16,
-)
-pipe.enable_model_cpu_offload()
-image = pipe("a cat holding a sign that says hello world").images[0]
-image.save("sd35.png")
-```
-
 ## StableDiffusion3Pipeline

 [[autodoc]] StableDiffusion3Pipeline
--- a/docs/source/en/api/pipelines/text_to_video_zero.md
+++ b/docs/source/en/api/pipelines/text_to_video_zero.md
@@ -40,9 +40,8 @@ To generate a video from prompt, run the following Python code:
 ```python
 import torch
 from diffusers import TextToVideoZeroPipeline
-import imageio

-model_id = "stable-diffusion-v1-5/stable-diffusion-v1-5"
+model_id = "runwayml/stable-diffusion-v1-5"
 pipe = TextToVideoZeroPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to("cuda")

 prompt = "A panda is playing guitar on times square"
@@ -64,7 +63,7 @@ import torch
 from diffusers import TextToVideoZeroPipeline
 import numpy as np

-model_id = "stable-diffusion-v1-5/stable-diffusion-v1-5"
+model_id = "runwayml/stable-diffusion-v1-5"
 pipe = TextToVideoZeroPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to("cuda")
 seed = 0
 video_length = 24  #24 ÷ 4fps = 6 seconds
@@ -138,7 +137,7 @@ To generate a video from prompt with additional pose control
    from diffusers import StableDiffusionControlNetPipeline, ControlNetModel
    from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_zero import CrossFrameAttnProcessor

-    model_id = "stable-diffusion-v1-5/stable-diffusion-v1-5"
+    model_id = "runwayml/stable-diffusion-v1-5"
    controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-openpose", torch_dtype=torch.float16)
    pipe = StableDiffusionControlNetPipeline.from_pretrained(
        model_id, controlnet=controlnet, torch_dtype=torch.float16
--- a/docs/source/en/api/quantization.md
+++ b/docs/source/en/api/quantization.md
@@ -1,33 +0,0 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-->
-
-# Quantization
-
-Quantization techniques reduce memory and computational costs by representing weights and activations with lower-precision data types like 8-bit integers (int8). This enables loading larger models you normally wouldn't be able to fit into memory, and speeding up inference. Diffusers supports 8-bit and 4-bit quantization with [bitsandbytes](https://huggingface.co/docs/bitsandbytes/en/index).
-
-Quantization techniques that aren't supported in Transformers can be added with the [`DiffusersQuantizer`] class.
-
-<Tip>
-
-Learn how to quantize models in the [Quantization](../quantization/overview) guide.
-
-</Tip>
-
-
-## BitsAndBytesConfig
-
-[[autodoc]] BitsAndBytesConfig
-
-## DiffusersQuantizer
-
-[[autodoc]] quantizers.base.DiffusersQuantizer
--- a/docs/source/en/api/schedulers/overview.md
+++ b/docs/source/en/api/schedulers/overview.md
@@ -45,15 +45,6 @@ Many schedulers are implemented from the [k-diffusion](https://github.com/crowso
 | N/A                 | [`DEISMultistepScheduler`]          |                                                                                                               |
 | N/A                 | [`UniPCMultistepScheduler`]         |                                                                                                               |

-## Noise schedules and schedule types
-| A1111/k-diffusion        | 🤗 Diffusers                                                               |
-|--------------------------|----------------------------------------------------------------------------|
-| Karras                   | init with `use_karras_sigmas=True`                                         |
-| sgm_uniform              | init with `timestep_spacing="trailing"`                                    |
-| simple                   | init with `timestep_spacing="trailing"`                                    |
-| exponential              | init with `timestep_spacing="linspace"`, `use_exponential_sigmas=True`     |
-| beta                     | init with `timestep_spacing="linspace"`, `use_beta_sigmas=True`            |
-
 All schedulers are built from the base [`SchedulerMixin`] class which implements low level utilities shared by all schedulers.

 ## SchedulerMixin
--- a/docs/source/en/community_projects.md
+++ b/docs/source/en/community_projects.md
@@ -1,82 +0,0 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# Community Projects
-
-Welcome to Community Projects. This space is dedicated to showcasing the incredible work and innovative applications created by our vibrant community using the `diffusers` library.
-
-This section aims to:
-
- Highlight diverse and inspiring projects built with `diffusers`
- Foster knowledge sharing within our community
- Provide real-world examples of how `diffusers` can be leveraged
-
-Happy exploring, and thank you for being part of the Diffusers community!
-
-<table>
-    <tr>
-        <th>Project Name</th>
-        <th>Description</th>
-    </tr>
-  <tr style="border-top: 2px solid black">
-    <td><a href="https://github.com/carson-katri/dream-textures"> dream-textures </a></td>
-    <td>Stable Diffusion built-in to Blender</td>
-  </tr>
-  <tr style="border-top: 2px solid black">
-    <td><a href="https://github.com/megvii-research/HiDiffusion"> HiDiffusion </a></td>
-    <td>Increases the resolution and speed of your diffusion model by only adding a single line of code</td>
-  </tr>
-  <tr style="border-top: 2px solid black">
-    <td><a href="https://github.com/lllyasviel/IC-Light"> IC-Light </a></td>
-    <td>IC-Light is a project to manipulate the illumination of images</td>
-  </tr>
-  <tr style="border-top: 2px solid black">
-    <td><a href="https://github.com/InstantID/InstantID"> InstantID </a></td>
-    <td>InstantID : Zero-shot Identity-Preserving Generation in Seconds</td>
-  </tr>
-  <tr style="border-top: 2px solid black">
-    <td><a href="https://github.com/Sanster/IOPaint"> IOPaint </a></td>
-    <td>Image inpainting tool powered by SOTA AI Model. Remove any unwanted object, defect, people from your pictures or erase and replace(powered by stable diffusion) any thing on your pictures.</td>
-  </tr>
-  <tr style="border-top: 2px solid black">
-    <td><a href="https://github.com/bmaltais/kohya_ss"> Kohya </a></td>
-    <td>Gradio GUI for Kohya's Stable Diffusion trainers</td>
-  </tr>
-  <tr style="border-top: 2px solid black">
-    <td><a href="https://github.com/magic-research/magic-animate"> MagicAnimate </a></td>
-    <td>MagicAnimate: Temporally Consistent Human Image Animation using Diffusion Model</td>
-  </tr>
-  <tr style="border-top: 2px solid black">
-    <td><a href="https://github.com/levihsu/OOTDiffusion"> OOTDiffusion </a></td>
-    <td>Outfitting Fusion based Latent Diffusion for Controllable Virtual Try-on</td>
-  </tr>
-  <tr style="border-top: 2px solid black">
-    <td><a href="https://github.com/vladmandic/automatic"> SD.Next </a></td>
-    <td>SD.Next: Advanced Implementation of Stable Diffusion and other Diffusion-based generative image models</td>
-  </tr>
-  <tr style="border-top: 2px solid black">
-    <td><a href="https://github.com/ashawkey/stable-dreamfusion"> stable-dreamfusion </a></td>
-    <td>Text-to-3D & Image-to-3D & Mesh Exportation with NeRF + Diffusion</td>
-  </tr>
-  <tr style="border-top: 2px solid black">
-    <td><a href="https://github.com/HVision-NKU/StoryDiffusion"> StoryDiffusion </a></td>
-    <td>StoryDiffusion can create a magic story by generating consistent images and videos.</td>
-  </tr>
-  <tr style="border-top: 2px solid black">
-    <td><a href="https://github.com/cumulo-autumn/StreamDiffusion"> StreamDiffusion </a></td>
-    <td>A Pipeline-Level Solution for Real-Time Interactive Generation</td>
-  </tr>
-  <tr style="border-top: 2px solid black">
-    <td><a href="https://github.com/Netwrck/stable-diffusion-server"> Stable Diffusion Server </a></td>
-    <td>A server configured for Inpainting/Generation/img2img with one stable diffusion model</td>
-  </tr>
-</table>
--- a/docs/source/en/conceptual/evaluation.md
+++ b/docs/source/en/conceptual/evaluation.md
@@ -92,7 +92,7 @@ images = sd_pipeline(sample_prompts, num_images_per_prompt=1, generator=generato

 ![parti-prompts-14](https://huggingface.co/datasets/diffusers/docs-images/resolve/main/evaluation_diffusion_models/parti-prompts-14.png)

-We can also set `num_images_per_prompt` accordingly to compare different images for the same prompt. Running the same pipeline but with a different checkpoint ([v1-5](https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5)), yields:
+We can also set `num_images_per_prompt` accordingly to compare different images for the same prompt. Running the same pipeline but with a different checkpoint ([v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5)), yields:

 ![parti-prompts-15](https://huggingface.co/datasets/diffusers/docs-images/resolve/main/evaluation_diffusion_models/parti-prompts-15.png)

@@ -177,10 +177,10 @@ generator = torch.manual_seed(seed)
 images = sd_pipeline(prompts, num_images_per_prompt=1, generator=generator, output_type="np").images
 ```

-Then we load the [v1-5 checkpoint](https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5) to generate images:
+Then we load the [v1-5 checkpoint](https://huggingface.co/runwayml/stable-diffusion-v1-5) to generate images:

 ```python
-model_ckpt_1_5 = "stable-diffusion-v1-5/stable-diffusion-v1-5"
+model_ckpt_1_5 = "runwayml/stable-diffusion-v1-5"
 sd_pipeline_1_5 = StableDiffusionPipeline.from_pretrained(model_ckpt_1_5, torch_dtype=weight_dtype).to(device)

 images_1_5 = sd_pipeline_1_5(prompts, num_images_per_prompt=1, generator=generator, output_type="np").images
@@ -198,7 +198,7 @@ print(f"CLIP Score with v-1-5: {sd_clip_score_1_5}")
 # CLIP Score with v-1-5: 36.2137
 ```

-It seems like the [v1-5](https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5) checkpoint performs better than its predecessor. Note, however, that the number of prompts we used to compute the CLIP scores is quite low. For a more practical evaluation, this number should be way higher, and the prompts should be diverse.
+It seems like the [v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5) checkpoint performs better than its predecessor. Note, however, that the number of prompts we used to compute the CLIP scores is quite low. For a more practical evaluation, this number should be way higher, and the prompts should be diverse.

 <Tip warning={true}>

--- a/docs/source/en/conceptual/philosophy.md
+++ b/docs/source/en/conceptual/philosophy.md
@@ -65,7 +65,7 @@ Pipelines are designed to be easy to use (therefore do not follow [*Simple over
 The following design principles are followed:
 - Pipelines follow the single-file policy. All pipelines can be found in individual directories under src/diffusers/pipelines. One pipeline folder corresponds to one diffusion paper/project/release. Multiple pipeline files can be gathered in one pipeline folder, as it’s done for [`src/diffusers/pipelines/stable-diffusion`](https://github.com/huggingface/diffusers/tree/main/src/diffusers/pipelines/stable_diffusion). If pipelines share similar functionality, one can make use of the [# Copied from mechanism](https://github.com/huggingface/diffusers/blob/125d783076e5bd9785beb05367a2d2566843a271/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py#L251).
 - Pipelines all inherit from [`DiffusionPipeline`].
- Every pipeline consists of different model and scheduler components, that are documented in the [`model_index.json` file](https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5/blob/main/model_index.json), are accessible under the same name as attributes of the pipeline and can be shared between pipelines with [`DiffusionPipeline.components`](https://huggingface.co/docs/diffusers/main/en/api/diffusion_pipeline#diffusers.DiffusionPipeline.components) function.
+- Every pipeline consists of different model and scheduler components, that are documented in the [`model_index.json` file](https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/model_index.json), are accessible under the same name as attributes of the pipeline and can be shared between pipelines with [`DiffusionPipeline.components`](https://huggingface.co/docs/diffusers/main/en/api/diffusion_pipeline#diffusers.DiffusionPipeline.components) function.
 - Every pipeline should be loadable via the [`DiffusionPipeline.from_pretrained`](https://huggingface.co/docs/diffusers/main/en/api/diffusion_pipeline#diffusers.DiffusionPipeline.from_pretrained) function.
 - Pipelines should be used **only** for inference.
 - Pipelines should be very readable, self-explanatory, and easy to tweak.
--- a/docs/source/en/optimization/coreml.md
+++ b/docs/source/en/optimization/coreml.md
@@ -95,17 +95,17 @@ print(f"Model downloaded at {model_path}")
 Once you have downloaded a snapshot of the model, you can test it using Apple's Python script.

 ```shell
-python -m python_coreml_stable_diffusion.pipeline --prompt "a photo of an astronaut riding a horse on mars" -i ./models/coreml-stable-diffusion-v1-4_original_packages/original/packages -o </path/to/output/image> --compute-unit CPU_AND_GPU --seed 93
+python -m python_coreml_stable_diffusion.pipeline --prompt "a photo of an astronaut riding a horse on mars" -i models/coreml-stable-diffusion-v1-4_original_packages -o </path/to/output/image> --compute-unit CPU_AND_GPU --seed 93
 ```

 Pass the path of the downloaded checkpoint with `-i` flag to the script. `--compute-unit` indicates the hardware you want to allow for inference. It must be one of the following options: `ALL`, `CPU_AND_GPU`, `CPU_ONLY`, `CPU_AND_NE`. You may also provide an optional output path, and a seed for reproducibility.

 The inference script assumes you're using the original version of the Stable Diffusion model, `CompVis/stable-diffusion-v1-4`. If you use another model, you *have* to specify its Hub id in the inference command line, using the `--model-version` option. This works for models already supported and custom models you trained or fine-tuned yourself.

-For example, if you want to use [`stable-diffusion-v1-5/stable-diffusion-v1-5`](https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5):
+For example, if you want to use [`runwayml/stable-diffusion-v1-5`](https://huggingface.co/runwayml/stable-diffusion-v1-5):

 ```shell
-python -m python_coreml_stable_diffusion.pipeline --prompt "a photo of an astronaut riding a horse on mars" --compute-unit ALL -o output --seed 93 -i models/coreml-stable-diffusion-v1-5_original_packages --model-version stable-diffusion-v1-5/stable-diffusion-v1-5
+python -m python_coreml_stable_diffusion.pipeline --prompt "a photo of an astronaut riding a horse on mars" --compute-unit ALL -o output --seed 93 -i models/coreml-stable-diffusion-v1-5_original_packages --model-version runwayml/stable-diffusion-v1-5
 ```

 ## Core ML inference in Swift
--- a/docs/source/en/optimization/deepcache.md
+++ b/docs/source/en/optimization/deepcache.md
@@ -23,7 +23,7 @@ Then load and enable the [`DeepCacheSDHelper`](https://github.com/horseee/DeepCa
 ```diff
  import torch
  from diffusers import StableDiffusionPipeline
-  pipe = StableDiffusionPipeline.from_pretrained('stable-diffusion-v1-5/stable-diffusion-v1-5', torch_dtype=torch.float16).to("cuda")
+  pipe = StableDiffusionPipeline.from_pretrained('runwayml/stable-diffusion-v1-5', torch_dtype=torch.float16).to("cuda")

 + from DeepCache import DeepCacheSDHelper
 + helper = DeepCacheSDHelper(pipe=pipe)
--- a/docs/source/en/optimization/fp16.md
+++ b/docs/source/en/optimization/fp16.md
@@ -47,7 +47,7 @@ import torch
 from diffusers import DiffusionPipeline

 pipe = DiffusionPipeline.from_pretrained(
-    "stable-diffusion-v1-5/stable-diffusion-v1-5",
+    "runwayml/stable-diffusion-v1-5",
    torch_dtype=torch.float16,
    use_safetensors=True,
 )
@@ -125,5 +125,3 @@ image
    <figcaption class="mt-2 text-center text-sm text-gray-500">distilled Stable Diffusion + Tiny AutoEncoder</figcaption>
  </div>
 </div>
-
-More tiny autoencoder models for other Stable Diffusion models, like Stable Diffusion 3, are available from [madebyollin](https://huggingface.co/madebyollin).
--- a/docs/source/en/optimization/habana.md
+++ b/docs/source/en/optimization/habana.md
@@ -61,7 +61,7 @@ For more information, check out 🤗 Optimum Habana's [documentation](https://hu

 We benchmarked Habana's first-generation Gaudi and Gaudi2 with the [Habana/stable-diffusion](https://huggingface.co/Habana/stable-diffusion) and [Habana/stable-diffusion-2](https://huggingface.co/Habana/stable-diffusion-2) Gaudi configurations (mixed precision bf16/fp32) to demonstrate their performance.

-For [Stable Diffusion v1.5](https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5) on 512x512 images:
+For [Stable Diffusion v1.5](https://huggingface.co/runwayml/stable-diffusion-v1-5) on 512x512 images:

 |                        | Latency (batch size = 1) | Throughput  |
 | ---------------------- |:------------------------:|:---------------------------:|
--- a/docs/source/en/optimization/memory.md
+++ b/docs/source/en/optimization/memory.md
@@ -41,7 +41,7 @@ import torch
 from diffusers import StableDiffusionPipeline

 pipe = StableDiffusionPipeline.from_pretrained(
-    "stable-diffusion-v1-5/stable-diffusion-v1-5",
+    "runwayml/stable-diffusion-v1-5",
    torch_dtype=torch.float16,
    use_safetensors=True,
 )
@@ -66,7 +66,7 @@ import torch
 from diffusers import StableDiffusionPipeline, UniPCMultistepScheduler

 pipe = StableDiffusionPipeline.from_pretrained(
-    "stable-diffusion-v1-5/stable-diffusion-v1-5",
+    "runwayml/stable-diffusion-v1-5",
    torch_dtype=torch.float16,
    use_safetensors=True,
 )
@@ -92,7 +92,7 @@ import torch
 from diffusers import StableDiffusionPipeline

 pipe = StableDiffusionPipeline.from_pretrained(
-    "stable-diffusion-v1-5/stable-diffusion-v1-5",
+    "runwayml/stable-diffusion-v1-5",
    torch_dtype=torch.float16,
    use_safetensors=True,
 )
@@ -140,7 +140,7 @@ import torch
 from diffusers import StableDiffusionPipeline

 pipe = StableDiffusionPipeline.from_pretrained(
-    "stable-diffusion-v1-5/stable-diffusion-v1-5",
+    "runwayml/stable-diffusion-v1-5",
    torch_dtype=torch.float16,
    use_safetensors=True,
 )
@@ -201,7 +201,7 @@ def generate_inputs():


 pipe = StableDiffusionPipeline.from_pretrained(
-    "stable-diffusion-v1-5/stable-diffusion-v1-5",
+    "runwayml/stable-diffusion-v1-5",
    torch_dtype=torch.float16,
    use_safetensors=True,
 ).to("cuda")
@@ -265,7 +265,7 @@ class UNet2DConditionOutput:


 pipe = StableDiffusionPipeline.from_pretrained(
-    "stable-diffusion-v1-5/stable-diffusion-v1-5",
+    "runwayml/stable-diffusion-v1-5",
    torch_dtype=torch.float16,
    use_safetensors=True,
 ).to("cuda")
@@ -315,7 +315,7 @@ from diffusers import DiffusionPipeline
 import torch

 pipe = DiffusionPipeline.from_pretrained(
-    "stable-diffusion-v1-5/stable-diffusion-v1-5",
+    "runwayml/stable-diffusion-v1-5",
    torch_dtype=torch.float16,
    use_safetensors=True,
 ).to("cuda")
--- a/docs/source/en/optimization/mps.md
+++ b/docs/source/en/optimization/mps.md
@@ -24,7 +24,7 @@ The `mps` backend uses PyTorch's `.to()` interface to move the Stable Diffusion
 ```python
 from diffusers import DiffusionPipeline

-pipe = DiffusionPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5")
+pipe = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
 pipe = pipe.to("mps")

 # Recommended if your computer has < 64 GB of RAM
@@ -46,7 +46,7 @@ If you're using **PyTorch 1.13**, you need to "prime" the pipeline with an addit
 ```diff
  from diffusers import DiffusionPipeline

-  pipe = DiffusionPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5").to("mps")
+  pipe = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5").to("mps")
  pipe.enable_attention_slicing()

  prompt = "a photo of an astronaut riding a horse on mars"
@@ -67,7 +67,7 @@ To prevent this from happening, we recommend *attention slicing* to reduce memor
 from diffusers import DiffusionPipeline
 import torch

-pipeline = DiffusionPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5", torch_dtype=torch.float16, variant="fp16", use_safetensors=True).to("mps")
+pipeline = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, variant="fp16", use_safetensors=True).to("mps")
 pipeline.enable_attention_slicing()
 ```

--- a/docs/source/en/optimization/neuron.md
+++ b/docs/source/en/optimization/neuron.md
@@ -1,61 +0,0 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# AWS Neuron
-
-Diffusers functionalities are available on [AWS Inf2 instances](https://aws.amazon.com/ec2/instance-types/inf2/), which are EC2 instances powered by [Neuron machine learning accelerators](https://aws.amazon.com/machine-learning/inferentia/). These instances aim to provide better compute performance (higher throughput, lower latency) with good cost-efficiency, making them good candidates for AWS users to deploy diffusion models to production.
-
-[Optimum Neuron](https://huggingface.co/docs/optimum-neuron/en/index) is the interface between Hugging Face libraries and AWS Accelerators, including AWS [Trainium](https://aws.amazon.com/machine-learning/trainium/) and AWS [Inferentia](https://aws.amazon.com/machine-learning/inferentia/). It supports many of the features in Diffusers with similar APIs, so it is easier to learn if you're already familiar with Diffusers. Once you have created an AWS Inf2 instance, install Optimum Neuron.
-
-```bash
-python -m pip install --upgrade-strategy eager optimum[neuronx]
-```
-
-<Tip>
-
-We provide pre-built [Hugging Face Neuron Deep Learning AMI](https://aws.amazon.com/marketplace/pp/prodview-gr3e6yiscria2) (DLAMI) and Optimum Neuron containers for Amazon SageMaker. It's recommended to correctly set up your environment.
-
-</Tip>
-
-The example below demonstrates how to generate images with the Stable Diffusion XL model on an inf2.8xlarge instance (you can switch to cheaper inf2.xlarge instances once the model is compiled). To generate some images, use the [`~optimum.neuron.NeuronStableDiffusionXLPipeline`] class, which is similar to the [`StableDiffusionXLPipeline`] class in Diffusers.
-
-Unlike Diffusers, you need to compile models in the pipeline to the Neuron format, `.neuron`. Launch the following command to export the model to the `.neuron` format.
-
-```bash
-optimum-cli export neuron --model stabilityai/stable-diffusion-xl-base-1.0 \
-  --batch_size 1 \
-  --height 1024 `# height in pixels of generated image, eg. 768, 1024` \
-  --width 1024 `# width in pixels of generated image, eg. 768, 1024` \
-  --num_images_per_prompt 1 `# number of images to generate per prompt, defaults to 1` \
-  --auto_cast matmul `# cast only matrix multiplication operations` \
-  --auto_cast_type bf16 `# cast operations from FP32 to BF16` \
-  sd_neuron_xl/
-```
-
-Now generate some images with the pre-compiled SDXL model.
-
-```python
->>> from optimum.neuron import NeuronStableDiffusionXLPipeline
-
->>> stable_diffusion_xl = NeuronStableDiffusionXLPipeline.from_pretrained("sd_neuron_xl/")
->>> prompt = "a pig with wings flying in floating US dollar banknotes in the air, skyscrapers behind, warm color palette, muted colors, detailed, 8k"
->>> image = stable_diffusion_xl(prompt).images[0]
-```
-
-<img
-  src="https://huggingface.co/datasets/Jingya/document_images/resolve/main/optimum/neuron/sdxl_pig.png"
-  width="256"
-  height="256"
-  alt="peggy generated by sdxl on inf2"
-/>
-
-Feel free to check out more guides and examples on different use cases from the Optimum Neuron [documentation](https://huggingface.co/docs/optimum-neuron/en/inference_tutorials/stable_diffusion#generate-images-with-stable-diffusion-models-on-aws-inferentia)!
--- a/docs/source/en/optimization/onnx.md
+++ b/docs/source/en/optimization/onnx.md
@@ -27,7 +27,7 @@ To load and run inference, use the [`~optimum.onnxruntime.ORTStableDiffusionPipe
 ```python
 from optimum.onnxruntime import ORTStableDiffusionPipeline

-model_id = "stable-diffusion-v1-5/stable-diffusion-v1-5"
+model_id = "runwayml/stable-diffusion-v1-5"
 pipeline = ORTStableDiffusionPipeline.from_pretrained(model_id, export=True)
 prompt = "sailing ship in storm by Leonardo da Vinci"
 image = pipeline(prompt).images[0]
@@ -44,7 +44,7 @@ To export the pipeline in the ONNX format offline and use it later for inference
 use the [`optimum-cli export`](https://huggingface.co/docs/optimum/main/en/exporters/onnx/usage_guides/export_a_model#exporting-a-model-to-onnx-using-the-cli) command:

 ```bash
-optimum-cli export onnx --model stable-diffusion-v1-5/stable-diffusion-v1-5 sd_v15_onnx/
+optimum-cli export onnx --model runwayml/stable-diffusion-v1-5 sd_v15_onnx/
 ```

 Then to perform inference (you don't have to specify `export=True` again):
--- a/docs/source/en/optimization/open_vino.md
+++ b/docs/source/en/optimization/open_vino.md
@@ -29,7 +29,7 @@ To load and run inference, use the [`~optimum.intel.OVStableDiffusionPipeline`].
 ```python
 from optimum.intel import OVStableDiffusionPipeline

-model_id = "stable-diffusion-v1-5/stable-diffusion-v1-5"
+model_id = "runwayml/stable-diffusion-v1-5"
 pipeline = OVStableDiffusionPipeline.from_pretrained(model_id, export=True)
 prompt = "sailing ship in storm by Rembrandt"
 image = pipeline(prompt).images[0]
--- a/docs/source/en/optimization/tome.md
+++ b/docs/source/en/optimization/tome.md
@@ -28,7 +28,7 @@ You can use ToMe from the [`tomesd`](https://github.com/dbolya/tomesd) library w
  import tomesd

  pipeline = StableDiffusionPipeline.from_pretrained(
-        "stable-diffusion-v1-5/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True,
+        "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True,
  ).to("cuda")
 + tomesd.apply_patch(pipeline, ratio=0.5)

--- a/docs/source/en/optimization/torch2.0.md
+++ b/docs/source/en/optimization/torch2.0.md
@@ -34,7 +34,7 @@ However, if you want to explicitly enable it, you can set a [`DiffusionPipeline`
  from diffusers import DiffusionPipeline
 + from diffusers.models.attention_processor import AttnProcessor2_0

-  pipe = DiffusionPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True).to("cuda")
+  pipe = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True).to("cuda")
 + pipe.unet.set_attn_processor(AttnProcessor2_0())

  prompt = "a photo of an astronaut riding a horse on mars"
@@ -49,7 +49,7 @@ In some cases - such as making the pipeline more deterministic or converting it
  import torch
  from diffusers import DiffusionPipeline

-  pipe = DiffusionPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True).to("cuda")
+  pipe = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True).to("cuda")
 + pipe.unet.set_default_attn_processor()

  prompt = "a photo of an astronaut riding a horse on mars"
@@ -64,7 +64,7 @@ The `torch.compile` function can often provide an additional speed-up to your Py
 from diffusers import DiffusionPipeline
 import torch

-pipe = DiffusionPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True).to("cuda")
+pipe = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True).to("cuda")
 pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
 images = pipe(prompt, num_inference_steps=steps, num_images_per_prompt=batch_size).images[0]
 ```
@@ -92,7 +92,7 @@ Expand the dropdown below to find the code used to benchmark each pipeline:
 from diffusers import DiffusionPipeline
 import torch

-path = "stable-diffusion-v1-5/stable-diffusion-v1-5"
+path = "runwayml/stable-diffusion-v1-5"

 run_compile = True  # Set True / False

@@ -122,7 +122,7 @@ url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/st
 init_image = load_image(url)
 init_image = init_image.resize((512, 512))

-path = "stable-diffusion-v1-5/stable-diffusion-v1-5"
+path = "runwayml/stable-diffusion-v1-5"

 run_compile = True  # Set True / False

@@ -183,7 +183,7 @@ url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/st
 init_image = load_image(url)
 init_image = init_image.resize((512, 512))

-path = "stable-diffusion-v1-5/stable-diffusion-v1-5"
+path = "runwayml/stable-diffusion-v1-5"

 run_compile = True  # Set True / False
 controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16, use_safetensors=True)
--- a/docs/source/en/optimization/xdit.md
+++ b/docs/source/en/optimization/xdit.md
@@ -1,121 +0,0 @@
-# xDiT
-
-[xDiT](https://github.com/xdit-project/xDiT) is an inference engine designed for the large scale parallel deployment of Diffusion Transformers (DiTs). xDiT provides a suite of efficient parallel approaches for Diffusion Models, as well as GPU kernel accelerations.
-
-There are four parallel methods supported in xDiT, including [Unified Sequence Parallelism](https://arxiv.org/abs/2405.07719), [PipeFusion](https://arxiv.org/abs/2405.14430), CFG parallelism and data parallelism. The four parallel methods in xDiT can be configured in a hybrid manner, optimizing communication patterns to best suit the underlying network hardware.
-
-Optimization orthogonal to parallelization focuses on accelerating single GPU performance. In addition to utilizing well-known Attention optimization libraries, we leverage compilation acceleration technologies such as torch.compile and onediff.
-
-The overview of xDiT is shown as follows.
-
-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/xDiT/documentation-images/resolve/main/methods/xdit_overview.png">
-</div>
-You can install xDiT using the following command:
-
-
-```bash
-pip install xfuser
-```
-
-Here's an example of using xDiT to accelerate inference of a Diffusers model.
-
-```diff
- import torch
- from diffusers import StableDiffusion3Pipeline
-
- from xfuser import xFuserArgs, xDiTParallel
- from xfuser.config import FlexibleArgumentParser
- from xfuser.core.distributed import get_world_group
-
- def main():
-+    parser = FlexibleArgumentParser(description="xFuser Arguments")
-+    args = xFuserArgs.add_cli_args(parser).parse_args()
-+    engine_args = xFuserArgs.from_cli_args(args)
-+    engine_config, input_config = engine_args.create_config()
-
-     local_rank = get_world_group().local_rank
-     pipe = StableDiffusion3Pipeline.from_pretrained(
-         pretrained_model_name_or_path=engine_config.model_config.model,
-         torch_dtype=torch.float16,
-     ).to(f"cuda:{local_rank}")
-    
-# do anything you want with pipeline here
-
-+    pipe = xDiTParallel(pipe, engine_config, input_config)
-
-     pipe(
-         height=input_config.height,
-         width=input_config.height,
-         prompt=input_config.prompt,
-         num_inference_steps=input_config.num_inference_steps,
-         output_type=input_config.output_type,
-         generator=torch.Generator(device="cuda").manual_seed(input_config.seed),
-     )
-
-+    if input_config.output_type == "pil":
-+        pipe.save("results", "stable_diffusion_3")
-
-if __name__ == "__main__":
-    main()
-
-```
-
-As you can see, we only need to use xFuserArgs from xDiT to get configuration parameters, and pass these parameters along with the pipeline object from the Diffusers library into xDiTParallel to complete the parallelization of a specific pipeline in Diffusers.
-
-xDiT runtime parameters can be viewed in the command line using `-h`, and you can refer to this [usage](https://github.com/xdit-project/xDiT?tab=readme-ov-file#2-usage) example for more details.
-
-xDiT needs to be launched using torchrun to support its multi-node, multi-GPU parallel capabilities. For example, the following command can be used for 8-GPU parallel inference:
-
-```bash
-torchrun --nproc_per_node=8 ./inference.py --model models/FLUX.1-dev --data_parallel_degree 2 --ulysses_degree 2 --ring_degree 2 --prompt "A snowy mountain" "A small dog" --num_inference_steps 50
-```
-
-## Supported models
-
-A subset of Diffusers models are supported in xDiT, such as Flux.1, Stable Diffusion 3, etc. The latest supported models can be found [here](https://github.com/xdit-project/xDiT?tab=readme-ov-file#-supported-dits).
-
-## Benchmark
-We tested different models on various machines, and here is some of the benchmark data.
-
-### Flux.1-schnell
-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/xDiT/documentation-images/resolve/main/performance/flux/Flux-2k-L40.png">
-</div>
-
-
-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/xDiT/documentation-images/resolve/main/performance/flux/Flux-2K-A100.png">
-</div>
-
-### Stable Diffusion 3
-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/xDiT/documentation-images/resolve/main/performance/sd3/L40-SD3.png">
-</div>
-
-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/xDiT/documentation-images/resolve/main/performance/sd3/A100-SD3.png">
-</div>
-
-### HunyuanDiT
-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/xDiT/documentation-images/resolve/main/performance/hunuyuandit/L40-HunyuanDiT.png">
-</div>
-
-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/xDiT/documentation-images/resolve/main/performance/hunuyuandit/V100-HunyuanDiT.png">
-</div>
-
-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/xDiT/documentation-images/resolve/main/performance/hunuyuandit/T4-HunyuanDiT.png">
-</div>
-
-More detailed performance metric can be found on our [github page](https://github.com/xdit-project/xDiT?tab=readme-ov-file#perf).
-
-## Reference
-
-[xDiT-project](https://github.com/xdit-project/xDiT)
-
-[USP: A Unified Sequence Parallelism Approach for Long Context Generative AI](https://arxiv.org/abs/2405.07719)
-
-[PipeFusion: Displaced Patch Pipeline Parallelism for Inference of Diffusion Transformer Models](https://arxiv.org/abs/2405.14430)
--- a/docs/source/en/quantization/bitsandbytes.md
+++ b/docs/source/en/quantization/bitsandbytes.md
@@ -1,260 +0,0 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-->
-
-# bitsandbytes
-
-[bitsandbytes](https://huggingface.co/docs/bitsandbytes/index) is the easiest option for quantizing a model to 8 and 4-bit. 8-bit quantization multiplies outliers in fp16 with non-outliers in int8, converts the non-outlier values back to fp16, and then adds them together to return the weights in fp16. This reduces the degradative effect outlier values have on a model's performance.
-
-4-bit quantization compresses a model even further, and it is commonly used with [QLoRA](https://hf.co/papers/2305.14314) to finetune quantized LLMs.
-
-
-To use bitsandbytes, make sure you have the following libraries installed:
-
-```bash
-pip install diffusers transformers accelerate bitsandbytes -U
-```
-
-Now you can quantize a model by passing a [`BitsAndBytesConfig`] to [`~ModelMixin.from_pretrained`]. This works for any model in any modality, as long as it supports loading with [Accelerate](https://hf.co/docs/accelerate/index) and contains `torch.nn.Linear` layers.
-
-<hfoptions id="bnb">
-<hfoption id="8-bit">
-
-Quantizing a model in 8-bit halves the memory-usage:
-
-```py
-from diffusers import FluxTransformer2DModel, BitsAndBytesConfig
-
-quantization_config = BitsAndBytesConfig(load_in_8bit=True)
-
-model_8bit = FluxTransformer2DModel.from_pretrained(
-    "black-forest-labs/FLUX.1-dev", 
-    subfolder="transformer",
-    quantization_config=quantization_config
-)
-```
-
-By default, all the other modules such as `torch.nn.LayerNorm` are converted to `torch.float16`. You can change the data type of these modules with the `torch_dtype` parameter if you want:
-
-```py
-from diffusers import FluxTransformer2DModel, BitsAndBytesConfig
-
-quantization_config = BitsAndBytesConfig(load_in_8bit=True)
-
-model_8bit = FluxTransformer2DModel.from_pretrained(
-    "black-forest-labs/FLUX.1-dev", 
-    subfolder="transformer",
-    quantization_config=quantization_config,
-    torch_dtype=torch.float32
-)
-model_8bit.transformer_blocks.layers[-1].norm2.weight.dtype
-```
-
-Once a model is quantized, you can push the model to the Hub with the [`~ModelMixin.push_to_hub`] method. The quantization `config.json` file is pushed first, followed by the quantized model weights. You can also save the serialized 4-bit models locally with [`~ModelMixin.save_pretrained`].
-
-</hfoption>
-<hfoption id="4-bit">
-
-Quantizing a model in 4-bit reduces your memory-usage by 4x:
-
-```py
-from diffusers import FluxTransformer2DModel, BitsAndBytesConfig
-
-quantization_config = BitsAndBytesConfig(load_in_4bit=True)
-
-model_4bit = FluxTransformer2DModel.from_pretrained(
-    "black-forest-labs/FLUX.1-dev", 
-    subfolder="transformer",
-    quantization_config=quantization_config
-)
-```
-
-By default, all the other modules such as `torch.nn.LayerNorm` are converted to `torch.float16`. You can change the data type of these modules with the `torch_dtype` parameter if you want:
-
-```py
-from diffusers import FluxTransformer2DModel, BitsAndBytesConfig
-
-quantization_config = BitsAndBytesConfig(load_in_4bit=True)
-
-model_4bit = FluxTransformer2DModel.from_pretrained(
-    "black-forest-labs/FLUX.1-dev", 
-    subfolder="transformer",
-    quantization_config=quantization_config,
-    torch_dtype=torch.float32
-)
-model_4bit.transformer_blocks.layers[-1].norm2.weight.dtype
-```
-
-Call [`~ModelMixin.push_to_hub`] after loading it in 4-bit precision. You can also save the serialized 4-bit models locally with [`~ModelMixin.save_pretrained`].  
-
-</hfoption>
-</hfoptions>
-
-<Tip warning={true}>
-
-Training with 8-bit and 4-bit weights are only supported for training *extra* parameters.
-
-</Tip>
-
-Check your memory footprint with the `get_memory_footprint` method:
-
-```py
-print(model.get_memory_footprint())
-```
-
-Quantized models can be loaded from the [`~ModelMixin.from_pretrained`] method without needing to specify the `quantization_config` parameters:
-
-```py
-from diffusers import FluxTransformer2DModel, BitsAndBytesConfig
-
-quantization_config = BitsAndBytesConfig(load_in_4bit=True)
-
-model_4bit = FluxTransformer2DModel.from_pretrained(
-    "hf-internal-testing/flux.1-dev-nf4-pkg", subfolder="transformer"
-)
-```
-
-## 8-bit (LLM.int8() algorithm)
-
-<Tip>
-
-Learn more about the details of 8-bit quantization in this [blog post](https://huggingface.co/blog/hf-bitsandbytes-integration)!
-
-</Tip>
-
-This section explores some of the specific features of 8-bit models, such as outlier thresholds and skipping module conversion.
-
-### Outlier threshold
-
-An "outlier" is a hidden state value greater than a certain threshold, and these values are computed in fp16. While the values are usually normally distributed ([-3.5, 3.5]), this distribution can be very different for large models ([-60, 6] or [6, 60]). 8-bit quantization works well for values ~5, but beyond that, there is a significant performance penalty. A good default threshold value is 6, but a lower threshold may be needed for more unstable models (small models or finetuning).
-
-To find the best threshold for your model, we recommend experimenting with the `llm_int8_threshold` parameter in [`BitsAndBytesConfig`]:
-
-```py
-from diffusers import FluxTransformer2DModel, BitsAndBytesConfig
-
-quantization_config = BitsAndBytesConfig(
-    load_in_8bit=True, llm_int8_threshold=10,
-)
-
-model_8bit = FluxTransformer2DModel.from_pretrained(
-    "black-forest-labs/FLUX.1-dev",
-    subfolder="transformer",
-    quantization_config=quantization_config,
-)
-```
-
-### Skip module conversion
-
-For some models, you don't need to quantize every module to 8-bit which can actually cause instability. For example, for diffusion models like [Stable Diffusion 3](../api/pipelines/stable_diffusion/stable_diffusion_3), the `proj_out` module can be skipped using the `llm_int8_skip_modules` parameter in [`BitsAndBytesConfig`]:
-
-```py
-from diffusers import SD3Transformer2DModel, BitsAndBytesConfig
-
-quantization_config = BitsAndBytesConfig(
-    load_in_8bit=True, llm_int8_skip_modules=["proj_out"],
-)
-
-model_8bit = SD3Transformer2DModel.from_pretrained(
-    "stabilityai/stable-diffusion-3-medium-diffusers",
-    subfolder="transformer",
-    quantization_config=quantization_config,
-)
-```
-
-
-## 4-bit (QLoRA algorithm)
-
-<Tip>
-
-Learn more about its details in this [blog post](https://huggingface.co/blog/4bit-transformers-bitsandbytes).
-
-</Tip>
-
-This section explores some of the specific features of 4-bit models, such as changing the compute data type, using the Normal Float 4 (NF4) data type, and using nested quantization.
-
-
-### Compute data type
-
-To speedup computation, you can change the data type from float32 (the default value) to bf16 using the `bnb_4bit_compute_dtype` parameter in [`BitsAndBytesConfig`]:
-
-```py
-import torch
-from diffusers import BitsAndBytesConfig
-
-quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16)
-```
-
-### Normal Float 4 (NF4)
-
-NF4 is a 4-bit data type from the [QLoRA](https://hf.co/papers/2305.14314) paper, adapted for weights initialized from a normal distribution. You should use NF4 for training 4-bit base models. This can be configured with the `bnb_4bit_quant_type` parameter in the [`BitsAndBytesConfig`]:
-
-```py
-from diffusers import BitsAndBytesConfig
-
-nf4_config = BitsAndBytesConfig(
-    load_in_4bit=True,
-    bnb_4bit_quant_type="nf4",
-)
-
-model_nf4 = SD3Transformer2DModel.from_pretrained(
-    "stabilityai/stable-diffusion-3-medium-diffusers",
-    subfolder="transformer",
-    quantization_config=nf4_config,
-)
-```
-
-For inference, the `bnb_4bit_quant_type` does not have a huge impact on performance. However, to remain consistent with the model weights, you should use the `bnb_4bit_compute_dtype` and `torch_dtype` values.
-
-### Nested quantization
-
-Nested quantization is a technique that can save additional memory at no additional performance cost. This feature performs a second quantization of the already quantized weights to save an additional 0.4 bits/parameter. 
-
-```py
-from diffusers import BitsAndBytesConfig
-
-double_quant_config = BitsAndBytesConfig(
-    load_in_4bit=True,
-    bnb_4bit_use_double_quant=True,
-)
-
-double_quant_model = SD3Transformer2DModel.from_pretrained(
-    "stabilityai/stable-diffusion-3-medium-diffusers",
-    subfolder="transformer",
-    quantization_config=double_quant_config,
-)
-```
-
-## Dequantizing `bitsandbytes` models
-
-Once quantized, you can dequantize the model to the original precision but this might result in a small quality loss of the model. Make sure you have enough GPU RAM to fit the dequantized model. 
-
-```python
-from diffusers import BitsAndBytesConfig
-
-double_quant_config = BitsAndBytesConfig(
-    load_in_4bit=True,
-    bnb_4bit_use_double_quant=True,
-)
-
-double_quant_model = SD3Transformer2DModel.from_pretrained(
-    "stabilityai/stable-diffusion-3-medium-diffusers",
-    subfolder="transformer",
-    quantization_config=double_quant_config,
-)
-model.dequantize()
-```
-
-## Resources
-
-* [End-to-end notebook showing Flux.1 Dev inference in a free-tier Colab](https://gist.github.com/sayakpaul/c76bd845b48759e11687ac550b99d8b4)
-* [Training](https://gist.github.com/sayakpaul/05afd428bc089b47af7c016e42004527)
--- a/docs/source/en/quantization/overview.md
+++ b/docs/source/en/quantization/overview.md
@@ -1,35 +0,0 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-->
-
-# Quantization
-
-Quantization techniques focus on representing data with less information while also trying to not lose too much accuracy. This often means converting a data type to represent the same information with fewer bits. For example, if your model weights are stored as 32-bit floating points and they're quantized to 16-bit floating points, this halves the model size which makes it easier to store and reduces memory-usage. Lower precision can also speedup inference because it takes less time to perform calculations with fewer bits.
-
-<Tip>
-
-Interested in adding a new quantization method to Transformers? Refer to the [Contribute new quantization method guide](https://huggingface.co/docs/transformers/main/en/quantization/contribute) to learn more about adding a new quantization method.
-
-</Tip>
-
-<Tip>
-
-If you are new to the quantization field, we recommend you to check out these beginner-friendly courses about quantization in collaboration with DeepLearning.AI:
-
-* [Quantization Fundamentals with Hugging Face](https://www.deeplearning.ai/short-courses/quantization-fundamentals-with-hugging-face/)
-* [Quantization in Depth](https://www.deeplearning.ai/short-courses/quantization-in-depth/)
-
-</Tip>
-
-## When to use what?
-
-This section will be expanded once Diffusers has multiple quantization backends. Currently, we only support `bitsandbytes`. [This resource](https://huggingface.co/docs/transformers/main/en/quantization/overview#when-to-use-what) provides a good overview of the pros and cons of different quantization techniques. 
--- a/docs/source/en/quicktour.md
+++ b/docs/source/en/quicktour.md
@@ -54,7 +54,7 @@ The [`DiffusionPipeline`] is the easiest way to use a pretrained diffusion syste

 Start by creating an instance of a [`DiffusionPipeline`] and specify which pipeline checkpoint you would like to download.
 You can use the [`DiffusionPipeline`] for any [checkpoint](https://huggingface.co/models?library=diffusers&sort=downloads) stored on the Hugging Face Hub.
-In this quicktour, you'll load the [`stable-diffusion-v1-5`](https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5) checkpoint for text-to-image generation.
+In this quicktour, you'll load the [`stable-diffusion-v1-5`](https://huggingface.co/runwayml/stable-diffusion-v1-5) checkpoint for text-to-image generation.

 <Tip warning={true}>

@@ -67,7 +67,7 @@ Load the model with the [`~DiffusionPipeline.from_pretrained`] method:
 ```python
 >>> from diffusers import DiffusionPipeline

->>> pipeline = DiffusionPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5", use_safetensors=True)
+>>> pipeline = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", use_safetensors=True)
 ```

 The [`DiffusionPipeline`] downloads and caches all modeling, tokenization, and scheduling components. You'll see that the Stable Diffusion pipeline is composed of the [`UNet2DConditionModel`] and [`PNDMScheduler`] among other things:
@@ -124,7 +124,7 @@ You can also use the pipeline locally. The only difference is you need to downlo

 ```bash
 !git lfs install
-!git clone https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5
+!git clone https://huggingface.co/runwayml/stable-diffusion-v1-5
 ```

 Then load the saved weights into the pipeline:
@@ -142,7 +142,7 @@ Different schedulers come with different denoising speeds and quality trade-offs
 ```py
 >>> from diffusers import EulerDiscreteScheduler

->>> pipeline = DiffusionPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5", use_safetensors=True)
+>>> pipeline = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", use_safetensors=True)
 >>> pipeline.scheduler = EulerDiscreteScheduler.from_config(pipeline.scheduler.config)
 ```

--- a/docs/source/en/stable_diffusion.md
+++ b/docs/source/en/stable_diffusion.md
@@ -20,12 +20,12 @@ This is why it's important to get the most *computational* (speed) and *memory*

 This tutorial walks you through how to generate faster and better with the [`DiffusionPipeline`].

-Begin by loading the [`stable-diffusion-v1-5/stable-diffusion-v1-5`](https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5) model:
+Begin by loading the [`runwayml/stable-diffusion-v1-5`](https://huggingface.co/runwayml/stable-diffusion-v1-5) model:

 ```python
 from diffusers import DiffusionPipeline

-model_id = "stable-diffusion-v1-5/stable-diffusion-v1-5"
+model_id = "runwayml/stable-diffusion-v1-5"
 pipeline = DiffusionPipeline.from_pretrained(model_id, use_safetensors=True)
 ```

@@ -238,7 +238,7 @@ Pretty impressive! Let's tweak the second image - corresponding to the `Generato
 ```python
 prompts = [
    "portrait photo of the oldest warrior chief, tribal panther make up, blue on red, side profile, looking away, serious eyes 50mm portrait photography, hard rim lighting photography--beta --ar 2:3  --beta --upbeta",
-    "portrait photo of an old warrior chief, tribal panther make up, blue on red, side profile, looking away, serious eyes 50mm portrait photography, hard rim lighting photography--beta --ar 2:3  --beta --upbeta",
+    "portrait photo of a old warrior chief, tribal panther make up, blue on red, side profile, looking away, serious eyes 50mm portrait photography, hard rim lighting photography--beta --ar 2:3  --beta --upbeta",
    "portrait photo of a warrior chief, tribal panther make up, blue on red, side profile, looking away, serious eyes 50mm portrait photography, hard rim lighting photography--beta --ar 2:3  --beta --upbeta",
    "portrait photo of a young warrior chief, tribal panther make up, blue on red, side profile, looking away, serious eyes 50mm portrait photography, hard rim lighting photography--beta --ar 2:3  --beta --upbeta",
 ]
--- a/docs/source/en/training/adapt_a_model.md
+++ b/docs/source/en/training/adapt_a_model.md
@@ -6,12 +6,12 @@ This guide will show you how to adapt a pretrained text-to-image model for inpai

 ## Configure UNet2DConditionModel parameters

-A [`UNet2DConditionModel`] by default accepts 4 channels in the [input sample](https://huggingface.co/docs/diffusers/v0.16.0/en/api/models#diffusers.UNet2DConditionModel.in_channels). For example, load a pretrained text-to-image model like [`stable-diffusion-v1-5/stable-diffusion-v1-5`](https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5) and take a look at the number of `in_channels`:
+A [`UNet2DConditionModel`] by default accepts 4 channels in the [input sample](https://huggingface.co/docs/diffusers/v0.16.0/en/api/models#diffusers.UNet2DConditionModel.in_channels). For example, load a pretrained text-to-image model like [`runwayml/stable-diffusion-v1-5`](https://huggingface.co/runwayml/stable-diffusion-v1-5) and take a look at the number of `in_channels`:

 ```py
 from diffusers import StableDiffusionPipeline

-pipeline = StableDiffusionPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5", use_safetensors=True)
+pipeline = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", use_safetensors=True)
 pipeline.unet.config["in_channels"]
 4
 ```
@@ -33,7 +33,7 @@ Initialize a [`UNet2DConditionModel`] with the pretrained text-to-image model we
 ```py
 from diffusers import UNet2DConditionModel

-model_id = "stable-diffusion-v1-5/stable-diffusion-v1-5"
+model_id = "runwayml/stable-diffusion-v1-5"
 unet = UNet2DConditionModel.from_pretrained(
    model_id,
    subfolder="unet",
--- a/docs/source/en/training/cogvideox.md
+++ b/docs/source/en/training/cogvideox.md
@@ -1,291 +0,0 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-# CogVideoX
-
-CogVideoX is a text-to-video generation model focused on creating more coherent videos aligned with a prompt. It achieves this using several methods.
-
- a 3D variational autoencoder that compresses videos spatially and temporally, improving compression rate and video accuracy.
-
- an expert transformer block to help align text and video, and a 3D full attention module for capturing and creating spatially and temporally accurate videos.
-
-The actual test of the video instruction dimension found that CogVideoX has good effects on consistent theme, dynamic information, consistent background, object information, smooth motion, color, scene, appearance style, and temporal style but cannot achieve good results with human action, spatial relationship, and multiple objects.
-
-Finetuning with Diffusers can help make up for these poor results. 
-
-## Data Preparation
-
-The training scripts accepts data in two formats.  
-
-The first format is suited for small-scale training, and the second format uses a CSV format, which is more appropriate for streaming data for large-scale training. In the future, Diffusers will support the `<Video>` tag.
-
-### Small format
-
-Two files where one file contains line-separated prompts and another file contains line-separated paths to video data (the path to video files must be relative to the path you pass when specifying `--instance_data_root`). Let's take a look at an example to understand this better!
-
-Assume you've specified `--instance_data_root` as `/dataset`, and that this directory contains the files: `prompts.txt` and `videos.txt`.
-
-The `prompts.txt` file should contain line-separated prompts:
-
-```
-A black and white animated sequence featuring a rabbit, named Rabbity Ribfried, and an anthropomorphic goat in a musical, playful environment, showcasing their evolving interaction.
-A black and white animated sequence on a ship's deck features a bulldog character, named Bully Bulldoger, showcasing exaggerated facial expressions and body language. The character progresses from confident to focused, then to strained and distressed, displaying a range of emotions as it navigates challenges. The ship's interior remains static in the background, with minimalistic details such as a bell and open door. The character's dynamic movements and changing expressions drive the narrative, with no camera movement to distract from its evolving reactions and physical gestures.
-...
-```
-
-The `videos.txt` file should contain line-separate paths to video files. Note that the path should be _relative_ to the `--instance_data_root` directory.
-
-```
-videos/00000.mp4
-videos/00001.mp4
-...
-```
-
-Overall, this is how your dataset would look like if you ran the `tree` command on the dataset root directory:
-
-```
-/dataset
-├── prompts.txt
-├── videos.txt
-├── videos
-    ├── videos/00000.mp4
-    ├── videos/00001.mp4
-    ├── ...
-```
-
-When using this format, the `--caption_column` must be `prompts.txt` and `--video_column` must be `videos.txt`.
-
-### Stream format
-
-You could use a single CSV file. For the sake of this example, assume you have a `metadata.csv` file. The expected format is:
-
-```
-<CAPTION_COLUMN>,<PATH_TO_VIDEO_COLUMN>
-"""A black and white animated sequence featuring a rabbit, named Rabbity Ribfried, and an anthropomorphic goat in a musical, playful environment, showcasing their evolving interaction.""","""00000.mp4"""
-"""A black and white animated sequence on a ship's deck features a bulldog character, named Bully Bulldoger, showcasing exaggerated facial expressions and body language. The character progresses from confident to focused, then to strained and distressed, displaying a range of emotions as it navigates challenges. The ship's interior remains static in the background, with minimalistic details such as a bell and open door. The character's dynamic movements and changing expressions drive the narrative, with no camera movement to distract from its evolving reactions and physical gestures.""","""00001.mp4"""
-...
-```
-
-In this case, the `--instance_data_root` should be the location where the videos are stored and `--dataset_name` should be either a path to local folder or a [`~datasets.load_dataset`] compatible dataset hosted on the Hub. Assuming you have videos of Minecraft gameplay at `https://huggingface.co/datasets/my-awesome-username/minecraft-videos`, you would have to specify `my-awesome-username/minecraft-videos`.
-
-When using this format, the `--caption_column` must be `<CAPTION_COLUMN>` and `--video_column` must be `<PATH_TO_VIDEO_COLUMN>`.
-
-You are not strictly restricted to the CSV format. Any format works as long as the `load_dataset` method supports the file format to load a basic `<PATH_TO_VIDEO_COLUMN>` and `<CAPTION_COLUMN>`. The reason for going through these dataset organization gymnastics for loading video data is because `load_dataset` does not fully support all kinds of video formats.
-
-> [!NOTE]
-> CogVideoX works best with long and descriptive LLM-augmented prompts for video generation. We recommend pre-processing your videos by first generating a summary using a VLM and then augmenting the prompts with an LLM. To generate the above captions, we use [MiniCPM-V-26](https://huggingface.co/openbmb/MiniCPM-V-2_6) and [Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct). A very barebones and no-frills example for this is available [here](https://gist.github.com/a-r-r-o-w/4dee20250e82f4e44690a02351324a4a). The official recommendation for augmenting prompts is [ChatGLM](https://huggingface.co/THUDM?search_models=chatglm) and a length of 50-100 words is considered good.
-
->![NOTE]
-> It is expected that your dataset is already pre-processed. If not, some basic pre-processing can be done by playing with the following parameters:
-> `--height`, `--width`, `--fps`, `--max_num_frames`, `--skip_frames_start` and `--skip_frames_end`.
-> Presently, all videos in your dataset should contain the same number of video frames when using a training batch size > 1.
-
-<!-- TODO: Implement frame packing in future to address above issue. -->
-
-## Training
-
-You need to setup your development environment by installing the necessary requirements. The following packages are required:
- Torch 2.0 or above based on the training features you are utilizing (might require latest or nightly versions for quantized/deepspeed training)
- `pip install diffusers transformers accelerate peft huggingface_hub` for all things modeling and training related
- `pip install datasets decord` for loading video training data
- `pip install bitsandbytes` for using 8-bit Adam or AdamW optimizers for memory-optimized training
- `pip install wandb` optionally for monitoring training logs
- `pip install deepspeed` optionally for [DeepSpeed](https://github.com/microsoft/DeepSpeed) training
- `pip install prodigyopt` optionally if you would like to use the Prodigy optimizer for training
-
-To make sure you can successfully run the latest versions of the example scripts, we highly recommend **installing from source** and keeping the install up to date as we update the example scripts frequently and install some example-specific requirements. To do this, execute the following steps in a new virtual environment:
-
-Before running the script, make sure you install the library from source:
-```bash
-git clone https://github.com/huggingface/diffusers
-cd diffusers
-pip install -e .
-```
-
- 
-
-Then navigate to the example folder containing the training script and install the required dependencies for the script you're using:
-
- PyTorch
-
-```bash
-cd examples/cogvideo
-pip install -r requirements.txt
-```
-
-And initialize an [🤗 Accelerate](https://github.com/huggingface/accelerate/) environment with:
-
-```bash
-accelerate config
-```
-
-Or for a default accelerate configuration without answering questions about your environment
-
-```bash
-accelerate config default
-```
-
-Or if your environment doesn't support an interactive shell (e.g., a notebook)
-
-```python
-from accelerate.utils import write_basic_config
-write_basic_config()
-```
-
-When running `accelerate config`, if you use torch.compile, there can be dramatic speedups. The PEFT library is used as a backend for LoRA training, so make sure to have `peft>=0.6.0` installed in your environment.
-
-If you would like to push your model to the Hub after training is completed with a neat model card, make sure you're logged in:
-
-```bash
-huggingface-cli login
-
-# Alternatively, you could upload your model manually using:
-# huggingface-cli upload my-cool-account-name/my-cool-lora-name /path/to/awesome/lora
-```
-
-Make sure your data is prepared as described in [Data Preparation](#data-preparation). When ready, you can begin training!
-
-Assuming you are training on 50 videos of a similar concept, we have found 1500-2000 steps to work well. The official recommendation, however, is 100 videos with a total of 4000 steps. Assuming you are training on a single GPU with a `--train_batch_size` of `1`:
- 1500 steps on 50 videos would correspond to `30` training epochs
- 4000 steps on 100 videos would correspond to `40` training epochs
-
-```bash
-#!/bin/bash
-
-GPU_IDS="0"
-
-accelerate launch --gpu_ids $GPU_IDS examples/cogvideo/train_cogvideox_lora.py \
-  --pretrained_model_name_or_path THUDM/CogVideoX-2b \
-  --cache_dir <CACHE_DIR> \
-  --instance_data_root <PATH_TO_WHERE_VIDEO_FILES_ARE_STORED> \
-  --dataset_name my-awesome-name/my-awesome-dataset \
-  --caption_column <CAPTION_COLUMN> \
-  --video_column <PATH_TO_VIDEO_COLUMN> \
-  --id_token <ID_TOKEN> \
-  --validation_prompt "<ID_TOKEN> Spiderman swinging over buildings:::A panda, dressed in a small, red jacket and a tiny hat, sits on a wooden stool in a serene bamboo forest. The panda's fluffy paws strum a miniature acoustic guitar, producing soft, melodic tunes. Nearby, a few other pandas gather, watching curiously and some clapping in rhythm. Sunlight filters through the tall bamboo, casting a gentle glow on the scene. The panda's face is expressive, showing concentration and joy as it plays. The background includes a small, flowing stream and vibrant green foliage, enhancing the peaceful and magical atmosphere of this unique musical performance" \
-  --validation_prompt_separator ::: \
-  --num_validation_videos 1 \
-  --validation_epochs 10 \
-  --seed 42 \
-  --rank 64 \
-  --lora_alpha 64 \
-  --mixed_precision fp16 \
-  --output_dir /raid/aryan/cogvideox-lora \
-  --height 480 --width 720 --fps 8 --max_num_frames 49 --skip_frames_start 0 --skip_frames_end 0 \
-  --train_batch_size 1 \
-  --num_train_epochs 30 \
-  --checkpointing_steps 1000 \
-  --gradient_accumulation_steps 1 \
-  --learning_rate 1e-3 \
-  --lr_scheduler cosine_with_restarts \
-  --lr_warmup_steps 200 \
-  --lr_num_cycles 1 \
-  --enable_slicing \
-  --enable_tiling \
-  --optimizer Adam \
-  --adam_beta1 0.9 \
-  --adam_beta2 0.95 \
-  --max_grad_norm 1.0 \
-  --report_to wandb
-```
-
-To better track our training experiments, we're using the following flags in the command above:
-* `--report_to wandb` will ensure the training runs are tracked on Weights and Biases. To use it, be sure to install `wandb` with `pip install wandb`.
-* `validation_prompt` and `validation_epochs` to allow the script to do a few validation inference runs. This allows us to qualitatively check if the training is progressing as expected.
-
-Setting the `<ID_TOKEN>` is not necessary. From some limited experimentation, we found it works better (as it resembles [Dreambooth](https://huggingface.co/docs/diffusers/en/training/dreambooth) training) than without. When provided, the `<ID_TOKEN>` is appended to the beginning of each prompt. So, if your `<ID_TOKEN>` was `"DISNEY"` and your prompt was `"Spiderman swinging over buildings"`, the effective prompt used in training would be `"DISNEY Spiderman swinging over buildings"`. When not provided, you would either be training without any additional token or could augment your dataset to apply the token where you wish before starting the training.
-
-> [!NOTE]
-> You can pass `--use_8bit_adam` to reduce the memory requirements of training.
-
-> [!IMPORTANT]
-> The following settings have been tested at the time of adding CogVideoX LoRA training support:
-> - Our testing was primarily done on CogVideoX-2b. We will work on CogVideoX-5b and CogVideoX-5b-I2V soon
-> - One dataset comprised of 70 training videos of resolutions `200 x 480 x 720` (F x H x W). From this, by using frame skipping in data preprocessing, we created two smaller 49-frame and 16-frame datasets for faster experimentation and because the maximum limit recommended by the CogVideoX team is 49 frames. Out of the 70 videos, we created three groups of 10, 25 and 50 videos. All videos were similar in nature of the concept being trained.
-> - 25+ videos worked best for training new concepts and styles.
-> - We found that it is better to train with an identifier token that can be specified as `--id_token`. This is similar to Dreambooth-like training but normal finetuning without such a token works too.
-> - Trained concept seemed to work decently well when combined with completely unrelated prompts. We expect even better results if CogVideoX-5B is finetuned.
-> - The original repository uses a `lora_alpha` of `1`. We found this not suitable in many runs, possibly due to difference in modeling backends and training settings. Our recommendation is to set to the `lora_alpha` to either `rank` or `rank // 2`.
-> - If you're training on data whose captions generate bad results with the original model, a `rank` of 64 and above is good and also the recommendation by the team behind CogVideoX. If the generations are already moderately good on your training captions, a `rank` of 16/32 should work. We found that setting the rank too low, say `4`, is not ideal and doesn't produce promising results.
-> - The authors of CogVideoX recommend 4000 training steps and 100 training videos overall to achieve the best result. While that might yield the best results, we found from our limited experimentation that 2000 steps and 25 videos could also be sufficient.
-> - When using the Prodigy opitimizer for training, one can follow the recommendations from [this](https://huggingface.co/blog/sdxl_lora_advanced_script) blog. Prodigy tends to overfit quickly. From my very limited testing, I found a learning rate of `0.5` to be suitable in addition to `--prodigy_use_bias_correction`, `prodigy_safeguard_warmup` and `--prodigy_decouple`.
-> - The recommended learning rate by the CogVideoX authors and from our experimentation with Adam/AdamW is between `1e-3` and `1e-4` for a dataset of 25+ videos.
->
-> Note that our testing is not exhaustive due to limited time for exploration. Our recommendation would be to play around with the different knobs and dials to find the best settings for your data.
-
-<!-- TODO: Test finetuning with CogVideoX-5b and CogVideoX-5b-I2V and update scripts accordingly -->
-
-## Inference
-
-Once you have trained a lora model, the inference can be done simply loading the lora weights into the `CogVideoXPipeline`.
-
-```python
-import torch
-from diffusers import CogVideoXPipeline
-from diffusers.utils import export_to_video
-
-pipe = CogVideoXPipeline.from_pretrained("THUDM/CogVideoX-2b", torch_dtype=torch.float16)
-# pipe.load_lora_weights("/path/to/lora/weights", adapter_name="cogvideox-lora") # Or,
-pipe.load_lora_weights("my-awesome-hf-username/my-awesome-lora-name", adapter_name="cogvideox-lora") # If loading from the HF Hub
-pipe.to("cuda")
-
-# Assuming lora_alpha=32 and rank=64 for training. If different, set accordingly
-pipe.set_adapters(["cogvideox-lora"], [32 / 64])
-
-prompt = "A vast, shimmering ocean flows gracefully under a twilight sky, its waves undulating in a mesmerizing dance of blues and greens. The surface glints with the last rays of the setting sun, casting golden highlights that ripple across the water. Seagulls soar above, their cries blending with the gentle roar of the waves. The horizon stretches infinitely, where the ocean meets the sky in a seamless blend of hues. Close-ups reveal the intricate patterns of the waves, capturing the fluidity and dynamic beauty of the sea in motion."
-frames = pipe(prompt, guidance_scale=6, use_dynamic_cfg=True).frames[0]
-export_to_video(frames, "output.mp4", fps=8)
-```
-
-
-## Reduce memory usage
-
-While testing using the diffusers library, all optimizations included in the diffusers library were enabled. This
-scheme has not been tested for actual memory usage on devices outside of **NVIDIA A100 / H100** architectures.
-Generally, this scheme can be adapted to all **NVIDIA Ampere architecture** and above devices. If optimizations are
-disabled, memory consumption will multiply, with peak memory usage being about 3 times the value in the table.
-However, speed will increase by about 3-4 times. You can selectively disable some optimizations, including:
-
-```
-pipe.enable_sequential_cpu_offload()
-pipe.vae.enable_slicing()
-pipe.vae.enable_tiling()
-```
-
-+ For multi-GPU inference, the `enable_sequential_cpu_offload()` optimization needs to be disabled.
-+ Using INT8 models will slow down inference, which is done to accommodate lower-memory GPUs while maintaining minimal
-  video quality loss, though inference speed will significantly decrease.
-+ The CogVideoX-2B model was trained in `FP16` precision, and all CogVideoX-5B models were trained in `BF16` precision.
-  We recommend using the precision in which the model was trained for inference.
-+ [PytorchAO](https://github.com/pytorch/ao) and [Optimum-quanto](https://github.com/huggingface/optimum-quanto/) can be
-  used to quantize the text encoder, transformer, and VAE modules to reduce the memory requirements of CogVideoX. This
-  allows the model to run on free T4 Colabs or GPUs with smaller memory! Also, note that TorchAO quantization is fully
-  compatible with `torch.compile`, which can significantly improve inference speed. FP8 precision must be used on
-  devices with NVIDIA H100 and above, requiring source installation of `torch`, `torchao`, `diffusers`, and `accelerate`
-  Python packages. CUDA 12.4 is recommended.
-+ The inference speed tests also used the above memory optimization scheme. Without memory optimization, inference speed
-  increases by about 10%. Only the `diffusers` version of the model supports quantization.
-+ The model only supports English input; other languages can be translated into English for use via large model
-  refinement.
-+ The memory usage of model fine-tuning is tested in an `8 * H100` environment, and the program automatically
-  uses `Zero 2` optimization. If a specific number of GPUs is marked in the table, that number or more GPUs must be used
-  for fine-tuning.
-
-
- | **Attribute**                        | **CogVideoX-2B**                                                       | **CogVideoX-5B**                                                       |
-| ------------------------------------ | ---------------------------------------------------------------------- | ---------------------------------------------------------------------- |
-| **Model Name**                       | CogVideoX-2B                                                           | CogVideoX-5B                                                           |
-| **Inference Precision**              | FP16* (Recommended), BF16, FP32, FP8*, INT8, Not supported INT4         | BF16 (Recommended), FP16, FP32, FP8*, INT8, Not supported INT4         |
-| **Single GPU Inference VRAM**        | FP16: Using diffusers 12.5GB* INT8: Using diffusers with torchao 7.8GB* | BF16: Using diffusers 20.7GB* INT8: Using diffusers with torchao 11.4GB* |
-| **Multi GPU Inference VRAM**         | FP16: Using diffusers 10GB*                                             | BF16: Using diffusers 15GB*                                             |
-| **Inference Speed**                  | Single A100: ~90 seconds, Single H100: ~45 seconds                      | Single A100: ~180 seconds, Single H100: ~90 seconds                     |
-| **Fine-tuning Precision**            | FP16                                                                   | BF16                                                                   |
-| **Fine-tuning VRAM Consumption**     | 47 GB (bs=1, LORA) 61 GB (bs=2, LORA) 62GB (bs=1, SFT)                 | 63 GB (bs=1, LORA) 80 GB (bs=2, LORA) 75GB (bs=1, SFT)                 |
--- a/docs/source/en/training/controlnet.md
+++ b/docs/source/en/training/controlnet.md
@@ -276,7 +276,7 @@ That's it! You don't need to add any additional parameters to your training comm
 <hfoption id="PyTorch">

 ```bash
-export MODEL_DIR="stable-diffusion-v1-5/stable-diffusion-v1-5"
+export MODEL_DIR="runwayml/stable-diffusion-v1-5"
 export OUTPUT_DIR="path/to/save/model"

 accelerate launch train_controlnet.py \
--- a/docs/source/en/training/create_dataset.md
+++ b/docs/source/en/training/create_dataset.md
@@ -78,7 +78,7 @@ Now the dataset is available for training by passing the dataset name to the `--

 ```bash
 accelerate launch --mixed_precision="fp16"  train_text_to_image.py \
-  --pretrained_model_name_or_path="stable-diffusion-v1-5/stable-diffusion-v1-5" \
+  --pretrained_model_name_or_path="runwayml/stable-diffusion-v1-5" \
  --dataset_name="name_of_your_dataset" \
  <other-arguments>
 ```
--- a/docs/source/en/training/distributed_inference.md
+++ b/docs/source/en/training/distributed_inference.md
@@ -10,7 +10,7 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->

-# Distributed inference
+# Distributed inference with multiple GPUs

 On distributed setups, you can run inference across multiple GPUs with 🤗 [Accelerate](https://huggingface.co/docs/accelerate/index) or [PyTorch Distributed](https://pytorch.org/tutorials/beginner/dist_overview.html), which is useful for generating with multiple prompts in parallel.

@@ -30,7 +30,7 @@ from accelerate import PartialState
 from diffusers import DiffusionPipeline

 pipeline = DiffusionPipeline.from_pretrained(
-    "stable-diffusion-v1-5/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True
+    "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True
 )
 distributed_state = PartialState()
 pipeline.to(distributed_state.device)
@@ -48,7 +48,7 @@ accelerate launch run_distributed.py --num_processes=2

 <Tip>

-Refer to this minimal example [script](https://gist.github.com/sayakpaul/cfaebd221820d7b43fae638b4dfa01ba) for running inference across multiple GPUs. To learn more, take a look at the [Distributed Inference with 🤗 Accelerate](https://huggingface.co/docs/accelerate/en/usage_guides/distributed_inference#distributed-inference-with-accelerate) guide.
+To learn more, take a look at the [Distributed Inference with 🤗 Accelerate](https://huggingface.co/docs/accelerate/en/usage_guides/distributed_inference#distributed-inference-with-accelerate) guide.

 </Tip>

@@ -66,7 +66,7 @@ import torch.multiprocessing as mp
 from diffusers import DiffusionPipeline

 sd = DiffusionPipeline.from_pretrained(
-    "stable-diffusion-v1-5/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True
+    "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True
 )
 ```

@@ -108,132 +108,4 @@ torchrun run_distributed.py --nproc_per_node=2
 ```

 > [!TIP]
-> You can use `device_map` within a [`DiffusionPipeline`] to distribute its model-level components on multiple devices. Refer to the [Device placement](../tutorials/inference_with_big_models#device-placement) guide to learn more.
-
-## Model sharding
-
-Modern diffusion systems such as [Flux](../api/pipelines/flux) are very large and have multiple models. For example, [Flux.1-Dev](https://hf.co/black-forest-labs/FLUX.1-dev) is made up of two text encoders - [T5-XXL](https://hf.co/google/t5-v1_1-xxl) and [CLIP-L](https://hf.co/openai/clip-vit-large-patch14) - a [diffusion transformer](../api/models/flux_transformer), and a [VAE](../api/models/autoencoderkl). With a model this size, it can be challenging to run inference on consumer GPUs.
-
-Model sharding is a technique that distributes models across GPUs when the models don't fit on a single GPU. The example below assumes two 16GB GPUs are available for inference.
-
-Start by computing the text embeddings with the text encoders. Keep the text encoders on two GPUs by setting `device_map="balanced"`. The `balanced` strategy evenly distributes the model on all available GPUs. Use the `max_memory` parameter to allocate the maximum amount of memory for each text encoder on each GPU.
-
-> [!TIP]
-> **Only** load the text encoders for this step! The diffusion transformer and VAE are loaded in a later step to preserve memory.
-
-```py
-from diffusers import FluxPipeline
-import torch
-
-prompt = "a photo of a dog with cat-like look"
-
-pipeline = FluxPipeline.from_pretrained(
-    "black-forest-labs/FLUX.1-dev",
-    transformer=None,
-    vae=None,
-    device_map="balanced",
-    max_memory={0: "16GB", 1: "16GB"},
-    torch_dtype=torch.bfloat16
-)
-with torch.no_grad():
-    print("Encoding prompts.")
-    prompt_embeds, pooled_prompt_embeds, text_ids = pipeline.encode_prompt(
-        prompt=prompt, prompt_2=None, max_sequence_length=512
-    )
-```
-
-Once the text embeddings are computed, remove them from the GPU to make space for the diffusion transformer.
-
-```py
-import gc 
-
-def flush():
-    gc.collect()
-    torch.cuda.empty_cache()
-    torch.cuda.reset_max_memory_allocated()
-    torch.cuda.reset_peak_memory_stats()
-
-del pipeline.text_encoder
-del pipeline.text_encoder_2
-del pipeline.tokenizer
-del pipeline.tokenizer_2
-del pipeline
-
-flush()
-```
-
-Load the diffusion transformer next which has 12.5B parameters. This time, set `device_map="auto"` to automatically distribute the model across two 16GB GPUs. The `auto` strategy is backed by [Accelerate](https://hf.co/docs/accelerate/index) and available as a part of the [Big Model Inference](https://hf.co/docs/accelerate/concept_guides/big_model_inference) feature. It starts by distributing a model across the fastest device first (GPU) before moving to slower devices like the CPU and hard drive if needed. The trade-off of storing model parameters on slower devices is slower inference latency.
-
-```py
-from diffusers import FluxTransformer2DModel
-import torch 
-
-transformer = FluxTransformer2DModel.from_pretrained(
-    "black-forest-labs/FLUX.1-dev", 
-    subfolder="transformer",
-    device_map="auto",
-    torch_dtype=torch.bfloat16
-)
-```
-
-> [!TIP]
-> At any point, you can try `print(pipeline.hf_device_map)` to see how the various models are distributed across devices. This is useful for tracking the device placement of the models. You can also try `print(transformer.hf_device_map)` to see how the transformer model is sharded across devices.
-
-Add the transformer model to the pipeline for denoising, but set the other model-level components like the text encoders and VAE to `None` because you don't need them yet.
-
-```py
-pipeline = FluxPipeline.from_pretrained(
-    "black-forest-labs/FLUX.1-dev",
-    text_encoder=None,
-    text_encoder_2=None,
-    tokenizer=None,
-    tokenizer_2=None,
-    vae=None,
-    transformer=transformer,
-    torch_dtype=torch.bfloat16
-)
-
-print("Running denoising.")
-height, width = 768, 1360
-latents = pipeline(
-    prompt_embeds=prompt_embeds,
-    pooled_prompt_embeds=pooled_prompt_embeds,
-    num_inference_steps=50,
-    guidance_scale=3.5,
-    height=height,
-    width=width,
-    output_type="latent",
-).images
-```
-
-Remove the pipeline and transformer from memory as they're no longer needed.
-
-```py
-del pipeline.transformer
-del pipeline
-
-flush()
-```
-
-Finally, decode the latents with the VAE into an image. The VAE is typically small enough to be loaded on a single GPU.
-
-```py
-from diffusers import AutoencoderKL
-from diffusers.image_processor import VaeImageProcessor
-import torch 
-
-vae = AutoencoderKL.from_pretrained(ckpt_id, subfolder="vae", torch_dtype=torch.bfloat16).to("cuda")
-vae_scale_factor = 2 ** (len(vae.config.block_out_channels))
-image_processor = VaeImageProcessor(vae_scale_factor=vae_scale_factor)
-
-with torch.no_grad():
-    print("Running decoding.")
-    latents = FluxPipeline._unpack_latents(latents, height, width, vae_scale_factor)
-    latents = (latents / vae.config.scaling_factor) + vae.config.shift_factor
-
-    image = vae.decode(latents, return_dict=False)[0]
-    image = image_processor.postprocess(image, output_type="pil")
-    image[0].save("split_transformer.png")
-```
-
-By selectively loading and unloading the models you need at a given stage and sharding the largest models across multiple GPUs, it is possible to run inference with large models on consumer GPUs.
+> You can use `device_map` within a [`DiffusionPipeline`] to distribute its model-level components on multiple devices. Refer to the [Device placement](../tutorials/inference_with_big_models#device-placement) guide to learn more.
--- a/docs/source/en/training/dreambooth.md
+++ b/docs/source/en/training/dreambooth.md
@@ -315,7 +315,7 @@ That's it! You don't need to add any additional parameters to your training comm
 <hfoption id="PyTorch">

 ```bash
-export MODEL_NAME="stable-diffusion-v1-5/stable-diffusion-v1-5"
+export MODEL_NAME="runwayml/stable-diffusion-v1-5"
 export INSTANCE_DIR="./dog"
 export OUTPUT_DIR="path_to_saved_model"

@@ -374,7 +374,7 @@ unet = UNet2DConditionModel.from_pretrained("path/to/model/checkpoint-100/unet")
 text_encoder = CLIPTextModel.from_pretrained("path/to/model/checkpoint-100/checkpoint-100/text_encoder")

 pipeline = DiffusionPipeline.from_pretrained(
-    "stable-diffusion-v1-5/stable-diffusion-v1-5", unet=unet, text_encoder=text_encoder, dtype=torch.float16,
+    "runwayml/stable-diffusion-v1-5", unet=unet, text_encoder=text_encoder, dtype=torch.float16,
 ).to("cuda")

 image = pipeline("A photo of sks dog in a bucket", num_inference_steps=50, guidance_scale=7.5).images[0]
--- a/docs/source/en/training/instructpix2pix.md
+++ b/docs/source/en/training/instructpix2pix.md
@@ -14,7 +14,7 @@ specific language governing permissions and limitations under the License.

 [InstructPix2Pix](https://hf.co/papers/2211.09800) is a Stable Diffusion model trained to edit images from human-provided instructions. For example, your prompt can be "turn the clouds rainy" and the model will edit the input image accordingly. This model is conditioned on the text prompt (or editing instruction) and the input image.

-This guide will explore the [train_instruct_pix2pix.py](https://github.com/huggingface/diffusers/blob/main/examples/instruct_pix2pix/train_instruct_pix2pix.py) training script to help you become familiar with it, and how you can adapt it for your own use case.
+This guide will explore the [train_instruct_pix2pix.py](https://github.com/huggingface/diffusers/blob/main/examples/instruct_pix2pix/train_instruct_pix2pix.py) training script to help you become familiar with it, and how you can adapt it for your own use-case.

 Before running the script, make sure you install the library from source:

@@ -117,7 +117,7 @@ optimizer = optimizer_cls(
 )
 ```

-Next, the edited images and edit instructions are [preprocessed](https://github.com/huggingface/diffusers/blob/64603389da01082055a901f2883c4810d1144edb/examples/instruct_pix2pix/train_instruct_pix2pix.py#L624) and [tokenized](https://github.com/huggingface/diffusers/blob/64603389da01082055a901f2883c4810d1144edb/examples/instruct_pix2pix/train_instruct_pix2pix.py#L610C24-L610C24). It is important the same image transformations are applied to the original and edited images.
+Next, the edited images and and edit instructions are [preprocessed](https://github.com/huggingface/diffusers/blob/64603389da01082055a901f2883c4810d1144edb/examples/instruct_pix2pix/train_instruct_pix2pix.py#L624) and [tokenized](https://github.com/huggingface/diffusers/blob/64603389da01082055a901f2883c4810d1144edb/examples/instruct_pix2pix/train_instruct_pix2pix.py#L610C24-L610C24). It is important the same image transformations are applied to the original and edited images.

 ```py
 def preprocess_train(examples):
@@ -249,4 +249,4 @@ The SDXL training script is discussed in more detail in the [SDXL training](sdxl

 Congratulations on training your own InstructPix2Pix model! 🥳 To learn more about the model, it may be helpful to:

- Read the [Instruction-tuning Stable Diffusion with InstructPix2Pix](https://huggingface.co/blog/instruction-tuning-sd) blog post to learn more about some experiments we've done with InstructPix2Pix, dataset preparation, and results for different instructions.
+- Read the [Instruction-tuning Stable Diffusion with InstructPix2Pix](https://huggingface.co/blog/instruction-tuning-sd) blog post to learn more about some experiments we've done with InstructPix2Pix, dataset preparation, and results for different instructions.
--- a/docs/source/en/training/lcm_distill.md
+++ b/docs/source/en/training/lcm_distill.md
@@ -193,7 +193,7 @@ Now you're ready to launch the training script and start distilling!
 For this guide, you'll use the `--train_shards_path_or_url` to specify the path to the [Conceptual Captions 12M](https://github.com/google-research-datasets/conceptual-12m) dataset stored on the Hub [here](https://huggingface.co/datasets/laion/conceptual-captions-12m-webdataset). Set the `MODEL_DIR` environment variable to the name of the teacher model and `OUTPUT_DIR` to where you want to save the model.

 ```bash
-export MODEL_DIR="stable-diffusion-v1-5/stable-diffusion-v1-5"
+export MODEL_DIR="runwayml/stable-diffusion-v1-5"
 export OUTPUT_DIR="path/to/saved/model"

 accelerate launch train_lcm_distill_sd_wds.py \
@@ -225,7 +225,7 @@ from diffusers import UNet2DConditionModel, DiffusionPipeline, LCMScheduler
 import torch

 unet = UNet2DConditionModel.from_pretrained("your-username/your-model", torch_dtype=torch.float16, variant="fp16")
-pipeline = DiffusionPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5", unet=unet, torch_dtype=torch.float16, variant="fp16")
+pipeline = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", unet=unet, torch_dtype=torch.float16, variant="fp16")

 pipeline.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
 pipeline.to("cuda")
--- a/docs/source/en/training/lora.md
+++ b/docs/source/en/training/lora.md
@@ -184,7 +184,7 @@ A full training run takes ~5 hours on a 2080 Ti GPU with 11GB of VRAM.
 </Tip>

 ```bash
-export MODEL_NAME="stable-diffusion-v1-5/stable-diffusion-v1-5"
+export MODEL_NAME="runwayml/stable-diffusion-v1-5"
 export OUTPUT_DIR="/sddata/finetune/lora/naruto"
 export HUB_MODEL_ID="naruto-lora"
 export DATASET_NAME="lambdalabs/naruto-blip-captions"
@@ -218,7 +218,7 @@ Once training has been completed, you can use your model for inference:
 from diffusers import AutoPipelineForText2Image
 import torch

-pipeline = AutoPipelineForText2Image.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5", torch_dtype=torch.float16).to("cuda")
+pipeline = AutoPipelineForText2Image.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16).to("cuda")
 pipeline.load_lora_weights("path/to/lora/model", weight_name="pytorch_lora_weights.safetensors")
 image = pipeline("A naruto with blue eyes").images[0]
 ```
--- a/docs/source/en/training/text2image.md
+++ b/docs/source/en/training/text2image.md
@@ -167,7 +167,7 @@ To train on a local dataset, set the `TRAIN_DIR` and `OUTPUT_DIR` environment va
 </Tip>

 ```bash
-export MODEL_NAME="stable-diffusion-v1-5/stable-diffusion-v1-5"
+export MODEL_NAME="runwayml/stable-diffusion-v1-5"
 export dataset_name="lambdalabs/naruto-blip-captions"

 accelerate launch --mixed_precision="fp16"  train_text_to_image.py \
@@ -201,7 +201,7 @@ To train on a local dataset, set the `TRAIN_DIR` and `OUTPUT_DIR` environment va
 </Tip>

 ```bash
-export MODEL_NAME="stable-diffusion-v1-5/stable-diffusion-v1-5"
+export MODEL_NAME="runwayml/stable-diffusion-v1-5"
 export dataset_name="lambdalabs/naruto-blip-captions"

 python train_text_to_image_flax.py \
--- a/docs/source/en/training/text_inversion.md
+++ b/docs/source/en/training/text_inversion.md
@@ -193,7 +193,7 @@ One more thing before you launch the script. If you're interested in following a
 <hfoption id="PyTorch">

 ```bash
-export MODEL_NAME="stable-diffusion-v1-5/stable-diffusion-v1-5"
+export MODEL_NAME="runwayml/stable-diffusion-v1-5"
 export DATA_DIR="./cat"

 accelerate launch textual_inversion.py \
@@ -248,7 +248,7 @@ After training is complete, you can use your newly trained model for inference l
 from diffusers import StableDiffusionPipeline
 import torch

-pipeline = StableDiffusionPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5", torch_dtype=torch.float16).to("cuda")
+pipeline = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16).to("cuda")
 pipeline.load_textual_inversion("sd-concepts-library/cat-toy")
 image = pipeline("A <cat-toy> train", num_inference_steps=50).images[0]
 image.save("cat-train.png")
--- a/docs/source/en/tutorials/inference_with_big_models.md
+++ b/docs/source/en/tutorials/inference_with_big_models.md
@@ -90,8 +90,8 @@ from diffusers import DiffusionPipeline
 import torch

 pipeline = DiffusionPipeline.from_pretrained(
-    "stable-diffusion-v1-5/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True,
-+    "stable-diffusion-v1-5/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True, device_map="balanced"
+-    "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True,
+    "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True, device_map="balanced"
 )
 image = pipeline("a dog").images[0]
 image
@@ -105,7 +105,7 @@ import torch

 max_memory = {0:"1GB", 1:"1GB"}
 pipeline = DiffusionPipeline.from_pretrained(
-    "stable-diffusion-v1-5/stable-diffusion-v1-5",
+    "runwayml/stable-diffusion-v1-5",
    torch_dtype=torch.float16,
    use_safetensors=True,
    device_map="balanced",
--- a/docs/source/en/tutorials/using_peft_for_inference.md
+++ b/docs/source/en/tutorials/using_peft_for_inference.md
@@ -34,7 +34,7 @@ pipe_id = "stabilityai/stable-diffusion-xl-base-1.0"
 pipe = DiffusionPipeline.from_pretrained(pipe_id, torch_dtype=torch.float16).to("cuda")
 ```

-Next, load a [CiroN2022/toy-face](https://huggingface.co/CiroN2022/toy-face) adapter with the [`~diffusers.loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] method. With the 🤗 PEFT integration, you can assign a specific `adapter_name` to the checkpoint, which lets you easily switch between different LoRA checkpoints. Let's call this adapter `"toy"`.
+Next, load a [CiroN2022/toy-face](https://huggingface.co/CiroN2022/toy-face) adapter with the [`~diffusers.loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] method. With the 🤗 PEFT integration, you can assign a specific `adapter_name` to the checkpoint, which let's you easily switch between different LoRA checkpoints. Let's call this adapter `"toy"`.

 ```python
 pipe.load_lora_weights("CiroN2022/toy-face", weight_name="toy_face_sdxl.safetensors", adapter_name="toy")
@@ -75,12 +75,6 @@ image

 ![pixel-art](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/peft_integration/diffusers_peft_lora_inference_12_1.png)

-<Tip>
-
-By default, if the most up-to-date versions of PEFT and Transformers are detected, `low_cpu_mem_usage` is set to `True` to speed up the loading time of LoRA checkpoints. 
-
-</Tip>
-
 ## Merge adapters

 You can also merge different adapter checkpoints for inference to blend their styles together.
--- a/docs/source/en/using-diffusers/callback.md
+++ b/docs/source/en/using-diffusers/callback.md
@@ -12,7 +12,7 @@ specific language governing permissions and limitations under the License.

 # Pipeline callbacks

-The denoising loop of a pipeline can be modified with custom defined functions using the `callback_on_step_end` parameter. The callback function is executed at the end of each step, and modifies the pipeline attributes and variables for the next step. This is really useful for *dynamically* adjusting certain pipeline attributes or modifying tensor variables. This versatility allows for interesting use cases such as changing the prompt embeddings at each timestep, assigning different weights to the prompt embeddings, and editing the guidance scale. With callbacks, you can implement new features without modifying the underlying code!
+The denoising loop of a pipeline can be modified with custom defined functions using the `callback_on_step_end` parameter. The callback function is executed at the end of each step, and modifies the pipeline attributes and variables for the next step. This is really useful for *dynamically* adjusting certain pipeline attributes or modifying tensor variables. This versatility allows for interesting use-cases such as changing the prompt embeddings at each timestep, assigning different weights to the prompt embeddings, and editing the guidance scale. With callbacks, you can implement new features without modifying the underlying code!

 > [!TIP]
 > 🤗 Diffusers currently only supports `callback_on_step_end`, but feel free to open a [feature request](https://github.com/huggingface/diffusers/issues/new/choose) if you have a cool use-case and require a callback function with a different execution point!
@@ -75,7 +75,7 @@ out.images[0].save("official_callback.png")
    <figcaption class="mt-2 text-center text-sm text-gray-500">without SDXLCFGCutoffCallback</figcaption>
  </div>
  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/with_cfg_callback.png" alt="generated image of a sports car at the road with cfg callback" />
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/with_cfg_callback.png" alt="generated image of a a sports car at the road with cfg callback" />
    <figcaption class="mt-2 text-center text-sm text-gray-500">with SDXLCFGCutoffCallback</figcaption>
  </div>
 </div>
@@ -109,7 +109,7 @@ Now, you can pass the callback function to the `callback_on_step_end` parameter
 import torch
 from diffusers import StableDiffusionPipeline

-pipeline = StableDiffusionPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5", torch_dtype=torch.float16)
+pipeline = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16)
 pipeline = pipeline.to("cuda")

 prompt = "a photo of an astronaut riding a horse on mars"
@@ -139,7 +139,7 @@ In this example, the diffusion process is stopped after 10 steps even though `nu
 ```python
 from diffusers import StableDiffusionPipeline

-pipeline = StableDiffusionPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5")
+pipeline = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
 pipeline.enable_model_cpu_offload()
 num_inference_steps = 50

@@ -171,13 +171,14 @@ def latents_to_rgb(latents):
    weights = (
        (60, -60, 25, -70),
        (60,  -5, 15, -50),
-        (60,  10, -5, -35),
+        (60,  10, -5, -35)
    )

    weights_tensor = torch.t(torch.tensor(weights, dtype=latents.dtype).to(latents.device))
    biases_tensor = torch.tensor((150, 140, 130), dtype=latents.dtype).to(latents.device)
    rgb_tensor = torch.einsum("...lxy,lr -> ...rxy", latents, weights_tensor) + biases_tensor.unsqueeze(-1).unsqueeze(-1)
-    image_array = rgb_tensor.clamp(0, 255).byte().cpu().numpy().transpose(1, 2, 0)
+    image_array = rgb_tensor.clamp(0, 255)[0].byte().cpu().numpy()
+    image_array = image_array.transpose(1, 2, 0)

    return Image.fromarray(image_array)
 ```
@@ -188,7 +189,7 @@ def latents_to_rgb(latents):
 def decode_tensors(pipe, step, timestep, callback_kwargs):
    latents = callback_kwargs["latents"]

-    image = latents_to_rgb(latents[0])
+    image = latents_to_rgb(latents)
    image.save(f"{step}.png")

    return callback_kwargs
--- a/docs/source/en/using-diffusers/cogvideox.md
+++ b/docs/source/en/using-diffusers/cogvideox.md
@@ -1,120 +0,0 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-# CogVideoX
-
-CogVideoX is a text-to-video generation model focused on creating more coherent videos aligned with a prompt. It achieves this using several methods.
-
- a 3D variational autoencoder that compresses videos spatially and temporally, improving compression rate and video accuracy.
-
- an expert transformer block to help align text and video, and a 3D full attention module for capturing and creating spatially and temporally accurate videos.
- 
-
-
-## Load model checkpoints
-Model weights may be stored in separate subfolders on the Hub or locally, in which case, you should use the [`~DiffusionPipeline.from_pretrained`] method.
-
-
-```py
-from diffusers import CogVideoXPipeline, CogVideoXImageToVideoPipeline
-pipe = CogVideoXPipeline.from_pretrained(
-    "THUDM/CogVideoX-2b",
-    torch_dtype=torch.float16
-)
-
-pipe = CogVideoXImageToVideoPipeline.from_pretrained(
-    "THUDM/CogVideoX-5b-I2V",
-    torch_dtype=torch.bfloat16
-)
-
-```
-
-## Text-to-Video
-For text-to-video, pass a text prompt. By default, CogVideoX generates a 720x480 video for the best results.
-
-```py
-import torch
-from diffusers import CogVideoXPipeline
-from diffusers.utils import export_to_video
-
-prompt = "An elderly gentleman, with a serene expression, sits at the water's edge, a steaming cup of tea by his side. He is engrossed in his artwork, brush in hand, as he renders an oil painting on a canvas that's propped up against a small, weathered table. The sea breeze whispers through his silver hair, gently billowing his loose-fitting white shirt, while the salty air adds an intangible element to his masterpiece in progress. The scene is one of tranquility and inspiration, with the artist's canvas capturing the vibrant hues of the setting sun reflecting off the tranquil sea."
-
-pipe = CogVideoXPipeline.from_pretrained(
-    "THUDM/CogVideoX-5b",
-    torch_dtype=torch.bfloat16
-)
-
-pipe.enable_model_cpu_offload()
-pipe.vae.enable_tiling()
-
-video = pipe(
-    prompt=prompt,
-    num_videos_per_prompt=1,
-    num_inference_steps=50,
-    num_frames=49,
-    guidance_scale=6,
-    generator=torch.Generator(device="cuda").manual_seed(42),
-).frames[0]
-
-export_to_video(video, "output.mp4", fps=8)
-
-```
-
-
-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cogvideox/cogvideox_out.gif" alt="generated image of an astronaut in a jungle"/>
-</div>
-
-
-## Image-to-Video
-
-
-You'll use the [THUDM/CogVideoX-5b-I2V](https://huggingface.co/THUDM/CogVideoX-5b-I2V)  checkpoint for this guide.
-
-```py
-import torch
-from diffusers import CogVideoXImageToVideoPipeline
-from diffusers.utils import export_to_video, load_image
-
-prompt = "A vast, shimmering ocean flows gracefully under a twilight sky, its waves undulating in a mesmerizing dance of blues and greens. The surface glints with the last rays of the setting sun, casting golden highlights that ripple across the water. Seagulls soar above, their cries blending with the gentle roar of the waves. The horizon stretches infinitely, where the ocean meets the sky in a seamless blend of hues. Close-ups reveal the intricate patterns of the waves, capturing the fluidity and dynamic beauty of the sea in motion."
-image = load_image(image="cogvideox_rocket.png")
-pipe = CogVideoXImageToVideoPipeline.from_pretrained(
-    "THUDM/CogVideoX-5b-I2V",
-    torch_dtype=torch.bfloat16
-)
- 
-pipe.vae.enable_tiling()
-pipe.vae.enable_slicing()
-
-video = pipe(
-    prompt=prompt,
-    image=image,
-    num_videos_per_prompt=1,
-    num_inference_steps=50,
-    num_frames=49,
-    guidance_scale=6,
-    generator=torch.Generator(device="cuda").manual_seed(42),
-).frames[0]
-
-export_to_video(video, "output.mp4", fps=8)
-```
-
-<div class="flex gap-4">
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cogvideox/cogvideox_rocket.png"/>
-    <figcaption class="mt-2 text-center text-sm text-gray-500">initial image</figcaption>
-  </div>
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cogvideox/cogvideox_outrocket.gif"/>
-    <figcaption class="mt-2 text-center text-sm text-gray-500">generated video</figcaption>
-  </div>
-</div>
-
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
yiyixuxu	78c125f76f	up	2024-08-06 07:20:35 +02:00
yiyixuxu	9e3b0fe107	add dynamic cfg	2024-08-06 03:59:06 +02:00
Aryan	878f609aa5	remove dynamic guidance scale	2024-08-05 17:52:13 +02:00
zR	32da2e7673	Update lora_conversion_utils.py	2024-08-05 22:20:06 +08:00
zR	9a0b906518	restore	2024-08-05 22:08:50 +08:00
Aryan	b3428ad5f5	Merge branch 'main' into cogvideox-2b	2024-08-05 18:09:09 +05:30
Aryan	70a54a8230	use num_frames instead of num_seconds	2024-08-05 14:38:42 +02:00
yiyixuxu	6f4e60b58b	Merge branch 'cogvideox-2b' of github.com:zRzRzRzRzRzRzR/diffusers into cogvideo-dpm	2024-08-05 14:06:15 +02:00
yiyixuxu	e4d65ccdd7	update	2024-08-05 13:57:42 +02:00
zR	22dcceb858	messages	2024-08-05 19:17:16 +08:00
zR	9c6b8894ff	rename unsample and add some docs	2024-08-05 17:57:54 +08:00
zR	cf7369d418	fix some error	2024-08-05 16:45:41 +08:00
Aryan	123ecef2b9	Merge branch 'main' into cogvideox-2b	2024-08-05 12:35:10 +05:30
zR	511c9ef560	merge remote branch	2024-08-05 14:02:36 +08:00
zR	fb6130fe90	Merge branch 'cogvideox-common-draft-2' of github.com:huggingface/diffusers-new-model-addition-thudm into cogvideox-2b	2024-08-05 14:01:49 +08:00
zR	2d9602cc96	add CogVideoX team, Tsinghua University & ZhipuAI	2024-08-05 13:55:25 +08:00
zR	1b1b737acb	cogvideox branch	2024-08-05 11:28:31 +08:00
Aryan	311845fc77	update docs	2024-08-04 23:46:50 +02:00
Aryan	01c2dff338	update docs	2024-08-04 23:45:56 +02:00
Aryan	03580c07b9	remove cogvideox-specific attention processor	2024-08-04 23:37:31 +02:00
Aryan	fd11c0fbee	Merge branch 'main' into cogvideox-common-draft-2	2024-08-04 23:29:18 +02:00
Aryan	92c8c00756	make fix-copies	2024-08-04 23:29:11 +02:00
Aryan	5781e017dd	Merge branch 'huggingface:main' into main	2024-08-05 02:22:36 +05:30
sayakpaul	90aa8be534	add workflow for rebasing with upstream automatically.	2024-08-04 22:50:51 +02:00
Sayak Paul	2f1b7870e2	Revert "add workflow to rebase with upstream main nightly."	2024-08-04 22:50:51 +02:00
sayakpaul	7360ea1d03	add upstream	2024-08-04 22:50:51 +02:00
sayakpaul	ba1855c07e	add workflow to rebase with upstream main nightly.	2024-08-04 22:50:51 +02:00
Aryan	1b1b26b65c	make style	2024-08-04 16:29:57 +02:00
Aryan	312f7dc4fd	apply suggestions from review	2024-08-04 16:29:35 +02:00
zR	ba4223ac3b	remove attenstion mask	2024-08-04 15:52:37 +08:00
zR	6988cc3a86	new schedule with dpm	2024-08-04 14:26:00 +08:00
Aryan	fa7fa9cced	make inference 2-3x faster (by fixing the bug i introduced) 🚀😎	2024-08-03 18:48:54 +02:00
Aryan	61c6da076a	remove chunked ff code; reuse and refactor to support temb directly in adalayernorm	2024-08-03 17:43:09 +02:00
zR	c7ee165c4f	Update downsampling.py	2024-08-03 22:22:28 +08:00
zR	d99528be94	Restore the timesteps parameter	2024-08-03 16:04:46 +08:00
zR	fd0831c52c	timestep fix	2024-08-03 15:11:12 +08:00
zR	477e12b235	fix	2024-08-03 15:08:07 +08:00
zR	b42b079213	fix some comment	2024-08-03 13:46:57 +08:00
zR	21509aa7f5	fp16 problem	2024-08-03 00:34:55 +08:00
Aryan	65f6211f1f	Merge pull request #4 from huggingface/cogvideox-refactor-to-diffusers CogVideoX pipeline, refactor modeling to diffusers format, bug fixes	2024-08-02 21:20:34 +05:30
Aryan	3def90523d	update	2024-08-02 17:40:36 +02:00
Aryan	551c884acd	remove debug prints	2024-08-02 17:40:23 +02:00
zR	ec53a30a0e	schedule	2024-08-02 22:38:58 +08:00
zR	71e7c82ae8	vae problem fix	2024-08-02 16:23:24 +08:00
Aryan	c33dd0213b	remove incorrect copied from	2024-08-02 01:04:35 +02:00
Aryan	e12458e16c	make style	2024-08-02 01:03:37 +02:00
Aryan	77558f31bf	add pipeline docs	2024-08-02 01:03:02 +02:00
Aryan	41da084fbe	remove debug prints	2024-08-02 00:53:09 +02:00
Aryan	4c2e8870e6	add cogvideo specific attn processor	2024-08-02 00:51:15 +02:00
Aryan	fe6f5d6419	ensure tokenizer config correctly uses 226 as text length	2024-08-01 22:06:08 +02:00
Aryan	d0b8db2b11	update	2024-08-01 21:12:49 +02:00
zR	351d1f009e	remove 0.transformer_blocks.encoder.embed_tokens.weight	2024-08-01 23:31:03 +08:00
zR	a31db5f952	using with 226 instead of 225 of final weight	2024-08-01 22:58:27 +08:00
Aryan	03ee7cd109	add pipeline implementation	2024-08-01 13:52:58 +02:00
Aryan	712ddbeac6	make style	2024-08-01 12:18:18 +02:00
Aryan	03c28eef5b	modeling fixes	2024-08-01 12:18:01 +02:00
Aryan	e05f83479c	refactor vae	2024-08-01 06:05:49 +02:00
zR	bb4740ce29	add clear_fake_cp_cache	2024-08-01 00:26:59 +08:00
zR	2956866ef4	Merge branch 'cogvideox-common-draft-2' of https://github.com/huggingface/diffusers-new-model-addition-thudm into cogvideox-common-draft-2	2024-07-31 21:41:16 +08:00
zR	4498cfc98c	add doc draft	2024-07-31 21:41:09 +08:00
Aryan	a449ceb3ef	update conversion script	2024-07-31 14:40:16 +02:00
Aryan	45f7127ade	fix bug in handling long prompts	2024-07-31 14:28:46 +02:00
Aryan	73469f9562	make fix-copies	2024-07-31 14:13:56 +02:00
Aryan	d45d199b99	make style	2024-07-31 14:13:38 +02:00
Aryan	e67cc5ae47	implement encode prompt	2024-07-31 14:12:22 +02:00
Aryan	470815cefa	minor refactor	2024-07-31 13:38:23 +02:00
Aryan	5f183bfe27	reorder upsampling/downsampling blocks in order of invocation	2024-07-31 13:33:57 +02:00
Aryan	c43a8f5b2b	minor factor and repositioning of code in order of invocation	2024-07-31 13:27:15 +02:00
Aryan	9f9d0cbb83	verify CogVideoXSpatialNorm3D implementation	2024-07-31 13:14:39 +02:00
Aryan	2be7469821	groups->norm_num_groups	2024-07-31 12:44:18 +02:00
Aryan	3ae9413966	undo unnecessary changes added on cogvideo-vae by mistake	2024-07-31 12:42:03 +02:00
Aryan	ec9508c83b	Merge branch 'main' into cogvideox-common-draft-2	2024-07-31 12:40:26 +02:00
Aryan	6bcafcbaa6	make fix-copies	2024-07-31 12:02:58 +02:00
Aryan	b3052807e5	add skeleton for pipeline	2024-07-31 12:02:38 +02:00
Aryan	73b041e7a9	Merge branch 'cogvideox' into cogvideox-common-draft-1	2024-07-31 11:41:28 +02:00
zR	1c661ce3d4	remove deriving and using nn.module	2024-07-31 17:19:22 +08:00
zR	8fe54bcd26	add	2024-07-31 16:42:47 +08:00
zR	ee40f0e1ca	follow review guide	2024-07-31 14:31:21 +08:00
sayakpaul	0980f4dcd2	add workflow for rebasing with upstream automatically.	2024-07-31 08:35:22 +05:30
Sayak Paul	71bcb1e1c5	Revert "add workflow to rebase with upstream main nightly."	2024-07-31 08:35:22 +05:30
sayakpaul	dfeb32975d	add upstream	2024-07-31 08:35:22 +05:30
sayakpaul	d83c1f8447	add workflow to rebase with upstream main nightly.	2024-07-31 08:35:22 +05:30
Aryan	21a0fc1b0d	make style	2024-07-31 02:55:08 +02:00
Aryan	16967589d8	remove debug prints	2024-07-31 02:54:41 +02:00
Aryan	d963b1aaa4	update conversion script for latest modeling changes	2024-07-31 02:54:32 +02:00
Aryan	e982881716	refactor	2024-07-31 02:52:53 +02:00
Aryan	cb5348a0c2	fix nasty bug in 3d sincos pos embeds	2024-07-31 01:26:32 +02:00
zR	aff72ec5dc	Update autoencoder_kl3d.py	2024-07-30 22:43:09 +08:00
zR	dc7e6e814f	fix error	2024-07-30 22:33:34 +08:00
zR	a3d827fb8d	rename	2024-07-30 22:00:53 +08:00
zR	84ff56eb90	fix with some review guide	2024-07-30 21:05:04 +08:00
Aryan	45cb1f92d3	fix layernorms	2024-07-30 14:57:57 +02:00
Aryan	59e6669f6d	fix attention mask	2024-07-30 11:44:42 +02:00
Aryan	bb917755ee	add imports	2024-07-30 09:04:45 +02:00
Aryan	bd6efd5fe4	initial draft of cogvideo transformer	2024-07-30 09:02:06 +02:00
zR	c341786f3e	vae draft	2024-07-30 13:25:08 +08:00
zR	c8e5491be0	Create autoencoder_kl3d.py	2024-07-30 13:15:58 +08:00