Merge branch 'main' into qwenimage-lru-cache-bypass

add attentionmixin to qwen image (#12219 )
Support ControlNet for Qwen-Image (#12215 )
2025-12-07 21:14:44 +08:00 · 2025-08-23 08:34:44 +05:30 · 2025-08-23 04:48:32 +05:30 · 2025-08-22 11:00:01 -10:00 · 2025-08-22 13:01:24 -07:00 · 2025-08-22 10:42:13 -07:00
1605 changed files with 87956 additions and 23606 deletions
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -11,20 +11,21 @@ env:
  HF_HOME: /mnt/cache
  OMP_NUM_THREADS: 8
  MKL_NUM_THREADS: 8
  BASE_PATH: benchmark_outputs
 jobs:
-  torch_pipelines_cuda_benchmark_tests:
+  torch_models_cuda_benchmark_tests:
    env:
      SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL_BENCHMARK }}
-    name: Torch Core Pipelines CUDA Benchmarking Tests
+    name: Torch Core Models CUDA Benchmarking Tests
    strategy:
      fail-fast: false
      max-parallel: 1
    runs-on:
-      group: aws-g6-4xlarge-plus
+      group: aws-g6e-4xlarge
    container:
-      image: diffusers/diffusers-pytorch-compile-cuda
+      image: diffusers/diffusers-pytorch-cuda
-      options: --shm-size "16gb" --ipc host --gpus 0
+      options: --shm-size "16gb" --ipc host --gpus all
    steps:
      - name: Checkout diffusers
        uses: actions/checkout@v3
@@ -35,27 +36,47 @@ jobs:
          nvidia-smi
      - name: Install dependencies
        run: |
          apt update
          apt install -y libpq-dev postgresql-client
          python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
          python -m uv pip install -e [quality,test]
-          python -m uv pip install pandas peft
+          python -m uv pip install -r benchmarks/requirements.txt
          python -m uv pip uninstall transformers && python -m uv pip install transformers==4.48.0
      - name: Environment
        run: |
          python utils/print_env.py
      - name: Diffusers Benchmarking
        env:
-            HF_TOKEN: ${{ secrets.DIFFUSERS_BOT_TOKEN }}
+          HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
            BASE_PATH: benchmark_outputs
        run: |
-          export TOTAL_GPU_MEMORY=$(python -c "import torch; print(torch.cuda.get_device_properties(0).total_memory / (1024**3))")
+          cd benchmarks && python run_all.py
-          cd benchmarks && mkdir ${BASE_PATH} && python run_all.py && python push_results.py
+
      - name: Push results to the Hub
        env: 
          HF_TOKEN: ${{ secrets.DIFFUSERS_BOT_TOKEN }}
        run: |
          cd benchmarks && python push_results.py
          mkdir $BASE_PATH && cp *.csv $BASE_PATH
      - name: Test suite reports artifacts
        if: ${{ always() }}
        uses: actions/upload-artifact@v4
        with:
          name: benchmark_test_reports
-          path: benchmarks/benchmark_outputs
+          path: benchmarks/${{ env.BASE_PATH }}
      # TODO: enable this once the connection problem has been resolved.
      - name: Update benchmarking results to DB
        env:
          PGDATABASE: metrics
          PGHOST: ${{ secrets.DIFFUSERS_BENCHMARKS_PGHOST }}
          PGUSER: transformers_benchmarks
          PGPASSWORD: ${{ secrets.DIFFUSERS_BENCHMARKS_PGPASSWORD }}
          BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
        run: |
          git config --global --add safe.directory /__w/diffusers/diffusers
          commit_id=$GITHUB_SHA
          commit_msg=$(git show -s --format=%s "$commit_id" | cut -c1-70)
          cd benchmarks && python populate_into_db.py "$BRANCH_NAME" "$commit_id" "$commit_msg"
      - name: Report success status
        if: ${{ success() }}
--- a/.github/workflows/build_docker_images.yml
+++ b/.github/workflows/build_docker_images.yml
@@ -38,9 +38,16 @@ jobs:
          token: ${{ secrets.GITHUB_TOKEN }}
      - name: Build Changed Docker Images
        env: 
          CHANGED_FILES: ${{ steps.file_changes.outputs.all }}
        run: |
-          CHANGED_FILES="${{ steps.file_changes.outputs.all }}"
+          echo "$CHANGED_FILES"
-          for FILE in $CHANGED_FILES; do
+          for FILE in $CHANGED_FILES; do 
            # skip anything that isn't still on disk
            if [[ ! -f "$FILE" ]]; then
              echo "Skipping removed file $FILE"
              continue
            fi           
            if [[ "$FILE" == docker/*Dockerfile ]]; then
              DOCKER_PATH="${FILE%/Dockerfile}"
              DOCKER_TAG=$(basename "$DOCKER_PATH")
@@ -65,13 +72,9 @@ jobs:
        image-name:
          - diffusers-pytorch-cpu
          - diffusers-pytorch-cuda
-          - diffusers-pytorch-compile-cuda
+          - diffusers-pytorch-cuda
          - diffusers-pytorch-xformers-cuda
          - diffusers-pytorch-minimum-cuda
          - diffusers-flax-cpu
          - diffusers-flax-tpu
          - diffusers-onnxruntime-cpu
          - diffusers-onnxruntime-cuda
          - diffusers-doc-builder
    steps:
--- a/.github/workflows/mirror_community_pipeline.yml
+++ b/.github/workflows/mirror_community_pipeline.yml
@@ -79,14 +79,14 @@ jobs:
      # Check secret is set
      - name: whoami
-        run: huggingface-cli whoami
+        run: hf auth whoami
        env:
            HF_TOKEN: ${{ secrets.HF_TOKEN_MIRROR_COMMUNITY_PIPELINES }}
      # Push to HF! (under subfolder based on checkout ref)
      # https://huggingface.co/datasets/diffusers/community-pipelines-mirror
      - name: Mirror community pipeline to HF
-        run: huggingface-cli upload diffusers/community-pipelines-mirror ./examples/community ${PATH_IN_REPO} --repo-type dataset
+        run: hf upload diffusers/community-pipelines-mirror ./examples/community ${PATH_IN_REPO} --repo-type dataset
        env:
            PATH_IN_REPO: ${{ env.PATH_IN_REPO }}
            HF_TOKEN: ${{ secrets.HF_TOKEN_MIRROR_COMMUNITY_PIPELINES }}
--- a/.github/workflows/nightly_tests.yml
+++ b/.github/workflows/nightly_tests.yml
@@ -13,8 +13,9 @@ env:
  PYTEST_TIMEOUT: 600
  RUN_SLOW: yes
  RUN_NIGHTLY: yes
-  PIPELINE_USAGE_CUTOFF: 5000
+  PIPELINE_USAGE_CUTOFF: 0
  SLACK_API_TOKEN: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
  CONSOLIDATED_REPORT_PATH: consolidated_test_report.md
 jobs:
  setup_torch_cuda_pipeline_matrix:
@@ -60,7 +61,7 @@ jobs:
      group: aws-g4dn-2xlarge
    container:
      image: diffusers/diffusers-pytorch-cuda
-      options: --shm-size "16gb" --ipc host --gpus 0
+      options: --shm-size "16gb" --ipc host --gpus all
    steps:
      - name: Checkout diffusers
        uses: actions/checkout@v3
@@ -99,11 +100,6 @@ jobs:
        with:
          name: pipeline_${{ matrix.module }}_test_reports
          path: reports
      - name: Generate Report and Notify Channel
        if: always()
        run: |
          pip install slack_sdk tabulate
          python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
  run_nightly_tests_for_other_torch_modules:
    name: Nightly Torch CUDA Tests
@@ -111,7 +107,7 @@ jobs:
      group: aws-g4dn-2xlarge
    container:
      image: diffusers/diffusers-pytorch-cuda
-      options: --shm-size "16gb" --ipc host --gpus 0
+      options: --shm-size "16gb" --ipc host --gpus all
    defaults:
      run:
        shell: bash
@@ -142,7 +138,6 @@ jobs:
        HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
        # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
        CUBLAS_WORKSPACE_CONFIG: :16:8
        RUN_COMPILE: yes
      run: |
        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
          -s -v -k "not Flax and not Onnx" \
@@ -175,12 +170,6 @@ jobs:
        name: torch_${{ matrix.module }}_cuda_test_reports
        path: reports
    - name: Generate Report and Notify Channel
      if: always()
      run: |
        pip install slack_sdk tabulate
        python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
  run_torch_compile_tests:
    name: PyTorch Compile CUDA tests
@@ -188,8 +177,8 @@ jobs:
      group: aws-g4dn-2xlarge
    container:
-      image: diffusers/diffusers-pytorch-compile-cuda
+      image: diffusers/diffusers-pytorch-cuda
-      options: --gpus 0 --shm-size "16gb" --ipc host
+      options: --gpus all --shm-size "16gb" --ipc host
    steps:
    - name: Checkout diffusers
@@ -224,12 +213,6 @@ jobs:
        name: torch_compile_test_reports
        path: reports
    - name: Generate Report and Notify Channel
      if: always()
      run: |
        pip install slack_sdk tabulate
        python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
  run_big_gpu_torch_tests:
    name: Torch tests on big GPU
    strategy:
@@ -239,7 +222,7 @@ jobs:
      group: aws-g6e-xlarge-plus
    container:
      image: diffusers/diffusers-pytorch-cuda
-      options: --shm-size "16gb" --ipc host --gpus 0
+      options: --shm-size "16gb" --ipc host --gpus all
    steps:
      - name: Checkout diffusers
        uses: actions/checkout@v3
@@ -265,7 +248,7 @@ jobs:
          BIG_GPU_MEMORY: 40
        run: |
          python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
-            -m "big_gpu_with_torch_cuda" \
+            -m "big_accelerator" \
            --make-reports=tests_big_gpu_torch_cuda \
            --report-log=tests_big_gpu_torch_cuda.log \
            tests/
@@ -280,19 +263,14 @@ jobs:
        with:
          name: torch_cuda_big_gpu_test_reports
          path: reports
-      - name: Generate Report and Notify Channel
+
        if: always()
        run: |
          pip install slack_sdk tabulate
          python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
  torch_minimum_version_cuda_tests:
    name: Torch Minimum Version CUDA Tests
    runs-on:
      group: aws-g4dn-2xlarge
    container:
      image: diffusers/diffusers-pytorch-minimum-cuda
-      options: --shm-size "16gb" --ipc host --gpus 0
+      options: --shm-size "16gb" --ipc host --gpus all
    defaults:
      run:
        shell: bash
@@ -342,132 +320,20 @@ jobs:
        with:
          name: torch_minimum_version_cuda_test_reports
          path: reports
  run_flax_tpu_tests:
    name: Nightly Flax TPU Tests
    runs-on:
      group: gcp-ct5lp-hightpu-8t
    if: github.event_name == 'schedule'
    container:
      image: diffusers/diffusers-flax-tpu
      options: --shm-size "16gb" --ipc host --privileged ${{ vars.V5_LITEPOD_8_ENV}} -v /mnt/hf_cache:/mnt/hf_cache
    defaults:
      run:
        shell: bash
    steps:
    - name: Checkout diffusers
      uses: actions/checkout@v3
      with:
        fetch-depth: 2
    - name: Install dependencies
      run: |
        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
        python -m uv pip install -e [quality,test]
        pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
        python -m uv pip install pytest-reportlog
    - name: Environment
      run: python utils/print_env.py
    - name: Run nightly Flax TPU tests
      env:
        HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
      run: |
        python -m pytest -n 0 \
          -s -v -k "Flax" \
          --make-reports=tests_flax_tpu \
          --report-log=tests_flax_tpu.log \
          tests/
    - name: Failure short reports
      if: ${{ failure() }}
      run: |
        cat reports/tests_flax_tpu_stats.txt
        cat reports/tests_flax_tpu_failures_short.txt
    - name: Test suite reports artifacts
      if: ${{ always() }}
      uses: actions/upload-artifact@v4
      with:
        name: flax_tpu_test_reports
        path: reports
    - name: Generate Report and Notify Channel
      if: always()
      run: |
        pip install slack_sdk tabulate
        python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
  run_nightly_onnx_tests:
    name: Nightly ONNXRuntime CUDA tests on Ubuntu
    runs-on:
      group: aws-g4dn-2xlarge
    container:
      image: diffusers/diffusers-onnxruntime-cuda
      options: --gpus 0 --shm-size "16gb" --ipc host
    steps:
    - name: Checkout diffusers
      uses: actions/checkout@v3
      with:
        fetch-depth: 2
    - name: NVIDIA-SMI
      run: nvidia-smi
    - name: Install dependencies
      run: |
        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
        python -m uv pip install -e [quality,test]
        pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
        python -m uv pip install pytest-reportlog
    - name: Environment
      run: python utils/print_env.py
    - name: Run Nightly ONNXRuntime CUDA tests
      env:
        HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
      run: |
        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
          -s -v -k "Onnx" \
          --make-reports=tests_onnx_cuda \
          --report-log=tests_onnx_cuda.log \
          tests/
    - name: Failure short reports
      if: ${{ failure() }}
      run: |
        cat reports/tests_onnx_cuda_stats.txt
        cat reports/tests_onnx_cuda_failures_short.txt
    - name: Test suite reports artifacts
      if: ${{ always() }}
      uses: actions/upload-artifact@v4
      with:
        name: tests_onnx_cuda_reports
        path: reports
    - name: Generate Report and Notify Channel
      if: always()
      run: |
        pip install slack_sdk tabulate
        python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
  run_nightly_quantization_tests:
    name: Torch quantization nightly tests
    strategy:
      fail-fast: false
      max-parallel: 2
-      matrix: 
+      matrix:
        config:
          - backend: "bitsandbytes"
            test_location: "bnb"
            additional_deps: ["peft"]
          - backend: "gguf"
            test_location: "gguf"
-            additional_deps: ["peft"]
+            additional_deps: ["peft", "kernels"]
          - backend: "torchao"
            test_location: "torchao"
            additional_deps: []
@@ -478,7 +344,7 @@ jobs:
      group: aws-g6e-xlarge-plus
    container:
      image: diffusers/diffusers-pytorch-cuda
-      options: --shm-size "20gb" --ipc host --gpus 0
+      options: --shm-size "20gb" --ipc host --gpus all
    steps:
      - name: Checkout diffusers
        uses: actions/checkout@v3
@@ -520,12 +386,7 @@ jobs:
        with:
          name: torch_cuda_${{ matrix.config.backend }}_reports
          path: reports
-      - name: Generate Report and Notify Channel
+          
        if: always()
        run: |
          pip install slack_sdk tabulate
          python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
  run_nightly_pipeline_level_quantization_tests:
    name: Torch quantization nightly tests
    strategy:
@@ -535,7 +396,7 @@ jobs:
      group: aws-g6e-xlarge-plus
    container:
      image: diffusers/diffusers-pytorch-cuda
-      options: --shm-size "20gb" --ipc host --gpus 0
+      options: --shm-size "20gb" --ipc host --gpus all
    steps:
      - name: Checkout diffusers
        uses: actions/checkout@v3
@@ -574,12 +435,66 @@ jobs:
        with:
          name: torch_cuda_pipeline_level_quant_reports
          path: reports
-      - name: Generate Report and Notify Channel
+
-        if: always()
+  generate_consolidated_report:
    name: Generate Consolidated Test Report
    needs: [
      run_nightly_tests_for_torch_pipelines,
      run_nightly_tests_for_other_torch_modules,
      run_torch_compile_tests,
      run_big_gpu_torch_tests,
      run_nightly_quantization_tests,
      run_nightly_pipeline_level_quantization_tests,
      # run_nightly_onnx_tests,
      torch_minimum_version_cuda_tests,
      # run_flax_tpu_tests
    ]
    if: always()
    runs-on:
      group: aws-general-8-plus
    container:
      image: diffusers/diffusers-pytorch-cpu
    steps:
      - name: Checkout diffusers
        uses: actions/checkout@v3
        with:
          fetch-depth: 2
      - name: Create reports directory
        run: mkdir -p combined_reports
      - name: Download all test reports
        uses: actions/download-artifact@v4
        with:
          path: artifacts
      - name: Prepare reports
        run: |
          # Move all report files to a single directory for processing
          find artifacts -name "*.txt" -exec cp {} combined_reports/ \;
      - name: Install dependencies
        run: |
          pip install -e .[test]
          pip install slack_sdk tabulate
-          python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
+
-  
+      - name: Generate consolidated report
        run: |
          python utils/consolidated_test_report.py \
            --reports_dir combined_reports \
            --output_file $CONSOLIDATED_REPORT_PATH \
            --slack_channel_name diffusers-ci-nightly
      - name: Show consolidated report
        run: |
          cat $CONSOLIDATED_REPORT_PATH >> $GITHUB_STEP_SUMMARY
      - name: Upload consolidated report
        uses: actions/upload-artifact@v4
        with:
          name: consolidated_test_report
          path: ${{ env.CONSOLIDATED_REPORT_PATH }}
 # M1 runner currently not well supported
 # TODO: (Dhruv) add these back when we setup better testing for Apple Silicon
 #  run_nightly_tests_apple_m1:
--- a/.github/workflows/pr_modular_tests.yml
+++ b/.github/workflows/pr_modular_tests.yml
@@ -0,0 +1,141 @@
 name: Fast PR tests for Modular
 on:
  pull_request:
    branches: [main]
    paths:
      - "src/diffusers/modular_pipelines/**.py"
      - "src/diffusers/models/modeling_utils.py"
      - "src/diffusers/models/model_loading_utils.py"
      - "src/diffusers/pipelines/pipeline_utils.py"
      - "src/diffusers/pipeline_loading_utils.py"
      - "src/diffusers/loaders/lora_base.py"
      - "src/diffusers/loaders/lora_pipeline.py"
      - "src/diffusers/loaders/peft.py"
      - "tests/modular_pipelines/**.py"
      - ".github/**.yml"
      - "utils/**.py"
      - "setup.py"
  push:
    branches:
      - ci-*
 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
  cancel-in-progress: true
 env:
  DIFFUSERS_IS_CI: yes
  HF_HUB_ENABLE_HF_TRANSFER: 1
  OMP_NUM_THREADS: 4
  MKL_NUM_THREADS: 4
  PYTEST_TIMEOUT: 60
 jobs:
  check_code_quality:
    runs-on: ubuntu-22.04
    steps:
      - uses: actions/checkout@v3
      - name: Set up Python
        uses: actions/setup-python@v4
        with:
          python-version: "3.10"
      - name: Install dependencies
        run: |
          python -m pip install --upgrade pip
          pip install .[quality]
      - name: Check quality
        run: make quality
      - name: Check if failure
        if: ${{ failure() }}
        run: |
          echo "Quality check failed. Please ensure the right dependency versions are installed with 'pip install -e .[quality]' and run 'make style && make quality'" >> $GITHUB_STEP_SUMMARY
  check_repository_consistency:
    needs: check_code_quality
    runs-on: ubuntu-22.04
    steps:
      - uses: actions/checkout@v3
      - name: Set up Python
        uses: actions/setup-python@v4
        with:
          python-version: "3.10"
      - name: Install dependencies
        run: |
          python -m pip install --upgrade pip
          pip install .[quality]
      - name: Check repo consistency
        run: |
          python utils/check_copies.py
          python utils/check_dummies.py
          python utils/check_support_list.py
          make deps_table_check_updated
      - name: Check if failure
        if: ${{ failure() }}
        run: |
          echo "Repo consistency check failed. Please ensure the right dependency versions are installed with 'pip install -e .[quality]' and run 'make fix-copies'" >> $GITHUB_STEP_SUMMARY
  run_fast_tests:
    needs: [check_code_quality, check_repository_consistency]
    strategy:
      fail-fast: false
      matrix:
        config:
          - name: Fast PyTorch Modular Pipeline CPU tests
            framework: pytorch_pipelines
            runner: aws-highmemory-32-plus
            image: diffusers/diffusers-pytorch-cpu
            report: torch_cpu_modular_pipelines
    name: ${{ matrix.config.name }}
    runs-on:
      group: ${{ matrix.config.runner }}
    container:
      image: ${{ matrix.config.image }}
      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
    defaults:
      run:
        shell: bash
    steps:
    - name: Checkout diffusers
      uses: actions/checkout@v3
      with:
        fetch-depth: 2
    - name: Install dependencies
      run: |
        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
        python -m uv pip install -e [quality,test]
        pip uninstall transformers -y && python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers.git --no-deps
        pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git --no-deps
    - name: Environment
      run: |
        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
        python utils/print_env.py
    - name: Run fast PyTorch Pipeline CPU tests
      if: ${{ matrix.config.framework == 'pytorch_pipelines' }}
      run: |
        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
        python -m pytest -n 8 --max-worker-restart=0 --dist=loadfile \
          -s -v -k "not Flax and not Onnx" \
          --make-reports=tests_${{ matrix.config.report }} \
          tests/modular_pipelines
    - name: Failure short reports
      if: ${{ failure() }}
      run: cat reports/tests_${{ matrix.config.report }}_failures_short.txt
    - name: Test suite reports artifacts
      if: ${{ always() }}
      uses: actions/upload-artifact@v4
      with:
        name: pr_${{ matrix.config.framework }}_${{ matrix.config.report }}_test_reports
        path: reports
--- a/.github/workflows/pr_style_bot.yml
+++ b/.github/workflows/pr_style_bot.yml
@@ -14,4 +14,4 @@ jobs:
    with:
      python_quality_dependencies: "[quality]"
    secrets:
-      bot_token: ${{ secrets.GITHUB_TOKEN }}
+      bot_token: ${{ secrets.HF_STYLE_BOT_ACTION }}
--- a/.github/workflows/pr_tests.yml
+++ b/.github/workflows/pr_tests.yml
@@ -87,11 +87,6 @@ jobs:
            runner: aws-general-8-plus
            image: diffusers/diffusers-pytorch-cpu
            report: torch_cpu_models_schedulers
          - name: Fast Flax CPU tests
            framework: flax
            runner: aws-general-8-plus
            image: diffusers/diffusers-flax-cpu
            report: flax_cpu
          - name: PyTorch Example CPU tests
            framework: pytorch_examples
            runner: aws-general-8-plus
@@ -147,15 +142,6 @@ jobs:
          --make-reports=tests_${{ matrix.config.report }} \
          tests/models tests/schedulers tests/others
    - name: Run fast Flax TPU tests
      if: ${{ matrix.config.framework == 'flax' }}
      run: |
        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
        python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \
          -s -v -k "Flax" \
          --make-reports=tests_${{ matrix.config.report }} \
          tests
    - name: Run example PyTorch CPU tests
      if: ${{ matrix.config.framework == 'pytorch_examples' }}
      run: |
@@ -291,8 +277,8 @@ jobs:
    - name: Failure short reports
      if: ${{ failure() }}
      run: |
-        cat reports/tests_lora_failures_short.txt
+        cat reports/tests_peft_main_failures_short.txt
-        cat reports/tests_models_lora_failures_short.txt
+        cat reports/tests_models_lora_peft_main_failures_short.txt
    - name: Test suite reports artifacts
      if: ${{ always() }}
--- a/.github/workflows/pr_tests_gpu.yml
+++ b/.github/workflows/pr_tests_gpu.yml
@@ -13,6 +13,7 @@ on:
      - "src/diffusers/loaders/peft.py"
      - "tests/pipelines/test_pipelines_common.py"
      - "tests/models/test_modeling_common.py"
      - "examples/**/*.py"
  workflow_dispatch:
 concurrency:
@@ -117,7 +118,7 @@ jobs:
      group: aws-g4dn-2xlarge
    container:
      image: diffusers/diffusers-pytorch-cuda
-      options: --shm-size "16gb" --ipc host --gpus 0
+      options: --shm-size "16gb" --ipc host --gpus all
    steps:
      - name: Checkout diffusers
        uses: actions/checkout@v3
@@ -182,13 +183,13 @@ jobs:
      group: aws-g4dn-2xlarge
    container:
      image: diffusers/diffusers-pytorch-cuda
-      options: --shm-size "16gb" --ipc host --gpus 0
+      options: --shm-size "16gb" --ipc host --gpus all
    defaults:
      run:
        shell: bash
    strategy:
      fail-fast: false
-      max-parallel: 2
+      max-parallel: 4
      matrix:
        module: [models, schedulers, lora, others]
    steps:
@@ -252,7 +253,7 @@ jobs:
    container:
      image: diffusers/diffusers-pytorch-cuda
-      options: --gpus 0 --shm-size "16gb" --ipc host
+      options: --gpus all --shm-size "16gb" --ipc host
    steps:
    - name: Checkout diffusers
      uses: actions/checkout@v3
--- a/.github/workflows/push_tests.yml
+++ b/.github/workflows/push_tests.yml
@@ -64,7 +64,7 @@ jobs:
      group: aws-g4dn-2xlarge
    container:
      image: diffusers/diffusers-pytorch-cuda
-      options: --shm-size "16gb" --ipc host --gpus 0
+      options: --shm-size "16gb" --ipc host --gpus all
    steps:
      - name: Checkout diffusers
        uses: actions/checkout@v3
@@ -109,7 +109,7 @@ jobs:
      group: aws-g4dn-2xlarge
    container:
      image: diffusers/diffusers-pytorch-cuda
-      options: --shm-size "16gb" --ipc host --gpus 0
+      options: --shm-size "16gb" --ipc host --gpus all
    defaults:
      run:
        shell: bash
@@ -159,102 +159,6 @@ jobs:
        name: torch_cuda_test_reports_${{ matrix.module }}
        path: reports
  flax_tpu_tests:
    name: Flax TPU Tests
    runs-on:
      group: gcp-ct5lp-hightpu-8t
    container:
      image: diffusers/diffusers-flax-tpu
      options: --shm-size "16gb" --ipc host --privileged ${{ vars.V5_LITEPOD_8_ENV}} -v /mnt/hf_cache:/mnt/hf_cache 
    defaults:
      run:
        shell: bash
    steps:
    - name: Checkout diffusers
      uses: actions/checkout@v3
      with:
        fetch-depth: 2
    - name: Install dependencies
      run: |
        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
        python -m uv pip install -e [quality,test]
        pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
    - name: Environment
      run: |
        python utils/print_env.py
    - name: Run Flax TPU tests
      env:
        HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
      run: |
        python -m pytest -n 0 \
          -s -v -k "Flax" \
          --make-reports=tests_flax_tpu \
          tests/
    - name: Failure short reports
      if: ${{ failure() }}
      run: |
        cat reports/tests_flax_tpu_stats.txt
        cat reports/tests_flax_tpu_failures_short.txt
    - name: Test suite reports artifacts
      if: ${{ always() }}
      uses: actions/upload-artifact@v4
      with:
        name: flax_tpu_test_reports
        path: reports
  onnx_cuda_tests:
    name: ONNX CUDA Tests
    runs-on:
      group: aws-g4dn-2xlarge
    container:
      image: diffusers/diffusers-onnxruntime-cuda
      options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ --gpus 0
    defaults:
      run:
        shell: bash
    steps:
    - name: Checkout diffusers
      uses: actions/checkout@v3
      with:
        fetch-depth: 2
    - name: Install dependencies
      run: |
        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
        python -m uv pip install -e [quality,test]
        pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
    - name: Environment
      run: |
        python utils/print_env.py
    - name: Run ONNXRuntime CUDA tests
      env:
        HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
      run: |
        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
          -s -v -k "Onnx" \
          --make-reports=tests_onnx_cuda \
          tests/
    - name: Failure short reports
      if: ${{ failure() }}
      run: |
        cat reports/tests_onnx_cuda_stats.txt
        cat reports/tests_onnx_cuda_failures_short.txt
    - name: Test suite reports artifacts
      if: ${{ always() }}
      uses: actions/upload-artifact@v4
      with:
        name: onnx_cuda_test_reports
        path: reports
  run_torch_compile_tests:
    name: PyTorch Compile CUDA tests
@@ -262,8 +166,8 @@ jobs:
      group: aws-g4dn-2xlarge
    container:
-      image: diffusers/diffusers-pytorch-compile-cuda
+      image: diffusers/diffusers-pytorch-cuda
-      options: --gpus 0 --shm-size "16gb" --ipc host
+      options: --gpus all --shm-size "16gb" --ipc host
    steps:
    - name: Checkout diffusers
@@ -306,7 +210,7 @@ jobs:
    container:
      image: diffusers/diffusers-pytorch-xformers-cuda
-      options: --gpus 0 --shm-size "16gb" --ipc host
+      options: --gpus all --shm-size "16gb" --ipc host
    steps:
    - name: Checkout diffusers
@@ -348,7 +252,7 @@ jobs:
    container:
      image: diffusers/diffusers-pytorch-cuda
-      options: --gpus 0 --shm-size "16gb" --ipc host
+      options: --gpus all --shm-size "16gb" --ipc host
    steps:
    - name: Checkout diffusers
      uses: actions/checkout@v3
--- a/.github/workflows/push_tests_fast.yml
+++ b/.github/workflows/push_tests_fast.yml
@@ -33,16 +33,6 @@ jobs:
            runner: aws-general-8-plus
            image: diffusers/diffusers-pytorch-cpu
            report: torch_cpu
          - name: Fast Flax CPU tests on Ubuntu
            framework: flax
            runner: aws-general-8-plus
            image: diffusers/diffusers-flax-cpu
            report: flax_cpu
          - name: Fast ONNXRuntime CPU tests on Ubuntu
            framework: onnxruntime
            runner: aws-general-8-plus
            image: diffusers/diffusers-onnxruntime-cpu
            report: onnx_cpu
          - name: PyTorch Example CPU tests on Ubuntu
            framework: pytorch_examples
            runner: aws-general-8-plus
@@ -87,24 +77,6 @@ jobs:
          --make-reports=tests_${{ matrix.config.report }} \
          tests/
    - name: Run fast Flax TPU tests
      if: ${{ matrix.config.framework == 'flax' }}
      run: |
        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
        python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \
          -s -v -k "Flax" \
          --make-reports=tests_${{ matrix.config.report }} \
          tests/
    - name: Run fast ONNXRuntime CPU tests
      if: ${{ matrix.config.framework == 'onnxruntime' }}
      run: |
        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
        python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \
          -s -v -k "Onnx" \
          --make-reports=tests_${{ matrix.config.report }} \
          tests/
    - name: Run example PyTorch CPU tests
      if: ${{ matrix.config.framework == 'pytorch_examples' }}
      run: |
--- a/.github/workflows/push_tests_mps.yml
+++ b/.github/workflows/push_tests_mps.yml
@@ -1,12 +1,7 @@
 name: Fast mps tests on main
 on:
-  push:
+  workflow_dispatch:
    branches:
      - main
    paths:
      - "src/diffusers/**.py"
      - "tests/**.py"
 env:
  DIFFUSERS_IS_CI: yes
--- a/.github/workflows/release_tests_fast.yml
+++ b/.github/workflows/release_tests_fast.yml
@@ -62,7 +62,7 @@ jobs:
      group: aws-g4dn-2xlarge
    container:
      image: diffusers/diffusers-pytorch-cuda
-      options: --shm-size "16gb" --ipc host --gpus 0
+      options: --shm-size "16gb" --ipc host --gpus all
    steps:
      - name: Checkout diffusers
        uses: actions/checkout@v3
@@ -107,7 +107,7 @@ jobs:
      group: aws-g4dn-2xlarge
    container:
      image: diffusers/diffusers-pytorch-cuda
-      options: --shm-size "16gb" --ipc host --gpus 0
+      options: --shm-size "16gb" --ipc host --gpus all
    defaults:
      run:
        shell: bash
@@ -163,7 +163,7 @@ jobs:
      group: aws-g4dn-2xlarge
    container:
      image: diffusers/diffusers-pytorch-minimum-cuda
-      options: --shm-size "16gb" --ipc host --gpus 0
+      options: --shm-size "16gb" --ipc host --gpus all
    defaults:
      run:
        shell: bash
@@ -213,101 +213,6 @@ jobs:
        with:
          name: torch_minimum_version_cuda_test_reports
          path: reports
  flax_tpu_tests:
    name: Flax TPU Tests
    runs-on: docker-tpu
    container:
      image: diffusers/diffusers-flax-tpu
      options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ --privileged
    defaults:
      run:
        shell: bash
    steps:
    - name: Checkout diffusers
      uses: actions/checkout@v3
      with:
        fetch-depth: 2
    - name: Install dependencies
      run: |
        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
        python -m uv pip install -e [quality,test]
        pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
    - name: Environment
      run: |
        python utils/print_env.py
    - name: Run slow Flax TPU tests
      env:
        HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
      run: |
        python -m pytest -n 0 \
          -s -v -k "Flax" \
          --make-reports=tests_flax_tpu \
          tests/
    - name: Failure short reports
      if: ${{ failure() }}
      run: |
        cat reports/tests_flax_tpu_stats.txt
        cat reports/tests_flax_tpu_failures_short.txt
    - name: Test suite reports artifacts
      if: ${{ always() }}
      uses: actions/upload-artifact@v4
      with:
        name: flax_tpu_test_reports
        path: reports
  onnx_cuda_tests:
    name: ONNX CUDA Tests
    runs-on:
      group: aws-g4dn-2xlarge
    container:
      image: diffusers/diffusers-onnxruntime-cuda
      options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ --gpus 0
    defaults:
      run:
        shell: bash
    steps:
    - name: Checkout diffusers
      uses: actions/checkout@v3
      with:
        fetch-depth: 2
    - name: Install dependencies
      run: |
        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
        python -m uv pip install -e [quality,test]
        pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
    - name: Environment
      run: |
        python utils/print_env.py
    - name: Run slow ONNXRuntime CUDA tests
      env:
        HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
      run: |
        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
          -s -v -k "Onnx" \
          --make-reports=tests_onnx_cuda \
          tests/
    - name: Failure short reports
      if: ${{ failure() }}
      run: |
        cat reports/tests_onnx_cuda_stats.txt
        cat reports/tests_onnx_cuda_failures_short.txt
    - name: Test suite reports artifacts
      if: ${{ always() }}
      uses: actions/upload-artifact@v4
      with:
        name: onnx_cuda_test_reports
        path: reports
  run_torch_compile_tests:
    name: PyTorch Compile CUDA tests
@@ -316,8 +221,8 @@ jobs:
      group: aws-g4dn-2xlarge
    container:
-      image: diffusers/diffusers-pytorch-compile-cuda
+      image: diffusers/diffusers-pytorch-cuda
-      options: --gpus 0 --shm-size "16gb" --ipc host
+      options: --gpus all --shm-size "16gb" --ipc host
    steps:
    - name: Checkout diffusers
@@ -360,7 +265,7 @@ jobs:
    container:
      image: diffusers/diffusers-pytorch-xformers-cuda
-      options: --gpus 0 --shm-size "16gb" --ipc host
+      options: --gpus all --shm-size "16gb" --ipc host
    steps:
    - name: Checkout diffusers
@@ -402,7 +307,7 @@ jobs:
    container:
      image: diffusers/diffusers-pytorch-cuda
-      options: --gpus 0 --shm-size "16gb" --ipc host
+      options: --gpus all --shm-size "16gb" --ipc host
    steps:
    - name: Checkout diffusers
--- a/.github/workflows/run_tests_from_a_pr.yml
+++ b/.github/workflows/run_tests_from_a_pr.yml
@@ -30,7 +30,7 @@ jobs:
      group: aws-g4dn-2xlarge
    container:
      image: ${{ github.event.inputs.docker_image }}
-      options: --gpus 0 --privileged --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+      options: --gpus all --privileged --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    steps:
      - name: Validate test files input
--- a/.github/workflows/ssh-runner.yml
+++ b/.github/workflows/ssh-runner.yml
@@ -31,7 +31,7 @@ jobs:
      group: "${{ github.event.inputs.runner_type }}"
    container:
      image: ${{ github.event.inputs.docker_image }}
-      options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface/diffusers:/mnt/cache/ --gpus 0 --privileged
+      options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface/diffusers:/mnt/cache/ --gpus all --privileged
    steps:
      - name: Checkout diffusers
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,4 +1,4 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
--- a/PHILOSOPHY.md
+++ b/PHILOSOPHY.md
@@ -1,4 +1,4 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -0,0 +1,69 @@
 # Diffusers Benchmarks
 Welcome to Diffusers Benchmarks. These benchmarks are use to obtain latency and memory information of the most popular models across different scenarios such as:
 * Base case i.e., when using `torch.bfloat16` and `torch.nn.functional.scaled_dot_product_attention`.
 * Base + `torch.compile()`
 * NF4 quantization
 * Layerwise upcasting
 Instead of full diffusion pipelines, only the forward pass of the respective model classes (such as `FluxTransformer2DModel`) is tested with the real checkpoints (such as `"black-forest-labs/FLUX.1-dev"`). 
 The entrypoint to running all the currently available benchmarks is in `run_all.py`. However, one can run the individual benchmarks, too, e.g., `python benchmarking_flux.py`. It should produce a CSV file containing various information about the benchmarks run.
 The benchmarks are run on a weekly basis and the CI is defined in [benchmark.yml](../.github/workflows/benchmark.yml).
 ## Running the benchmarks manually
 First set up `torch` and install `diffusers` from the root of the directory:
 ```py
 pip install -e ".[quality,test]"
 ```
 Then make sure the other dependencies are installed:
 ```sh
 cd benchmarks/
 pip install -r requirements.txt
 ```
 We need to be authenticated to access some of the checkpoints used during benchmarking:
 ```sh
 hf auth login
 ```
 We use an L40 GPU with 128GB RAM to run the benchmark CI. As such, the benchmarks are configured to run on NVIDIA GPUs. So, make sure you have access to a similar machine (or modify the benchmarking scripts accordingly).
 Then you can either launch the entire benchmarking suite by running:
 ```sh
 python run_all.py
 ```
 Or, you can run the individual benchmarks.
 ## Customizing the benchmarks
 We define "scenarios" to cover the most common ways in which these models are used. You can
 define a new scenario, modifying an existing benchmark file:
 ```py
 BenchmarkScenario(
    name=f"{CKPT_ID}-bnb-8bit",
    model_cls=FluxTransformer2DModel,
    model_init_kwargs={
        "pretrained_model_name_or_path": CKPT_ID,
        "torch_dtype": torch.bfloat16,
        "subfolder": "transformer",
        "quantization_config": BitsAndBytesConfig(load_in_8bit=True),
    },
    get_model_input_dict=partial(get_input_dict, device=torch_device, dtype=torch.bfloat16),
    model_init_fn=model_init_fn,
 )
 ```
 You can also configure a new model-level benchmark and add it to the existing suite. To do so, just defining a valid benchmarking file like `benchmarking_flux.py` should be enough.
 Happy benchmarking 🧨
--- a/tests/pipelines/amused/init.py
+++ b/tests/pipelines/amused/init.py
--- a/benchmarks/base_classes.py
+++ b/benchmarks/base_classes.py
@@ -1,346 +0,0 @@
 import os
 import sys
 import torch
 from diffusers import (
    AutoPipelineForImage2Image,
    AutoPipelineForInpainting,
    AutoPipelineForText2Image,
    ControlNetModel,
    LCMScheduler,
    StableDiffusionAdapterPipeline,
    StableDiffusionControlNetPipeline,
    StableDiffusionXLAdapterPipeline,
    StableDiffusionXLControlNetPipeline,
    T2IAdapter,
    WuerstchenCombinedPipeline,
 )
 from diffusers.utils import load_image
 sys.path.append(".")
 from utils import (  # noqa: E402
    BASE_PATH,
    PROMPT,
    BenchmarkInfo,
    benchmark_fn,
    bytes_to_giga_bytes,
    flush,
    generate_csv_dict,
    write_to_csv,
 )
 RESOLUTION_MAPPING = {
    "Lykon/DreamShaper": (512, 512),
    "lllyasviel/sd-controlnet-canny": (512, 512),
    "diffusers/controlnet-canny-sdxl-1.0": (1024, 1024),
    "TencentARC/t2iadapter_canny_sd14v1": (512, 512),
    "TencentARC/t2i-adapter-canny-sdxl-1.0": (1024, 1024),
    "stabilityai/stable-diffusion-2-1": (768, 768),
    "stabilityai/stable-diffusion-xl-base-1.0": (1024, 1024),
    "stabilityai/stable-diffusion-xl-refiner-1.0": (1024, 1024),
    "stabilityai/sdxl-turbo": (512, 512),
 }
 class BaseBenchmak:
    pipeline_class = None
    def __init__(self, args):
        super().__init__()
    def run_inference(self, args):
        raise NotImplementedError
    def benchmark(self, args):
        raise NotImplementedError
    def get_result_filepath(self, args):
        pipeline_class_name = str(self.pipe.__class__.__name__)
        name = (
            args.ckpt.replace("/", "_")
            + "_"
            + pipeline_class_name
            + f"-bs@{args.batch_size}-steps@{args.num_inference_steps}-mco@{args.model_cpu_offload}-compile@{args.run_compile}.csv"
        )
        filepath = os.path.join(BASE_PATH, name)
        return filepath
 class TextToImageBenchmark(BaseBenchmak):
    pipeline_class = AutoPipelineForText2Image
    def __init__(self, args):
        pipe = self.pipeline_class.from_pretrained(args.ckpt, torch_dtype=torch.float16)
        pipe = pipe.to("cuda")
        if args.run_compile:
            if not isinstance(pipe, WuerstchenCombinedPipeline):
                pipe.unet.to(memory_format=torch.channels_last)
                print("Run torch compile")
                pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
                if hasattr(pipe, "movq") and getattr(pipe, "movq", None) is not None:
                    pipe.movq.to(memory_format=torch.channels_last)
                    pipe.movq = torch.compile(pipe.movq, mode="reduce-overhead", fullgraph=True)
            else:
                print("Run torch compile")
                pipe.decoder = torch.compile(pipe.decoder, mode="reduce-overhead", fullgraph=True)
                pipe.vqgan = torch.compile(pipe.vqgan, mode="reduce-overhead", fullgraph=True)
        pipe.set_progress_bar_config(disable=True)
        self.pipe = pipe
    def run_inference(self, pipe, args):
        _ = pipe(
            prompt=PROMPT,
            num_inference_steps=args.num_inference_steps,
            num_images_per_prompt=args.batch_size,
        )
    def benchmark(self, args):
        flush()
        print(f"[INFO] {self.pipe.__class__.__name__}: Running benchmark with: {vars(args)}\n")
        time = benchmark_fn(self.run_inference, self.pipe, args)  # in seconds.
        memory = bytes_to_giga_bytes(torch.cuda.max_memory_allocated())  # in GBs.
        benchmark_info = BenchmarkInfo(time=time, memory=memory)
        pipeline_class_name = str(self.pipe.__class__.__name__)
        flush()
        csv_dict = generate_csv_dict(
            pipeline_cls=pipeline_class_name, ckpt=args.ckpt, args=args, benchmark_info=benchmark_info
        )
        filepath = self.get_result_filepath(args)
        write_to_csv(filepath, csv_dict)
        print(f"Logs written to: {filepath}")
        flush()
 class TurboTextToImageBenchmark(TextToImageBenchmark):
    def __init__(self, args):
        super().__init__(args)
    def run_inference(self, pipe, args):
        _ = pipe(
            prompt=PROMPT,
            num_inference_steps=args.num_inference_steps,
            num_images_per_prompt=args.batch_size,
            guidance_scale=0.0,
        )
 class LCMLoRATextToImageBenchmark(TextToImageBenchmark):
    lora_id = "latent-consistency/lcm-lora-sdxl"
    def __init__(self, args):
        super().__init__(args)
        self.pipe.load_lora_weights(self.lora_id)
        self.pipe.fuse_lora()
        self.pipe.unload_lora_weights()
        self.pipe.scheduler = LCMScheduler.from_config(self.pipe.scheduler.config)
    def get_result_filepath(self, args):
        pipeline_class_name = str(self.pipe.__class__.__name__)
        name = (
            self.lora_id.replace("/", "_")
            + "_"
            + pipeline_class_name
            + f"-bs@{args.batch_size}-steps@{args.num_inference_steps}-mco@{args.model_cpu_offload}-compile@{args.run_compile}.csv"
        )
        filepath = os.path.join(BASE_PATH, name)
        return filepath
    def run_inference(self, pipe, args):
        _ = pipe(
            prompt=PROMPT,
            num_inference_steps=args.num_inference_steps,
            num_images_per_prompt=args.batch_size,
            guidance_scale=1.0,
        )
    def benchmark(self, args):
        flush()
        print(f"[INFO] {self.pipe.__class__.__name__}: Running benchmark with: {vars(args)}\n")
        time = benchmark_fn(self.run_inference, self.pipe, args)  # in seconds.
        memory = bytes_to_giga_bytes(torch.cuda.max_memory_allocated())  # in GBs.
        benchmark_info = BenchmarkInfo(time=time, memory=memory)
        pipeline_class_name = str(self.pipe.__class__.__name__)
        flush()
        csv_dict = generate_csv_dict(
            pipeline_cls=pipeline_class_name, ckpt=self.lora_id, args=args, benchmark_info=benchmark_info
        )
        filepath = self.get_result_filepath(args)
        write_to_csv(filepath, csv_dict)
        print(f"Logs written to: {filepath}")
        flush()
 class ImageToImageBenchmark(TextToImageBenchmark):
    pipeline_class = AutoPipelineForImage2Image
    url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/benchmarking/1665_Girl_with_a_Pearl_Earring.jpg"
    image = load_image(url).convert("RGB")
    def __init__(self, args):
        super().__init__(args)
        self.image = self.image.resize(RESOLUTION_MAPPING[args.ckpt])
    def run_inference(self, pipe, args):
        _ = pipe(
            prompt=PROMPT,
            image=self.image,
            num_inference_steps=args.num_inference_steps,
            num_images_per_prompt=args.batch_size,
        )
 class TurboImageToImageBenchmark(ImageToImageBenchmark):
    def __init__(self, args):
        super().__init__(args)
    def run_inference(self, pipe, args):
        _ = pipe(
            prompt=PROMPT,
            image=self.image,
            num_inference_steps=args.num_inference_steps,
            num_images_per_prompt=args.batch_size,
            guidance_scale=0.0,
            strength=0.5,
        )
 class InpaintingBenchmark(ImageToImageBenchmark):
    pipeline_class = AutoPipelineForInpainting
    mask_url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/benchmarking/overture-creations-5sI6fQgYIuo_mask.png"
    mask = load_image(mask_url).convert("RGB")
    def __init__(self, args):
        super().__init__(args)
        self.image = self.image.resize(RESOLUTION_MAPPING[args.ckpt])
        self.mask = self.mask.resize(RESOLUTION_MAPPING[args.ckpt])
    def run_inference(self, pipe, args):
        _ = pipe(
            prompt=PROMPT,
            image=self.image,
            mask_image=self.mask,
            num_inference_steps=args.num_inference_steps,
            num_images_per_prompt=args.batch_size,
        )
 class IPAdapterTextToImageBenchmark(TextToImageBenchmark):
    url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/load_neg_embed.png"
    image = load_image(url)
    def __init__(self, args):
        pipe = self.pipeline_class.from_pretrained(args.ckpt, torch_dtype=torch.float16).to("cuda")
        pipe.load_ip_adapter(
            args.ip_adapter_id[0],
            subfolder="models" if "sdxl" not in args.ip_adapter_id[1] else "sdxl_models",
            weight_name=args.ip_adapter_id[1],
        )
        if args.run_compile:
            pipe.unet.to(memory_format=torch.channels_last)
            print("Run torch compile")
            pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
        pipe.set_progress_bar_config(disable=True)
        self.pipe = pipe
    def run_inference(self, pipe, args):
        _ = pipe(
            prompt=PROMPT,
            ip_adapter_image=self.image,
            num_inference_steps=args.num_inference_steps,
            num_images_per_prompt=args.batch_size,
        )
 class ControlNetBenchmark(TextToImageBenchmark):
    pipeline_class = StableDiffusionControlNetPipeline
    aux_network_class = ControlNetModel
    root_ckpt = "Lykon/DreamShaper"
    url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/benchmarking/canny_image_condition.png"
    image = load_image(url).convert("RGB")
    def __init__(self, args):
        aux_network = self.aux_network_class.from_pretrained(args.ckpt, torch_dtype=torch.float16)
        pipe = self.pipeline_class.from_pretrained(self.root_ckpt, controlnet=aux_network, torch_dtype=torch.float16)
        pipe = pipe.to("cuda")
        pipe.set_progress_bar_config(disable=True)
        self.pipe = pipe
        if args.run_compile:
            pipe.unet.to(memory_format=torch.channels_last)
            pipe.controlnet.to(memory_format=torch.channels_last)
            print("Run torch compile")
            pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
            pipe.controlnet = torch.compile(pipe.controlnet, mode="reduce-overhead", fullgraph=True)
        self.image = self.image.resize(RESOLUTION_MAPPING[args.ckpt])
    def run_inference(self, pipe, args):
        _ = pipe(
            prompt=PROMPT,
            image=self.image,
            num_inference_steps=args.num_inference_steps,
            num_images_per_prompt=args.batch_size,
        )
 class ControlNetSDXLBenchmark(ControlNetBenchmark):
    pipeline_class = StableDiffusionXLControlNetPipeline
    root_ckpt = "stabilityai/stable-diffusion-xl-base-1.0"
    def __init__(self, args):
        super().__init__(args)
 class T2IAdapterBenchmark(ControlNetBenchmark):
    pipeline_class = StableDiffusionAdapterPipeline
    aux_network_class = T2IAdapter
    root_ckpt = "Lykon/DreamShaper"
    url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/benchmarking/canny_for_adapter.png"
    image = load_image(url).convert("L")
    def __init__(self, args):
        aux_network = self.aux_network_class.from_pretrained(args.ckpt, torch_dtype=torch.float16)
        pipe = self.pipeline_class.from_pretrained(self.root_ckpt, adapter=aux_network, torch_dtype=torch.float16)
        pipe = pipe.to("cuda")
        pipe.set_progress_bar_config(disable=True)
        self.pipe = pipe
        if args.run_compile:
            pipe.unet.to(memory_format=torch.channels_last)
            pipe.adapter.to(memory_format=torch.channels_last)
            print("Run torch compile")
            pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
            pipe.adapter = torch.compile(pipe.adapter, mode="reduce-overhead", fullgraph=True)
        self.image = self.image.resize(RESOLUTION_MAPPING[args.ckpt])
 class T2IAdapterSDXLBenchmark(T2IAdapterBenchmark):
    pipeline_class = StableDiffusionXLAdapterPipeline
    root_ckpt = "stabilityai/stable-diffusion-xl-base-1.0"
    url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/benchmarking/canny_for_adapter_sdxl.png"
    image = load_image(url)
    def __init__(self, args):
        super().__init__(args)
--- a/benchmarks/benchmark_controlnet.py
+++ b/benchmarks/benchmark_controlnet.py
@@ -1,26 +0,0 @@
 import argparse
 import sys
 sys.path.append(".")
 from base_classes import ControlNetBenchmark, ControlNetSDXLBenchmark  # noqa: E402
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--ckpt",
        type=str,
        default="lllyasviel/sd-controlnet-canny",
        choices=["lllyasviel/sd-controlnet-canny", "diffusers/controlnet-canny-sdxl-1.0"],
    )
    parser.add_argument("--batch_size", type=int, default=1)
    parser.add_argument("--num_inference_steps", type=int, default=50)
    parser.add_argument("--model_cpu_offload", action="store_true")
    parser.add_argument("--run_compile", action="store_true")
    args = parser.parse_args()
    benchmark_pipe = (
        ControlNetBenchmark(args) if args.ckpt == "lllyasviel/sd-controlnet-canny" else ControlNetSDXLBenchmark(args)
    )
    benchmark_pipe.benchmark(args)
--- a/benchmarks/benchmark_ip_adapters.py
+++ b/benchmarks/benchmark_ip_adapters.py
@@ -1,33 +0,0 @@
 import argparse
 import sys
 sys.path.append(".")
 from base_classes import IPAdapterTextToImageBenchmark  # noqa: E402
 IP_ADAPTER_CKPTS = {
    # because original SD v1.5 has been taken down.
    "Lykon/DreamShaper": ("h94/IP-Adapter", "ip-adapter_sd15.bin"),
    "stabilityai/stable-diffusion-xl-base-1.0": ("h94/IP-Adapter", "ip-adapter_sdxl.bin"),
 }
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--ckpt",
        type=str,
        default="rstabilityai/stable-diffusion-xl-base-1.0",
        choices=list(IP_ADAPTER_CKPTS.keys()),
    )
    parser.add_argument("--batch_size", type=int, default=1)
    parser.add_argument("--num_inference_steps", type=int, default=50)
    parser.add_argument("--model_cpu_offload", action="store_true")
    parser.add_argument("--run_compile", action="store_true")
    args = parser.parse_args()
    args.ip_adapter_id = IP_ADAPTER_CKPTS[args.ckpt]
    benchmark_pipe = IPAdapterTextToImageBenchmark(args)
    args.ckpt = f"{args.ckpt} (IP-Adapter)"
    benchmark_pipe.benchmark(args)
--- a/benchmarks/benchmark_sd_img.py
+++ b/benchmarks/benchmark_sd_img.py
@@ -1,29 +0,0 @@
 import argparse
 import sys
 sys.path.append(".")
 from base_classes import ImageToImageBenchmark, TurboImageToImageBenchmark  # noqa: E402
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--ckpt",
        type=str,
        default="Lykon/DreamShaper",
        choices=[
            "Lykon/DreamShaper",
            "stabilityai/stable-diffusion-2-1",
            "stabilityai/stable-diffusion-xl-refiner-1.0",
            "stabilityai/sdxl-turbo",
        ],
    )
    parser.add_argument("--batch_size", type=int, default=1)
    parser.add_argument("--num_inference_steps", type=int, default=50)
    parser.add_argument("--model_cpu_offload", action="store_true")
    parser.add_argument("--run_compile", action="store_true")
    args = parser.parse_args()
    benchmark_pipe = ImageToImageBenchmark(args) if "turbo" not in args.ckpt else TurboImageToImageBenchmark(args)
    benchmark_pipe.benchmark(args)
--- a/benchmarks/benchmark_sd_inpainting.py
+++ b/benchmarks/benchmark_sd_inpainting.py
@@ -1,28 +0,0 @@
 import argparse
 import sys
 sys.path.append(".")
 from base_classes import InpaintingBenchmark  # noqa: E402
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--ckpt",
        type=str,
        default="Lykon/DreamShaper",
        choices=[
            "Lykon/DreamShaper",
            "stabilityai/stable-diffusion-2-1",
            "stabilityai/stable-diffusion-xl-base-1.0",
        ],
    )
    parser.add_argument("--batch_size", type=int, default=1)
    parser.add_argument("--num_inference_steps", type=int, default=50)
    parser.add_argument("--model_cpu_offload", action="store_true")
    parser.add_argument("--run_compile", action="store_true")
    args = parser.parse_args()
    benchmark_pipe = InpaintingBenchmark(args)
    benchmark_pipe.benchmark(args)
--- a/benchmarks/benchmark_t2i_adapter.py
+++ b/benchmarks/benchmark_t2i_adapter.py
@@ -1,28 +0,0 @@
 import argparse
 import sys
 sys.path.append(".")
 from base_classes import T2IAdapterBenchmark, T2IAdapterSDXLBenchmark  # noqa: E402
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--ckpt",
        type=str,
        default="TencentARC/t2iadapter_canny_sd14v1",
        choices=["TencentARC/t2iadapter_canny_sd14v1", "TencentARC/t2i-adapter-canny-sdxl-1.0"],
    )
    parser.add_argument("--batch_size", type=int, default=1)
    parser.add_argument("--num_inference_steps", type=int, default=50)
    parser.add_argument("--model_cpu_offload", action="store_true")
    parser.add_argument("--run_compile", action="store_true")
    args = parser.parse_args()
    benchmark_pipe = (
        T2IAdapterBenchmark(args)
        if args.ckpt == "TencentARC/t2iadapter_canny_sd14v1"
        else T2IAdapterSDXLBenchmark(args)
    )
    benchmark_pipe.benchmark(args)
--- a/benchmarks/benchmark_t2i_lcm_lora.py
+++ b/benchmarks/benchmark_t2i_lcm_lora.py
@@ -1,23 +0,0 @@
 import argparse
 import sys
 sys.path.append(".")
 from base_classes import LCMLoRATextToImageBenchmark  # noqa: E402
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--ckpt",
        type=str,
        default="stabilityai/stable-diffusion-xl-base-1.0",
    )
    parser.add_argument("--batch_size", type=int, default=1)
    parser.add_argument("--num_inference_steps", type=int, default=4)
    parser.add_argument("--model_cpu_offload", action="store_true")
    parser.add_argument("--run_compile", action="store_true")
    args = parser.parse_args()
    benchmark_pipe = LCMLoRATextToImageBenchmark(args)
    benchmark_pipe.benchmark(args)
--- a/benchmarks/benchmark_text_to_image.py
+++ b/benchmarks/benchmark_text_to_image.py
@@ -1,40 +0,0 @@
 import argparse
 import sys
 sys.path.append(".")
 from base_classes import TextToImageBenchmark, TurboTextToImageBenchmark  # noqa: E402
 ALL_T2I_CKPTS = [
    "Lykon/DreamShaper",
    "segmind/SSD-1B",
    "stabilityai/stable-diffusion-xl-base-1.0",
    "kandinsky-community/kandinsky-2-2-decoder",
    "warp-ai/wuerstchen",
    "stabilityai/sdxl-turbo",
 ]
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--ckpt",
        type=str,
        default="Lykon/DreamShaper",
        choices=ALL_T2I_CKPTS,
    )
    parser.add_argument("--batch_size", type=int, default=1)
    parser.add_argument("--num_inference_steps", type=int, default=50)
    parser.add_argument("--model_cpu_offload", action="store_true")
    parser.add_argument("--run_compile", action="store_true")
    args = parser.parse_args()
    benchmark_cls = None
    if "turbo" in args.ckpt:
        benchmark_cls = TurboTextToImageBenchmark
    else:
        benchmark_cls = TextToImageBenchmark
    benchmark_pipe = benchmark_cls(args)
    benchmark_pipe.benchmark(args)
--- a/benchmarks/benchmarking_flux.py
+++ b/benchmarks/benchmarking_flux.py
@@ -0,0 +1,98 @@
 from functools import partial
 import torch
 from benchmarking_utils import BenchmarkMixin, BenchmarkScenario, model_init_fn
 from diffusers import BitsAndBytesConfig, FluxTransformer2DModel
 from diffusers.utils.testing_utils import torch_device
 CKPT_ID = "black-forest-labs/FLUX.1-dev"
 RESULT_FILENAME = "flux.csv"
 def get_input_dict(**device_dtype_kwargs):
    # resolution: 1024x1024
    # maximum sequence length 512
    hidden_states = torch.randn(1, 4096, 64, **device_dtype_kwargs)
    encoder_hidden_states = torch.randn(1, 512, 4096, **device_dtype_kwargs)
    pooled_prompt_embeds = torch.randn(1, 768, **device_dtype_kwargs)
    image_ids = torch.ones(512, 3, **device_dtype_kwargs)
    text_ids = torch.ones(4096, 3, **device_dtype_kwargs)
    timestep = torch.tensor([1.0], **device_dtype_kwargs)
    guidance = torch.tensor([1.0], **device_dtype_kwargs)
    return {
        "hidden_states": hidden_states,
        "encoder_hidden_states": encoder_hidden_states,
        "img_ids": image_ids,
        "txt_ids": text_ids,
        "pooled_projections": pooled_prompt_embeds,
        "timestep": timestep,
        "guidance": guidance,
    }
 if __name__ == "__main__":
    scenarios = [
        BenchmarkScenario(
            name=f"{CKPT_ID}-bf16",
            model_cls=FluxTransformer2DModel,
            model_init_kwargs={
                "pretrained_model_name_or_path": CKPT_ID,
                "torch_dtype": torch.bfloat16,
                "subfolder": "transformer",
            },
            get_model_input_dict=partial(get_input_dict, device=torch_device, dtype=torch.bfloat16),
            model_init_fn=model_init_fn,
            compile_kwargs={"fullgraph": True},
        ),
        BenchmarkScenario(
            name=f"{CKPT_ID}-bnb-nf4",
            model_cls=FluxTransformer2DModel,
            model_init_kwargs={
                "pretrained_model_name_or_path": CKPT_ID,
                "torch_dtype": torch.bfloat16,
                "subfolder": "transformer",
                "quantization_config": BitsAndBytesConfig(
                    load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_quant_type="nf4"
                ),
            },
            get_model_input_dict=partial(get_input_dict, device=torch_device, dtype=torch.bfloat16),
            model_init_fn=model_init_fn,
        ),
        BenchmarkScenario(
            name=f"{CKPT_ID}-layerwise-upcasting",
            model_cls=FluxTransformer2DModel,
            model_init_kwargs={
                "pretrained_model_name_or_path": CKPT_ID,
                "torch_dtype": torch.bfloat16,
                "subfolder": "transformer",
            },
            get_model_input_dict=partial(get_input_dict, device=torch_device, dtype=torch.bfloat16),
            model_init_fn=partial(model_init_fn, layerwise_upcasting=True),
        ),
        BenchmarkScenario(
            name=f"{CKPT_ID}-group-offload-leaf",
            model_cls=FluxTransformer2DModel,
            model_init_kwargs={
                "pretrained_model_name_or_path": CKPT_ID,
                "torch_dtype": torch.bfloat16,
                "subfolder": "transformer",
            },
            get_model_input_dict=partial(get_input_dict, device=torch_device, dtype=torch.bfloat16),
            model_init_fn=partial(
                model_init_fn,
                group_offload_kwargs={
                    "onload_device": torch_device,
                    "offload_device": torch.device("cpu"),
                    "offload_type": "leaf_level",
                    "use_stream": True,
                    "non_blocking": True,
                },
            ),
        ),
    ]
    runner = BenchmarkMixin()
    runner.run_bencmarks_and_collate(scenarios, filename=RESULT_FILENAME)
--- a/benchmarks/benchmarking_ltx.py
+++ b/benchmarks/benchmarking_ltx.py
@@ -0,0 +1,80 @@
 from functools import partial
 import torch
 from benchmarking_utils import BenchmarkMixin, BenchmarkScenario, model_init_fn
 from diffusers import LTXVideoTransformer3DModel
 from diffusers.utils.testing_utils import torch_device
 CKPT_ID = "Lightricks/LTX-Video-0.9.7-dev"
 RESULT_FILENAME = "ltx.csv"
 def get_input_dict(**device_dtype_kwargs):
    # 512x704 (161 frames)
    # `max_sequence_length`: 256
    hidden_states = torch.randn(1, 7392, 128, **device_dtype_kwargs)
    encoder_hidden_states = torch.randn(1, 256, 4096, **device_dtype_kwargs)
    encoder_attention_mask = torch.ones(1, 256, **device_dtype_kwargs)
    timestep = torch.tensor([1.0], **device_dtype_kwargs)
    video_coords = torch.randn(1, 3, 7392, **device_dtype_kwargs)
    return {
        "hidden_states": hidden_states,
        "encoder_hidden_states": encoder_hidden_states,
        "encoder_attention_mask": encoder_attention_mask,
        "timestep": timestep,
        "video_coords": video_coords,
    }
 if __name__ == "__main__":
    scenarios = [
        BenchmarkScenario(
            name=f"{CKPT_ID}-bf16",
            model_cls=LTXVideoTransformer3DModel,
            model_init_kwargs={
                "pretrained_model_name_or_path": CKPT_ID,
                "torch_dtype": torch.bfloat16,
                "subfolder": "transformer",
            },
            get_model_input_dict=partial(get_input_dict, device=torch_device, dtype=torch.bfloat16),
            model_init_fn=model_init_fn,
            compile_kwargs={"fullgraph": True},
        ),
        BenchmarkScenario(
            name=f"{CKPT_ID}-layerwise-upcasting",
            model_cls=LTXVideoTransformer3DModel,
            model_init_kwargs={
                "pretrained_model_name_or_path": CKPT_ID,
                "torch_dtype": torch.bfloat16,
                "subfolder": "transformer",
            },
            get_model_input_dict=partial(get_input_dict, device=torch_device, dtype=torch.bfloat16),
            model_init_fn=partial(model_init_fn, layerwise_upcasting=True),
        ),
        BenchmarkScenario(
            name=f"{CKPT_ID}-group-offload-leaf",
            model_cls=LTXVideoTransformer3DModel,
            model_init_kwargs={
                "pretrained_model_name_or_path": CKPT_ID,
                "torch_dtype": torch.bfloat16,
                "subfolder": "transformer",
            },
            get_model_input_dict=partial(get_input_dict, device=torch_device, dtype=torch.bfloat16),
            model_init_fn=partial(
                model_init_fn,
                group_offload_kwargs={
                    "onload_device": torch_device,
                    "offload_device": torch.device("cpu"),
                    "offload_type": "leaf_level",
                    "use_stream": True,
                    "non_blocking": True,
                },
            ),
        ),
    ]
    runner = BenchmarkMixin()
    runner.run_bencmarks_and_collate(scenarios, filename=RESULT_FILENAME)
--- a/benchmarks/benchmarking_sdxl.py
+++ b/benchmarks/benchmarking_sdxl.py
@@ -0,0 +1,82 @@
 from functools import partial
 import torch
 from benchmarking_utils import BenchmarkMixin, BenchmarkScenario, model_init_fn
 from diffusers import UNet2DConditionModel
 from diffusers.utils.testing_utils import torch_device
 CKPT_ID = "stabilityai/stable-diffusion-xl-base-1.0"
 RESULT_FILENAME = "sdxl.csv"
 def get_input_dict(**device_dtype_kwargs):
    # height: 1024
    # width: 1024
    # max_sequence_length: 77
    hidden_states = torch.randn(1, 4, 128, 128, **device_dtype_kwargs)
    encoder_hidden_states = torch.randn(1, 77, 2048, **device_dtype_kwargs)
    timestep = torch.tensor([1.0], **device_dtype_kwargs)
    added_cond_kwargs = {
        "text_embeds": torch.randn(1, 1280, **device_dtype_kwargs),
        "time_ids": torch.ones(1, 6, **device_dtype_kwargs),
    }
    return {
        "sample": hidden_states,
        "encoder_hidden_states": encoder_hidden_states,
        "timestep": timestep,
        "added_cond_kwargs": added_cond_kwargs,
    }
 if __name__ == "__main__":
    scenarios = [
        BenchmarkScenario(
            name=f"{CKPT_ID}-bf16",
            model_cls=UNet2DConditionModel,
            model_init_kwargs={
                "pretrained_model_name_or_path": CKPT_ID,
                "torch_dtype": torch.bfloat16,
                "subfolder": "unet",
            },
            get_model_input_dict=partial(get_input_dict, device=torch_device, dtype=torch.bfloat16),
            model_init_fn=model_init_fn,
            compile_kwargs={"fullgraph": True},
        ),
        BenchmarkScenario(
            name=f"{CKPT_ID}-layerwise-upcasting",
            model_cls=UNet2DConditionModel,
            model_init_kwargs={
                "pretrained_model_name_or_path": CKPT_ID,
                "torch_dtype": torch.bfloat16,
                "subfolder": "unet",
            },
            get_model_input_dict=partial(get_input_dict, device=torch_device, dtype=torch.bfloat16),
            model_init_fn=partial(model_init_fn, layerwise_upcasting=True),
        ),
        BenchmarkScenario(
            name=f"{CKPT_ID}-group-offload-leaf",
            model_cls=UNet2DConditionModel,
            model_init_kwargs={
                "pretrained_model_name_or_path": CKPT_ID,
                "torch_dtype": torch.bfloat16,
                "subfolder": "unet",
            },
            get_model_input_dict=partial(get_input_dict, device=torch_device, dtype=torch.bfloat16),
            model_init_fn=partial(
                model_init_fn,
                group_offload_kwargs={
                    "onload_device": torch_device,
                    "offload_device": torch.device("cpu"),
                    "offload_type": "leaf_level",
                    "use_stream": True,
                    "non_blocking": True,
                },
            ),
        ),
    ]
    runner = BenchmarkMixin()
    runner.run_bencmarks_and_collate(scenarios, filename=RESULT_FILENAME)
--- a/benchmarks/benchmarking_utils.py
+++ b/benchmarks/benchmarking_utils.py
@@ -0,0 +1,244 @@
 import gc
 import inspect
 import logging
 import os
 import queue
 import threading
 from contextlib import nullcontext
 from dataclasses import dataclass
 from typing import Any, Callable, Dict, Optional, Union
 import pandas as pd
 import torch
 import torch.utils.benchmark as benchmark
 from diffusers.models.modeling_utils import ModelMixin
 from diffusers.utils.testing_utils import require_torch_gpu, torch_device
 logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s: %(message)s")
 logger = logging.getLogger(__name__)
 NUM_WARMUP_ROUNDS = 5
 def benchmark_fn(f, *args, **kwargs):
    t0 = benchmark.Timer(
        stmt="f(*args, **kwargs)",
        globals={"args": args, "kwargs": kwargs, "f": f},
        num_threads=1,
    )
    return float(f"{(t0.blocked_autorange().mean):.3f}")
 def flush():
    gc.collect()
    torch.cuda.empty_cache()
    torch.cuda.reset_max_memory_allocated()
    torch.cuda.reset_peak_memory_stats()
 # Adapted from https://github.com/lucasb-eyer/cnn_vit_benchmarks/blob/15b665ff758e8062131353076153905cae00a71f/main.py
 def calculate_flops(model, input_dict):
    try:
        from torchprofile import profile_macs
    except ModuleNotFoundError:
        raise
    # This is a hacky way to convert the kwargs to args as `profile_macs` cries about kwargs.
    sig = inspect.signature(model.forward)
    param_names = [
        p.name
        for p in sig.parameters.values()
        if p.kind
        in (
            inspect.Parameter.POSITIONAL_ONLY,
            inspect.Parameter.POSITIONAL_OR_KEYWORD,
        )
        and p.name != "self"
    ]
    bound = sig.bind_partial(**input_dict)
    bound.apply_defaults()
    args = tuple(bound.arguments[name] for name in param_names)
    model.eval()
    with torch.no_grad():
        macs = profile_macs(model, args)
    flops = 2 * macs  # 1 MAC operation = 2 FLOPs (1 multiplication + 1 addition)
    return flops
 def calculate_params(model):
    return sum(p.numel() for p in model.parameters())
 # Users can define their own in case this doesn't suffice. For most cases,
 # it should be sufficient.
 def model_init_fn(model_cls, group_offload_kwargs=None, layerwise_upcasting=False, **init_kwargs):
    model = model_cls.from_pretrained(**init_kwargs).eval()
    if group_offload_kwargs and isinstance(group_offload_kwargs, dict):
        model.enable_group_offload(**group_offload_kwargs)
    else:
        model.to(torch_device)
    if layerwise_upcasting:
        model.enable_layerwise_casting(
            storage_dtype=torch.float8_e4m3fn, compute_dtype=init_kwargs.get("torch_dtype", torch.bfloat16)
        )
    return model
@dataclass
 class BenchmarkScenario:
    name: str
    model_cls: ModelMixin
    model_init_kwargs: Dict[str, Any]
    model_init_fn: Callable
    get_model_input_dict: Callable
    compile_kwargs: Optional[Dict[str, Any]] = None
@require_torch_gpu
 class BenchmarkMixin:
    def pre_benchmark(self):
        flush()
        torch.compiler.reset()
    def post_benchmark(self, model):
        model.cpu()
        flush()
        torch.compiler.reset()
    @torch.no_grad()
    def run_benchmark(self, scenario: BenchmarkScenario):
        # 0) Basic stats
        logger.info(f"Running scenario: {scenario.name}.")
        try:
            model = model_init_fn(scenario.model_cls, **scenario.model_init_kwargs)
            num_params = round(calculate_params(model) / 1e9, 2)
            try:
                flops = round(calculate_flops(model, input_dict=scenario.get_model_input_dict()) / 1e9, 2)
            except Exception as e:
                logger.info(f"Problem in calculating FLOPs:\n{e}")
                flops = None
            model.cpu()
            del model
        except Exception as e:
            logger.info(f"Error while initializing the model and calculating FLOPs:\n{e}")
            return {}
        self.pre_benchmark()
        # 1) plain stats
        results = {}
        plain = None
        try:
            plain = self._run_phase(
                model_cls=scenario.model_cls,
                init_fn=scenario.model_init_fn,
                init_kwargs=scenario.model_init_kwargs,
                get_input_fn=scenario.get_model_input_dict,
                compile_kwargs=None,
            )
        except Exception as e:
            logger.info(f"Benchmark could not be run with the following error:\n{e}")
            return results
        # 2) compiled stats (if any)
        compiled = {"time": None, "memory": None}
        if scenario.compile_kwargs:
            try:
                compiled = self._run_phase(
                    model_cls=scenario.model_cls,
                    init_fn=scenario.model_init_fn,
                    init_kwargs=scenario.model_init_kwargs,
                    get_input_fn=scenario.get_model_input_dict,
                    compile_kwargs=scenario.compile_kwargs,
                )
            except Exception as e:
                logger.info(f"Compilation benchmark could not be run with the following error\n: {e}")
                if plain is None:
                    return results
        # 3) merge
        result = {
            "scenario": scenario.name,
            "model_cls": scenario.model_cls.__name__,
            "num_params_B": num_params,
            "flops_G": flops,
            "time_plain_s": plain["time"],
            "mem_plain_GB": plain["memory"],
            "time_compile_s": compiled["time"],
            "mem_compile_GB": compiled["memory"],
        }
        if scenario.compile_kwargs:
            result["fullgraph"] = scenario.compile_kwargs.get("fullgraph", False)
            result["mode"] = scenario.compile_kwargs.get("mode", "default")
        else:
            result["fullgraph"], result["mode"] = None, None
        return result
    def run_bencmarks_and_collate(self, scenarios: Union[BenchmarkScenario, list[BenchmarkScenario]], filename: str):
        if not isinstance(scenarios, list):
            scenarios = [scenarios]
        record_queue = queue.Queue()
        stop_signal = object()
        def _writer_thread():
            while True:
                item = record_queue.get()
                if item is stop_signal:
                    break
                df_row = pd.DataFrame([item])
                write_header = not os.path.exists(filename)
                df_row.to_csv(filename, mode="a", header=write_header, index=False)
                record_queue.task_done()
            record_queue.task_done()
        writer = threading.Thread(target=_writer_thread, daemon=True)
        writer.start()
        for s in scenarios:
            try:
                record = self.run_benchmark(s)
                if record:
                    record_queue.put(record)
                else:
                    logger.info(f"Record empty from scenario: {s.name}.")
            except Exception as e:
                logger.info(f"Running scenario ({s.name}) led to error:\n{e}")
        record_queue.put(stop_signal)
        logger.info(f"Results serialized to {filename=}.")
    def _run_phase(
        self,
        *,
        model_cls: ModelMixin,
        init_fn: Callable,
        init_kwargs: Dict[str, Any],
        get_input_fn: Callable,
        compile_kwargs: Optional[Dict[str, Any]],
    ) -> Dict[str, float]:
        # setup
        self.pre_benchmark()
        # init & (optional) compile
        model = init_fn(model_cls, **init_kwargs)
        if compile_kwargs:
            model.compile(**compile_kwargs)
        # build inputs
        inp = get_input_fn()
        # measure
        run_ctx = torch._inductor.utils.fresh_inductor_cache() if compile_kwargs else nullcontext()
        with run_ctx:
            for _ in range(NUM_WARMUP_ROUNDS):
                _ = model(**inp)
            time_s = benchmark_fn(lambda m, d: m(**d), model, inp)
        mem_gb = torch.cuda.max_memory_allocated() / (1024**3)
        mem_gb = round(mem_gb, 2)
        # teardown
        self.post_benchmark(model)
        del model
        return {"time": time_s, "memory": mem_gb}
--- a/benchmarks/benchmarking_wan.py
+++ b/benchmarks/benchmarking_wan.py
@@ -0,0 +1,74 @@
 from functools import partial
 import torch
 from benchmarking_utils import BenchmarkMixin, BenchmarkScenario, model_init_fn
 from diffusers import WanTransformer3DModel
 from diffusers.utils.testing_utils import torch_device
 CKPT_ID = "Wan-AI/Wan2.1-T2V-14B-Diffusers"
 RESULT_FILENAME = "wan.csv"
 def get_input_dict(**device_dtype_kwargs):
    # height: 480
    # width: 832
    # num_frames: 81
    # max_sequence_length: 512
    hidden_states = torch.randn(1, 16, 21, 60, 104, **device_dtype_kwargs)
    encoder_hidden_states = torch.randn(1, 512, 4096, **device_dtype_kwargs)
    timestep = torch.tensor([1.0], **device_dtype_kwargs)
    return {"hidden_states": hidden_states, "encoder_hidden_states": encoder_hidden_states, "timestep": timestep}
 if __name__ == "__main__":
    scenarios = [
        BenchmarkScenario(
            name=f"{CKPT_ID}-bf16",
            model_cls=WanTransformer3DModel,
            model_init_kwargs={
                "pretrained_model_name_or_path": CKPT_ID,
                "torch_dtype": torch.bfloat16,
                "subfolder": "transformer",
            },
            get_model_input_dict=partial(get_input_dict, device=torch_device, dtype=torch.bfloat16),
            model_init_fn=model_init_fn,
            compile_kwargs={"fullgraph": True},
        ),
        BenchmarkScenario(
            name=f"{CKPT_ID}-layerwise-upcasting",
            model_cls=WanTransformer3DModel,
            model_init_kwargs={
                "pretrained_model_name_or_path": CKPT_ID,
                "torch_dtype": torch.bfloat16,
                "subfolder": "transformer",
            },
            get_model_input_dict=partial(get_input_dict, device=torch_device, dtype=torch.bfloat16),
            model_init_fn=partial(model_init_fn, layerwise_upcasting=True),
        ),
        BenchmarkScenario(
            name=f"{CKPT_ID}-group-offload-leaf",
            model_cls=WanTransformer3DModel,
            model_init_kwargs={
                "pretrained_model_name_or_path": CKPT_ID,
                "torch_dtype": torch.bfloat16,
                "subfolder": "transformer",
            },
            get_model_input_dict=partial(get_input_dict, device=torch_device, dtype=torch.bfloat16),
            model_init_fn=partial(
                model_init_fn,
                group_offload_kwargs={
                    "onload_device": torch_device,
                    "offload_device": torch.device("cpu"),
                    "offload_type": "leaf_level",
                    "use_stream": True,
                    "non_blocking": True,
                },
            ),
        ),
    ]
    runner = BenchmarkMixin()
    runner.run_bencmarks_and_collate(scenarios, filename=RESULT_FILENAME)
--- a/benchmarks/populate_into_db.py
+++ b/benchmarks/populate_into_db.py
@@ -0,0 +1,166 @@
 import argparse
 import os
 import sys
 import gpustat
 import pandas as pd
 import psycopg2
 import psycopg2.extras
 from psycopg2.extensions import register_adapter
 from psycopg2.extras import Json
 register_adapter(dict, Json)
 FINAL_CSV_FILENAME = "collated_results.csv"
 # https://github.com/huggingface/transformers/blob/593e29c5e2a9b17baec010e8dc7c1431fed6e841/benchmark/init_db.sql#L27
 BENCHMARKS_TABLE_NAME = "benchmarks"
 MEASUREMENTS_TABLE_NAME = "model_measurements"
 def _init_benchmark(conn, branch, commit_id, commit_msg):
    gpu_stats = gpustat.GPUStatCollection.new_query()
    metadata = {"gpu_name": gpu_stats[0]["name"]}
    repository = "huggingface/diffusers"
    with conn.cursor() as cur:
        cur.execute(
            f"INSERT INTO {BENCHMARKS_TABLE_NAME} (repository, branch, commit_id, commit_message, metadata) VALUES (%s, %s, %s, %s, %s) RETURNING benchmark_id",
            (repository, branch, commit_id, commit_msg, metadata),
        )
        benchmark_id = cur.fetchone()[0]
        print(f"Initialised benchmark #{benchmark_id}")
        return benchmark_id
 def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "branch",
        type=str,
        help="The branch name on which the benchmarking is performed.",
    )
    parser.add_argument(
        "commit_id",
        type=str,
        help="The commit hash on which the benchmarking is performed.",
    )
    parser.add_argument(
        "commit_msg",
        type=str,
        help="The commit message associated with the commit, truncated to 70 characters.",
    )
    args = parser.parse_args()
    return args
 if __name__ == "__main__":
    args = parse_args()
    try:
        conn = psycopg2.connect(
            host=os.getenv("PGHOST"),
            database=os.getenv("PGDATABASE"),
            user=os.getenv("PGUSER"),
            password=os.getenv("PGPASSWORD"),
        )
        print("DB connection established successfully.")
    except Exception as e:
        print(f"Problem during DB init: {e}")
        sys.exit(1)
    try:
        benchmark_id = _init_benchmark(
            conn=conn,
            branch=args.branch,
            commit_id=args.commit_id,
            commit_msg=args.commit_msg,
        )
    except Exception as e:
        print(f"Problem during initializing benchmark: {e}")
        sys.exit(1)
    cur = conn.cursor()
    df = pd.read_csv(FINAL_CSV_FILENAME)
    # Helper to cast values (or None) given a dtype
    def _cast_value(val, dtype: str):
        if pd.isna(val):
            return None
        if dtype == "text":
            return str(val).strip()
        if dtype == "float":
            try:
                return float(val)
            except ValueError:
                return None
        if dtype == "bool":
            s = str(val).strip().lower()
            if s in ("true", "t", "yes", "1"):
                return True
            if s in ("false", "f", "no", "0"):
                return False
            if val in (1, 1.0):
                return True
            if val in (0, 0.0):
                return False
            return None
        return val
    try:
        rows_to_insert = []
        for _, row in df.iterrows():
            scenario = _cast_value(row.get("scenario"), "text")
            model_cls = _cast_value(row.get("model_cls"), "text")
            num_params_B = _cast_value(row.get("num_params_B"), "float")
            flops_G = _cast_value(row.get("flops_G"), "float")
            time_plain_s = _cast_value(row.get("time_plain_s"), "float")
            mem_plain_GB = _cast_value(row.get("mem_plain_GB"), "float")
            time_compile_s = _cast_value(row.get("time_compile_s"), "float")
            mem_compile_GB = _cast_value(row.get("mem_compile_GB"), "float")
            fullgraph = _cast_value(row.get("fullgraph"), "bool")
            mode = _cast_value(row.get("mode"), "text")
            # If "github_sha" column exists in the CSV, cast it; else default to None
            if "github_sha" in df.columns:
                github_sha = _cast_value(row.get("github_sha"), "text")
            else:
                github_sha = None
            measurements = {
                "scenario": scenario,
                "model_cls": model_cls,
                "num_params_B": num_params_B,
                "flops_G": flops_G,
                "time_plain_s": time_plain_s,
                "mem_plain_GB": mem_plain_GB,
                "time_compile_s": time_compile_s,
                "mem_compile_GB": mem_compile_GB,
                "fullgraph": fullgraph,
                "mode": mode,
                "github_sha": github_sha,
            }
            rows_to_insert.append((benchmark_id, measurements))
        # Batch-insert all rows
        insert_sql = f"""
        INSERT INTO {MEASUREMENTS_TABLE_NAME} (
            benchmark_id,
            measurements
        )
        VALUES (%s, %s);
        """
        psycopg2.extras.execute_batch(cur, insert_sql, rows_to_insert)
        conn.commit()
        cur.close()
        conn.close()
    except Exception as e:
        print(f"Exception: {e}")
        sys.exit(1)
--- a/benchmarks/push_results.py
+++ b/benchmarks/push_results.py
@@ -1,19 +1,19 @@
-import glob
+import os
 import sys
 import pandas as pd
 from huggingface_hub import hf_hub_download, upload_file
 from huggingface_hub.utils import EntryNotFoundError
-sys.path.append(".")
+REPO_ID = "diffusers/benchmarks"
 from utils import BASE_PATH, FINAL_CSV_FILE, GITHUB_SHA, REPO_ID, collate_csv  # noqa: E402
 def has_previous_benchmark() -> str:
    from run_all import FINAL_CSV_FILENAME
    csv_path = None
    try:
-        csv_path = hf_hub_download(repo_id=REPO_ID, repo_type="dataset", filename=FINAL_CSV_FILE)
+        csv_path = hf_hub_download(repo_id=REPO_ID, repo_type="dataset", filename=FINAL_CSV_FILENAME)
    except EntryNotFoundError:
        csv_path = None
    return csv_path
@@ -26,46 +26,50 @@ def filter_float(value):
 def push_to_hf_dataset():
-    all_csvs = sorted(glob.glob(f"{BASE_PATH}/*.csv"))
+    from run_all import FINAL_CSV_FILENAME, GITHUB_SHA
    collate_csv(all_csvs, FINAL_CSV_FILE)
    # If there's an existing benchmark file, we should report the changes.
    csv_path = has_previous_benchmark()
    if csv_path is not None:
-        current_results = pd.read_csv(FINAL_CSV_FILE)
+        current_results = pd.read_csv(FINAL_CSV_FILENAME)
        previous_results = pd.read_csv(csv_path)
        numeric_columns = current_results.select_dtypes(include=["float64", "int64"]).columns
        numeric_columns = [
            c for c in numeric_columns if c not in ["batch_size", "num_inference_steps", "actual_gpu_memory (gbs)"]
        ]
        for column in numeric_columns:
-            previous_results[column] = previous_results[column].map(lambda x: filter_float(x))
+            # get previous values as floats, aligned to current index
            prev_vals = previous_results[column].map(filter_float).reindex(current_results.index)
-            # Calculate the percentage change
+            # get current values as floats
-            current_results[column] = current_results[column].astype(float)
+            curr_vals = current_results[column].astype(float)
            previous_results[column] = previous_results[column].astype(float)
            percent_change = ((current_results[column] - previous_results[column]) / previous_results[column]) * 100
-            # Format the values with '+' or '-' sign and append to original values
+            # stringify the current values
-            current_results[column] = current_results[column].map(str) + percent_change.map(
+            curr_str = curr_vals.map(str)
-                lambda x: f" ({'+' if x > 0 else ''}{x:.2f}%)"
+
            # build an appendage only when prev exists and differs
            append_str = prev_vals.where(prev_vals.notnull() & (prev_vals != curr_vals), other=pd.NA).map(
                lambda x: f" ({x})" if pd.notnull(x) else ""
            )
            # There might be newly added rows. So, filter out the NaNs.
            current_results[column] = current_results[column].map(lambda x: x.replace(" (nan%)", ""))
-        # Overwrite the current result file.
+            # combine
-        current_results.to_csv(FINAL_CSV_FILE, index=False)
+            current_results[column] = curr_str + append_str
        os.remove(FINAL_CSV_FILENAME)
        current_results.to_csv(FINAL_CSV_FILENAME, index=False)
    commit_message = f"upload from sha: {GITHUB_SHA}" if GITHUB_SHA is not None else "upload benchmark results"
    upload_file(
        repo_id=REPO_ID,
-        path_in_repo=FINAL_CSV_FILE,
+        path_in_repo=FINAL_CSV_FILENAME,
-        path_or_fileobj=FINAL_CSV_FILE,
+        path_or_fileobj=FINAL_CSV_FILENAME,
        repo_type="dataset",
        commit_message=commit_message,
    )
    upload_file(
        repo_id="diffusers/benchmark-analyzer",
        path_in_repo=FINAL_CSV_FILENAME,
        path_or_fileobj=FINAL_CSV_FILENAME,
        repo_type="space",
        commit_message=commit_message,
    )
 if __name__ == "__main__":
--- a/benchmarks/requirements.txt
+++ b/benchmarks/requirements.txt
@@ -0,0 +1,6 @@
 pandas 
 psutil
 gpustat
 torchprofile
 bitsandbytes
 psycopg2==2.9.9
--- a/benchmarks/run_all.py
+++ b/benchmarks/run_all.py
@@ -1,101 +1,84 @@
 import glob
 import logging
 import os
 import subprocess
-import sys
+
-from typing import List
+import pandas as pd
-sys.path.append(".")
+logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s: %(message)s")
-from benchmark_text_to_image import ALL_T2I_CKPTS  # noqa: E402
+logger = logging.getLogger(__name__)
-
+PATTERN = "benchmarking_*.py"
-PATTERN = "benchmark_*.py"
+FINAL_CSV_FILENAME = "collated_results.csv"
 GITHUB_SHA = os.getenv("GITHUB_SHA", None)
 class SubprocessCallException(Exception):
    pass
-# Taken from `test_examples_utils.py`
+def run_command(command: list[str], return_stdout=False):
 def run_command(command: List[str], return_stdout=False):
    """
    Runs `command` with `subprocess.check_output` and will potentially return the `stdout`. Will also properly capture
    if an error occurred while running `command`
    """
    try:
        output = subprocess.check_output(command, stderr=subprocess.STDOUT)
-        if return_stdout:
+        if return_stdout and hasattr(output, "decode"):
-            if hasattr(output, "decode"):
+            return output.decode("utf-8")
                output = output.decode("utf-8")
            return output
    except subprocess.CalledProcessError as e:
-        raise SubprocessCallException(
+        raise SubprocessCallException(f"Command `{' '.join(command)}` failed with:\n{e.output.decode()}") from e
            f"Command `{' '.join(command)}` failed with the following error:\n\n{e.output.decode()}"
        ) from e
-def main():
+def merge_csvs(final_csv: str = "collated_results.csv"):
-    python_files = glob.glob(PATTERN)
+    all_csvs = glob.glob("*.csv")
    all_csvs = [f for f in all_csvs if f != final_csv]
    if not all_csvs:
        logger.info("No result CSVs found to merge.")
        return
-    for file in python_files:
+    df_list = []
-        print(f"****** Running file: {file} ******")
+    for f in all_csvs:
-
+        try:
-        # Run with canonical settings.
+            d = pd.read_csv(f)
-        if file != "benchmark_text_to_image.py" and file != "benchmark_ip_adapters.py":
+        except pd.errors.EmptyDataError:
-            command = f"python {file}"
+            # If a file existed but was zero‐bytes or corrupted, skip it
            run_command(command.split())
            command += " --run_compile"
            run_command(command.split())
    # Run variants.
    for file in python_files:
        # See: https://github.com/pytorch/pytorch/issues/129637
        if file == "benchmark_ip_adapters.py":
            continue
        df_list.append(d)
-        if file == "benchmark_text_to_image.py":
+    if not df_list:
-            for ckpt in ALL_T2I_CKPTS:
+        logger.info("All result CSVs were empty or invalid; nothing to merge.")
-                command = f"python {file} --ckpt {ckpt}"
+        return
-                if "turbo" in ckpt:
+    final_df = pd.concat(df_list, ignore_index=True)
-                    command += " --num_inference_steps 1"
+    if GITHUB_SHA is not None:
        final_df["github_sha"] = GITHUB_SHA
    final_df.to_csv(final_csv, index=False)
    logger.info(f"Merged {len(all_csvs)} partial CSVs → {final_csv}.")
                run_command(command.split())
-                command += " --run_compile"
+def run_scripts():
-                run_command(command.split())
+    python_files = sorted(glob.glob(PATTERN))
    python_files = [f for f in python_files if f != "benchmarking_utils.py"]
-        elif file == "benchmark_sd_img.py":
+    for file in python_files:
-            for ckpt in ["stabilityai/stable-diffusion-xl-refiner-1.0", "stabilityai/sdxl-turbo"]:
+        script_name = file.split(".py")[0].split("_")[-1]  # example: benchmarking_foo.py -> foo
-                command = f"python {file} --ckpt {ckpt}"
+        logger.info(f"\n****** Running file: {file} ******")
-                if ckpt == "stabilityai/sdxl-turbo":
+        partial_csv = f"{script_name}.csv"
-                    command += " --num_inference_steps 2"
+        if os.path.exists(partial_csv):
            logger.info(f"Found {partial_csv}. Removing for safer numbers and duplication.")
            os.remove(partial_csv)
-                run_command(command.split())
+        command = ["python", file]
-                command += " --run_compile"
+        try:
-                run_command(command.split())
+            run_command(command)
            logger.info(f"→ {file} finished normally.")
        except SubprocessCallException as e:
            logger.info(f"Error running {file}:\n{e}")
        finally:
            logger.info(f"→ Merging partial CSVs after {file} …")
            merge_csvs(final_csv=FINAL_CSV_FILENAME)
-        elif file in ["benchmark_sd_inpainting.py", "benchmark_ip_adapters.py"]:
+    logger.info(f"\nAll scripts attempted. Final collated CSV: {FINAL_CSV_FILENAME}")
            sdxl_ckpt = "stabilityai/stable-diffusion-xl-base-1.0"
            command = f"python {file} --ckpt {sdxl_ckpt}"
            run_command(command.split())
            command += " --run_compile"
            run_command(command.split())
        elif file in ["benchmark_controlnet.py", "benchmark_t2i_adapter.py"]:
            sdxl_ckpt = (
                "diffusers/controlnet-canny-sdxl-1.0"
                if "controlnet" in file
                else "TencentARC/t2i-adapter-canny-sdxl-1.0"
            )
            command = f"python {file} --ckpt {sdxl_ckpt}"
            run_command(command.split())
            command += " --run_compile"
            run_command(command.split())
 if __name__ == "__main__":
-    main()
+    run_scripts()
--- a/benchmarks/utils.py
+++ b/benchmarks/utils.py
@@ -1,98 +0,0 @@
 import argparse
 import csv
 import gc
 import os
 from dataclasses import dataclass
 from typing import Dict, List, Union
 import torch
 import torch.utils.benchmark as benchmark
 GITHUB_SHA = os.getenv("GITHUB_SHA", None)
 BENCHMARK_FIELDS = [
    "pipeline_cls",
    "ckpt_id",
    "batch_size",
    "num_inference_steps",
    "model_cpu_offload",
    "run_compile",
    "time (secs)",
    "memory (gbs)",
    "actual_gpu_memory (gbs)",
    "github_sha",
 ]
 PROMPT = "ghibli style, a fantasy landscape with castles"
 BASE_PATH = os.getenv("BASE_PATH", ".")
 TOTAL_GPU_MEMORY = float(os.getenv("TOTAL_GPU_MEMORY", torch.cuda.get_device_properties(0).total_memory / (1024**3)))
 REPO_ID = "diffusers/benchmarks"
 FINAL_CSV_FILE = "collated_results.csv"
@dataclass
 class BenchmarkInfo:
    time: float
    memory: float
 def flush():
    """Wipes off memory."""
    gc.collect()
    torch.cuda.empty_cache()
    torch.cuda.reset_max_memory_allocated()
    torch.cuda.reset_peak_memory_stats()
 def bytes_to_giga_bytes(bytes):
    return f"{(bytes / 1024 / 1024 / 1024):.3f}"
 def benchmark_fn(f, *args, **kwargs):
    t0 = benchmark.Timer(
        stmt="f(*args, **kwargs)",
        globals={"args": args, "kwargs": kwargs, "f": f},
        num_threads=torch.get_num_threads(),
    )
    return f"{(t0.blocked_autorange().mean):.3f}"
 def generate_csv_dict(
    pipeline_cls: str, ckpt: str, args: argparse.Namespace, benchmark_info: BenchmarkInfo
 ) -> Dict[str, Union[str, bool, float]]:
    """Packs benchmarking data into a dictionary for latter serialization."""
    data_dict = {
        "pipeline_cls": pipeline_cls,
        "ckpt_id": ckpt,
        "batch_size": args.batch_size,
        "num_inference_steps": args.num_inference_steps,
        "model_cpu_offload": args.model_cpu_offload,
        "run_compile": args.run_compile,
        "time (secs)": benchmark_info.time,
        "memory (gbs)": benchmark_info.memory,
        "actual_gpu_memory (gbs)": f"{(TOTAL_GPU_MEMORY):.3f}",
        "github_sha": GITHUB_SHA,
    }
    return data_dict
 def write_to_csv(file_name: str, data_dict: Dict[str, Union[str, bool, float]]):
    """Serializes a dictionary into a CSV file."""
    with open(file_name, mode="w", newline="") as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=BENCHMARK_FIELDS)
        writer.writeheader()
        writer.writerow(data_dict)
 def collate_csv(input_files: List[str], output_file: str):
    """Collates multiple identically structured CSVs into a single CSV file."""
    with open(output_file, mode="w", newline="") as outfile:
        writer = csv.DictWriter(outfile, fieldnames=BENCHMARK_FIELDS)
        writer.writeheader()
        for file in input_files:
            with open(file, mode="r") as infile:
                reader = csv.DictReader(infile)
                for row in reader:
                    writer.writerow(row)
--- a/docker/diffusers-doc-builder/Dockerfile
+++ b/docker/diffusers-doc-builder/Dockerfile
@@ -47,6 +47,10 @@ RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
        tensorboard \
        transformers \
        matplotlib \
-        setuptools==69.5.1
+        setuptools==69.5.1 \
        bitsandbytes \
        torchao \
        gguf \
        optimum-quanto
 CMD ["/bin/bash"]
--- a/docker/diffusers-pytorch-compile-cuda/Dockerfile
+++ b/docker/diffusers-pytorch-compile-cuda/Dockerfile
@@ -1,50 +0,0 @@
 FROM nvidia/cuda:12.1.0-runtime-ubuntu20.04
 LABEL maintainer="Hugging Face"
 LABEL repository="diffusers"
 ENV DEBIAN_FRONTEND=noninteractive
 RUN apt-get -y update \
    && apt-get install -y software-properties-common \
    && add-apt-repository ppa:deadsnakes/ppa
 RUN apt install -y bash \
    build-essential \
    git \
    git-lfs \
    curl \
    ca-certificates \
    libsndfile1-dev \
    libgl1 \
    python3.10 \
    python3.10-dev \
    python3-pip \
    python3.10-venv && \
    rm -rf /var/lib/apt/lists
 # make sure to use venv
 RUN python3.10 -m venv /opt/venv
 ENV PATH="/opt/venv/bin:$PATH"
 # pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
 RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
    python3.10 -m uv pip install --no-cache-dir \
    torch \
    torchvision \
    torchaudio \
    invisible_watermark && \
    python3.10 -m pip install --no-cache-dir \
    accelerate \
    datasets \
    hf-doc-builder \
    huggingface-hub \
    hf_transfer \
    Jinja2 \
    librosa \
    numpy==1.26.4 \
    scipy \
    tensorboard \
    transformers \
    hf_transfer
 CMD ["/bin/bash"]
--- a/docs/TRANSLATING.md
+++ b/docs/TRANSLATING.md
@@ -1,4 +1,4 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -1,38 +1,39 @@
- sections:
+- title: Get started
  sections:
  - local: index
-    title: 🧨 Diffusers
+    title: Diffusers
  - local: quicktour
    title: Quicktour
  - local: stable_diffusion
    title: Effective and efficient diffusion
  - local: installation
    title: Installation
-  title: Get started
+  - local: quicktour
- sections:
+    title: Quickstart
-  - local: tutorials/tutorial_overview
+  - local: stable_diffusion
-    title: Overview
+    title: Basic performance
-  - local: using-diffusers/write_own_pipeline
+
-    title: Understanding pipelines, models and schedulers
+- title: DiffusionPipeline
-  - local: tutorials/autopipeline
+  isExpanded: false
-    title: AutoPipeline
+  sections:
  - local: tutorials/basic_training
    title: Train a diffusion model
  - local: tutorials/fast_diffusion
    title: Accelerate inference of text-to-image diffusion models
  title: Tutorials
 - sections:
  - local: using-diffusers/loading
    title: Load pipelines
  - local: tutorials/autopipeline
    title: AutoPipeline
  - local: using-diffusers/custom_pipeline_overview
-    title: Load community pipelines and components
+    title: Community pipelines and components
  - local: using-diffusers/callback
    title: Pipeline callbacks
  - local: using-diffusers/reusing_seeds
    title: Reproducible pipelines
  - local: using-diffusers/schedulers
    title: Load schedulers and models
  - local: using-diffusers/scheduler_features
    title: Scheduler features
  - local: using-diffusers/other-formats
    title: Model files and layouts
  - local: using-diffusers/push_to_hub
    title: Push files to the Hub
-  title: Load pipelines and adapters
+
- sections:
+- title: Adapters
  isExpanded: false
  sections:
  - local: tutorials/using_peft_for_inference
    title: LoRA
  - local: using-diffusers/ip_adapter
@@ -45,27 +46,16 @@
    title: DreamBooth
  - local: using-diffusers/textual_inversion_inference
    title: Textual inversion
-  title: Adapters
+
 - title: Inference
  isExpanded: false
- sections:
+  sections:
-  - local: using-diffusers/unconditional_image_generation
+  - local: using-diffusers/weighted_prompts
-    title: Unconditional image generation
+    title: Prompt techniques
  - local: using-diffusers/conditional_image_generation
    title: Text-to-image
  - local: using-diffusers/img2img
    title: Image-to-image
  - local: using-diffusers/inpaint
    title: Inpainting
  - local: using-diffusers/text-img2vid
    title: Video generation
  - local: using-diffusers/depth2img
    title: Depth-to-image
  title: Generative tasks
 - sections:
  - local: using-diffusers/overview_techniques
    title: Overview
  - local: using-diffusers/create_a_server
    title: Create a server
  - local: using-diffusers/batched_inference
    title: Batch inference
  - local: training/distributed_inference
    title: Distributed inference
  - local: using-diffusers/scheduler_features
@@ -76,14 +66,38 @@
    title: Reproducible pipelines
  - local: using-diffusers/image_quality
    title: Controlling image quality
-  - local: using-diffusers/weighted_prompts
+
-    title: Prompt techniques
+- title: Inference optimization
-  title: Inference techniques
+  isExpanded: false
- sections:
+  sections:
-  - local: advanced_inference/outpaint
+  - local: optimization/fp16
-    title: Outpainting
+    title: Accelerate inference
-  title: Advanced inference
+  - local: optimization/cache
- sections:
+    title: Caching
  - local: optimization/memory
    title: Reduce memory usage
  - local: optimization/speed-memory-optims
    title: Compile and offloading quantized models
  - title: Community optimizations
    sections:
    - local: optimization/pruna
      title: Pruna
    - local: optimization/xformers
      title: xFormers
    - local: optimization/tome
      title: Token merging
    - local: optimization/deepcache
      title: DeepCache
    - local: optimization/tgate
      title: TGATE
    - local: optimization/xdit
      title: xDiT
    - local: optimization/para_attn
      title: ParaAttention
 - title: Hybrid Inference
  isExpanded: false
  sections:
  - local: hybrid_inference/overview
    title: Overview
  - local: hybrid_inference/vae_decode
@@ -92,10 +106,112 @@
    title: VAE Encode
  - local: hybrid_inference/api_reference
    title: API Reference
-  title: Hybrid Inference
+
- sections:
+- title: Modular Diffusers
-  - local: using-diffusers/cogvideox
+  isExpanded: false
-    title: CogVideoX
+  sections:
  - local: modular_diffusers/overview
    title: Overview
  - local: modular_diffusers/quickstart
    title: Quickstart
  - local: modular_diffusers/modular_diffusers_states
    title: States
  - local: modular_diffusers/pipeline_block
    title: ModularPipelineBlocks
  - local: modular_diffusers/sequential_pipeline_blocks
    title: SequentialPipelineBlocks
  - local: modular_diffusers/loop_sequential_pipeline_blocks
    title: LoopSequentialPipelineBlocks
  - local: modular_diffusers/auto_pipeline_blocks
    title: AutoPipelineBlocks
  - local: modular_diffusers/modular_pipeline
    title: ModularPipeline
  - local: modular_diffusers/components_manager
    title: ComponentsManager
  - local: modular_diffusers/guiders
    title: Guiders
 - title: Training
  isExpanded: false
  sections:
  - local: training/overview
    title: Overview
  - local: training/create_dataset
    title: Create a dataset for training
  - local: training/adapt_a_model
    title: Adapt a model to a new task
  - local: tutorials/basic_training
    title: Train a diffusion model
  - title: Models
    sections:
    - local: training/unconditional_training
      title: Unconditional image generation
    - local: training/text2image
      title: Text-to-image
    - local: training/sdxl
      title: Stable Diffusion XL
    - local: training/kandinsky
      title: Kandinsky 2.2
    - local: training/wuerstchen
      title: Wuerstchen
    - local: training/controlnet
      title: ControlNet
    - local: training/t2i_adapters
      title: T2I-Adapters
    - local: training/instructpix2pix
      title: InstructPix2Pix
    - local: training/cogvideox
      title: CogVideoX
  - title: Methods
    sections:
    - local: training/text_inversion
      title: Textual Inversion
    - local: training/dreambooth
      title: DreamBooth
    - local: training/lora
      title: LoRA
    - local: training/custom_diffusion
      title: Custom Diffusion
    - local: training/lcm_distill
      title: Latent Consistency Distillation
    - local: training/ddpo
      title: Reinforcement learning training with DDPO
 - title: Quantization
  isExpanded: false
  sections:
  - local: quantization/overview
    title: Getting started
  - local: quantization/bitsandbytes
    title: bitsandbytes
  - local: quantization/gguf
    title: gguf
  - local: quantization/torchao
    title: torchao
  - local: quantization/quanto
    title: quanto
 - title: Model accelerators and hardware
  isExpanded: false
  sections:
  - local: using-diffusers/stable_diffusion_jax_how_to
    title: JAX/Flax
  - local: optimization/onnx
    title: ONNX
  - local: optimization/open_vino
    title: OpenVINO
  - local: optimization/coreml
    title: Core ML
  - local: optimization/mps
    title: Metal Performance Shaders (MPS)
  - local: optimization/habana
    title: Intel Gaudi
  - local: optimization/neuron
    title: AWS Neuron
 - title: Specific pipeline examples
  isExpanded: false
  sections:
  - local: using-diffusers/consisid
    title: ConsisID
  - local: using-diffusers/sdxl
@@ -120,102 +236,30 @@
    title: Stable Video Diffusion
  - local: using-diffusers/marigold_usage
    title: Marigold Computer Vision
-  title: Specific pipeline examples
+
- sections:
+- title: Resources
-  - local: training/overview
+  isExpanded: false
-    title: Overview
+  sections:
-  - local: training/create_dataset
+  - title: Task recipes
    title: Create a dataset for training
  - local: training/adapt_a_model
    title: Adapt a model to a new task
  - isExpanded: false
    sections:
-    - local: training/unconditional_training
+    - local: using-diffusers/unconditional_image_generation
      title: Unconditional image generation
-    - local: training/text2image
+    - local: using-diffusers/conditional_image_generation
      title: Text-to-image
-    - local: training/sdxl
+    - local: using-diffusers/img2img
-      title: Stable Diffusion XL
+      title: Image-to-image
-    - local: training/kandinsky
+    - local: using-diffusers/inpaint
-      title: Kandinsky 2.2
+      title: Inpainting
-    - local: training/wuerstchen
+    - local: advanced_inference/outpaint
-      title: Wuerstchen
+      title: Outpainting
-    - local: training/controlnet
+    - local: using-diffusers/text-img2vid
-      title: ControlNet
+      title: Video generation
-    - local: training/t2i_adapters
+    - local: using-diffusers/depth2img
-      title: T2I-Adapters
+      title: Depth-to-image
-    - local: training/instructpix2pix
+  - local: using-diffusers/write_own_pipeline
-      title: InstructPix2Pix
+    title: Understanding pipelines, models and schedulers
-    - local: training/cogvideox
+  - local: community_projects
-      title: CogVideoX
+    title: Projects built with Diffusers
    title: Models
  - isExpanded: false
    sections:
    - local: training/text_inversion
      title: Textual Inversion
    - local: training/dreambooth
      title: DreamBooth
    - local: training/lora
      title: LoRA
    - local: training/custom_diffusion
      title: Custom Diffusion
    - local: training/lcm_distill
      title: Latent Consistency Distillation
    - local: training/ddpo
      title: Reinforcement learning training with DDPO
    title: Methods
  title: Training
 - sections:
  - local: quantization/overview
    title: Getting Started
  - local: quantization/bitsandbytes
    title: bitsandbytes
  - local: quantization/gguf
    title: gguf
  - local: quantization/torchao
    title: torchao
  - local: quantization/quanto
    title: quanto
  title: Quantization Methods
 - sections:
  - local: optimization/fp16
    title: Accelerate inference
  - local: optimization/memory
    title: Reduce memory usage
  - local: optimization/torch2.0
    title: PyTorch 2.0
  - local: optimization/xformers
    title: xFormers
  - local: optimization/tome
    title: Token merging
  - local: optimization/deepcache
    title: DeepCache
  - local: optimization/tgate
    title: TGATE
  - local: optimization/xdit
    title: xDiT
  - local: optimization/para_attn
    title: ParaAttention
  - sections:
    - local: using-diffusers/stable_diffusion_jax_how_to
      title: JAX/Flax
    - local: optimization/onnx
      title: ONNX
    - local: optimization/open_vino
      title: OpenVINO
    - local: optimization/coreml
      title: Core ML
    title: Optimized model formats
  - sections:
    - local: optimization/mps
      title: Metal Performance Shaders (MPS)
    - local: optimization/habana
      title: Habana Gaudi
    - local: optimization/neuron
      title: AWS Neuron
    title: Optimized hardware
  title: Accelerate inference and reduce memory
 - sections:
  - local: conceptual/philosophy
    title: Philosophy
  - local: using-diffusers/controlling_generation
@@ -226,13 +270,11 @@
    title: Diffusers' Ethical Guidelines
  - local: conceptual/evaluation
    title: Evaluating Diffusion Models
-  title: Conceptual Guides
+
- sections:
+- title: API
-  - local: community_projects
+  isExpanded: false
-    title: Projects built with Diffusers
+  sections:
-  title: Community Projects
+  - title: Main Classes
 - sections:
  - isExpanded: false
    sections:
    - local: api/configuration
      title: Configuration
@@ -242,8 +284,19 @@
      title: Outputs
    - local: api/quantization
      title: Quantization
-    title: Main Classes
+  - title: Modular
-  - isExpanded: false
+    sections:
    - local: api/modular_diffusers/pipeline
      title: Pipeline
    - local: api/modular_diffusers/pipeline_blocks
      title: Blocks
    - local: api/modular_diffusers/pipeline_states
      title: States
    - local: api/modular_diffusers/pipeline_components
      title: Components and configs
    - local: api/modular_diffusers/guiders
      title: Guiders
  - title: Loaders
    sections:
    - local: api/loaders/ip_adapter
      title: IP-Adapter
@@ -259,14 +312,14 @@
      title: SD3Transformer2D
    - local: api/loaders/peft
      title: PEFT
-    title: Loaders
+  - title: Models
  - isExpanded: false
    sections:
    - local: api/models/overview
      title: Overview
    - local: api/models/auto_model
      title: AutoModel
-    - sections:
+    - title: ControlNets
      sections:
      - local: api/models/controlnet
        title: ControlNetModel
      - local: api/models/controlnet_union
@@ -281,12 +334,16 @@
        title: SD3ControlNetModel
      - local: api/models/controlnet_sparsectrl
        title: SparseControlNetModel
-      title: ControlNets
+    - title: Transformers
-    - sections:
+      sections:
      - local: api/models/allegro_transformer3d
        title: AllegroTransformer3DModel
      - local: api/models/aura_flow_transformer2d
        title: AuraFlowTransformer2DModel
      - local: api/models/bria_transformer
        title: BriaTransformer2DModel
      - local: api/models/chroma_transformer
        title: ChromaTransformer2DModel
      - local: api/models/cogvideox_transformer3d
        title: CogVideoXTransformer3DModel
      - local: api/models/cogview3plus_transformer2d
@@ -325,10 +382,14 @@
        title: PixArtTransformer2DModel
      - local: api/models/prior_transformer
        title: PriorTransformer
      - local: api/models/qwenimage_transformer2d
        title: QwenImageTransformer2DModel
      - local: api/models/sana_transformer2d
        title: SanaTransformer2DModel
      - local: api/models/sd3_transformer2d
        title: SD3Transformer2DModel
      - local: api/models/skyreels_v2_transformer_3d
        title: SkyReelsV2Transformer3DModel
      - local: api/models/stable_audio_transformer
        title: StableAudioDiTModel
      - local: api/models/transformer2d
@@ -337,8 +398,8 @@
        title: TransformerTemporalModel
      - local: api/models/wan_transformer_3d
        title: WanTransformer3DModel
-      title: Transformers
+    - title: UNets
-    - sections:
+      sections:
      - local: api/models/stable_cascade_unet
        title: StableCascadeUNet
      - local: api/models/unet
@@ -353,8 +414,8 @@
        title: UNetMotionModel
      - local: api/models/uvit2d
        title: UViT2DModel
-      title: UNets
+    - title: VAEs
-    - sections:
+      sections:
      - local: api/models/asymmetricautoencoderkl
        title: AsymmetricAutoencoderKL
      - local: api/models/autoencoder_dc
@@ -375,6 +436,8 @@
        title: AutoencoderKLMagvit
      - local: api/models/autoencoderkl_mochi
        title: AutoencoderKLMochi
      - local: api/models/autoencoderkl_qwenimage
        title: AutoencoderKLQwenImage
      - local: api/models/autoencoder_kl_wan
        title: AutoencoderKLWan
      - local: api/models/consistency_decoder_vae
@@ -385,9 +448,7 @@
        title: Tiny AutoEncoder
      - local: api/models/vq
        title: VQModel
-      title: VAEs
+  - title: Pipelines
    title: Models
  - isExpanded: false
    sections:
    - local: api/pipelines/overview
      title: Overview
@@ -409,6 +470,10 @@
      title: AutoPipeline
    - local: api/pipelines/blip_diffusion
      title: BLIP-Diffusion
    - local: api/pipelines/bria_3_2
      title: Bria 3.2
    - local: api/pipelines/chroma
      title: Chroma
    - local: api/pipelines/cogvideox
      title: CogVideoX
    - local: api/pipelines/cogview3
@@ -511,6 +576,8 @@
      title: PixArt-α
    - local: api/pipelines/pixart_sigma
      title: PixArt-Σ
    - local: api/pipelines/qwenimage
      title: QwenImage
    - local: api/pipelines/sana
      title: Sana
    - local: api/pipelines/sana_sprint
@@ -521,11 +588,14 @@
      title: Semantic Guidance
    - local: api/pipelines/shap_e
      title: Shap-E
    - local: api/pipelines/skyreels_v2
      title: SkyReels-V2
    - local: api/pipelines/stable_audio
      title: Stable Audio
    - local: api/pipelines/stable_cascade
      title: Stable Cascade
-    - sections:
+    - title: Stable Diffusion
      sections:
      - local: api/pipelines/stable_diffusion/overview
        title: Overview
      - local: api/pipelines/stable_diffusion/depth2img
@@ -562,7 +632,6 @@
        title: T2I-Adapter
      - local: api/pipelines/stable_diffusion/text2img
        title: Text-to-image
      title: Stable Diffusion
    - local: api/pipelines/stable_unclip
      title: Stable unCLIP
    - local: api/pipelines/text_to_video
@@ -581,8 +650,7 @@
      title: Wan
    - local: api/pipelines/wuerstchen
      title: Wuerstchen
-    title: Pipelines
+  - title: Schedulers
  - isExpanded: false
    sections:
    - local: api/schedulers/overview
      title: Overview
@@ -652,8 +720,7 @@
      title: UniPCMultistepScheduler
    - local: api/schedulers/vq_diffusion
      title: VQDiffusionScheduler
-    title: Schedulers
+  - title: Internal classes
  - isExpanded: false
    sections:
    - local: api/internal_classes_overview
      title: Overview
@@ -671,5 +738,3 @@
      title: VAE Image Processor
    - local: api/video_processor
      title: Video Processor
    title: Internal classes
  title: API
--- a/docs/source/en/advanced_inference/outpaint.md
+++ b/docs/source/en/advanced_inference/outpaint.md
@@ -1,4 +1,4 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
--- a/docs/source/en/api/activations.md
+++ b/docs/source/en/api/activations.md
@@ -1,4 +1,4 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
--- a/docs/source/en/api/attnprocessor.md
+++ b/docs/source/en/api/attnprocessor.md
@@ -1,4 +1,4 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
--- a/docs/source/en/api/cache.md
+++ b/docs/source/en/api/cache.md
@@ -1,4 +1,4 @@
-<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
+<!-- Copyright 2025 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
@@ -11,72 +11,26 @@ specific language governing permissions and limitations under the License. -->
 # Caching methods
-## Pyramid Attention Broadcast
+Cache methods speedup diffusion transformers by storing and reusing intermediate outputs of specific layers, such as attention and feedforward layers, instead of recalculating them at each inference step.
-[Pyramid Attention Broadcast](https://huggingface.co/papers/2408.12588) from Xuanlei Zhao, Xiaolong Jin, Kai Wang, Yang You.
+## CacheMixin
 Pyramid Attention Broadcast (PAB) is a method that speeds up inference in diffusion models by systematically skipping attention computations between successive inference steps and reusing cached attention states. The attention states are not very different between successive inference steps. The most prominent difference is in the spatial attention blocks, not as much in the temporal attention blocks, and finally the least in the cross attention blocks. Therefore, many cross attention computation blocks can be skipped, followed by the temporal and spatial attention blocks. By combining other techniques like sequence parallelism and classifier-free guidance parallelism, PAB achieves near real-time video generation.
 Enable PAB with [`~PyramidAttentionBroadcastConfig`] on any pipeline. For some benchmarks, refer to [this](https://github.com/huggingface/diffusers/pull/9562) pull request.
 ```python
 import torch
 from diffusers import CogVideoXPipeline, PyramidAttentionBroadcastConfig
 pipe = CogVideoXPipeline.from_pretrained("THUDM/CogVideoX-5b", torch_dtype=torch.bfloat16)
 pipe.to("cuda")
 # Increasing the value of `spatial_attention_timestep_skip_range[0]` or decreasing the value of
 # `spatial_attention_timestep_skip_range[1]` will decrease the interval in which pyramid attention
 # broadcast is active, leader to slower inference speeds. However, large intervals can lead to
 # poorer quality of generated videos.
 config = PyramidAttentionBroadcastConfig(
    spatial_attention_block_skip_range=2,
    spatial_attention_timestep_skip_range=(100, 800),
    current_timestep_callback=lambda: pipe.current_timestep,
 )
 pipe.transformer.enable_cache(config)
 ```
 ## Faster Cache
 [FasterCache](https://huggingface.co/papers/2410.19355) from Zhengyao Lv, Chenyang Si, Junhao Song, Zhenyu Yang, Yu Qiao, Ziwei Liu, Kwan-Yee K. Wong.
 FasterCache is a method that speeds up inference in diffusion transformers by:
 - Reusing attention states between successive inference steps, due to high similarity between them
 - Skipping unconditional branch prediction used in classifier-free guidance by revealing redundancies between unconditional and conditional branch outputs for the same timestep, and therefore approximating the unconditional branch output using the conditional branch output
 ```python
 import torch
 from diffusers import CogVideoXPipeline, FasterCacheConfig
 pipe = CogVideoXPipeline.from_pretrained("THUDM/CogVideoX-5b", torch_dtype=torch.bfloat16)
 pipe.to("cuda")
 config = FasterCacheConfig(
    spatial_attention_block_skip_range=2,
    spatial_attention_timestep_skip_range=(-1, 681),
    current_timestep_callback=lambda: pipe.current_timestep,
    attention_weight_callback=lambda _: 0.3,
    unconditional_batch_skip_range=5,
    unconditional_batch_timestep_skip_range=(-1, 781),
    tensor_format="BFCHW",
 )
 pipe.transformer.enable_cache(config)
 ```
 ### CacheMixin
 [[autodoc]] CacheMixin
-### PyramidAttentionBroadcastConfig
+## PyramidAttentionBroadcastConfig
 [[autodoc]] PyramidAttentionBroadcastConfig
 [[autodoc]] apply_pyramid_attention_broadcast
-### FasterCacheConfig
+## FasterCacheConfig
 [[autodoc]] FasterCacheConfig
 [[autodoc]] apply_faster_cache
 ### FirstBlockCacheConfig
 [[autodoc]] FirstBlockCacheConfig
 [[autodoc]] apply_first_block_cache
--- a/docs/source/en/api/configuration.md
+++ b/docs/source/en/api/configuration.md
@@ -1,4 +1,4 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
@@ -16,7 +16,7 @@ Schedulers from [`~schedulers.scheduling_utils.SchedulerMixin`] and models from
 <Tip>
-To use private or [gated](https://huggingface.co/docs/hub/models-gated#gated-models) models, log-in with `huggingface-cli login`.
+To use private or [gated](https://huggingface.co/docs/hub/models-gated#gated-models) models, log-in with `hf auth login`.
 </Tip>
--- a/docs/source/en/api/image_processor.md
+++ b/docs/source/en/api/image_processor.md
@@ -1,4 +1,4 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
--- a/docs/source/en/api/internal_classes_overview.md
+++ b/docs/source/en/api/internal_classes_overview.md
@@ -1,4 +1,4 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
--- a/docs/source/en/api/loaders/ip_adapter.md
+++ b/docs/source/en/api/loaders/ip_adapter.md
@@ -1,4 +1,4 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
--- a/docs/source/en/api/loaders/lora.md
+++ b/docs/source/en/api/loaders/lora.md
@@ -1,4 +1,4 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
@@ -26,9 +26,11 @@ LoRA is a fast and lightweight training method that inserts and trains a signifi
 - [`HunyuanVideoLoraLoaderMixin`] provides similar functions for [HunyuanVideo](https://huggingface.co/docs/diffusers/main/en/api/pipelines/hunyuan_video).
 - [`Lumina2LoraLoaderMixin`] provides similar functions for [Lumina2](https://huggingface.co/docs/diffusers/main/en/api/pipelines/lumina2).
 - [`WanLoraLoaderMixin`] provides similar functions for [Wan](https://huggingface.co/docs/diffusers/main/en/api/pipelines/wan).
 - [`SkyReelsV2LoraLoaderMixin`] provides similar functions for [SkyReels-V2](https://huggingface.co/docs/diffusers/main/en/api/pipelines/skyreels_v2).
 - [`CogView4LoraLoaderMixin`] provides similar functions for [CogView4](https://huggingface.co/docs/diffusers/main/en/api/pipelines/cogview4).
 - [`AmusedLoraLoaderMixin`] is for the [`AmusedPipeline`].
 - [`HiDreamImageLoraLoaderMixin`] provides similar functions for [HiDream Image](https://huggingface.co/docs/diffusers/main/en/api/pipelines/hidream)
 - [`QwenImageLoraLoaderMixin`] provides similar functions for [Qwen Image](https://huggingface.co/docs/diffusers/main/en/api/pipelines/qwen)
 - [`LoraBaseMixin`] provides a base class with several utility methods to fuse, unfuse, unload, LoRAs and more.
 <Tip>
@@ -37,6 +39,10 @@ To learn more about how to load LoRA weights, see the [LoRA](../../using-diffuse
 </Tip>
 ## LoraBaseMixin
 [[autodoc]] loaders.lora_base.LoraBaseMixin
 ## StableDiffusionLoraLoaderMixin
 [[autodoc]] loaders.lora_pipeline.StableDiffusionLoraLoaderMixin
@@ -88,6 +94,10 @@ To learn more about how to load LoRA weights, see the [LoRA](../../using-diffuse
 [[autodoc]] loaders.lora_pipeline.WanLoraLoaderMixin
 ## SkyReelsV2LoraLoaderMixin
 [[autodoc]] loaders.lora_pipeline.SkyReelsV2LoraLoaderMixin
 ## AmusedLoraLoaderMixin
 [[autodoc]] loaders.lora_pipeline.AmusedLoraLoaderMixin
@@ -96,6 +106,10 @@ To learn more about how to load LoRA weights, see the [LoRA](../../using-diffuse
 [[autodoc]] loaders.lora_pipeline.HiDreamImageLoraLoaderMixin
 ## QwenImageLoraLoaderMixin
 [[autodoc]] loaders.lora_pipeline.QwenImageLoraLoaderMixin
 ## LoraBaseMixin
 [[autodoc]] loaders.lora_base.LoraBaseMixin
--- a/docs/source/en/api/loaders/peft.md
+++ b/docs/source/en/api/loaders/peft.md
@@ -1,4 +1,4 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
--- a/docs/source/en/api/loaders/single_file.md
+++ b/docs/source/en/api/loaders/single_file.md
@@ -1,4 +1,4 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
--- a/docs/source/en/api/loaders/textual_inversion.md
+++ b/docs/source/en/api/loaders/textual_inversion.md
@@ -1,4 +1,4 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
--- a/docs/source/en/api/loaders/transformer_sd3.md
+++ b/docs/source/en/api/loaders/transformer_sd3.md
@@ -1,4 +1,4 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
--- a/docs/source/en/api/loaders/unet.md
+++ b/docs/source/en/api/loaders/unet.md
@@ -1,4 +1,4 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
--- a/docs/source/en/api/logging.md
+++ b/docs/source/en/api/logging.md
@@ -1,4 +1,4 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
--- a/docs/source/en/api/models/allegro_transformer3d.md
+++ b/docs/source/en/api/models/allegro_transformer3d.md
@@ -1,4 +1,4 @@
-<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
+<!-- Copyright 2025 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
--- a/docs/source/en/api/models/asymmetricautoencoderkl.md
+++ b/docs/source/en/api/models/asymmetricautoencoderkl.md
@@ -1,4 +1,4 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
@@ -12,7 +12,7 @@ specific language governing permissions and limitations under the License.
 # AsymmetricAutoencoderKL
-Improved larger variational autoencoder (VAE) model with KL loss for inpainting task: [Designing a Better Asymmetric VQGAN for StableDiffusion](https://arxiv.org/abs/2306.04632) by Zixin Zhu, Xuelu Feng, Dongdong Chen, Jianmin Bao, Le Wang, Yinpeng Chen, Lu Yuan, Gang Hua.
+Improved larger variational autoencoder (VAE) model with KL loss for inpainting task: [Designing a Better Asymmetric VQGAN for StableDiffusion](https://huggingface.co/papers/2306.04632) by Zixin Zhu, Xuelu Feng, Dongdong Chen, Jianmin Bao, Le Wang, Yinpeng Chen, Lu Yuan, Gang Hua.
 The abstract from the paper is:
--- a/docs/source/en/api/models/aura_flow_transformer2d.md
+++ b/docs/source/en/api/models/aura_flow_transformer2d.md
@@ -1,4 +1,4 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
--- a/docs/source/en/api/models/auto_model.md
+++ b/docs/source/en/api/models/auto_model.md
@@ -1,4 +1,4 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
--- a/docs/source/en/api/models/autoencoder_dc.md
+++ b/docs/source/en/api/models/autoencoder_dc.md
@@ -1,4 +1,4 @@
-<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
+<!-- Copyright 2025 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
--- a/docs/source/en/api/models/autoencoder_kl_hunyuan_video.md
+++ b/docs/source/en/api/models/autoencoder_kl_hunyuan_video.md
@@ -1,4 +1,4 @@
-<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
+<!-- Copyright 2025 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
--- a/docs/source/en/api/models/autoencoder_kl_wan.md
+++ b/docs/source/en/api/models/autoencoder_kl_wan.md
@@ -1,4 +1,4 @@
-<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
+<!-- Copyright 2025 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
--- a/docs/source/en/api/models/autoencoder_oobleck.md
+++ b/docs/source/en/api/models/autoencoder_oobleck.md
@@ -1,4 +1,4 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
--- a/docs/source/en/api/models/autoencoder_tiny.md
+++ b/docs/source/en/api/models/autoencoder_tiny.md
@@ -1,4 +1,4 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
--- a/docs/source/en/api/models/autoencoderkl.md
+++ b/docs/source/en/api/models/autoencoderkl.md
@@ -1,4 +1,4 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
@@ -12,7 +12,7 @@ specific language governing permissions and limitations under the License.
 # AutoencoderKL
-The variational autoencoder (VAE) model with KL loss was introduced in [Auto-Encoding Variational Bayes](https://arxiv.org/abs/1312.6114v11) by Diederik P. Kingma and Max Welling. The model is used in 🤗 Diffusers to encode images into latents and to decode latent representations into images.
+The variational autoencoder (VAE) model with KL loss was introduced in [Auto-Encoding Variational Bayes](https://huggingface.co/papers/1312.6114v11) by Diederik P. Kingma and Max Welling. The model is used in 🤗 Diffusers to encode images into latents and to decode latent representations into images.
 The abstract from the paper is:
--- a/docs/source/en/api/models/autoencoderkl_allegro.md
+++ b/docs/source/en/api/models/autoencoderkl_allegro.md
@@ -1,4 +1,4 @@
-<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
+<!-- Copyright 2025 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
--- a/docs/source/en/api/models/autoencoderkl_cogvideox.md
+++ b/docs/source/en/api/models/autoencoderkl_cogvideox.md
@@ -1,4 +1,4 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
--- a/docs/source/en/api/models/autoencoderkl_cosmos.md
+++ b/docs/source/en/api/models/autoencoderkl_cosmos.md
@@ -1,4 +1,4 @@
-<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
+<!-- Copyright 2025 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
--- a/docs/source/en/api/models/autoencoderkl_ltx_video.md
+++ b/docs/source/en/api/models/autoencoderkl_ltx_video.md
@@ -1,4 +1,4 @@
-<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
+<!-- Copyright 2025 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
--- a/docs/source/en/api/models/autoencoderkl_mochi.md
+++ b/docs/source/en/api/models/autoencoderkl_mochi.md
@@ -1,4 +1,4 @@
-<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
+<!-- Copyright 2025 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
--- a/docs/source/en/api/models/autoencoderkl_qwenimage.md
+++ b/docs/source/en/api/models/autoencoderkl_qwenimage.md
@@ -0,0 +1,35 @@
 <!-- Copyright 2025 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
 http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License. -->
 # AutoencoderKLQwenImage
 The model can be loaded with the following code snippet.
 ```python
 from diffusers import AutoencoderKLQwenImage
 vae = AutoencoderKLQwenImage.from_pretrained("Qwen/QwenImage-20B", subfolder="vae")
 ```
 ## AutoencoderKLQwenImage
 [[autodoc]] AutoencoderKLQwenImage
    - decode
    - encode
    - all
 ## AutoencoderKLOutput
 [[autodoc]] models.autoencoders.autoencoder_kl.AutoencoderKLOutput
 ## DecoderOutput
 [[autodoc]] models.autoencoders.vae.DecoderOutput
--- a/docs/source/en/api/models/bria_transformer.md
+++ b/docs/source/en/api/models/bria_transformer.md
@@ -0,0 +1,19 @@
 <!--Copyright 2025 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
 http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
 -->
 # BriaTransformer2DModel
 A modified flux Transformer model from [Bria](https://huggingface.co/briaai/BRIA-3.2)
 ## BriaTransformer2DModel
 [[autodoc]] BriaTransformer2DModel
--- a/docs/source/en/api/models/chroma_transformer.md
+++ b/docs/source/en/api/models/chroma_transformer.md
@@ -0,0 +1,19 @@
 <!--Copyright 2025 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
 http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
 -->
 # ChromaTransformer2DModel
 A modified flux Transformer model from [Chroma](https://huggingface.co/lodestones/Chroma)
 ## ChromaTransformer2DModel
 [[autodoc]] ChromaTransformer2DModel
--- a/docs/source/en/api/models/cogvideox_transformer3d.md
+++ b/docs/source/en/api/models/cogvideox_transformer3d.md
@@ -1,4 +1,4 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
--- a/docs/source/en/api/models/cogview3plus_transformer2d.md
+++ b/docs/source/en/api/models/cogview3plus_transformer2d.md
@@ -1,4 +1,4 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
--- a/docs/source/en/api/models/cogview4_transformer2d.md
+++ b/docs/source/en/api/models/cogview4_transformer2d.md
@@ -1,4 +1,4 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
--- a/docs/source/en/api/models/consisid_transformer3d.md
+++ b/docs/source/en/api/models/consisid_transformer3d.md
@@ -1,4 +1,4 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
@@ -11,7 +11,7 @@ specific language governing permissions and limitations under the License. -->
 # ConsisIDTransformer3DModel
-A Diffusion Transformer model for 3D data from [ConsisID](https://github.com/PKU-YuanGroup/ConsisID) was introduced in [Identity-Preserving Text-to-Video Generation by Frequency Decomposition](https://arxiv.org/pdf/2411.17440) by Peking University & University of Rochester & etc.
+A Diffusion Transformer model for 3D data from [ConsisID](https://github.com/PKU-YuanGroup/ConsisID) was introduced in [Identity-Preserving Text-to-Video Generation by Frequency Decomposition](https://huggingface.co/papers/2411.17440) by Peking University & University of Rochester & etc.
 The model can be loaded with the following code snippet.
--- a/docs/source/en/api/models/consistency_decoder_vae.md
+++ b/docs/source/en/api/models/consistency_decoder_vae.md
@@ -1,4 +1,4 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
--- a/docs/source/en/api/models/controlnet.md
+++ b/docs/source/en/api/models/controlnet.md
@@ -1,4 +1,4 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
--- a/docs/source/en/api/models/controlnet_flux.md
+++ b/docs/source/en/api/models/controlnet_flux.md
@@ -1,4 +1,4 @@
-<!--Copyright 2024 The HuggingFace Team and The InstantX Team. All rights reserved.
+<!--Copyright 2025 The HuggingFace Team and The InstantX Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
--- a/docs/source/en/api/models/controlnet_hunyuandit.md
+++ b/docs/source/en/api/models/controlnet_hunyuandit.md
@@ -1,4 +1,4 @@
-<!--Copyright 2024 The HuggingFace Team and Tencent Hunyuan Team. All rights reserved.
+<!--Copyright 2025 The HuggingFace Team and Tencent Hunyuan Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
@@ -12,7 +12,7 @@ specific language governing permissions and limitations under the License.
 # HunyuanDiT2DControlNetModel
-HunyuanDiT2DControlNetModel is an implementation of ControlNet for [Hunyuan-DiT](https://arxiv.org/abs/2405.08748).
+HunyuanDiT2DControlNetModel is an implementation of ControlNet for [Hunyuan-DiT](https://huggingface.co/papers/2405.08748).
 ControlNet was introduced in [Adding Conditional Control to Text-to-Image Diffusion Models](https://huggingface.co/papers/2302.05543) by Lvmin Zhang, Anyi Rao, and Maneesh Agrawala.
--- a/docs/source/en/api/models/controlnet_sana.md
+++ b/docs/source/en/api/models/controlnet_sana.md
@@ -1,4 +1,4 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
--- a/docs/source/en/api/models/controlnet_sd3.md
+++ b/docs/source/en/api/models/controlnet_sd3.md
@@ -1,4 +1,4 @@
-<!--Copyright 2024 The HuggingFace Team and The InstantX Team. All rights reserved.
+<!--Copyright 2025 The HuggingFace Team and The InstantX Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
--- a/docs/source/en/api/models/controlnet_sparsectrl.md
+++ b/docs/source/en/api/models/controlnet_sparsectrl.md
@@ -1,4 +1,4 @@
-<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
+<!-- Copyright 2025 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
@@ -11,11 +11,11 @@ specific language governing permissions and limitations under the License. -->
 # SparseControlNetModel
-SparseControlNetModel is an implementation of ControlNet for [AnimateDiff](https://arxiv.org/abs/2307.04725).
+SparseControlNetModel is an implementation of ControlNet for [AnimateDiff](https://huggingface.co/papers/2307.04725).
 ControlNet was introduced in [Adding Conditional Control to Text-to-Image Diffusion Models](https://huggingface.co/papers/2302.05543) by Lvmin Zhang, Anyi Rao, and Maneesh Agrawala.
-The SparseCtrl version of ControlNet was introduced in [SparseCtrl: Adding Sparse Controls to Text-to-Video Diffusion Models](https://arxiv.org/abs/2311.16933) for achieving controlled generation in text-to-video diffusion models by Yuwei Guo, Ceyuan Yang, Anyi Rao, Maneesh Agrawala, Dahua Lin, and Bo Dai.
+The SparseCtrl version of ControlNet was introduced in [SparseCtrl: Adding Sparse Controls to Text-to-Video Diffusion Models](https://huggingface.co/papers/2311.16933) for achieving controlled generation in text-to-video diffusion models by Yuwei Guo, Ceyuan Yang, Anyi Rao, Maneesh Agrawala, Dahua Lin, and Bo Dai.
 The abstract from the paper is:
--- a/docs/source/en/api/models/controlnet_union.md
+++ b/docs/source/en/api/models/controlnet_union.md
@@ -1,4 +1,4 @@
-<!--Copyright 2024 The HuggingFace Team and The InstantX Team. All rights reserved.
+<!--Copyright 2025 The HuggingFace Team and The InstantX Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
--- a/docs/source/en/api/models/cosmos_transformer3d.md
+++ b/docs/source/en/api/models/cosmos_transformer3d.md
@@ -1,4 +1,4 @@
-<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
+<!-- Copyright 2025 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
--- a/docs/source/en/api/models/dit_transformer2d.md
+++ b/docs/source/en/api/models/dit_transformer2d.md
@@ -1,4 +1,4 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
--- a/docs/source/en/api/models/flux_transformer.md
+++ b/docs/source/en/api/models/flux_transformer.md
@@ -1,4 +1,4 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
--- a/docs/source/en/api/models/hidream_image_transformer.md
+++ b/docs/source/en/api/models/hidream_image_transformer.md
@@ -1,4 +1,4 @@
-<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
+<!-- Copyright 2025 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
@@ -21,6 +21,22 @@ from diffusers import HiDreamImageTransformer2DModel
 transformer = HiDreamImageTransformer2DModel.from_pretrained("HiDream-ai/HiDream-I1-Full", subfolder="transformer", torch_dtype=torch.bfloat16)
 ```
 ## Loading GGUF quantized checkpoints for HiDream-I1
 GGUF checkpoints for the `HiDreamImageTransformer2DModel` can  be loaded using `~FromOriginalModelMixin.from_single_file`
 ```python
 import torch
 from diffusers import GGUFQuantizationConfig, HiDreamImageTransformer2DModel
 ckpt_path = "https://huggingface.co/city96/HiDream-I1-Dev-gguf/blob/main/hidream-i1-dev-Q2_K.gguf"
 transformer = HiDreamImageTransformer2DModel.from_single_file(
    ckpt_path,
    quantization_config=GGUFQuantizationConfig(compute_dtype=torch.bfloat16),
    torch_dtype=torch.bfloat16
 )
 ```
 ## HiDreamImageTransformer2DModel
 [[autodoc]] HiDreamImageTransformer2DModel
--- a/docs/source/en/api/models/hunyuan_transformer2d.md
+++ b/docs/source/en/api/models/hunyuan_transformer2d.md
@@ -1,4 +1,4 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
--- a/docs/source/en/api/models/hunyuan_video_transformer_3d.md
+++ b/docs/source/en/api/models/hunyuan_video_transformer_3d.md
@@ -1,4 +1,4 @@
-<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
+<!-- Copyright 2025 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
--- a/docs/source/en/api/models/latte_transformer3d.md
+++ b/docs/source/en/api/models/latte_transformer3d.md
@@ -1,4 +1,4 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
--- a/docs/source/en/api/models/ltx_video_transformer3d.md
+++ b/docs/source/en/api/models/ltx_video_transformer3d.md
@@ -1,4 +1,4 @@
-<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
+<!-- Copyright 2025 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
--- a/docs/source/en/api/models/lumina2_transformer2d.md
+++ b/docs/source/en/api/models/lumina2_transformer2d.md
@@ -1,4 +1,4 @@
-<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
+<!-- Copyright 2025 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
--- a/docs/source/en/api/models/lumina_nextdit2d.md
+++ b/docs/source/en/api/models/lumina_nextdit2d.md
@@ -1,4 +1,4 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
--- a/docs/source/en/api/models/mochi_transformer3d.md
+++ b/docs/source/en/api/models/mochi_transformer3d.md
@@ -1,4 +1,4 @@
-<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
+<!-- Copyright 2025 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
--- a/docs/source/en/api/models/omnigen_transformer.md
+++ b/docs/source/en/api/models/omnigen_transformer.md
@@ -1,4 +1,4 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
--- a/docs/source/en/api/models/overview.md
+++ b/docs/source/en/api/models/overview.md
@@ -1,4 +1,4 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
--- a/docs/source/en/api/models/pixart_transformer2d.md
+++ b/docs/source/en/api/models/pixart_transformer2d.md
@@ -1,4 +1,4 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
--- a/docs/source/en/api/models/prior_transformer.md
+++ b/docs/source/en/api/models/prior_transformer.md
@@ -1,4 +1,4 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
--- a/Show More
+++ b/Show More