Release: v0.35.2-patch

fix scale_shift_factor being on cpu for wan and ltx (#12347 )
* wan fix scale_shift_factor being on cpu * apply device cast to ltx transformer * Apply style fixes --------- Co-authored-by: Dhruv Nair <dhruv.nair@gmail.com> Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
2025-12-10 06:24:19 +08:00 · 2025-10-15 09:23:57 +05:30 · 2025-10-15 09:19:40 +05:30 · 2025-10-15 09:14:51 +05:30 · 2025-10-15 09:09:25 +05:30 · 2025-10-15 09:08:31 +05:30
555 changed files with 53726 additions and 15892 deletions
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -11,20 +11,21 @@ env:
  HF_HOME: /mnt/cache
  OMP_NUM_THREADS: 8
  MKL_NUM_THREADS: 8
+  BASE_PATH: benchmark_outputs

 jobs:
-  torch_pipelines_cuda_benchmark_tests:
+  torch_models_cuda_benchmark_tests:
    env:
      SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL_BENCHMARK }}
-    name: Torch Core Pipelines CUDA Benchmarking Tests
+    name: Torch Core Models CUDA Benchmarking Tests
    strategy:
      fail-fast: false
      max-parallel: 1
    runs-on:
-      group: aws-g6-4xlarge-plus
+      group: aws-g6e-4xlarge
    container:
      image: diffusers/diffusers-pytorch-cuda
-      options: --shm-size "16gb" --ipc host --gpus 0
+      options: --shm-size "16gb" --ipc host --gpus all
    steps:
      - name: Checkout diffusers
        uses: actions/checkout@v3
@@ -35,27 +36,47 @@ jobs:
          nvidia-smi
      - name: Install dependencies
        run: |
+          apt update
+          apt install -y libpq-dev postgresql-client
          python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
          python -m uv pip install -e [quality,test]
-          python -m uv pip install pandas peft
-          python -m uv pip uninstall transformers && python -m uv pip install transformers==4.48.0
+          python -m uv pip install -r benchmarks/requirements.txt
      - name: Environment
        run: |
          python utils/print_env.py
      - name: Diffusers Benchmarking
        env:
-            HF_TOKEN: ${{ secrets.DIFFUSERS_BOT_TOKEN }}
-            BASE_PATH: benchmark_outputs
+          HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
        run: |
-          export TOTAL_GPU_MEMORY=$(python -c "import torch; print(torch.cuda.get_device_properties(0).total_memory / (1024**3))")
-          cd benchmarks && mkdir ${BASE_PATH} && python run_all.py && python push_results.py
+          cd benchmarks && python run_all.py
+
+      - name: Push results to the Hub
+        env: 
+          HF_TOKEN: ${{ secrets.DIFFUSERS_BOT_TOKEN }}
+        run: |
+          cd benchmarks && python push_results.py
+          mkdir $BASE_PATH && cp *.csv $BASE_PATH

      - name: Test suite reports artifacts
        if: ${{ always() }}
        uses: actions/upload-artifact@v4
        with:
          name: benchmark_test_reports
-          path: benchmarks/benchmark_outputs
+          path: benchmarks/${{ env.BASE_PATH }}
+      
+      # TODO: enable this once the connection problem has been resolved.
+      - name: Update benchmarking results to DB
+        env:
+          PGDATABASE: metrics
+          PGHOST: ${{ secrets.DIFFUSERS_BENCHMARKS_PGHOST }}
+          PGUSER: transformers_benchmarks
+          PGPASSWORD: ${{ secrets.DIFFUSERS_BENCHMARKS_PGPASSWORD }}
+          BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
+        run: |
+          git config --global --add safe.directory /__w/diffusers/diffusers
+          commit_id=$GITHUB_SHA
+          commit_msg=$(git show -s --format=%s "$commit_id" | cut -c1-70)
+          cd benchmarks && python populate_into_db.py "$BRANCH_NAME" "$commit_id" "$commit_msg"

      - name: Report success status
        if: ${{ success() }}
--- a/.github/workflows/mirror_community_pipeline.yml
+++ b/.github/workflows/mirror_community_pipeline.yml
@@ -79,14 +79,14 @@ jobs:

      # Check secret is set
      - name: whoami
-        run: huggingface-cli whoami
+        run: hf auth whoami
        env:
            HF_TOKEN: ${{ secrets.HF_TOKEN_MIRROR_COMMUNITY_PIPELINES }}

      # Push to HF! (under subfolder based on checkout ref)
      # https://huggingface.co/datasets/diffusers/community-pipelines-mirror
      - name: Mirror community pipeline to HF
-        run: huggingface-cli upload diffusers/community-pipelines-mirror ./examples/community ${PATH_IN_REPO} --repo-type dataset
+        run: hf upload diffusers/community-pipelines-mirror ./examples/community ${PATH_IN_REPO} --repo-type dataset
        env:
            PATH_IN_REPO: ${{ env.PATH_IN_REPO }}
            HF_TOKEN: ${{ secrets.HF_TOKEN_MIRROR_COMMUNITY_PIPELINES }}
--- a/.github/workflows/nightly_tests.yml
+++ b/.github/workflows/nightly_tests.yml
@@ -61,7 +61,7 @@ jobs:
      group: aws-g4dn-2xlarge
    container:
      image: diffusers/diffusers-pytorch-cuda
-      options: --shm-size "16gb" --ipc host --gpus 0
+      options: --shm-size "16gb" --ipc host --gpus all
    steps:
      - name: Checkout diffusers
        uses: actions/checkout@v3
@@ -107,7 +107,7 @@ jobs:
      group: aws-g4dn-2xlarge
    container:
      image: diffusers/diffusers-pytorch-cuda
-      options: --shm-size "16gb" --ipc host --gpus 0
+      options: --shm-size "16gb" --ipc host --gpus all
    defaults:
      run:
        shell: bash
@@ -178,7 +178,7 @@ jobs:

    container:
      image: diffusers/diffusers-pytorch-cuda
-      options: --gpus 0 --shm-size "16gb" --ipc host
+      options: --gpus all --shm-size "16gb" --ipc host

    steps:
    - name: Checkout diffusers
@@ -222,7 +222,7 @@ jobs:
      group: aws-g6e-xlarge-plus
    container:
      image: diffusers/diffusers-pytorch-cuda
-      options: --shm-size "16gb" --ipc host --gpus 0
+      options: --shm-size "16gb" --ipc host --gpus all
    steps:
      - name: Checkout diffusers
        uses: actions/checkout@v3
@@ -248,7 +248,7 @@ jobs:
          BIG_GPU_MEMORY: 40
        run: |
          python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
-            -m "big_gpu_with_torch_cuda" \
+            -m "big_accelerator" \
            --make-reports=tests_big_gpu_torch_cuda \
            --report-log=tests_big_gpu_torch_cuda.log \
            tests/
@@ -270,7 +270,7 @@ jobs:
      group: aws-g4dn-2xlarge
    container:
      image: diffusers/diffusers-pytorch-minimum-cuda
-      options: --shm-size "16gb" --ipc host --gpus 0
+      options: --shm-size "16gb" --ipc host --gpus all
    defaults:
      run:
        shell: bash
@@ -333,7 +333,7 @@ jobs:
            additional_deps: ["peft"]
          - backend: "gguf"
            test_location: "gguf"
-            additional_deps: ["peft"]
+            additional_deps: ["peft", "kernels"]
          - backend: "torchao"
            test_location: "torchao"
            additional_deps: []
@@ -344,7 +344,7 @@ jobs:
      group: aws-g6e-xlarge-plus
    container:
      image: diffusers/diffusers-pytorch-cuda
-      options: --shm-size "20gb" --ipc host --gpus 0
+      options: --shm-size "20gb" --ipc host --gpus all
    steps:
      - name: Checkout diffusers
        uses: actions/checkout@v3
@@ -396,7 +396,7 @@ jobs:
      group: aws-g6e-xlarge-plus
    container:
      image: diffusers/diffusers-pytorch-cuda
-      options: --shm-size "20gb" --ipc host --gpus 0
+      options: --shm-size "20gb" --ipc host --gpus all
    steps:
      - name: Checkout diffusers
        uses: actions/checkout@v3
--- a/.github/workflows/pr_modular_tests.yml
+++ b/.github/workflows/pr_modular_tests.yml
@@ -0,0 +1,141 @@
+name: Fast PR tests for Modular
+
+on:
+  pull_request:
+    branches: [main]
+    paths:
+      - "src/diffusers/modular_pipelines/**.py"
+      - "src/diffusers/models/modeling_utils.py"
+      - "src/diffusers/models/model_loading_utils.py"
+      - "src/diffusers/pipelines/pipeline_utils.py"
+      - "src/diffusers/pipeline_loading_utils.py"
+      - "src/diffusers/loaders/lora_base.py"
+      - "src/diffusers/loaders/lora_pipeline.py"
+      - "src/diffusers/loaders/peft.py"
+      - "tests/modular_pipelines/**.py"
+      - ".github/**.yml"
+      - "utils/**.py"
+      - "setup.py"
+  push:
+    branches:
+      - ci-*
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+env:
+  DIFFUSERS_IS_CI: yes
+  HF_HUB_ENABLE_HF_TRANSFER: 1
+  OMP_NUM_THREADS: 4
+  MKL_NUM_THREADS: 4
+  PYTEST_TIMEOUT: 60
+
+jobs:
+  check_code_quality:
+    runs-on: ubuntu-22.04
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.10"
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install .[quality]
+      - name: Check quality
+        run: make quality
+      - name: Check if failure
+        if: ${{ failure() }}
+        run: |
+          echo "Quality check failed. Please ensure the right dependency versions are installed with 'pip install -e .[quality]' and run 'make style && make quality'" >> $GITHUB_STEP_SUMMARY
+
+  check_repository_consistency:
+    needs: check_code_quality
+    runs-on: ubuntu-22.04
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.10"
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install .[quality]
+      - name: Check repo consistency
+        run: |
+          python utils/check_copies.py
+          python utils/check_dummies.py
+          python utils/check_support_list.py
+          make deps_table_check_updated
+      - name: Check if failure
+        if: ${{ failure() }}
+        run: |
+          echo "Repo consistency check failed. Please ensure the right dependency versions are installed with 'pip install -e .[quality]' and run 'make fix-copies'" >> $GITHUB_STEP_SUMMARY
+
+  run_fast_tests:
+    needs: [check_code_quality, check_repository_consistency]
+    strategy:
+      fail-fast: false
+      matrix:
+        config:
+          - name: Fast PyTorch Modular Pipeline CPU tests
+            framework: pytorch_pipelines
+            runner: aws-highmemory-32-plus
+            image: diffusers/diffusers-pytorch-cpu
+            report: torch_cpu_modular_pipelines
+
+    name: ${{ matrix.config.name }}
+
+    runs-on:
+      group: ${{ matrix.config.runner }}
+
+    container:
+      image: ${{ matrix.config.image }}
+      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
+
+    defaults:
+      run:
+        shell: bash
+
+    steps:
+    - name: Checkout diffusers
+      uses: actions/checkout@v3
+      with:
+        fetch-depth: 2
+
+    - name: Install dependencies
+      run: |
+        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+        python -m uv pip install -e [quality,test]
+        pip uninstall transformers -y && python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers.git --no-deps
+        pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git --no-deps
+
+    - name: Environment
+      run: |
+        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+        python utils/print_env.py
+
+    - name: Run fast PyTorch Pipeline CPU tests
+      if: ${{ matrix.config.framework == 'pytorch_pipelines' }}
+      run: |
+        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+        python -m pytest -n 8 --max-worker-restart=0 --dist=loadfile \
+          -s -v -k "not Flax and not Onnx" \
+          --make-reports=tests_${{ matrix.config.report }} \
+          tests/modular_pipelines
+
+    - name: Failure short reports
+      if: ${{ failure() }}
+      run: cat reports/tests_${{ matrix.config.report }}_failures_short.txt
+
+    - name: Test suite reports artifacts
+      if: ${{ always() }}
+      uses: actions/upload-artifact@v4
+      with:
+        name: pr_${{ matrix.config.framework }}_${{ matrix.config.report }}_test_reports
+        path: reports
+
+
--- a/.github/workflows/pr_tests_gpu.yml
+++ b/.github/workflows/pr_tests_gpu.yml
@@ -13,6 +13,7 @@ on:
      - "src/diffusers/loaders/peft.py"
      - "tests/pipelines/test_pipelines_common.py"
      - "tests/models/test_modeling_common.py"
+      - "examples/**/*.py"
  workflow_dispatch:

 concurrency:
@@ -117,7 +118,7 @@ jobs:
      group: aws-g4dn-2xlarge
    container:
      image: diffusers/diffusers-pytorch-cuda
-      options: --shm-size "16gb" --ipc host --gpus 0
+      options: --shm-size "16gb" --ipc host --gpus all
    steps:
      - name: Checkout diffusers
        uses: actions/checkout@v3
@@ -182,13 +183,13 @@ jobs:
      group: aws-g4dn-2xlarge
    container:
      image: diffusers/diffusers-pytorch-cuda
-      options: --shm-size "16gb" --ipc host --gpus 0
+      options: --shm-size "16gb" --ipc host --gpus all
    defaults:
      run:
        shell: bash
    strategy:
      fail-fast: false
-      max-parallel: 2
+      max-parallel: 4
      matrix:
        module: [models, schedulers, lora, others]
    steps:
@@ -252,7 +253,7 @@ jobs:

    container:
      image: diffusers/diffusers-pytorch-cuda
-      options: --gpus 0 --shm-size "16gb" --ipc host
+      options: --gpus all --shm-size "16gb" --ipc host
    steps:
    - name: Checkout diffusers
      uses: actions/checkout@v3
--- a/.github/workflows/push_tests.yml
+++ b/.github/workflows/push_tests.yml
@@ -64,7 +64,7 @@ jobs:
      group: aws-g4dn-2xlarge
    container:
      image: diffusers/diffusers-pytorch-cuda
-      options: --shm-size "16gb" --ipc host --gpus 0
+      options: --shm-size "16gb" --ipc host --gpus all
    steps:
      - name: Checkout diffusers
        uses: actions/checkout@v3
@@ -109,7 +109,7 @@ jobs:
      group: aws-g4dn-2xlarge
    container:
      image: diffusers/diffusers-pytorch-cuda
-      options: --shm-size "16gb" --ipc host --gpus 0
+      options: --shm-size "16gb" --ipc host --gpus all
    defaults:
      run:
        shell: bash
@@ -167,7 +167,7 @@ jobs:

    container:
      image: diffusers/diffusers-pytorch-cuda
-      options: --gpus 0 --shm-size "16gb" --ipc host
+      options: --gpus all --shm-size "16gb" --ipc host

    steps:
    - name: Checkout diffusers
@@ -210,7 +210,7 @@ jobs:

    container:
      image: diffusers/diffusers-pytorch-xformers-cuda
-      options: --gpus 0 --shm-size "16gb" --ipc host
+      options: --gpus all --shm-size "16gb" --ipc host

    steps:
    - name: Checkout diffusers
@@ -252,7 +252,7 @@ jobs:

    container:
      image: diffusers/diffusers-pytorch-cuda
-      options: --gpus 0 --shm-size "16gb" --ipc host
+      options: --gpus all --shm-size "16gb" --ipc host
    steps:
    - name: Checkout diffusers
      uses: actions/checkout@v3
--- a/.github/workflows/release_tests_fast.yml
+++ b/.github/workflows/release_tests_fast.yml
@@ -62,7 +62,7 @@ jobs:
      group: aws-g4dn-2xlarge
    container:
      image: diffusers/diffusers-pytorch-cuda
-      options: --shm-size "16gb" --ipc host --gpus 0
+      options: --shm-size "16gb" --ipc host --gpus all
    steps:
      - name: Checkout diffusers
        uses: actions/checkout@v3
@@ -107,7 +107,7 @@ jobs:
      group: aws-g4dn-2xlarge
    container:
      image: diffusers/diffusers-pytorch-cuda
-      options: --shm-size "16gb" --ipc host --gpus 0
+      options: --shm-size "16gb" --ipc host --gpus all
    defaults:
      run:
        shell: bash
@@ -163,7 +163,7 @@ jobs:
      group: aws-g4dn-2xlarge
    container:
      image: diffusers/diffusers-pytorch-minimum-cuda
-      options: --shm-size "16gb" --ipc host --gpus 0
+      options: --shm-size "16gb" --ipc host --gpus all
    defaults:
      run:
        shell: bash
@@ -222,7 +222,7 @@ jobs:

    container:
      image: diffusers/diffusers-pytorch-cuda
-      options: --gpus 0 --shm-size "16gb" --ipc host
+      options: --gpus all --shm-size "16gb" --ipc host

    steps:
    - name: Checkout diffusers
@@ -265,7 +265,7 @@ jobs:

    container:
      image: diffusers/diffusers-pytorch-xformers-cuda
-      options: --gpus 0 --shm-size "16gb" --ipc host
+      options: --gpus all --shm-size "16gb" --ipc host

    steps:
    - name: Checkout diffusers
@@ -307,7 +307,7 @@ jobs:

    container:
      image: diffusers/diffusers-pytorch-cuda
-      options: --gpus 0 --shm-size "16gb" --ipc host
+      options: --gpus all --shm-size "16gb" --ipc host

    steps:
    - name: Checkout diffusers
--- a/.github/workflows/run_tests_from_a_pr.yml
+++ b/.github/workflows/run_tests_from_a_pr.yml
@@ -30,7 +30,7 @@ jobs:
      group: aws-g4dn-2xlarge
    container:
      image: ${{ github.event.inputs.docker_image }}
-      options: --gpus 0 --privileged --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+      options: --gpus all --privileged --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/

    steps:
      - name: Validate test files input
--- a/.github/workflows/ssh-runner.yml
+++ b/.github/workflows/ssh-runner.yml
@@ -31,7 +31,7 @@ jobs:
      group: "${{ github.event.inputs.runner_type }}"
    container:
      image: ${{ github.event.inputs.docker_image }}
-      options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface/diffusers:/mnt/cache/ --gpus 0 --privileged
+      options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface/diffusers:/mnt/cache/ --gpus all --privileged

    steps:
      - name: Checkout diffusers
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -0,0 +1,69 @@
+# Diffusers Benchmarks
+
+Welcome to Diffusers Benchmarks. These benchmarks are use to obtain latency and memory information of the most popular models across different scenarios such as:
+
+* Base case i.e., when using `torch.bfloat16` and `torch.nn.functional.scaled_dot_product_attention`.
+* Base + `torch.compile()`
+* NF4 quantization
+* Layerwise upcasting
+
+Instead of full diffusion pipelines, only the forward pass of the respective model classes (such as `FluxTransformer2DModel`) is tested with the real checkpoints (such as `"black-forest-labs/FLUX.1-dev"`). 
+
+The entrypoint to running all the currently available benchmarks is in `run_all.py`. However, one can run the individual benchmarks, too, e.g., `python benchmarking_flux.py`. It should produce a CSV file containing various information about the benchmarks run.
+
+The benchmarks are run on a weekly basis and the CI is defined in [benchmark.yml](../.github/workflows/benchmark.yml).
+
+## Running the benchmarks manually
+
+First set up `torch` and install `diffusers` from the root of the directory:
+
+```py
+pip install -e ".[quality,test]"
+```
+
+Then make sure the other dependencies are installed:
+
+```sh
+cd benchmarks/
+pip install -r requirements.txt
+```
+
+We need to be authenticated to access some of the checkpoints used during benchmarking:
+
+```sh
+hf auth login
+```
+
+We use an L40 GPU with 128GB RAM to run the benchmark CI. As such, the benchmarks are configured to run on NVIDIA GPUs. So, make sure you have access to a similar machine (or modify the benchmarking scripts accordingly).
+
+Then you can either launch the entire benchmarking suite by running:
+
+```sh
+python run_all.py
+```
+
+Or, you can run the individual benchmarks.
+
+## Customizing the benchmarks
+
+We define "scenarios" to cover the most common ways in which these models are used. You can
+define a new scenario, modifying an existing benchmark file:
+
+```py
+BenchmarkScenario(
+    name=f"{CKPT_ID}-bnb-8bit",
+    model_cls=FluxTransformer2DModel,
+    model_init_kwargs={
+        "pretrained_model_name_or_path": CKPT_ID,
+        "torch_dtype": torch.bfloat16,
+        "subfolder": "transformer",
+        "quantization_config": BitsAndBytesConfig(load_in_8bit=True),
+    },
+    get_model_input_dict=partial(get_input_dict, device=torch_device, dtype=torch.bfloat16),
+    model_init_fn=model_init_fn,
+)
+```
+
+You can also configure a new model-level benchmark and add it to the existing suite. To do so, just defining a valid benchmarking file like `benchmarking_flux.py` should be enough.
+
+Happy benchmarking 🧨
--- a/tests/pipelines/amused/init.py
+++ b/tests/pipelines/amused/init.py
--- a/benchmarks/base_classes.py
+++ b/benchmarks/base_classes.py
@@ -1,346 +0,0 @@
-import os
-import sys
-
-import torch
-
-from diffusers import (
-    AutoPipelineForImage2Image,
-    AutoPipelineForInpainting,
-    AutoPipelineForText2Image,
-    ControlNetModel,
-    LCMScheduler,
-    StableDiffusionAdapterPipeline,
-    StableDiffusionControlNetPipeline,
-    StableDiffusionXLAdapterPipeline,
-    StableDiffusionXLControlNetPipeline,
-    T2IAdapter,
-    WuerstchenCombinedPipeline,
-)
-from diffusers.utils import load_image
-
-
-sys.path.append(".")
-
-from utils import (  # noqa: E402
-    BASE_PATH,
-    PROMPT,
-    BenchmarkInfo,
-    benchmark_fn,
-    bytes_to_giga_bytes,
-    flush,
-    generate_csv_dict,
-    write_to_csv,
-)
-
-
-RESOLUTION_MAPPING = {
-    "Lykon/DreamShaper": (512, 512),
-    "lllyasviel/sd-controlnet-canny": (512, 512),
-    "diffusers/controlnet-canny-sdxl-1.0": (1024, 1024),
-    "TencentARC/t2iadapter_canny_sd14v1": (512, 512),
-    "TencentARC/t2i-adapter-canny-sdxl-1.0": (1024, 1024),
-    "stabilityai/stable-diffusion-2-1": (768, 768),
-    "stabilityai/stable-diffusion-xl-base-1.0": (1024, 1024),
-    "stabilityai/stable-diffusion-xl-refiner-1.0": (1024, 1024),
-    "stabilityai/sdxl-turbo": (512, 512),
-}
-
-
-class BaseBenchmak:
-    pipeline_class = None
-
-    def __init__(self, args):
-        super().__init__()
-
-    def run_inference(self, args):
-        raise NotImplementedError
-
-    def benchmark(self, args):
-        raise NotImplementedError
-
-    def get_result_filepath(self, args):
-        pipeline_class_name = str(self.pipe.__class__.__name__)
-        name = (
-            args.ckpt.replace("/", "_")
-            + "_"
-            + pipeline_class_name
-            + f"-bs@{args.batch_size}-steps@{args.num_inference_steps}-mco@{args.model_cpu_offload}-compile@{args.run_compile}.csv"
-        )
-        filepath = os.path.join(BASE_PATH, name)
-        return filepath
-
-
-class TextToImageBenchmark(BaseBenchmak):
-    pipeline_class = AutoPipelineForText2Image
-
-    def __init__(self, args):
-        pipe = self.pipeline_class.from_pretrained(args.ckpt, torch_dtype=torch.float16)
-        pipe = pipe.to("cuda")
-
-        if args.run_compile:
-            if not isinstance(pipe, WuerstchenCombinedPipeline):
-                pipe.unet.to(memory_format=torch.channels_last)
-                print("Run torch compile")
-                pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
-
-                if hasattr(pipe, "movq") and getattr(pipe, "movq", None) is not None:
-                    pipe.movq.to(memory_format=torch.channels_last)
-                    pipe.movq = torch.compile(pipe.movq, mode="reduce-overhead", fullgraph=True)
-            else:
-                print("Run torch compile")
-                pipe.decoder = torch.compile(pipe.decoder, mode="reduce-overhead", fullgraph=True)
-                pipe.vqgan = torch.compile(pipe.vqgan, mode="reduce-overhead", fullgraph=True)
-
-        pipe.set_progress_bar_config(disable=True)
-        self.pipe = pipe
-
-    def run_inference(self, pipe, args):
-        _ = pipe(
-            prompt=PROMPT,
-            num_inference_steps=args.num_inference_steps,
-            num_images_per_prompt=args.batch_size,
-        )
-
-    def benchmark(self, args):
-        flush()
-
-        print(f"[INFO] {self.pipe.__class__.__name__}: Running benchmark with: {vars(args)}\n")
-
-        time = benchmark_fn(self.run_inference, self.pipe, args)  # in seconds.
-        memory = bytes_to_giga_bytes(torch.cuda.max_memory_allocated())  # in GBs.
-        benchmark_info = BenchmarkInfo(time=time, memory=memory)
-
-        pipeline_class_name = str(self.pipe.__class__.__name__)
-        flush()
-        csv_dict = generate_csv_dict(
-            pipeline_cls=pipeline_class_name, ckpt=args.ckpt, args=args, benchmark_info=benchmark_info
-        )
-        filepath = self.get_result_filepath(args)
-        write_to_csv(filepath, csv_dict)
-        print(f"Logs written to: {filepath}")
-        flush()
-
-
-class TurboTextToImageBenchmark(TextToImageBenchmark):
-    def __init__(self, args):
-        super().__init__(args)
-
-    def run_inference(self, pipe, args):
-        _ = pipe(
-            prompt=PROMPT,
-            num_inference_steps=args.num_inference_steps,
-            num_images_per_prompt=args.batch_size,
-            guidance_scale=0.0,
-        )
-
-
-class LCMLoRATextToImageBenchmark(TextToImageBenchmark):
-    lora_id = "latent-consistency/lcm-lora-sdxl"
-
-    def __init__(self, args):
-        super().__init__(args)
-        self.pipe.load_lora_weights(self.lora_id)
-        self.pipe.fuse_lora()
-        self.pipe.unload_lora_weights()
-        self.pipe.scheduler = LCMScheduler.from_config(self.pipe.scheduler.config)
-
-    def get_result_filepath(self, args):
-        pipeline_class_name = str(self.pipe.__class__.__name__)
-        name = (
-            self.lora_id.replace("/", "_")
-            + "_"
-            + pipeline_class_name
-            + f"-bs@{args.batch_size}-steps@{args.num_inference_steps}-mco@{args.model_cpu_offload}-compile@{args.run_compile}.csv"
-        )
-        filepath = os.path.join(BASE_PATH, name)
-        return filepath
-
-    def run_inference(self, pipe, args):
-        _ = pipe(
-            prompt=PROMPT,
-            num_inference_steps=args.num_inference_steps,
-            num_images_per_prompt=args.batch_size,
-            guidance_scale=1.0,
-        )
-
-    def benchmark(self, args):
-        flush()
-
-        print(f"[INFO] {self.pipe.__class__.__name__}: Running benchmark with: {vars(args)}\n")
-
-        time = benchmark_fn(self.run_inference, self.pipe, args)  # in seconds.
-        memory = bytes_to_giga_bytes(torch.cuda.max_memory_allocated())  # in GBs.
-        benchmark_info = BenchmarkInfo(time=time, memory=memory)
-
-        pipeline_class_name = str(self.pipe.__class__.__name__)
-        flush()
-        csv_dict = generate_csv_dict(
-            pipeline_cls=pipeline_class_name, ckpt=self.lora_id, args=args, benchmark_info=benchmark_info
-        )
-        filepath = self.get_result_filepath(args)
-        write_to_csv(filepath, csv_dict)
-        print(f"Logs written to: {filepath}")
-        flush()
-
-
-class ImageToImageBenchmark(TextToImageBenchmark):
-    pipeline_class = AutoPipelineForImage2Image
-    url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/benchmarking/1665_Girl_with_a_Pearl_Earring.jpg"
-    image = load_image(url).convert("RGB")
-
-    def __init__(self, args):
-        super().__init__(args)
-        self.image = self.image.resize(RESOLUTION_MAPPING[args.ckpt])
-
-    def run_inference(self, pipe, args):
-        _ = pipe(
-            prompt=PROMPT,
-            image=self.image,
-            num_inference_steps=args.num_inference_steps,
-            num_images_per_prompt=args.batch_size,
-        )
-
-
-class TurboImageToImageBenchmark(ImageToImageBenchmark):
-    def __init__(self, args):
-        super().__init__(args)
-
-    def run_inference(self, pipe, args):
-        _ = pipe(
-            prompt=PROMPT,
-            image=self.image,
-            num_inference_steps=args.num_inference_steps,
-            num_images_per_prompt=args.batch_size,
-            guidance_scale=0.0,
-            strength=0.5,
-        )
-
-
-class InpaintingBenchmark(ImageToImageBenchmark):
-    pipeline_class = AutoPipelineForInpainting
-    mask_url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/benchmarking/overture-creations-5sI6fQgYIuo_mask.png"
-    mask = load_image(mask_url).convert("RGB")
-
-    def __init__(self, args):
-        super().__init__(args)
-        self.image = self.image.resize(RESOLUTION_MAPPING[args.ckpt])
-        self.mask = self.mask.resize(RESOLUTION_MAPPING[args.ckpt])
-
-    def run_inference(self, pipe, args):
-        _ = pipe(
-            prompt=PROMPT,
-            image=self.image,
-            mask_image=self.mask,
-            num_inference_steps=args.num_inference_steps,
-            num_images_per_prompt=args.batch_size,
-        )
-
-
-class IPAdapterTextToImageBenchmark(TextToImageBenchmark):
-    url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/load_neg_embed.png"
-    image = load_image(url)
-
-    def __init__(self, args):
-        pipe = self.pipeline_class.from_pretrained(args.ckpt, torch_dtype=torch.float16).to("cuda")
-        pipe.load_ip_adapter(
-            args.ip_adapter_id[0],
-            subfolder="models" if "sdxl" not in args.ip_adapter_id[1] else "sdxl_models",
-            weight_name=args.ip_adapter_id[1],
-        )
-
-        if args.run_compile:
-            pipe.unet.to(memory_format=torch.channels_last)
-            print("Run torch compile")
-            pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
-
-        pipe.set_progress_bar_config(disable=True)
-        self.pipe = pipe
-
-    def run_inference(self, pipe, args):
-        _ = pipe(
-            prompt=PROMPT,
-            ip_adapter_image=self.image,
-            num_inference_steps=args.num_inference_steps,
-            num_images_per_prompt=args.batch_size,
-        )
-
-
-class ControlNetBenchmark(TextToImageBenchmark):
-    pipeline_class = StableDiffusionControlNetPipeline
-    aux_network_class = ControlNetModel
-    root_ckpt = "Lykon/DreamShaper"
-
-    url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/benchmarking/canny_image_condition.png"
-    image = load_image(url).convert("RGB")
-
-    def __init__(self, args):
-        aux_network = self.aux_network_class.from_pretrained(args.ckpt, torch_dtype=torch.float16)
-        pipe = self.pipeline_class.from_pretrained(self.root_ckpt, controlnet=aux_network, torch_dtype=torch.float16)
-        pipe = pipe.to("cuda")
-
-        pipe.set_progress_bar_config(disable=True)
-        self.pipe = pipe
-
-        if args.run_compile:
-            pipe.unet.to(memory_format=torch.channels_last)
-            pipe.controlnet.to(memory_format=torch.channels_last)
-
-            print("Run torch compile")
-            pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
-            pipe.controlnet = torch.compile(pipe.controlnet, mode="reduce-overhead", fullgraph=True)
-
-        self.image = self.image.resize(RESOLUTION_MAPPING[args.ckpt])
-
-    def run_inference(self, pipe, args):
-        _ = pipe(
-            prompt=PROMPT,
-            image=self.image,
-            num_inference_steps=args.num_inference_steps,
-            num_images_per_prompt=args.batch_size,
-        )
-
-
-class ControlNetSDXLBenchmark(ControlNetBenchmark):
-    pipeline_class = StableDiffusionXLControlNetPipeline
-    root_ckpt = "stabilityai/stable-diffusion-xl-base-1.0"
-
-    def __init__(self, args):
-        super().__init__(args)
-
-
-class T2IAdapterBenchmark(ControlNetBenchmark):
-    pipeline_class = StableDiffusionAdapterPipeline
-    aux_network_class = T2IAdapter
-    root_ckpt = "Lykon/DreamShaper"
-
-    url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/benchmarking/canny_for_adapter.png"
-    image = load_image(url).convert("L")
-
-    def __init__(self, args):
-        aux_network = self.aux_network_class.from_pretrained(args.ckpt, torch_dtype=torch.float16)
-        pipe = self.pipeline_class.from_pretrained(self.root_ckpt, adapter=aux_network, torch_dtype=torch.float16)
-        pipe = pipe.to("cuda")
-
-        pipe.set_progress_bar_config(disable=True)
-        self.pipe = pipe
-
-        if args.run_compile:
-            pipe.unet.to(memory_format=torch.channels_last)
-            pipe.adapter.to(memory_format=torch.channels_last)
-
-            print("Run torch compile")
-            pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
-            pipe.adapter = torch.compile(pipe.adapter, mode="reduce-overhead", fullgraph=True)
-
-        self.image = self.image.resize(RESOLUTION_MAPPING[args.ckpt])
-
-
-class T2IAdapterSDXLBenchmark(T2IAdapterBenchmark):
-    pipeline_class = StableDiffusionXLAdapterPipeline
-    root_ckpt = "stabilityai/stable-diffusion-xl-base-1.0"
-
-    url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/benchmarking/canny_for_adapter_sdxl.png"
-    image = load_image(url)
-
-    def __init__(self, args):
-        super().__init__(args)
--- a/benchmarks/benchmark_controlnet.py
+++ b/benchmarks/benchmark_controlnet.py
@@ -1,26 +0,0 @@
-import argparse
-import sys
-
-
-sys.path.append(".")
-from base_classes import ControlNetBenchmark, ControlNetSDXLBenchmark  # noqa: E402
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--ckpt",
-        type=str,
-        default="lllyasviel/sd-controlnet-canny",
-        choices=["lllyasviel/sd-controlnet-canny", "diffusers/controlnet-canny-sdxl-1.0"],
-    )
-    parser.add_argument("--batch_size", type=int, default=1)
-    parser.add_argument("--num_inference_steps", type=int, default=50)
-    parser.add_argument("--model_cpu_offload", action="store_true")
-    parser.add_argument("--run_compile", action="store_true")
-    args = parser.parse_args()
-
-    benchmark_pipe = (
-        ControlNetBenchmark(args) if args.ckpt == "lllyasviel/sd-controlnet-canny" else ControlNetSDXLBenchmark(args)
-    )
-    benchmark_pipe.benchmark(args)
--- a/benchmarks/benchmark_ip_adapters.py
+++ b/benchmarks/benchmark_ip_adapters.py
@@ -1,33 +0,0 @@
-import argparse
-import sys
-
-
-sys.path.append(".")
-from base_classes import IPAdapterTextToImageBenchmark  # noqa: E402
-
-
-IP_ADAPTER_CKPTS = {
-    # because original SD v1.5 has been taken down.
-    "Lykon/DreamShaper": ("h94/IP-Adapter", "ip-adapter_sd15.bin"),
-    "stabilityai/stable-diffusion-xl-base-1.0": ("h94/IP-Adapter", "ip-adapter_sdxl.bin"),
-}
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--ckpt",
-        type=str,
-        default="rstabilityai/stable-diffusion-xl-base-1.0",
-        choices=list(IP_ADAPTER_CKPTS.keys()),
-    )
-    parser.add_argument("--batch_size", type=int, default=1)
-    parser.add_argument("--num_inference_steps", type=int, default=50)
-    parser.add_argument("--model_cpu_offload", action="store_true")
-    parser.add_argument("--run_compile", action="store_true")
-    args = parser.parse_args()
-
-    args.ip_adapter_id = IP_ADAPTER_CKPTS[args.ckpt]
-    benchmark_pipe = IPAdapterTextToImageBenchmark(args)
-    args.ckpt = f"{args.ckpt} (IP-Adapter)"
-    benchmark_pipe.benchmark(args)
--- a/benchmarks/benchmark_sd_img.py
+++ b/benchmarks/benchmark_sd_img.py
@@ -1,29 +0,0 @@
-import argparse
-import sys
-
-
-sys.path.append(".")
-from base_classes import ImageToImageBenchmark, TurboImageToImageBenchmark  # noqa: E402
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--ckpt",
-        type=str,
-        default="Lykon/DreamShaper",
-        choices=[
-            "Lykon/DreamShaper",
-            "stabilityai/stable-diffusion-2-1",
-            "stabilityai/stable-diffusion-xl-refiner-1.0",
-            "stabilityai/sdxl-turbo",
-        ],
-    )
-    parser.add_argument("--batch_size", type=int, default=1)
-    parser.add_argument("--num_inference_steps", type=int, default=50)
-    parser.add_argument("--model_cpu_offload", action="store_true")
-    parser.add_argument("--run_compile", action="store_true")
-    args = parser.parse_args()
-
-    benchmark_pipe = ImageToImageBenchmark(args) if "turbo" not in args.ckpt else TurboImageToImageBenchmark(args)
-    benchmark_pipe.benchmark(args)
--- a/benchmarks/benchmark_sd_inpainting.py
+++ b/benchmarks/benchmark_sd_inpainting.py
@@ -1,28 +0,0 @@
-import argparse
-import sys
-
-
-sys.path.append(".")
-from base_classes import InpaintingBenchmark  # noqa: E402
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--ckpt",
-        type=str,
-        default="Lykon/DreamShaper",
-        choices=[
-            "Lykon/DreamShaper",
-            "stabilityai/stable-diffusion-2-1",
-            "stabilityai/stable-diffusion-xl-base-1.0",
-        ],
-    )
-    parser.add_argument("--batch_size", type=int, default=1)
-    parser.add_argument("--num_inference_steps", type=int, default=50)
-    parser.add_argument("--model_cpu_offload", action="store_true")
-    parser.add_argument("--run_compile", action="store_true")
-    args = parser.parse_args()
-
-    benchmark_pipe = InpaintingBenchmark(args)
-    benchmark_pipe.benchmark(args)
--- a/benchmarks/benchmark_t2i_adapter.py
+++ b/benchmarks/benchmark_t2i_adapter.py
@@ -1,28 +0,0 @@
-import argparse
-import sys
-
-
-sys.path.append(".")
-from base_classes import T2IAdapterBenchmark, T2IAdapterSDXLBenchmark  # noqa: E402
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--ckpt",
-        type=str,
-        default="TencentARC/t2iadapter_canny_sd14v1",
-        choices=["TencentARC/t2iadapter_canny_sd14v1", "TencentARC/t2i-adapter-canny-sdxl-1.0"],
-    )
-    parser.add_argument("--batch_size", type=int, default=1)
-    parser.add_argument("--num_inference_steps", type=int, default=50)
-    parser.add_argument("--model_cpu_offload", action="store_true")
-    parser.add_argument("--run_compile", action="store_true")
-    args = parser.parse_args()
-
-    benchmark_pipe = (
-        T2IAdapterBenchmark(args)
-        if args.ckpt == "TencentARC/t2iadapter_canny_sd14v1"
-        else T2IAdapterSDXLBenchmark(args)
-    )
-    benchmark_pipe.benchmark(args)
--- a/benchmarks/benchmark_t2i_lcm_lora.py
+++ b/benchmarks/benchmark_t2i_lcm_lora.py
@@ -1,23 +0,0 @@
-import argparse
-import sys
-
-
-sys.path.append(".")
-from base_classes import LCMLoRATextToImageBenchmark  # noqa: E402
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--ckpt",
-        type=str,
-        default="stabilityai/stable-diffusion-xl-base-1.0",
-    )
-    parser.add_argument("--batch_size", type=int, default=1)
-    parser.add_argument("--num_inference_steps", type=int, default=4)
-    parser.add_argument("--model_cpu_offload", action="store_true")
-    parser.add_argument("--run_compile", action="store_true")
-    args = parser.parse_args()
-
-    benchmark_pipe = LCMLoRATextToImageBenchmark(args)
-    benchmark_pipe.benchmark(args)
--- a/benchmarks/benchmark_text_to_image.py
+++ b/benchmarks/benchmark_text_to_image.py
@@ -1,40 +0,0 @@
-import argparse
-import sys
-
-
-sys.path.append(".")
-from base_classes import TextToImageBenchmark, TurboTextToImageBenchmark  # noqa: E402
-
-
-ALL_T2I_CKPTS = [
-    "Lykon/DreamShaper",
-    "segmind/SSD-1B",
-    "stabilityai/stable-diffusion-xl-base-1.0",
-    "kandinsky-community/kandinsky-2-2-decoder",
-    "warp-ai/wuerstchen",
-    "stabilityai/sdxl-turbo",
-]
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--ckpt",
-        type=str,
-        default="Lykon/DreamShaper",
-        choices=ALL_T2I_CKPTS,
-    )
-    parser.add_argument("--batch_size", type=int, default=1)
-    parser.add_argument("--num_inference_steps", type=int, default=50)
-    parser.add_argument("--model_cpu_offload", action="store_true")
-    parser.add_argument("--run_compile", action="store_true")
-    args = parser.parse_args()
-
-    benchmark_cls = None
-    if "turbo" in args.ckpt:
-        benchmark_cls = TurboTextToImageBenchmark
-    else:
-        benchmark_cls = TextToImageBenchmark
-
-    benchmark_pipe = benchmark_cls(args)
-    benchmark_pipe.benchmark(args)
--- a/benchmarks/benchmarking_flux.py
+++ b/benchmarks/benchmarking_flux.py
@@ -0,0 +1,98 @@
+from functools import partial
+
+import torch
+from benchmarking_utils import BenchmarkMixin, BenchmarkScenario, model_init_fn
+
+from diffusers import BitsAndBytesConfig, FluxTransformer2DModel
+from diffusers.utils.testing_utils import torch_device
+
+
+CKPT_ID = "black-forest-labs/FLUX.1-dev"
+RESULT_FILENAME = "flux.csv"
+
+
+def get_input_dict(**device_dtype_kwargs):
+    # resolution: 1024x1024
+    # maximum sequence length 512
+    hidden_states = torch.randn(1, 4096, 64, **device_dtype_kwargs)
+    encoder_hidden_states = torch.randn(1, 512, 4096, **device_dtype_kwargs)
+    pooled_prompt_embeds = torch.randn(1, 768, **device_dtype_kwargs)
+    image_ids = torch.ones(512, 3, **device_dtype_kwargs)
+    text_ids = torch.ones(4096, 3, **device_dtype_kwargs)
+    timestep = torch.tensor([1.0], **device_dtype_kwargs)
+    guidance = torch.tensor([1.0], **device_dtype_kwargs)
+
+    return {
+        "hidden_states": hidden_states,
+        "encoder_hidden_states": encoder_hidden_states,
+        "img_ids": image_ids,
+        "txt_ids": text_ids,
+        "pooled_projections": pooled_prompt_embeds,
+        "timestep": timestep,
+        "guidance": guidance,
+    }
+
+
+if __name__ == "__main__":
+    scenarios = [
+        BenchmarkScenario(
+            name=f"{CKPT_ID}-bf16",
+            model_cls=FluxTransformer2DModel,
+            model_init_kwargs={
+                "pretrained_model_name_or_path": CKPT_ID,
+                "torch_dtype": torch.bfloat16,
+                "subfolder": "transformer",
+            },
+            get_model_input_dict=partial(get_input_dict, device=torch_device, dtype=torch.bfloat16),
+            model_init_fn=model_init_fn,
+            compile_kwargs={"fullgraph": True},
+        ),
+        BenchmarkScenario(
+            name=f"{CKPT_ID}-bnb-nf4",
+            model_cls=FluxTransformer2DModel,
+            model_init_kwargs={
+                "pretrained_model_name_or_path": CKPT_ID,
+                "torch_dtype": torch.bfloat16,
+                "subfolder": "transformer",
+                "quantization_config": BitsAndBytesConfig(
+                    load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_quant_type="nf4"
+                ),
+            },
+            get_model_input_dict=partial(get_input_dict, device=torch_device, dtype=torch.bfloat16),
+            model_init_fn=model_init_fn,
+        ),
+        BenchmarkScenario(
+            name=f"{CKPT_ID}-layerwise-upcasting",
+            model_cls=FluxTransformer2DModel,
+            model_init_kwargs={
+                "pretrained_model_name_or_path": CKPT_ID,
+                "torch_dtype": torch.bfloat16,
+                "subfolder": "transformer",
+            },
+            get_model_input_dict=partial(get_input_dict, device=torch_device, dtype=torch.bfloat16),
+            model_init_fn=partial(model_init_fn, layerwise_upcasting=True),
+        ),
+        BenchmarkScenario(
+            name=f"{CKPT_ID}-group-offload-leaf",
+            model_cls=FluxTransformer2DModel,
+            model_init_kwargs={
+                "pretrained_model_name_or_path": CKPT_ID,
+                "torch_dtype": torch.bfloat16,
+                "subfolder": "transformer",
+            },
+            get_model_input_dict=partial(get_input_dict, device=torch_device, dtype=torch.bfloat16),
+            model_init_fn=partial(
+                model_init_fn,
+                group_offload_kwargs={
+                    "onload_device": torch_device,
+                    "offload_device": torch.device("cpu"),
+                    "offload_type": "leaf_level",
+                    "use_stream": True,
+                    "non_blocking": True,
+                },
+            ),
+        ),
+    ]
+
+    runner = BenchmarkMixin()
+    runner.run_bencmarks_and_collate(scenarios, filename=RESULT_FILENAME)
--- a/benchmarks/benchmarking_ltx.py
+++ b/benchmarks/benchmarking_ltx.py
@@ -0,0 +1,80 @@
+from functools import partial
+
+import torch
+from benchmarking_utils import BenchmarkMixin, BenchmarkScenario, model_init_fn
+
+from diffusers import LTXVideoTransformer3DModel
+from diffusers.utils.testing_utils import torch_device
+
+
+CKPT_ID = "Lightricks/LTX-Video-0.9.7-dev"
+RESULT_FILENAME = "ltx.csv"
+
+
+def get_input_dict(**device_dtype_kwargs):
+    # 512x704 (161 frames)
+    # `max_sequence_length`: 256
+    hidden_states = torch.randn(1, 7392, 128, **device_dtype_kwargs)
+    encoder_hidden_states = torch.randn(1, 256, 4096, **device_dtype_kwargs)
+    encoder_attention_mask = torch.ones(1, 256, **device_dtype_kwargs)
+    timestep = torch.tensor([1.0], **device_dtype_kwargs)
+    video_coords = torch.randn(1, 3, 7392, **device_dtype_kwargs)
+
+    return {
+        "hidden_states": hidden_states,
+        "encoder_hidden_states": encoder_hidden_states,
+        "encoder_attention_mask": encoder_attention_mask,
+        "timestep": timestep,
+        "video_coords": video_coords,
+    }
+
+
+if __name__ == "__main__":
+    scenarios = [
+        BenchmarkScenario(
+            name=f"{CKPT_ID}-bf16",
+            model_cls=LTXVideoTransformer3DModel,
+            model_init_kwargs={
+                "pretrained_model_name_or_path": CKPT_ID,
+                "torch_dtype": torch.bfloat16,
+                "subfolder": "transformer",
+            },
+            get_model_input_dict=partial(get_input_dict, device=torch_device, dtype=torch.bfloat16),
+            model_init_fn=model_init_fn,
+            compile_kwargs={"fullgraph": True},
+        ),
+        BenchmarkScenario(
+            name=f"{CKPT_ID}-layerwise-upcasting",
+            model_cls=LTXVideoTransformer3DModel,
+            model_init_kwargs={
+                "pretrained_model_name_or_path": CKPT_ID,
+                "torch_dtype": torch.bfloat16,
+                "subfolder": "transformer",
+            },
+            get_model_input_dict=partial(get_input_dict, device=torch_device, dtype=torch.bfloat16),
+            model_init_fn=partial(model_init_fn, layerwise_upcasting=True),
+        ),
+        BenchmarkScenario(
+            name=f"{CKPT_ID}-group-offload-leaf",
+            model_cls=LTXVideoTransformer3DModel,
+            model_init_kwargs={
+                "pretrained_model_name_or_path": CKPT_ID,
+                "torch_dtype": torch.bfloat16,
+                "subfolder": "transformer",
+            },
+            get_model_input_dict=partial(get_input_dict, device=torch_device, dtype=torch.bfloat16),
+            model_init_fn=partial(
+                model_init_fn,
+                group_offload_kwargs={
+                    "onload_device": torch_device,
+                    "offload_device": torch.device("cpu"),
+                    "offload_type": "leaf_level",
+                    "use_stream": True,
+                    "non_blocking": True,
+                },
+            ),
+        ),
+    ]
+
+    runner = BenchmarkMixin()
+    runner.run_bencmarks_and_collate(scenarios, filename=RESULT_FILENAME)
--- a/benchmarks/benchmarking_sdxl.py
+++ b/benchmarks/benchmarking_sdxl.py
@@ -0,0 +1,82 @@
+from functools import partial
+
+import torch
+from benchmarking_utils import BenchmarkMixin, BenchmarkScenario, model_init_fn
+
+from diffusers import UNet2DConditionModel
+from diffusers.utils.testing_utils import torch_device
+
+
+CKPT_ID = "stabilityai/stable-diffusion-xl-base-1.0"
+RESULT_FILENAME = "sdxl.csv"
+
+
+def get_input_dict(**device_dtype_kwargs):
+    # height: 1024
+    # width: 1024
+    # max_sequence_length: 77
+    hidden_states = torch.randn(1, 4, 128, 128, **device_dtype_kwargs)
+    encoder_hidden_states = torch.randn(1, 77, 2048, **device_dtype_kwargs)
+    timestep = torch.tensor([1.0], **device_dtype_kwargs)
+    added_cond_kwargs = {
+        "text_embeds": torch.randn(1, 1280, **device_dtype_kwargs),
+        "time_ids": torch.ones(1, 6, **device_dtype_kwargs),
+    }
+
+    return {
+        "sample": hidden_states,
+        "encoder_hidden_states": encoder_hidden_states,
+        "timestep": timestep,
+        "added_cond_kwargs": added_cond_kwargs,
+    }
+
+
+if __name__ == "__main__":
+    scenarios = [
+        BenchmarkScenario(
+            name=f"{CKPT_ID}-bf16",
+            model_cls=UNet2DConditionModel,
+            model_init_kwargs={
+                "pretrained_model_name_or_path": CKPT_ID,
+                "torch_dtype": torch.bfloat16,
+                "subfolder": "unet",
+            },
+            get_model_input_dict=partial(get_input_dict, device=torch_device, dtype=torch.bfloat16),
+            model_init_fn=model_init_fn,
+            compile_kwargs={"fullgraph": True},
+        ),
+        BenchmarkScenario(
+            name=f"{CKPT_ID}-layerwise-upcasting",
+            model_cls=UNet2DConditionModel,
+            model_init_kwargs={
+                "pretrained_model_name_or_path": CKPT_ID,
+                "torch_dtype": torch.bfloat16,
+                "subfolder": "unet",
+            },
+            get_model_input_dict=partial(get_input_dict, device=torch_device, dtype=torch.bfloat16),
+            model_init_fn=partial(model_init_fn, layerwise_upcasting=True),
+        ),
+        BenchmarkScenario(
+            name=f"{CKPT_ID}-group-offload-leaf",
+            model_cls=UNet2DConditionModel,
+            model_init_kwargs={
+                "pretrained_model_name_or_path": CKPT_ID,
+                "torch_dtype": torch.bfloat16,
+                "subfolder": "unet",
+            },
+            get_model_input_dict=partial(get_input_dict, device=torch_device, dtype=torch.bfloat16),
+            model_init_fn=partial(
+                model_init_fn,
+                group_offload_kwargs={
+                    "onload_device": torch_device,
+                    "offload_device": torch.device("cpu"),
+                    "offload_type": "leaf_level",
+                    "use_stream": True,
+                    "non_blocking": True,
+                },
+            ),
+        ),
+    ]
+
+    runner = BenchmarkMixin()
+    runner.run_bencmarks_and_collate(scenarios, filename=RESULT_FILENAME)
--- a/benchmarks/benchmarking_utils.py
+++ b/benchmarks/benchmarking_utils.py
@@ -0,0 +1,244 @@
+import gc
+import inspect
+import logging
+import os
+import queue
+import threading
+from contextlib import nullcontext
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, Optional, Union
+
+import pandas as pd
+import torch
+import torch.utils.benchmark as benchmark
+
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.utils.testing_utils import require_torch_gpu, torch_device
+
+
+logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s: %(message)s")
+logger = logging.getLogger(__name__)
+
+NUM_WARMUP_ROUNDS = 5
+
+
+def benchmark_fn(f, *args, **kwargs):
+    t0 = benchmark.Timer(
+        stmt="f(*args, **kwargs)",
+        globals={"args": args, "kwargs": kwargs, "f": f},
+        num_threads=1,
+    )
+    return float(f"{(t0.blocked_autorange().mean):.3f}")
+
+
+def flush():
+    gc.collect()
+    torch.cuda.empty_cache()
+    torch.cuda.reset_max_memory_allocated()
+    torch.cuda.reset_peak_memory_stats()
+
+
+# Adapted from https://github.com/lucasb-eyer/cnn_vit_benchmarks/blob/15b665ff758e8062131353076153905cae00a71f/main.py
+def calculate_flops(model, input_dict):
+    try:
+        from torchprofile import profile_macs
+    except ModuleNotFoundError:
+        raise
+
+    # This is a hacky way to convert the kwargs to args as `profile_macs` cries about kwargs.
+    sig = inspect.signature(model.forward)
+    param_names = [
+        p.name
+        for p in sig.parameters.values()
+        if p.kind
+        in (
+            inspect.Parameter.POSITIONAL_ONLY,
+            inspect.Parameter.POSITIONAL_OR_KEYWORD,
+        )
+        and p.name != "self"
+    ]
+    bound = sig.bind_partial(**input_dict)
+    bound.apply_defaults()
+    args = tuple(bound.arguments[name] for name in param_names)
+
+    model.eval()
+    with torch.no_grad():
+        macs = profile_macs(model, args)
+    flops = 2 * macs  # 1 MAC operation = 2 FLOPs (1 multiplication + 1 addition)
+    return flops
+
+
+def calculate_params(model):
+    return sum(p.numel() for p in model.parameters())
+
+
+# Users can define their own in case this doesn't suffice. For most cases,
+# it should be sufficient.
+def model_init_fn(model_cls, group_offload_kwargs=None, layerwise_upcasting=False, **init_kwargs):
+    model = model_cls.from_pretrained(**init_kwargs).eval()
+    if group_offload_kwargs and isinstance(group_offload_kwargs, dict):
+        model.enable_group_offload(**group_offload_kwargs)
+    else:
+        model.to(torch_device)
+    if layerwise_upcasting:
+        model.enable_layerwise_casting(
+            storage_dtype=torch.float8_e4m3fn, compute_dtype=init_kwargs.get("torch_dtype", torch.bfloat16)
+        )
+    return model
+
+
+@dataclass
+class BenchmarkScenario:
+    name: str
+    model_cls: ModelMixin
+    model_init_kwargs: Dict[str, Any]
+    model_init_fn: Callable
+    get_model_input_dict: Callable
+    compile_kwargs: Optional[Dict[str, Any]] = None
+
+
+@require_torch_gpu
+class BenchmarkMixin:
+    def pre_benchmark(self):
+        flush()
+        torch.compiler.reset()
+
+    def post_benchmark(self, model):
+        model.cpu()
+        flush()
+        torch.compiler.reset()
+
+    @torch.no_grad()
+    def run_benchmark(self, scenario: BenchmarkScenario):
+        # 0) Basic stats
+        logger.info(f"Running scenario: {scenario.name}.")
+        try:
+            model = model_init_fn(scenario.model_cls, **scenario.model_init_kwargs)
+            num_params = round(calculate_params(model) / 1e9, 2)
+            try:
+                flops = round(calculate_flops(model, input_dict=scenario.get_model_input_dict()) / 1e9, 2)
+            except Exception as e:
+                logger.info(f"Problem in calculating FLOPs:\n{e}")
+                flops = None
+            model.cpu()
+            del model
+        except Exception as e:
+            logger.info(f"Error while initializing the model and calculating FLOPs:\n{e}")
+            return {}
+        self.pre_benchmark()
+
+        # 1) plain stats
+        results = {}
+        plain = None
+        try:
+            plain = self._run_phase(
+                model_cls=scenario.model_cls,
+                init_fn=scenario.model_init_fn,
+                init_kwargs=scenario.model_init_kwargs,
+                get_input_fn=scenario.get_model_input_dict,
+                compile_kwargs=None,
+            )
+        except Exception as e:
+            logger.info(f"Benchmark could not be run with the following error:\n{e}")
+            return results
+
+        # 2) compiled stats (if any)
+        compiled = {"time": None, "memory": None}
+        if scenario.compile_kwargs:
+            try:
+                compiled = self._run_phase(
+                    model_cls=scenario.model_cls,
+                    init_fn=scenario.model_init_fn,
+                    init_kwargs=scenario.model_init_kwargs,
+                    get_input_fn=scenario.get_model_input_dict,
+                    compile_kwargs=scenario.compile_kwargs,
+                )
+            except Exception as e:
+                logger.info(f"Compilation benchmark could not be run with the following error\n: {e}")
+                if plain is None:
+                    return results
+
+        # 3) merge
+        result = {
+            "scenario": scenario.name,
+            "model_cls": scenario.model_cls.__name__,
+            "num_params_B": num_params,
+            "flops_G": flops,
+            "time_plain_s": plain["time"],
+            "mem_plain_GB": plain["memory"],
+            "time_compile_s": compiled["time"],
+            "mem_compile_GB": compiled["memory"],
+        }
+        if scenario.compile_kwargs:
+            result["fullgraph"] = scenario.compile_kwargs.get("fullgraph", False)
+            result["mode"] = scenario.compile_kwargs.get("mode", "default")
+        else:
+            result["fullgraph"], result["mode"] = None, None
+        return result
+
+    def run_bencmarks_and_collate(self, scenarios: Union[BenchmarkScenario, list[BenchmarkScenario]], filename: str):
+        if not isinstance(scenarios, list):
+            scenarios = [scenarios]
+        record_queue = queue.Queue()
+        stop_signal = object()
+
+        def _writer_thread():
+            while True:
+                item = record_queue.get()
+                if item is stop_signal:
+                    break
+                df_row = pd.DataFrame([item])
+                write_header = not os.path.exists(filename)
+                df_row.to_csv(filename, mode="a", header=write_header, index=False)
+                record_queue.task_done()
+
+            record_queue.task_done()
+
+        writer = threading.Thread(target=_writer_thread, daemon=True)
+        writer.start()
+
+        for s in scenarios:
+            try:
+                record = self.run_benchmark(s)
+                if record:
+                    record_queue.put(record)
+                else:
+                    logger.info(f"Record empty from scenario: {s.name}.")
+            except Exception as e:
+                logger.info(f"Running scenario ({s.name}) led to error:\n{e}")
+        record_queue.put(stop_signal)
+        logger.info(f"Results serialized to {filename=}.")
+
+    def _run_phase(
+        self,
+        *,
+        model_cls: ModelMixin,
+        init_fn: Callable,
+        init_kwargs: Dict[str, Any],
+        get_input_fn: Callable,
+        compile_kwargs: Optional[Dict[str, Any]],
+    ) -> Dict[str, float]:
+        # setup
+        self.pre_benchmark()
+
+        # init & (optional) compile
+        model = init_fn(model_cls, **init_kwargs)
+        if compile_kwargs:
+            model.compile(**compile_kwargs)
+
+        # build inputs
+        inp = get_input_fn()
+
+        # measure
+        run_ctx = torch._inductor.utils.fresh_inductor_cache() if compile_kwargs else nullcontext()
+        with run_ctx:
+            for _ in range(NUM_WARMUP_ROUNDS):
+                _ = model(**inp)
+            time_s = benchmark_fn(lambda m, d: m(**d), model, inp)
+        mem_gb = torch.cuda.max_memory_allocated() / (1024**3)
+        mem_gb = round(mem_gb, 2)
+
+        # teardown
+        self.post_benchmark(model)
+        del model
+        return {"time": time_s, "memory": mem_gb}
--- a/benchmarks/benchmarking_wan.py
+++ b/benchmarks/benchmarking_wan.py
@@ -0,0 +1,74 @@
+from functools import partial
+
+import torch
+from benchmarking_utils import BenchmarkMixin, BenchmarkScenario, model_init_fn
+
+from diffusers import WanTransformer3DModel
+from diffusers.utils.testing_utils import torch_device
+
+
+CKPT_ID = "Wan-AI/Wan2.1-T2V-14B-Diffusers"
+RESULT_FILENAME = "wan.csv"
+
+
+def get_input_dict(**device_dtype_kwargs):
+    # height: 480
+    # width: 832
+    # num_frames: 81
+    # max_sequence_length: 512
+    hidden_states = torch.randn(1, 16, 21, 60, 104, **device_dtype_kwargs)
+    encoder_hidden_states = torch.randn(1, 512, 4096, **device_dtype_kwargs)
+    timestep = torch.tensor([1.0], **device_dtype_kwargs)
+
+    return {"hidden_states": hidden_states, "encoder_hidden_states": encoder_hidden_states, "timestep": timestep}
+
+
+if __name__ == "__main__":
+    scenarios = [
+        BenchmarkScenario(
+            name=f"{CKPT_ID}-bf16",
+            model_cls=WanTransformer3DModel,
+            model_init_kwargs={
+                "pretrained_model_name_or_path": CKPT_ID,
+                "torch_dtype": torch.bfloat16,
+                "subfolder": "transformer",
+            },
+            get_model_input_dict=partial(get_input_dict, device=torch_device, dtype=torch.bfloat16),
+            model_init_fn=model_init_fn,
+            compile_kwargs={"fullgraph": True},
+        ),
+        BenchmarkScenario(
+            name=f"{CKPT_ID}-layerwise-upcasting",
+            model_cls=WanTransformer3DModel,
+            model_init_kwargs={
+                "pretrained_model_name_or_path": CKPT_ID,
+                "torch_dtype": torch.bfloat16,
+                "subfolder": "transformer",
+            },
+            get_model_input_dict=partial(get_input_dict, device=torch_device, dtype=torch.bfloat16),
+            model_init_fn=partial(model_init_fn, layerwise_upcasting=True),
+        ),
+        BenchmarkScenario(
+            name=f"{CKPT_ID}-group-offload-leaf",
+            model_cls=WanTransformer3DModel,
+            model_init_kwargs={
+                "pretrained_model_name_or_path": CKPT_ID,
+                "torch_dtype": torch.bfloat16,
+                "subfolder": "transformer",
+            },
+            get_model_input_dict=partial(get_input_dict, device=torch_device, dtype=torch.bfloat16),
+            model_init_fn=partial(
+                model_init_fn,
+                group_offload_kwargs={
+                    "onload_device": torch_device,
+                    "offload_device": torch.device("cpu"),
+                    "offload_type": "leaf_level",
+                    "use_stream": True,
+                    "non_blocking": True,
+                },
+            ),
+        ),
+    ]
+
+    runner = BenchmarkMixin()
+    runner.run_bencmarks_and_collate(scenarios, filename=RESULT_FILENAME)
--- a/benchmarks/populate_into_db.py
+++ b/benchmarks/populate_into_db.py
@@ -0,0 +1,166 @@
+import argparse
+import os
+import sys
+
+import gpustat
+import pandas as pd
+import psycopg2
+import psycopg2.extras
+from psycopg2.extensions import register_adapter
+from psycopg2.extras import Json
+
+
+register_adapter(dict, Json)
+
+FINAL_CSV_FILENAME = "collated_results.csv"
+# https://github.com/huggingface/transformers/blob/593e29c5e2a9b17baec010e8dc7c1431fed6e841/benchmark/init_db.sql#L27
+BENCHMARKS_TABLE_NAME = "benchmarks"
+MEASUREMENTS_TABLE_NAME = "model_measurements"
+
+
+def _init_benchmark(conn, branch, commit_id, commit_msg):
+    gpu_stats = gpustat.GPUStatCollection.new_query()
+    metadata = {"gpu_name": gpu_stats[0]["name"]}
+    repository = "huggingface/diffusers"
+    with conn.cursor() as cur:
+        cur.execute(
+            f"INSERT INTO {BENCHMARKS_TABLE_NAME} (repository, branch, commit_id, commit_message, metadata) VALUES (%s, %s, %s, %s, %s) RETURNING benchmark_id",
+            (repository, branch, commit_id, commit_msg, metadata),
+        )
+        benchmark_id = cur.fetchone()[0]
+        print(f"Initialised benchmark #{benchmark_id}")
+        return benchmark_id
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "branch",
+        type=str,
+        help="The branch name on which the benchmarking is performed.",
+    )
+
+    parser.add_argument(
+        "commit_id",
+        type=str,
+        help="The commit hash on which the benchmarking is performed.",
+    )
+
+    parser.add_argument(
+        "commit_msg",
+        type=str,
+        help="The commit message associated with the commit, truncated to 70 characters.",
+    )
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    try:
+        conn = psycopg2.connect(
+            host=os.getenv("PGHOST"),
+            database=os.getenv("PGDATABASE"),
+            user=os.getenv("PGUSER"),
+            password=os.getenv("PGPASSWORD"),
+        )
+        print("DB connection established successfully.")
+    except Exception as e:
+        print(f"Problem during DB init: {e}")
+        sys.exit(1)
+
+    try:
+        benchmark_id = _init_benchmark(
+            conn=conn,
+            branch=args.branch,
+            commit_id=args.commit_id,
+            commit_msg=args.commit_msg,
+        )
+    except Exception as e:
+        print(f"Problem during initializing benchmark: {e}")
+        sys.exit(1)
+
+    cur = conn.cursor()
+
+    df = pd.read_csv(FINAL_CSV_FILENAME)
+
+    # Helper to cast values (or None) given a dtype
+    def _cast_value(val, dtype: str):
+        if pd.isna(val):
+            return None
+
+        if dtype == "text":
+            return str(val).strip()
+
+        if dtype == "float":
+            try:
+                return float(val)
+            except ValueError:
+                return None
+
+        if dtype == "bool":
+            s = str(val).strip().lower()
+            if s in ("true", "t", "yes", "1"):
+                return True
+            if s in ("false", "f", "no", "0"):
+                return False
+            if val in (1, 1.0):
+                return True
+            if val in (0, 0.0):
+                return False
+            return None
+
+        return val
+
+    try:
+        rows_to_insert = []
+        for _, row in df.iterrows():
+            scenario = _cast_value(row.get("scenario"), "text")
+            model_cls = _cast_value(row.get("model_cls"), "text")
+            num_params_B = _cast_value(row.get("num_params_B"), "float")
+            flops_G = _cast_value(row.get("flops_G"), "float")
+            time_plain_s = _cast_value(row.get("time_plain_s"), "float")
+            mem_plain_GB = _cast_value(row.get("mem_plain_GB"), "float")
+            time_compile_s = _cast_value(row.get("time_compile_s"), "float")
+            mem_compile_GB = _cast_value(row.get("mem_compile_GB"), "float")
+            fullgraph = _cast_value(row.get("fullgraph"), "bool")
+            mode = _cast_value(row.get("mode"), "text")
+
+            # If "github_sha" column exists in the CSV, cast it; else default to None
+            if "github_sha" in df.columns:
+                github_sha = _cast_value(row.get("github_sha"), "text")
+            else:
+                github_sha = None
+
+            measurements = {
+                "scenario": scenario,
+                "model_cls": model_cls,
+                "num_params_B": num_params_B,
+                "flops_G": flops_G,
+                "time_plain_s": time_plain_s,
+                "mem_plain_GB": mem_plain_GB,
+                "time_compile_s": time_compile_s,
+                "mem_compile_GB": mem_compile_GB,
+                "fullgraph": fullgraph,
+                "mode": mode,
+                "github_sha": github_sha,
+            }
+            rows_to_insert.append((benchmark_id, measurements))
+
+        # Batch-insert all rows
+        insert_sql = f"""
+        INSERT INTO {MEASUREMENTS_TABLE_NAME} (
+            benchmark_id,
+            measurements
+        )
+        VALUES (%s, %s);
+        """
+
+        psycopg2.extras.execute_batch(cur, insert_sql, rows_to_insert)
+        conn.commit()
+
+        cur.close()
+        conn.close()
+    except Exception as e:
+        print(f"Exception: {e}")
+        sys.exit(1)
--- a/benchmarks/push_results.py
+++ b/benchmarks/push_results.py
@@ -1,19 +1,19 @@
-import glob
-import sys
+import os

 import pandas as pd
 from huggingface_hub import hf_hub_download, upload_file
 from huggingface_hub.utils import EntryNotFoundError


-sys.path.append(".")
-from utils import BASE_PATH, FINAL_CSV_FILE, GITHUB_SHA, REPO_ID, collate_csv  # noqa: E402
+REPO_ID = "diffusers/benchmarks"


 def has_previous_benchmark() -> str:
+    from run_all import FINAL_CSV_FILENAME
+
    csv_path = None
    try:
-        csv_path = hf_hub_download(repo_id=REPO_ID, repo_type="dataset", filename=FINAL_CSV_FILE)
+        csv_path = hf_hub_download(repo_id=REPO_ID, repo_type="dataset", filename=FINAL_CSV_FILENAME)
    except EntryNotFoundError:
        csv_path = None
    return csv_path
@@ -26,46 +26,50 @@ def filter_float(value):


 def push_to_hf_dataset():
-    all_csvs = sorted(glob.glob(f"{BASE_PATH}/*.csv"))
-    collate_csv(all_csvs, FINAL_CSV_FILE)
+    from run_all import FINAL_CSV_FILENAME, GITHUB_SHA

-    # If there's an existing benchmark file, we should report the changes.
    csv_path = has_previous_benchmark()
    if csv_path is not None:
-        current_results = pd.read_csv(FINAL_CSV_FILE)
+        current_results = pd.read_csv(FINAL_CSV_FILENAME)
        previous_results = pd.read_csv(csv_path)

        numeric_columns = current_results.select_dtypes(include=["float64", "int64"]).columns
-        numeric_columns = [
-            c for c in numeric_columns if c not in ["batch_size", "num_inference_steps", "actual_gpu_memory (gbs)"]
-        ]

        for column in numeric_columns:
-            previous_results[column] = previous_results[column].map(lambda x: filter_float(x))
+            # get previous values as floats, aligned to current index
+            prev_vals = previous_results[column].map(filter_float).reindex(current_results.index)

-            # Calculate the percentage change
-            current_results[column] = current_results[column].astype(float)
-            previous_results[column] = previous_results[column].astype(float)
-            percent_change = ((current_results[column] - previous_results[column]) / previous_results[column]) * 100
+            # get current values as floats
+            curr_vals = current_results[column].astype(float)

-            # Format the values with '+' or '-' sign and append to original values
-            current_results[column] = current_results[column].map(str) + percent_change.map(
-                lambda x: f" ({'+' if x > 0 else ''}{x:.2f}%)"
+            # stringify the current values
+            curr_str = curr_vals.map(str)
+
+            # build an appendage only when prev exists and differs
+            append_str = prev_vals.where(prev_vals.notnull() & (prev_vals != curr_vals), other=pd.NA).map(
+                lambda x: f" ({x})" if pd.notnull(x) else ""
            )
-            # There might be newly added rows. So, filter out the NaNs.
-            current_results[column] = current_results[column].map(lambda x: x.replace(" (nan%)", ""))

-        # Overwrite the current result file.
-        current_results.to_csv(FINAL_CSV_FILE, index=False)
+            # combine
+            current_results[column] = curr_str + append_str
+        os.remove(FINAL_CSV_FILENAME)
+        current_results.to_csv(FINAL_CSV_FILENAME, index=False)

    commit_message = f"upload from sha: {GITHUB_SHA}" if GITHUB_SHA is not None else "upload benchmark results"
    upload_file(
        repo_id=REPO_ID,
-        path_in_repo=FINAL_CSV_FILE,
-        path_or_fileobj=FINAL_CSV_FILE,
+        path_in_repo=FINAL_CSV_FILENAME,
+        path_or_fileobj=FINAL_CSV_FILENAME,
        repo_type="dataset",
        commit_message=commit_message,
    )
+    upload_file(
+        repo_id="diffusers/benchmark-analyzer",
+        path_in_repo=FINAL_CSV_FILENAME,
+        path_or_fileobj=FINAL_CSV_FILENAME,
+        repo_type="space",
+        commit_message=commit_message,
+    )


 if __name__ == "__main__":
--- a/benchmarks/requirements.txt
+++ b/benchmarks/requirements.txt
@@ -0,0 +1,6 @@
+pandas 
+psutil
+gpustat
+torchprofile
+bitsandbytes
+psycopg2==2.9.9
--- a/benchmarks/run_all.py
+++ b/benchmarks/run_all.py
@@ -1,101 +1,84 @@
 import glob
+import logging
+import os
 import subprocess
-import sys
-from typing import List
+
+import pandas as pd


-sys.path.append(".")
-from benchmark_text_to_image import ALL_T2I_CKPTS  # noqa: E402
+logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s: %(message)s")
+logger = logging.getLogger(__name__)

-
-PATTERN = "benchmark_*.py"
+PATTERN = "benchmarking_*.py"
+FINAL_CSV_FILENAME = "collated_results.csv"
+GITHUB_SHA = os.getenv("GITHUB_SHA", None)


 class SubprocessCallException(Exception):
    pass


-# Taken from `test_examples_utils.py`
-def run_command(command: List[str], return_stdout=False):
-    """
-    Runs `command` with `subprocess.check_output` and will potentially return the `stdout`. Will also properly capture
-    if an error occurred while running `command`
-    """
+def run_command(command: list[str], return_stdout=False):
    try:
        output = subprocess.check_output(command, stderr=subprocess.STDOUT)
-        if return_stdout:
-            if hasattr(output, "decode"):
-                output = output.decode("utf-8")
-            return output
+        if return_stdout and hasattr(output, "decode"):
+            return output.decode("utf-8")
    except subprocess.CalledProcessError as e:
-        raise SubprocessCallException(
-            f"Command `{' '.join(command)}` failed with the following error:\n\n{e.output.decode()}"
-        ) from e
+        raise SubprocessCallException(f"Command `{' '.join(command)}` failed with:\n{e.output.decode()}") from e


-def main():
-    python_files = glob.glob(PATTERN)
+def merge_csvs(final_csv: str = "collated_results.csv"):
+    all_csvs = glob.glob("*.csv")
+    all_csvs = [f for f in all_csvs if f != final_csv]
+    if not all_csvs:
+        logger.info("No result CSVs found to merge.")
+        return

-    for file in python_files:
-        print(f"****** Running file: {file} ******")
-
-        # Run with canonical settings.
-        if file != "benchmark_text_to_image.py" and file != "benchmark_ip_adapters.py":
-            command = f"python {file}"
-            run_command(command.split())
-
-            command += " --run_compile"
-            run_command(command.split())
-
-    # Run variants.
-    for file in python_files:
-        # See: https://github.com/pytorch/pytorch/issues/129637
-        if file == "benchmark_ip_adapters.py":
+    df_list = []
+    for f in all_csvs:
+        try:
+            d = pd.read_csv(f)
+        except pd.errors.EmptyDataError:
+            # If a file existed but was zero‐bytes or corrupted, skip it
            continue
+        df_list.append(d)

-        if file == "benchmark_text_to_image.py":
-            for ckpt in ALL_T2I_CKPTS:
-                command = f"python {file} --ckpt {ckpt}"
+    if not df_list:
+        logger.info("All result CSVs were empty or invalid; nothing to merge.")
+        return

-                if "turbo" in ckpt:
-                    command += " --num_inference_steps 1"
+    final_df = pd.concat(df_list, ignore_index=True)
+    if GITHUB_SHA is not None:
+        final_df["github_sha"] = GITHUB_SHA
+    final_df.to_csv(final_csv, index=False)
+    logger.info(f"Merged {len(all_csvs)} partial CSVs → {final_csv}.")

-                run_command(command.split())

-                command += " --run_compile"
-                run_command(command.split())
+def run_scripts():
+    python_files = sorted(glob.glob(PATTERN))
+    python_files = [f for f in python_files if f != "benchmarking_utils.py"]

-        elif file == "benchmark_sd_img.py":
-            for ckpt in ["stabilityai/stable-diffusion-xl-refiner-1.0", "stabilityai/sdxl-turbo"]:
-                command = f"python {file} --ckpt {ckpt}"
+    for file in python_files:
+        script_name = file.split(".py")[0].split("_")[-1]  # example: benchmarking_foo.py -> foo
+        logger.info(f"\n****** Running file: {file} ******")

-                if ckpt == "stabilityai/sdxl-turbo":
-                    command += " --num_inference_steps 2"
+        partial_csv = f"{script_name}.csv"
+        if os.path.exists(partial_csv):
+            logger.info(f"Found {partial_csv}. Removing for safer numbers and duplication.")
+            os.remove(partial_csv)

-                run_command(command.split())
-                command += " --run_compile"
-                run_command(command.split())
+        command = ["python", file]
+        try:
+            run_command(command)
+            logger.info(f"→ {file} finished normally.")
+        except SubprocessCallException as e:
+            logger.info(f"Error running {file}:\n{e}")
+        finally:
+            logger.info(f"→ Merging partial CSVs after {file} …")
+            merge_csvs(final_csv=FINAL_CSV_FILENAME)

-        elif file in ["benchmark_sd_inpainting.py", "benchmark_ip_adapters.py"]:
-            sdxl_ckpt = "stabilityai/stable-diffusion-xl-base-1.0"
-            command = f"python {file} --ckpt {sdxl_ckpt}"
-            run_command(command.split())
-
-            command += " --run_compile"
-            run_command(command.split())
-
-        elif file in ["benchmark_controlnet.py", "benchmark_t2i_adapter.py"]:
-            sdxl_ckpt = (
-                "diffusers/controlnet-canny-sdxl-1.0"
-                if "controlnet" in file
-                else "TencentARC/t2i-adapter-canny-sdxl-1.0"
-            )
-            command = f"python {file} --ckpt {sdxl_ckpt}"
-            run_command(command.split())
-
-            command += " --run_compile"
-            run_command(command.split())
+    logger.info(f"\nAll scripts attempted. Final collated CSV: {FINAL_CSV_FILENAME}")


 if __name__ == "__main__":
-    main()
+    run_scripts()
--- a/benchmarks/utils.py
+++ b/benchmarks/utils.py
@@ -1,98 +0,0 @@
-import argparse
-import csv
-import gc
-import os
-from dataclasses import dataclass
-from typing import Dict, List, Union
-
-import torch
-import torch.utils.benchmark as benchmark
-
-
-GITHUB_SHA = os.getenv("GITHUB_SHA", None)
-BENCHMARK_FIELDS = [
-    "pipeline_cls",
-    "ckpt_id",
-    "batch_size",
-    "num_inference_steps",
-    "model_cpu_offload",
-    "run_compile",
-    "time (secs)",
-    "memory (gbs)",
-    "actual_gpu_memory (gbs)",
-    "github_sha",
-]
-
-PROMPT = "ghibli style, a fantasy landscape with castles"
-BASE_PATH = os.getenv("BASE_PATH", ".")
-TOTAL_GPU_MEMORY = float(os.getenv("TOTAL_GPU_MEMORY", torch.cuda.get_device_properties(0).total_memory / (1024**3)))
-
-REPO_ID = "diffusers/benchmarks"
-FINAL_CSV_FILE = "collated_results.csv"
-
-
-@dataclass
-class BenchmarkInfo:
-    time: float
-    memory: float
-
-
-def flush():
-    """Wipes off memory."""
-    gc.collect()
-    torch.cuda.empty_cache()
-    torch.cuda.reset_max_memory_allocated()
-    torch.cuda.reset_peak_memory_stats()
-
-
-def bytes_to_giga_bytes(bytes):
-    return f"{(bytes / 1024 / 1024 / 1024):.3f}"
-
-
-def benchmark_fn(f, *args, **kwargs):
-    t0 = benchmark.Timer(
-        stmt="f(*args, **kwargs)",
-        globals={"args": args, "kwargs": kwargs, "f": f},
-        num_threads=torch.get_num_threads(),
-    )
-    return f"{(t0.blocked_autorange().mean):.3f}"
-
-
-def generate_csv_dict(
-    pipeline_cls: str, ckpt: str, args: argparse.Namespace, benchmark_info: BenchmarkInfo
-) -> Dict[str, Union[str, bool, float]]:
-    """Packs benchmarking data into a dictionary for latter serialization."""
-    data_dict = {
-        "pipeline_cls": pipeline_cls,
-        "ckpt_id": ckpt,
-        "batch_size": args.batch_size,
-        "num_inference_steps": args.num_inference_steps,
-        "model_cpu_offload": args.model_cpu_offload,
-        "run_compile": args.run_compile,
-        "time (secs)": benchmark_info.time,
-        "memory (gbs)": benchmark_info.memory,
-        "actual_gpu_memory (gbs)": f"{(TOTAL_GPU_MEMORY):.3f}",
-        "github_sha": GITHUB_SHA,
-    }
-    return data_dict
-
-
-def write_to_csv(file_name: str, data_dict: Dict[str, Union[str, bool, float]]):
-    """Serializes a dictionary into a CSV file."""
-    with open(file_name, mode="w", newline="") as csvfile:
-        writer = csv.DictWriter(csvfile, fieldnames=BENCHMARK_FIELDS)
-        writer.writeheader()
-        writer.writerow(data_dict)
-
-
-def collate_csv(input_files: List[str], output_file: str):
-    """Collates multiple identically structured CSVs into a single CSV file."""
-    with open(output_file, mode="w", newline="") as outfile:
-        writer = csv.DictWriter(outfile, fieldnames=BENCHMARK_FIELDS)
-        writer.writeheader()
-
-        for file in input_files:
-            with open(file, mode="r") as infile:
-                reader = csv.DictReader(infile)
-                for row in reader:
-                    writer.writerow(row)
--- a/docker/diffusers-doc-builder/Dockerfile
+++ b/docker/diffusers-doc-builder/Dockerfile
@@ -47,6 +47,10 @@ RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
        tensorboard \
        transformers \
        matplotlib \
-        setuptools==69.5.1
+        setuptools==69.5.1 \
+        bitsandbytes \
+        torchao \
+        gguf \
+        optimum-quanto

 CMD ["/bin/bash"]
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -1,36 +1,39 @@
- sections:
+- title: Get started
+  sections:
  - local: index
-    title: 🧨 Diffusers
-  - local: quicktour
-    title: Quicktour
-  - local: stable_diffusion
-    title: Effective and efficient diffusion
+    title: Diffusers
  - local: installation
    title: Installation
-  title: Get started
- sections:
-  - local: tutorials/tutorial_overview
-    title: Overview
-  - local: using-diffusers/write_own_pipeline
-    title: Understanding pipelines, models and schedulers
-  - local: tutorials/autopipeline
-    title: AutoPipeline
-  - local: tutorials/basic_training
-    title: Train a diffusion model
-  title: Tutorials
- sections:
+  - local: quicktour
+    title: Quickstart
+  - local: stable_diffusion
+    title: Basic performance
+
+- title: DiffusionPipeline
+  isExpanded: false
+  sections:
  - local: using-diffusers/loading
    title: Load pipelines
+  - local: tutorials/autopipeline
+    title: AutoPipeline
  - local: using-diffusers/custom_pipeline_overview
    title: Load community pipelines and components
+  - local: using-diffusers/callback
+    title: Pipeline callbacks
+  - local: using-diffusers/reusing_seeds
+    title: Reproducible pipelines
  - local: using-diffusers/schedulers
    title: Load schedulers and models
+  - local: using-diffusers/scheduler_features
+    title: Scheduler features
  - local: using-diffusers/other-formats
    title: Model files and layouts
  - local: using-diffusers/push_to_hub
    title: Push files to the Hub
-  title: Load pipelines and adapters
- sections:
+
+- title: Adapters
+  isExpanded: false
+  sections:
  - local: tutorials/using_peft_for_inference
    title: LoRA
  - local: using-diffusers/ip_adapter
@@ -43,27 +46,16 @@
    title: DreamBooth
  - local: using-diffusers/textual_inversion_inference
    title: Textual inversion
-  title: Adapters
+
+- title: Inference
  isExpanded: false
- sections:
-  - local: using-diffusers/unconditional_image_generation
-    title: Unconditional image generation
-  - local: using-diffusers/conditional_image_generation
-    title: Text-to-image
-  - local: using-diffusers/img2img
-    title: Image-to-image
-  - local: using-diffusers/inpaint
-    title: Inpainting
-  - local: using-diffusers/text-img2vid
-    title: Video generation
-  - local: using-diffusers/depth2img
-    title: Depth-to-image
-  title: Generative tasks
- sections:
-  - local: using-diffusers/overview_techniques
-    title: Overview
+  sections:
+  - local: using-diffusers/weighted_prompts
+    title: Prompt techniques
  - local: using-diffusers/create_a_server
    title: Create a server
+  - local: using-diffusers/batched_inference
+    title: Batch inference
  - local: training/distributed_inference
    title: Distributed inference
  - local: using-diffusers/scheduler_features
@@ -74,14 +66,38 @@
    title: Reproducible pipelines
  - local: using-diffusers/image_quality
    title: Controlling image quality
-  - local: using-diffusers/weighted_prompts
-    title: Prompt techniques
-  title: Inference techniques
- sections:
-  - local: advanced_inference/outpaint
-    title: Outpainting
-  title: Advanced inference
- sections:
+
+- title: Inference optimization
+  isExpanded: false
+  sections:
+  - local: optimization/fp16
+    title: Accelerate inference
+  - local: optimization/cache
+    title: Caching
+  - local: optimization/memory
+    title: Reduce memory usage
+  - local: optimization/speed-memory-optims
+    title: Compile and offloading quantized models
+  - title: Community optimizations
+    sections:
+    - local: optimization/pruna
+      title: Pruna
+    - local: optimization/xformers
+      title: xFormers
+    - local: optimization/tome
+      title: Token merging
+    - local: optimization/deepcache
+      title: DeepCache
+    - local: optimization/tgate
+      title: TGATE
+    - local: optimization/xdit
+      title: xDiT
+    - local: optimization/para_attn
+      title: ParaAttention
+
+- title: Hybrid Inference
+  isExpanded: false
+  sections:
  - local: hybrid_inference/overview
    title: Overview
  - local: hybrid_inference/vae_decode
@@ -90,8 +106,112 @@
    title: VAE Encode
  - local: hybrid_inference/api_reference
    title: API Reference
-  title: Hybrid Inference
- sections:
+
+- title: Modular Diffusers
+  isExpanded: false
+  sections:
+  - local: modular_diffusers/overview
+    title: Overview
+  - local: modular_diffusers/quickstart
+    title: Quickstart
+  - local: modular_diffusers/modular_diffusers_states
+    title: States
+  - local: modular_diffusers/pipeline_block
+    title: ModularPipelineBlocks
+  - local: modular_diffusers/sequential_pipeline_blocks
+    title: SequentialPipelineBlocks
+  - local: modular_diffusers/loop_sequential_pipeline_blocks
+    title: LoopSequentialPipelineBlocks
+  - local: modular_diffusers/auto_pipeline_blocks
+    title: AutoPipelineBlocks
+  - local: modular_diffusers/modular_pipeline
+    title: ModularPipeline
+  - local: modular_diffusers/components_manager
+    title: ComponentsManager
+  - local: modular_diffusers/guiders
+    title: Guiders
+
+- title: Training
+  isExpanded: false
+  sections:
+  - local: training/overview
+    title: Overview
+  - local: training/create_dataset
+    title: Create a dataset for training
+  - local: training/adapt_a_model
+    title: Adapt a model to a new task
+  - local: tutorials/basic_training
+    title: Train a diffusion model
+  - title: Models
+    sections:
+    - local: training/unconditional_training
+      title: Unconditional image generation
+    - local: training/text2image
+      title: Text-to-image
+    - local: training/sdxl
+      title: Stable Diffusion XL
+    - local: training/kandinsky
+      title: Kandinsky 2.2
+    - local: training/wuerstchen
+      title: Wuerstchen
+    - local: training/controlnet
+      title: ControlNet
+    - local: training/t2i_adapters
+      title: T2I-Adapters
+    - local: training/instructpix2pix
+      title: InstructPix2Pix
+    - local: training/cogvideox
+      title: CogVideoX
+  - title: Methods
+    sections:
+    - local: training/text_inversion
+      title: Textual Inversion
+    - local: training/dreambooth
+      title: DreamBooth
+    - local: training/lora
+      title: LoRA
+    - local: training/custom_diffusion
+      title: Custom Diffusion
+    - local: training/lcm_distill
+      title: Latent Consistency Distillation
+    - local: training/ddpo
+      title: Reinforcement learning training with DDPO
+
+- title: Quantization
+  isExpanded: false
+  sections:
+  - local: quantization/overview
+    title: Getting started
+  - local: quantization/bitsandbytes
+    title: bitsandbytes
+  - local: quantization/gguf
+    title: gguf
+  - local: quantization/torchao
+    title: torchao
+  - local: quantization/quanto
+    title: quanto
+
+- title: Model accelerators and hardware
+  isExpanded: false
+  sections:
+  - local: using-diffusers/stable_diffusion_jax_how_to
+    title: JAX/Flax
+  - local: optimization/onnx
+    title: ONNX
+  - local: optimization/open_vino
+    title: OpenVINO
+  - local: optimization/coreml
+    title: Core ML
+  - local: optimization/mps
+    title: Metal Performance Shaders (MPS)
+  - local: optimization/habana
+    title: Intel Gaudi
+  - local: optimization/neuron
+    title: AWS Neuron
+
+- title: Specific pipeline examples
+  isExpanded: false
+  sections:
  - local: using-diffusers/consisid
    title: ConsisID
  - local: using-diffusers/sdxl
@@ -116,106 +236,30 @@
    title: Stable Video Diffusion
  - local: using-diffusers/marigold_usage
    title: Marigold Computer Vision
-  title: Specific pipeline examples
- sections:
-  - local: training/overview
-    title: Overview
-  - local: training/create_dataset
-    title: Create a dataset for training
-  - local: training/adapt_a_model
-    title: Adapt a model to a new task
-  - isExpanded: false
+
+- title: Resources
+  isExpanded: false
+  sections:
+  - title: Task recipes
    sections:
-    - local: training/unconditional_training
+    - local: using-diffusers/unconditional_image_generation
      title: Unconditional image generation
-    - local: training/text2image
+    - local: using-diffusers/conditional_image_generation
      title: Text-to-image
-    - local: training/sdxl
-      title: Stable Diffusion XL
-    - local: training/kandinsky
-      title: Kandinsky 2.2
-    - local: training/wuerstchen
-      title: Wuerstchen
-    - local: training/controlnet
-      title: ControlNet
-    - local: training/t2i_adapters
-      title: T2I-Adapters
-    - local: training/instructpix2pix
-      title: InstructPix2Pix
-    - local: training/cogvideox
-      title: CogVideoX
-    title: Models
-  - isExpanded: false
-    sections:
-    - local: training/text_inversion
-      title: Textual Inversion
-    - local: training/dreambooth
-      title: DreamBooth
-    - local: training/lora
-      title: LoRA
-    - local: training/custom_diffusion
-      title: Custom Diffusion
-    - local: training/lcm_distill
-      title: Latent Consistency Distillation
-    - local: training/ddpo
-      title: Reinforcement learning training with DDPO
-    title: Methods
-  title: Training
- sections:
-  - local: quantization/overview
-    title: Getting Started
-  - local: quantization/bitsandbytes
-    title: bitsandbytes
-  - local: quantization/gguf
-    title: gguf
-  - local: quantization/torchao
-    title: torchao
-  - local: quantization/quanto
-    title: quanto
-  title: Quantization Methods
- sections:
-  - local: optimization/fp16
-    title: Accelerate inference
-  - local: optimization/cache
-    title: Caching
-  - local: optimization/memory
-    title: Reduce memory usage
-  - local: optimization/speed-memory-optims
-    title: Compile and offloading quantized models
-  - local: optimization/pruna
-    title: Pruna
-  - local: optimization/xformers
-    title: xFormers
-  - local: optimization/tome
-    title: Token merging
-  - local: optimization/deepcache
-    title: DeepCache
-  - local: optimization/tgate
-    title: TGATE
-  - local: optimization/xdit
-    title: xDiT
-  - local: optimization/para_attn
-    title: ParaAttention
-  - sections:
-    - local: using-diffusers/stable_diffusion_jax_how_to
-      title: JAX/Flax
-    - local: optimization/onnx
-      title: ONNX
-    - local: optimization/open_vino
-      title: OpenVINO
-    - local: optimization/coreml
-      title: Core ML
-    title: Optimized model formats
-  - sections:
-    - local: optimization/mps
-      title: Metal Performance Shaders (MPS)
-    - local: optimization/habana
-      title: Intel Gaudi
-    - local: optimization/neuron
-      title: AWS Neuron
-    title: Optimized hardware
-  title: Accelerate inference and reduce memory
- sections:
+    - local: using-diffusers/img2img
+      title: Image-to-image
+    - local: using-diffusers/inpaint
+      title: Inpainting
+    - local: advanced_inference/outpaint
+      title: Outpainting
+    - local: using-diffusers/text-img2vid
+      title: Video generation
+    - local: using-diffusers/depth2img
+      title: Depth-to-image
+  - local: using-diffusers/write_own_pipeline
+    title: Understanding pipelines, models and schedulers
+  - local: community_projects
+    title: Projects built with Diffusers
  - local: conceptual/philosophy
    title: Philosophy
  - local: using-diffusers/controlling_generation
@@ -226,13 +270,11 @@
    title: Diffusers' Ethical Guidelines
  - local: conceptual/evaluation
    title: Evaluating Diffusion Models
-  title: Conceptual Guides
- sections:
-  - local: community_projects
-    title: Projects built with Diffusers
-  title: Community Projects
- sections:
-  - isExpanded: false
+
+- title: API
+  isExpanded: false
+  sections:
+  - title: Main Classes
    sections:
    - local: api/configuration
      title: Configuration
@@ -242,8 +284,19 @@
      title: Outputs
    - local: api/quantization
      title: Quantization
-    title: Main Classes
-  - isExpanded: false
+  - title: Modular
+    sections:
+    - local: api/modular_diffusers/pipeline
+      title: Pipeline
+    - local: api/modular_diffusers/pipeline_blocks
+      title: Blocks
+    - local: api/modular_diffusers/pipeline_states
+      title: States
+    - local: api/modular_diffusers/pipeline_components
+      title: Components and configs
+    - local: api/modular_diffusers/guiders
+      title: Guiders
+  - title: Loaders
    sections:
    - local: api/loaders/ip_adapter
      title: IP-Adapter
@@ -259,14 +312,14 @@
      title: SD3Transformer2D
    - local: api/loaders/peft
      title: PEFT
-    title: Loaders
-  - isExpanded: false
+  - title: Models
    sections:
    - local: api/models/overview
      title: Overview
    - local: api/models/auto_model
      title: AutoModel
-    - sections:
+    - title: ControlNets
+      sections:
      - local: api/models/controlnet
        title: ControlNetModel
      - local: api/models/controlnet_union
@@ -281,8 +334,8 @@
        title: SD3ControlNetModel
      - local: api/models/controlnet_sparsectrl
        title: SparseControlNetModel
-      title: ControlNets
-    - sections:
+    - title: Transformers
+      sections:
      - local: api/models/allegro_transformer3d
        title: AllegroTransformer3DModel
      - local: api/models/aura_flow_transformer2d
@@ -327,10 +380,14 @@
        title: PixArtTransformer2DModel
      - local: api/models/prior_transformer
        title: PriorTransformer
+      - local: api/models/qwenimage_transformer2d
+        title: QwenImageTransformer2DModel
      - local: api/models/sana_transformer2d
        title: SanaTransformer2DModel
      - local: api/models/sd3_transformer2d
        title: SD3Transformer2DModel
+      - local: api/models/skyreels_v2_transformer_3d
+        title: SkyReelsV2Transformer3DModel
      - local: api/models/stable_audio_transformer
        title: StableAudioDiTModel
      - local: api/models/transformer2d
@@ -339,8 +396,8 @@
        title: TransformerTemporalModel
      - local: api/models/wan_transformer_3d
        title: WanTransformer3DModel
-      title: Transformers
-    - sections:
+    - title: UNets
+      sections:
      - local: api/models/stable_cascade_unet
        title: StableCascadeUNet
      - local: api/models/unet
@@ -355,8 +412,8 @@
        title: UNetMotionModel
      - local: api/models/uvit2d
        title: UViT2DModel
-      title: UNets
-    - sections:
+    - title: VAEs
+      sections:
      - local: api/models/asymmetricautoencoderkl
        title: AsymmetricAutoencoderKL
      - local: api/models/autoencoder_dc
@@ -377,6 +434,8 @@
        title: AutoencoderKLMagvit
      - local: api/models/autoencoderkl_mochi
        title: AutoencoderKLMochi
+      - local: api/models/autoencoderkl_qwenimage
+        title: AutoencoderKLQwenImage
      - local: api/models/autoencoder_kl_wan
        title: AutoencoderKLWan
      - local: api/models/consistency_decoder_vae
@@ -387,9 +446,7 @@
        title: Tiny AutoEncoder
      - local: api/models/vq
        title: VQModel
-      title: VAEs
-    title: Models
-  - isExpanded: false
+  - title: Pipelines
    sections:
    - local: api/pipelines/overview
      title: Overview
@@ -515,6 +572,8 @@
      title: PixArt-α
    - local: api/pipelines/pixart_sigma
      title: PixArt-Σ
+    - local: api/pipelines/qwenimage
+      title: QwenImage
    - local: api/pipelines/sana
      title: Sana
    - local: api/pipelines/sana_sprint
@@ -525,11 +584,14 @@
      title: Semantic Guidance
    - local: api/pipelines/shap_e
      title: Shap-E
+    - local: api/pipelines/skyreels_v2
+      title: SkyReels-V2
    - local: api/pipelines/stable_audio
      title: Stable Audio
    - local: api/pipelines/stable_cascade
      title: Stable Cascade
-    - sections:
+    - title: Stable Diffusion
+      sections:
      - local: api/pipelines/stable_diffusion/overview
        title: Overview
      - local: api/pipelines/stable_diffusion/depth2img
@@ -566,7 +628,6 @@
        title: T2I-Adapter
      - local: api/pipelines/stable_diffusion/text2img
        title: Text-to-image
-      title: Stable Diffusion
    - local: api/pipelines/stable_unclip
      title: Stable unCLIP
    - local: api/pipelines/text_to_video
@@ -585,8 +646,7 @@
      title: Wan
    - local: api/pipelines/wuerstchen
      title: Wuerstchen
-    title: Pipelines
-  - isExpanded: false
+  - title: Schedulers
    sections:
    - local: api/schedulers/overview
      title: Overview
@@ -656,8 +716,7 @@
      title: UniPCMultistepScheduler
    - local: api/schedulers/vq_diffusion
      title: VQDiffusionScheduler
-    title: Schedulers
-  - isExpanded: false
+  - title: Internal classes
    sections:
    - local: api/internal_classes_overview
      title: Overview
@@ -675,5 +734,3 @@
      title: VAE Image Processor
    - local: api/video_processor
      title: Video Processor
-    title: Internal classes
-  title: API
--- a/docs/source/en/api/cache.md
+++ b/docs/source/en/api/cache.md
@@ -28,3 +28,9 @@ Cache methods speedup diffusion transformers by storing and reusing intermediate
 [[autodoc]] FasterCacheConfig

 [[autodoc]] apply_faster_cache
+
+### FirstBlockCacheConfig
+
+[[autodoc]] FirstBlockCacheConfig
+
+[[autodoc]] apply_first_block_cache
--- a/docs/source/en/api/configuration.md
+++ b/docs/source/en/api/configuration.md
@@ -16,7 +16,7 @@ Schedulers from [`~schedulers.scheduling_utils.SchedulerMixin`] and models from

 <Tip>

-To use private or [gated](https://huggingface.co/docs/hub/models-gated#gated-models) models, log-in with `huggingface-cli login`.
+To use private or [gated](https://huggingface.co/docs/hub/models-gated#gated-models) models, log-in with `hf auth login`.

 </Tip>

--- a/docs/source/en/api/loaders/lora.md
+++ b/docs/source/en/api/loaders/lora.md
@@ -26,9 +26,11 @@ LoRA is a fast and lightweight training method that inserts and trains a signifi
 - [`HunyuanVideoLoraLoaderMixin`] provides similar functions for [HunyuanVideo](https://huggingface.co/docs/diffusers/main/en/api/pipelines/hunyuan_video).
 - [`Lumina2LoraLoaderMixin`] provides similar functions for [Lumina2](https://huggingface.co/docs/diffusers/main/en/api/pipelines/lumina2).
 - [`WanLoraLoaderMixin`] provides similar functions for [Wan](https://huggingface.co/docs/diffusers/main/en/api/pipelines/wan).
+- [`SkyReelsV2LoraLoaderMixin`] provides similar functions for [SkyReels-V2](https://huggingface.co/docs/diffusers/main/en/api/pipelines/skyreels_v2).
 - [`CogView4LoraLoaderMixin`] provides similar functions for [CogView4](https://huggingface.co/docs/diffusers/main/en/api/pipelines/cogview4).
 - [`AmusedLoraLoaderMixin`] is for the [`AmusedPipeline`].
 - [`HiDreamImageLoraLoaderMixin`] provides similar functions for [HiDream Image](https://huggingface.co/docs/diffusers/main/en/api/pipelines/hidream)
+- [`QwenImageLoraLoaderMixin`] provides similar functions for [Qwen Image](https://huggingface.co/docs/diffusers/main/en/api/pipelines/qwen)
 - [`LoraBaseMixin`] provides a base class with several utility methods to fuse, unfuse, unload, LoRAs and more.

 <Tip>
@@ -92,6 +94,10 @@ To learn more about how to load LoRA weights, see the [LoRA](../../using-diffuse

 [[autodoc]] loaders.lora_pipeline.WanLoraLoaderMixin

+## SkyReelsV2LoraLoaderMixin
+
+[[autodoc]] loaders.lora_pipeline.SkyReelsV2LoraLoaderMixin
+
 ## AmusedLoraLoaderMixin

 [[autodoc]] loaders.lora_pipeline.AmusedLoraLoaderMixin
@@ -100,6 +106,10 @@ To learn more about how to load LoRA weights, see the [LoRA](../../using-diffuse

 [[autodoc]] loaders.lora_pipeline.HiDreamImageLoraLoaderMixin

-## WanLoraLoaderMixin
+## QwenImageLoraLoaderMixin

-[[autodoc]] loaders.lora_pipeline.WanLoraLoaderMixin
+[[autodoc]] loaders.lora_pipeline.QwenImageLoraLoaderMixin
+
+## LoraBaseMixin
+
+[[autodoc]] loaders.lora_base.LoraBaseMixin
--- a/docs/source/en/api/models/autoencoderkl_qwenimage.md
+++ b/docs/source/en/api/models/autoencoderkl_qwenimage.md
@@ -0,0 +1,35 @@
+<!-- Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License. -->
+
+# AutoencoderKLQwenImage
+
+The model can be loaded with the following code snippet.
+
+```python
+from diffusers import AutoencoderKLQwenImage
+
+vae = AutoencoderKLQwenImage.from_pretrained("Qwen/QwenImage-20B", subfolder="vae")
+```
+
+## AutoencoderKLQwenImage
+
+[[autodoc]] AutoencoderKLQwenImage
+    - decode
+    - encode
+    - all
+
+## AutoencoderKLOutput
+
+[[autodoc]] models.autoencoders.autoencoder_kl.AutoencoderKLOutput
+
+## DecoderOutput
+
+[[autodoc]] models.autoencoders.vae.DecoderOutput
--- a/docs/source/en/api/models/qwenimage_transformer2d.md
+++ b/docs/source/en/api/models/qwenimage_transformer2d.md
@@ -0,0 +1,28 @@
+<!-- Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License. -->
+
+# QwenImageTransformer2DModel
+
+The model can be loaded with the following code snippet.
+
+```python
+from diffusers import QwenImageTransformer2DModel
+
+transformer = QwenImageTransformer2DModel.from_pretrained("Qwen/QwenImage-20B", subfolder="transformer", torch_dtype=torch.bfloat16)
+```
+
+## QwenImageTransformer2DModel
+
+[[autodoc]] QwenImageTransformer2DModel
+
+## Transformer2DModelOutput
+
+[[autodoc]] models.modeling_outputs.Transformer2DModelOutput
--- a/docs/source/en/api/models/skyreels_v2_transformer_3d.md
+++ b/docs/source/en/api/models/skyreels_v2_transformer_3d.md
@@ -0,0 +1,30 @@
+<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License. -->
+
+# SkyReelsV2Transformer3DModel
+
+A Diffusion Transformer model for 3D video-like data was introduced in [SkyReels-V2](https://github.com/SkyworkAI/SkyReels-V2) by the Skywork AI.
+
+The model can be loaded with the following code snippet.
+
+```python
+from diffusers import SkyReelsV2Transformer3DModel
+
+transformer = SkyReelsV2Transformer3DModel.from_pretrained("Skywork/SkyReels-V2-DF-1.3B-540P-Diffusers", subfolder="transformer", torch_dtype=torch.bfloat16)
+```
+
+## SkyReelsV2Transformer3DModel
+
+[[autodoc]] SkyReelsV2Transformer3DModel
+
+## Transformer2DModelOutput
+
+[[autodoc]] models.modeling_outputs.Transformer2DModelOutput
--- a/docs/source/en/api/modular_diffusers/guiders.md
+++ b/docs/source/en/api/modular_diffusers/guiders.md
@@ -0,0 +1,39 @@
+# Guiders
+
+Guiders are components in Modular Diffusers that control how the diffusion process is guided during generation. They implement various guidance techniques to improve generation quality and control.
+
+## BaseGuidance
+
+[[autodoc]] diffusers.guiders.guider_utils.BaseGuidance
+
+## ClassifierFreeGuidance
+
+[[autodoc]] diffusers.guiders.classifier_free_guidance.ClassifierFreeGuidance
+
+## ClassifierFreeZeroStarGuidance
+
+[[autodoc]] diffusers.guiders.classifier_free_zero_star_guidance.ClassifierFreeZeroStarGuidance
+
+## SkipLayerGuidance
+
+[[autodoc]] diffusers.guiders.skip_layer_guidance.SkipLayerGuidance
+
+## SmoothedEnergyGuidance
+
+[[autodoc]] diffusers.guiders.smoothed_energy_guidance.SmoothedEnergyGuidance
+
+## PerturbedAttentionGuidance
+
+[[autodoc]] diffusers.guiders.perturbed_attention_guidance.PerturbedAttentionGuidance
+
+## AdaptiveProjectedGuidance
+
+[[autodoc]] diffusers.guiders.adaptive_projected_guidance.AdaptiveProjectedGuidance
+
+## AutoGuidance
+
+[[autodoc]] diffusers.guiders.auto_guidance.AutoGuidance
+
+## TangentialClassifierFreeGuidance
+
+[[autodoc]] diffusers.guiders.tangential_classifier_free_guidance.TangentialClassifierFreeGuidance
--- a/docs/source/en/api/modular_diffusers/pipeline.md
+++ b/docs/source/en/api/modular_diffusers/pipeline.md
@@ -0,0 +1,5 @@
+# Pipeline
+
+## ModularPipeline
+
+[[autodoc]] diffusers.modular_pipelines.modular_pipeline.ModularPipeline
--- a/docs/source/en/api/modular_diffusers/pipeline_blocks.md
+++ b/docs/source/en/api/modular_diffusers/pipeline_blocks.md
@@ -0,0 +1,17 @@
+# Pipeline blocks
+
+## ModularPipelineBlocks
+
+[[autodoc]] diffusers.modular_pipelines.modular_pipeline.ModularPipelineBlocks
+
+## SequentialPipelineBlocks
+
+[[autodoc]] diffusers.modular_pipelines.modular_pipeline.SequentialPipelineBlocks
+
+## LoopSequentialPipelineBlocks
+
+[[autodoc]] diffusers.modular_pipelines.modular_pipeline.LoopSequentialPipelineBlocks
+
+## AutoPipelineBlocks
+
+[[autodoc]] diffusers.modular_pipelines.modular_pipeline.AutoPipelineBlocks
--- a/docs/source/en/api/modular_diffusers/pipeline_components.md
+++ b/docs/source/en/api/modular_diffusers/pipeline_components.md
@@ -0,0 +1,17 @@
+# Components and configs
+
+## ComponentSpec
+
+[[autodoc]] diffusers.modular_pipelines.modular_pipeline.ComponentSpec
+
+## ConfigSpec
+
+[[autodoc]] diffusers.modular_pipelines.modular_pipeline.ConfigSpec
+
+## ComponentsManager
+
+[[autodoc]] diffusers.modular_pipelines.components_manager.ComponentsManager
+
+## InsertableDict
+
+[[autodoc]] diffusers.modular_pipelines.modular_pipeline_utils.InsertableDict
--- a/docs/source/en/api/modular_diffusers/pipeline_states.md
+++ b/docs/source/en/api/modular_diffusers/pipeline_states.md
@@ -0,0 +1,9 @@
+# Pipeline states
+
+## PipelineState
+
+[[autodoc]] diffusers.modular_pipelines.modular_pipeline.PipelineState
+
+## BlockState
+
+[[autodoc]] diffusers.modular_pipelines.modular_pipeline.BlockState 
--- a/docs/source/en/api/pipelines/amused.md
+++ b/docs/source/en/api/pipelines/amused.md
@@ -10,6 +10,9 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->

+> [!WARNING]
+> This pipeline is deprecated but it can still be used. However, we won't test the pipeline anymore and won't accept any changes to it. If you run into any issues, reinstall the last Diffusers version that supported this model.
+
 # aMUSEd

 aMUSEd was introduced in [aMUSEd: An Open MUSE Reproduction](https://huggingface.co/papers/2401.01808) by Suraj Patil, William Berman, Robin Rombach, and Patrick von Platen.
--- a/docs/source/en/api/pipelines/attend_and_excite.md
+++ b/docs/source/en/api/pipelines/attend_and_excite.md
@@ -10,6 +10,9 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->

+> [!WARNING]
+> This pipeline is deprecated but it can still be used. However, we won't test the pipeline anymore and won't accept any changes to it. If you run into any issues, reinstall the last Diffusers version that supported this model.
+
 # Attend-and-Excite

 Attend-and-Excite for Stable Diffusion was proposed in [Attend-and-Excite: Attention-Based Semantic Guidance for Text-to-Image Diffusion Models](https://attendandexcite.github.io/Attend-and-Excite/) and provides textual attention control over image generation.
--- a/docs/source/en/api/pipelines/audioldm.md
+++ b/docs/source/en/api/pipelines/audioldm.md
@@ -10,6 +10,9 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->

+> [!WARNING]
+> This pipeline is deprecated but it can still be used. However, we won't test the pipeline anymore and won't accept any changes to it. If you run into any issues, reinstall the last Diffusers version that supported this model.
+
 # AudioLDM

 AudioLDM was proposed in [AudioLDM: Text-to-Audio Generation with Latent Diffusion Models](https://huggingface.co/papers/2301.12503) by Haohe Liu et al. Inspired by [Stable Diffusion](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/overview), AudioLDM
--- a/docs/source/en/api/pipelines/blip_diffusion.md
+++ b/docs/source/en/api/pipelines/blip_diffusion.md
@@ -10,6 +10,9 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->

+> [!WARNING]
+> This pipeline is deprecated but it can still be used. However, we won't test the pipeline anymore and won't accept any changes to it. If you run into any issues, reinstall the last Diffusers version that supported this model.
+
 # BLIP-Diffusion

 BLIP-Diffusion was proposed in [BLIP-Diffusion: Pre-trained Subject Representation for Controllable Text-to-Image Generation and Editing](https://huggingface.co/papers/2305.14720). It enables zero-shot subject-driven generation and control-guided zero-shot generation.
--- a/docs/source/en/api/pipelines/chroma.md
+++ b/docs/source/en/api/pipelines/chroma.md
@@ -36,7 +36,7 @@ import torch
 from diffusers import ChromaPipeline

 pipe = ChromaPipeline.from_pretrained("lodestones/Chroma", torch_dtype=torch.bfloat16)
-pipe.enabe_model_cpu_offload()
+pipe.enable_model_cpu_offload()

 prompt = [
    "A high-fashion close-up portrait of a blonde woman in clear sunglasses. The image uses a bold teal and red color split for dramatic lighting. The background is a simple teal-green. The photo is sharp and well-composed, and is designed for viewing with anaglyph 3D glasses for optimal effect. It looks professionally done."
--- a/docs/source/en/api/pipelines/controlnetxs.md
+++ b/docs/source/en/api/pipelines/controlnetxs.md
@@ -10,6 +10,9 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->

+> [!WARNING]
+> This pipeline is deprecated but it can still be used. However, we won't test the pipeline anymore and won't accept any changes to it. If you run into any issues, reinstall the last Diffusers version that supported this model.
+
 # ControlNet-XS

 <div class="flex flex-wrap space-x-1">
--- a/docs/source/en/api/pipelines/controlnetxs_sdxl.md
+++ b/docs/source/en/api/pipelines/controlnetxs_sdxl.md
@@ -10,6 +10,9 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->

+> [!WARNING]
+> This pipeline is deprecated but it can still be used. However, we won't test the pipeline anymore and won't accept any changes to it. If you run into any issues, reinstall the last Diffusers version that supported this model.
+
 # ControlNet-XS with Stable Diffusion XL

 ControlNet-XS was introduced in [ControlNet-XS](https://vislearn.github.io/ControlNet-XS/) by Denis Zavadski and Carsten Rother. It is based on the observation that the control model in the [original ControlNet](https://huggingface.co/papers/2302.05543) can be made much smaller and still produce good results.
--- a/docs/source/en/api/pipelines/dance_diffusion.md
+++ b/docs/source/en/api/pipelines/dance_diffusion.md
@@ -10,6 +10,9 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->

+> [!WARNING]
+> This pipeline is deprecated but it can still be used. However, we won't test the pipeline anymore and won't accept any changes to it. If you run into any issues, reinstall the last Diffusers version that supported this model.
+
 # Dance Diffusion

 [Dance Diffusion](https://github.com/Harmonai-org/sample-generator) is by Zach Evans.
--- a/docs/source/en/api/pipelines/diffedit.md
+++ b/docs/source/en/api/pipelines/diffedit.md
@@ -10,6 +10,9 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->

+> [!WARNING]
+> This pipeline is deprecated but it can still be used. However, we won't test the pipeline anymore and won't accept any changes to it. If you run into any issues, reinstall the last Diffusers version that supported this model.
+
 # DiffEdit

 [DiffEdit: Diffusion-based semantic image editing with mask guidance](https://huggingface.co/papers/2210.11427) is by Guillaume Couairon, Jakob Verbeek, Holger Schwenk, and Matthieu Cord.
--- a/docs/source/en/api/pipelines/flux.md
+++ b/docs/source/en/api/pipelines/flux.md
@@ -25,6 +25,8 @@ Original model checkpoints for Flux can be found [here](https://huggingface.co/b

 Flux can be quite expensive to run on consumer hardware devices. However, you can perform a suite of optimizations to run it faster and in a more memory-friendly manner. Check out [this section](https://huggingface.co/blog/sd3#memory-optimizations-for-sd3) for more details. Additionally, Flux can benefit from quantization for memory efficiency with a trade-off in inference latency. Refer to [this blog post](https://huggingface.co/blog/quanto-diffusers) to learn more.  For an exhaustive list of resources, check out [this gist](https://gist.github.com/sayakpaul/b664605caf0aa3bf8585ab109dd5ac9c).

+[Caching](../../optimization/cache) may also speed up inference by storing and reusing intermediate outputs.
+
 </Tip>

 Flux comes in the following variants:
--- a/docs/source/en/api/pipelines/hidream.md
+++ b/docs/source/en/api/pipelines/hidream.md
@@ -18,7 +18,7 @@

 <Tip>

-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
+[Caching](../../optimization/cache) may also speed up inference by storing and reusing intermediate outputs.

 </Tip>

--- a/docs/source/en/api/pipelines/i2vgenxl.md
+++ b/docs/source/en/api/pipelines/i2vgenxl.md
@@ -10,6 +10,9 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->

+> [!WARNING]
+> This pipeline is deprecated but it can still be used. However, we won't test the pipeline anymore and won't accept any changes to it. If you run into any issues, reinstall the last Diffusers version that supported this model.
+
 # I2VGen-XL

 [I2VGen-XL: High-Quality Image-to-Video Synthesis via Cascaded Diffusion Models](https://hf.co/papers/2311.04145.pdf) by Shiwei Zhang, Jiayu Wang, Yingya Zhang, Kang Zhao, Hangjie Yuan, Zhiwu Qin, Xiang Wang, Deli Zhao, and Jingren Zhou.
--- a/docs/source/en/api/pipelines/ltx_video.md
+++ b/docs/source/en/api/pipelines/ltx_video.md
@@ -88,7 +88,7 @@ export_to_video(video, "output.mp4", fps=24)
 </hfoption>
 <hfoption id="inference speed">

-[Compilation](../../optimization/fp16#torchcompile) is slow the first time but subsequent calls to the pipeline are faster.
+[Compilation](../../optimization/fp16#torchcompile) is slow the first time but subsequent calls to the pipeline are faster. [Caching](../../optimization/cache) may also speed up inference by storing and reusing intermediate outputs.

 ```py
 import torch
--- a/docs/source/en/api/pipelines/musicldm.md
+++ b/docs/source/en/api/pipelines/musicldm.md
@@ -10,6 +10,9 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->

+> [!WARNING]
+> This pipeline is deprecated but it can still be used. However, we won't test the pipeline anymore and won't accept any changes to it. If you run into any issues, reinstall the last Diffusers version that supported this model.
+
 # MusicLDM

 MusicLDM was proposed in [MusicLDM: Enhancing Novelty in Text-to-Music Generation Using Beat-Synchronous Mixup Strategies](https://huggingface.co/papers/2308.01546) by Ke Chen, Yusong Wu, Haohe Liu, Marianna Nezhurina, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
--- a/docs/source/en/api/pipelines/paint_by_example.md
+++ b/docs/source/en/api/pipelines/paint_by_example.md
@@ -10,6 +10,9 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->

+> [!WARNING]
+> This pipeline is deprecated but it can still be used. However, we won't test the pipeline anymore and won't accept any changes to it. If you run into any issues, reinstall the last Diffusers version that supported this model.
+
 # Paint by Example

 [Paint by Example: Exemplar-based Image Editing with Diffusion Models](https://huggingface.co/papers/2211.13227) is by Binxin Yang, Shuyang Gu, Bo Zhang, Ting Zhang, Xuejin Chen, Xiaoyan Sun, Dong Chen, Fang Wen.
--- a/docs/source/en/api/pipelines/panorama.md
+++ b/docs/source/en/api/pipelines/panorama.md
@@ -10,6 +10,9 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->

+> [!WARNING]
+> This pipeline is deprecated but it can still be used. However, we won't test the pipeline anymore and won't accept any changes to it. If you run into any issues, reinstall the last Diffusers version that supported this model.
+
 # MultiDiffusion

 <div class="flex flex-wrap space-x-1">
--- a/docs/source/en/api/pipelines/pia.md
+++ b/docs/source/en/api/pipelines/pia.md
@@ -10,6 +10,9 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->

+> [!WARNING]
+> This pipeline is deprecated but it can still be used. However, we won't test the pipeline anymore and won't accept any changes to it. If you run into any issues, reinstall the last Diffusers version that supported this model.
+
 # Image-to-Video Generation with PIA (Personalized Image Animator)

 <div class="flex flex-wrap space-x-1">
--- a/docs/source/en/api/pipelines/qwenimage.md
+++ b/docs/source/en/api/pipelines/qwenimage.md
@@ -0,0 +1,125 @@
+<!-- Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License. -->
+
+# QwenImage
+
+<div class="flex flex-wrap space-x-1">
+  <img alt="LoRA" src="https://img.shields.io/badge/LoRA-d8b4fe?style=flat"/>
+</div>
+
+Qwen-Image from the Qwen team is an image generation foundation model in the Qwen series that achieves significant advances in complex text rendering and precise image editing. Experiments show strong general capabilities in both image generation and editing, with exceptional performance in text rendering, especially for Chinese.
+
+Qwen-Image comes in the following variants:
+
+| model type | model id |
+|:----------:|:--------:|
+| Qwen-Image | [`Qwen/Qwen-Image`](https://huggingface.co/Qwen/Qwen-Image) |
+| Qwen-Image-Edit | [`Qwen/Qwen-Image-Edit`](https://huggingface.co/Qwen/Qwen-Image-Edit) |
+
+<Tip>
+
+[Caching](../../optimization/cache) may also speed up inference by storing and reusing intermediate outputs.
+
+</Tip>
+
+## LoRA for faster inference
+
+Use a LoRA from `lightx2v/Qwen-Image-Lightning` to speed up inference by reducing the
+number of steps. Refer to the code snippet below:
+
+<details>
+<summary>Code</summary>
+
+```py
+from diffusers import DiffusionPipeline, FlowMatchEulerDiscreteScheduler
+import torch 
+import math
+
+ckpt_id = "Qwen/Qwen-Image"
+
+# From
+# https://github.com/ModelTC/Qwen-Image-Lightning/blob/342260e8f5468d2f24d084ce04f55e101007118b/generate_with_diffusers.py#L82C9-L97C10
+scheduler_config = {
+    "base_image_seq_len": 256,
+    "base_shift": math.log(3),  # We use shift=3 in distillation
+    "invert_sigmas": False,
+    "max_image_seq_len": 8192,
+    "max_shift": math.log(3),  # We use shift=3 in distillation
+    "num_train_timesteps": 1000,
+    "shift": 1.0,
+    "shift_terminal": None,  # set shift_terminal to None
+    "stochastic_sampling": False,
+    "time_shift_type": "exponential",
+    "use_beta_sigmas": False,
+    "use_dynamic_shifting": True,
+    "use_exponential_sigmas": False,
+    "use_karras_sigmas": False,
+}
+scheduler = FlowMatchEulerDiscreteScheduler.from_config(scheduler_config)
+pipe = DiffusionPipeline.from_pretrained(
+    ckpt_id, scheduler=scheduler, torch_dtype=torch.bfloat16
+).to("cuda")
+pipe.load_lora_weights(
+    "lightx2v/Qwen-Image-Lightning", weight_name="Qwen-Image-Lightning-8steps-V1.0.safetensors"
+)
+
+prompt = "a tiny astronaut hatching from an egg on the moon, Ultra HD, 4K, cinematic composition."
+negative_prompt = " "
+image = pipe(
+    prompt=prompt,
+    negative_prompt=negative_prompt,
+    width=1024,
+    height=1024,
+    num_inference_steps=8,
+    true_cfg_scale=1.0,
+    generator=torch.manual_seed(0),
+).images[0]
+image.save("qwen_fewsteps.png")
+```
+
+</details>
+
+<Tip>
+
+The `guidance_scale` parameter in the pipeline is there to support future guidance-distilled models when they come up. Note that passing `guidance_scale` to the pipeline is ineffective. To enable classifier-free guidance, please pass `true_cfg_scale` and `negative_prompt` (even an empty negative prompt like " ") should enable classifier-free guidance computations.
+
+</Tip>
+
+## QwenImagePipeline
+
+[[autodoc]] QwenImagePipeline
+  - all
+  - __call__
+
+## QwenImageImg2ImgPipeline
+
+[[autodoc]] QwenImageImg2ImgPipeline
+  - all
+  - __call__
+
+## QwenImageInpaintPipeline
+
+[[autodoc]] QwenImageInpaintPipeline
+  - all
+  - __call__
+
+## QwenImageEditPipeline
+
+[[autodoc]] QwenImageEditPipeline
+  - all
+  - __call__
+
+## QwenImagePipelineOutput
+
+[[autodoc]] pipelines.qwenimage.pipeline_output.QwenImagePipelineOutput
--- a/docs/source/en/api/pipelines/self_attention_guidance.md
+++ b/docs/source/en/api/pipelines/self_attention_guidance.md
@@ -10,6 +10,9 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->

+> [!WARNING]
+> This pipeline is deprecated but it can still be used. However, we won't test the pipeline anymore and won't accept any changes to it. If you run into any issues, reinstall the last Diffusers version that supported this model.
+
 # Self-Attention Guidance

 [Improving Sample Quality of Diffusion Models Using Self-Attention Guidance](https://huggingface.co/papers/2210.00939) is by Susung Hong et al.
--- a/docs/source/en/api/pipelines/semantic_stable_diffusion.md
+++ b/docs/source/en/api/pipelines/semantic_stable_diffusion.md
@@ -10,6 +10,9 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->

+> [!WARNING]
+> This pipeline is deprecated but it can still be used. However, we won't test the pipeline anymore and won't accept any changes to it. If you run into any issues, reinstall the last Diffusers version that supported this model.
+
 # Semantic Guidance

 Semantic Guidance for Diffusion Models was proposed in [SEGA: Instructing Text-to-Image Models using Semantic Guidance](https://huggingface.co/papers/2301.12247) and provides strong semantic control over image generation.
--- a/docs/source/en/api/pipelines/skyreels_v2.md
+++ b/docs/source/en/api/pipelines/skyreels_v2.md
@@ -0,0 +1,367 @@
+<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License. -->
+
+<div style="float: right;">
+  <div class="flex flex-wrap space-x-1">
+    <a href="https://huggingface.co/docs/diffusers/main/en/tutorials/using_peft_for_inference" target="_blank" rel="noopener">
+      <img alt="LoRA" src="https://img.shields.io/badge/LoRA-d8b4fe?style=flat"/>
+    </a>
+  </div>
+</div>
+
+# SkyReels-V2: Infinite-length Film Generative model
+
+[SkyReels-V2](https://huggingface.co/papers/2504.13074) by the SkyReels Team.
+
+*Recent advances in video generation have been driven by diffusion models and autoregressive frameworks, yet critical challenges persist in harmonizing prompt adherence, visual quality, motion dynamics, and duration: compromises in motion dynamics to enhance temporal visual quality, constrained video duration (5-10 seconds) to prioritize resolution, and inadequate shot-aware generation stemming from general-purpose MLLMs' inability to interpret cinematic grammar, such as shot composition, actor expressions, and camera motions. These intertwined limitations hinder realistic long-form synthesis and professional film-style generation. To address these limitations, we propose SkyReels-V2, an Infinite-length Film Generative Model, that synergizes Multi-modal Large Language Model (MLLM), Multi-stage Pretraining, Reinforcement Learning, and Diffusion Forcing Framework. Firstly, we design a comprehensive structural representation of video that combines the general descriptions by the Multi-modal LLM and the detailed shot language by sub-expert models. Aided with human annotation, we then train a unified Video Captioner, named SkyCaptioner-V1, to efficiently label the video data. Secondly, we establish progressive-resolution pretraining for the fundamental video generation, followed by a four-stage post-training enhancement: Initial concept-balanced Supervised Fine-Tuning (SFT) improves baseline quality; Motion-specific Reinforcement Learning (RL) training with human-annotated and synthetic distortion data addresses dynamic artifacts; Our diffusion forcing framework with non-decreasing noise schedules enables long-video synthesis in an efficient search space; Final high-quality SFT refines visual fidelity. All the code and models are available at [this https URL](https://github.com/SkyworkAI/SkyReels-V2).*
+
+You can find all the original SkyReels-V2 checkpoints under the [Skywork](https://huggingface.co/collections/Skywork/skyreels-v2-6801b1b93df627d441d0d0d9) organization.
+
+The following SkyReels-V2 models are supported in Diffusers:
+- [SkyReels-V2 DF 1.3B - 540P](https://huggingface.co/Skywork/SkyReels-V2-DF-1.3B-540P-Diffusers)
+- [SkyReels-V2 DF 14B - 540P](https://huggingface.co/Skywork/SkyReels-V2-DF-14B-540P-Diffusers)
+- [SkyReels-V2 DF 14B - 720P](https://huggingface.co/Skywork/SkyReels-V2-DF-14B-720P-Diffusers)
+- [SkyReels-V2 T2V 14B - 540P](https://huggingface.co/Skywork/SkyReels-V2-T2V-14B-540P-Diffusers)
+- [SkyReels-V2 T2V 14B - 720P](https://huggingface.co/Skywork/SkyReels-V2-T2V-14B-720P-Diffusers)
+- [SkyReels-V2 I2V 1.3B - 540P](https://huggingface.co/Skywork/SkyReels-V2-I2V-1.3B-540P-Diffusers)
+- [SkyReels-V2 I2V 14B - 540P](https://huggingface.co/Skywork/SkyReels-V2-I2V-14B-540P-Diffusers)
+- [SkyReels-V2 I2V 14B - 720P](https://huggingface.co/Skywork/SkyReels-V2-I2V-14B-720P-Diffusers)
+- [SkyReels-V2 FLF2V 1.3B - 540P](https://huggingface.co/Skywork/SkyReels-V2-FLF2V-1.3B-540P-Diffusers)
+
+> [!TIP]
+> Click on the SkyReels-V2 models in the right sidebar for more examples of video generation.
+
+### A _Visual_ Demonstration
+
+        An example with these parameters:
+        base_num_frames=97, num_frames=97, num_inference_steps=30, ar_step=5, causal_block_size=5
+
+        vae_scale_factor_temporal -> 4
+        num_latent_frames: (97-1)//vae_scale_factor_temporal+1 = 25 frames -> 5 blocks of 5 frames each
+
+        base_num_latent_frames = (97-1)//vae_scale_factor_temporal+1 = 25 → blocks = 25//5 = 5 blocks
+        This 5 blocks means the maximum context length of the model is 25 frames in the latent space.
+
+        Asynchronous Processing Timeline:
+        ┌─────────────────────────────────────────────────────────────────┐
+        │ Steps:    1    6   11   16   21   26   31   36   41   46   50   │
+        │ Block 1: [■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■]                       │
+        │ Block 2:      [■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■]                  │
+        │ Block 3:           [■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■]             │
+        │ Block 4:                [■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■]        │
+        │ Block 5:                     [■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■]   │
+        └─────────────────────────────────────────────────────────────────┘
+
+        For Long Videos (num_frames > base_num_frames):
+        base_num_frames acts as the "sliding window size" for processing long videos.
+
+        Example: 257-frame video with base_num_frames=97, overlap_history=17
+        ┌──── Iteration 1 (frames 1-97) ────┐
+        │ Processing window: 97 frames      │ → 5 blocks, async processing
+        │ Generates: frames 1-97            │
+        └───────────────────────────────────┘
+                    ┌────── Iteration 2 (frames 81-177) ──────┐
+                    │ Processing window: 97 frames            │
+                    │ Overlap: 17 frames (81-97) from prev    │ → 5 blocks, async processing
+                    │ Generates: frames 98-177                │
+                    └─────────────────────────────────────────┘
+                                ┌────── Iteration 3 (frames 161-257) ──────┐
+                                │ Processing window: 97 frames             │
+                                │ Overlap: 17 frames (161-177) from prev   │ → 5 blocks, async processing
+                                │ Generates: frames 178-257                │
+                                └──────────────────────────────────────────┘
+
+        Each iteration independently runs the asynchronous processing with its own 5 blocks.
+        base_num_frames controls:
+        1. Memory usage (larger window = more VRAM)
+        2. Model context length (must match training constraints)
+        3. Number of blocks per iteration (base_num_latent_frames // causal_block_size)
+
+        Each block takes 30 steps to complete denoising.
+        Block N starts at step: 1 + (N-1) x ar_step
+        Total steps: 30 + (5-1) x 5 = 50 steps
+
+
+        Synchronous mode (ar_step=0) would process all blocks/frames simultaneously:
+        ┌──────────────────────────────────────────────┐
+        │ Steps:       1            ...            30  │
+        │ All blocks: [■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■] │
+        └──────────────────────────────────────────────┘
+        Total steps: 30 steps
+
+
+        An example on how the step matrix is constructed for asynchronous processing:
+        Given the parameters: (num_inference_steps=30, flow_shift=8, num_frames=97, ar_step=5, causal_block_size=5)
+        - num_latent_frames = (97 frames - 1) // (4 temporal downsampling) + 1 = 25
+        - step_template = [999, 995, 991, 986, 980, 975, 969, 963, 956, 948,
+                           941, 932, 922, 912, 901, 888, 874, 859, 841, 822,
+                           799, 773, 743, 708, 666, 615, 551, 470, 363, 216]
+
+        The algorithm creates a 50x25 step_matrix where:
+        - Row 1:  [999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999]
+        - Row 2:  [995, 995, 995, 995, 995, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999]
+        - Row 3:  [991, 991, 991, 991, 991, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999]
+        - ...
+        - Row 7:  [969, 969, 969, 969, 969, 995, 995, 995, 995, 995, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999]
+        - ...
+        - Row 21: [799, 799, 799, 799, 799, 888, 888, 888, 888, 888, 941, 941, 941, 941, 941, 975, 975, 975, 975, 975, 999, 999, 999, 999, 999]
+        - ...
+        - Row 35: [  0,   0,   0,   0,   0, 216, 216, 216, 216, 216, 666, 666, 666, 666, 666, 822, 822, 822, 822, 822, 901, 901, 901, 901, 901]
+        - ...
+        - Row 42: [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, 551, 551, 551, 551, 551, 773, 773, 773, 773, 773]
+        - ...
+        - Row 50: [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, 216, 216, 216, 216, 216]
+
+        Detailed Row 6 Analysis:
+        - step_matrix[5]:       [ 975, 975, 975, 975, 975, 999, 999, 999, 999, 999, 999,  ...,  999]
+        - step_index[5]:        [   6,   6,   6,   6,   6,   1,   1,   1,   1,   1,   0,  ...,    0]
+        - step_update_mask[5]:  [True,True,True,True,True,True,True,True,True,True,False, ...,False]
+        - valid_interval[5]:    (0, 25)
+
+        Key Pattern: Block i lags behind Block i-1 by exactly ar_step=5 timesteps, creating the
+        staggered "diffusion forcing" effect where later blocks condition on cleaner earlier blocks.
+
+### Text-to-Video Generation
+
+The example below demonstrates how to generate a video from text.
+
+<hfoptions id="T2V usage">
+<hfoption id="T2V memory">
+
+Refer to the [Reduce memory usage](../../optimization/memory) guide for more details about the various memory saving techniques.
+
+From the original repo:
+>You can use --ar_step 5 to enable asynchronous inference. When asynchronous inference, --causal_block_size 5 is recommended while it is not supposed to be set for synchronous generation... Asynchronous inference will take more steps to diffuse the whole sequence which means it will be SLOWER than synchronous mode. In our experiments, asynchronous inference may improve the instruction following and visual consistent performance.
+
+```py
+# pip install ftfy
+import torch
+from diffusers import AutoModel, SkyReelsV2DiffusionForcingPipeline, UniPCMultistepScheduler
+from diffusers.utils import export_to_video
+
+vae = AutoModel.from_pretrained("Skywork/SkyReels-V2-DF-14B-540P-Diffusers", subfolder="vae", torch_dtype=torch.float32)
+transformer = AutoModel.from_pretrained("Skywork/SkyReels-V2-DF-14B-540P-Diffusers", subfolder="transformer", torch_dtype=torch.bfloat16)
+
+pipeline = SkyReelsV2DiffusionForcingPipeline.from_pretrained(
+    "Skywork/SkyReels-V2-DF-14B-540P-Diffusers",
+    vae=vae,
+    transformer=transformer,
+    torch_dtype=torch.bfloat16
+)
+flow_shift = 8.0  # 8.0 for T2V, 5.0 for I2V
+pipeline.scheduler = UniPCMultistepScheduler.from_config(pipeline.scheduler.config, flow_shift=flow_shift)
+pipeline = pipeline.to("cuda")
+
+prompt = "A cat and a dog baking a cake together in a kitchen. The cat is carefully measuring flour, while the dog is stirring the batter with a wooden spoon. The kitchen is cozy, with sunlight streaming through the window."
+
+output = pipeline(
+    prompt=prompt,
+    num_inference_steps=30,
+    height=544,  # 720 for 720P
+    width=960,   # 1280 for 720P
+    num_frames=97,
+    base_num_frames=97,  # 121 for 720P
+    ar_step=5,  # Controls asynchronous inference (0 for synchronous mode)
+    causal_block_size=5,  # Number of frames in each block for asynchronous processing
+    overlap_history=None,  # Number of frames to overlap for smooth transitions in long videos; 17 for long video generations
+    addnoise_condition=20,  # Improves consistency in long video generation
+).frames[0]
+export_to_video(output, "T2V.mp4", fps=24, quality=8)
+```
+
+</hfoption>
+</hfoptions>
+
+### First-Last-Frame-to-Video Generation
+
+The example below demonstrates how to use the image-to-video pipeline to generate a video using a text description, a starting frame, and an ending frame.
+
+<hfoptions id="FLF2V usage">
+<hfoption id="usage">
+
+```python
+import numpy as np
+import torch
+import torchvision.transforms.functional as TF
+from diffusers import AutoencoderKLWan, SkyReelsV2DiffusionForcingImageToVideoPipeline, UniPCMultistepScheduler
+from diffusers.utils import export_to_video, load_image
+
+
+model_id = "Skywork/SkyReels-V2-DF-14B-720P-Diffusers"
+vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32)
+pipeline = SkyReelsV2DiffusionForcingImageToVideoPipeline.from_pretrained(
+    model_id, vae=vae, torch_dtype=torch.bfloat16
+)
+flow_shift = 5.0  # 8.0 for T2V, 5.0 for I2V
+pipeline.scheduler = UniPCMultistepScheduler.from_config(pipeline.scheduler.config, flow_shift=flow_shift)
+pipeline.to("cuda")
+
+first_frame = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/flf2v_input_first_frame.png")
+last_frame = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/flf2v_input_last_frame.png")
+
+def aspect_ratio_resize(image, pipeline, max_area=720 * 1280):
+    aspect_ratio = image.height / image.width
+    mod_value = pipeline.vae_scale_factor_spatial * pipeline.transformer.config.patch_size[1]
+    height = round(np.sqrt(max_area * aspect_ratio)) // mod_value * mod_value
+    width = round(np.sqrt(max_area / aspect_ratio)) // mod_value * mod_value
+    image = image.resize((width, height))
+    return image, height, width
+
+def center_crop_resize(image, height, width):
+    # Calculate resize ratio to match first frame dimensions
+    resize_ratio = max(width / image.width, height / image.height)
+
+    # Resize the image
+    width = round(image.width * resize_ratio)
+    height = round(image.height * resize_ratio)
+    size = [width, height]
+    image = TF.center_crop(image, size)
+
+    return image, height, width
+
+first_frame, height, width = aspect_ratio_resize(first_frame, pipeline)
+if last_frame.size != first_frame.size:
+    last_frame, _, _ = center_crop_resize(last_frame, height, width)
+
+prompt = "CG animation style, a small blue bird takes off from the ground, flapping its wings. The bird's feathers are delicate, with a unique pattern on its chest. The background shows a blue sky with white clouds under bright sunshine. The camera follows the bird upward, capturing its flight and the vastness of the sky from a close-up, low-angle perspective."
+
+output = pipeline(
+    image=first_frame, last_image=last_frame, prompt=prompt, height=height, width=width, guidance_scale=5.0
+).frames[0]
+export_to_video(output, "output.mp4", fps=24, quality=8)
+```
+
+</hfoption>
+</hfoptions>
+
+
+### Video-to-Video Generation
+
+<hfoptions id="V2V usage">
+<hfoption id="usage">
+
+`SkyReelsV2DiffusionForcingVideoToVideoPipeline` extends a given video.
+
+```python
+import numpy as np
+import torch
+import torchvision.transforms.functional as TF
+from diffusers import AutoencoderKLWan, SkyReelsV2DiffusionForcingVideoToVideoPipeline, UniPCMultistepScheduler
+from diffusers.utils import export_to_video, load_video
+
+
+model_id = "Skywork/SkyReels-V2-DF-14B-540P-Diffusers"
+vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32)
+pipeline = SkyReelsV2DiffusionForcingVideoToVideoPipeline.from_pretrained(
+    model_id, vae=vae, torch_dtype=torch.bfloat16
+)
+flow_shift = 5.0  # 8.0 for T2V, 5.0 for I2V
+pipeline.scheduler = UniPCMultistepScheduler.from_config(pipeline.scheduler.config, flow_shift=flow_shift)
+pipeline.to("cuda")
+
+video = load_video("input_video.mp4")
+
+prompt = "CG animation style, a small blue bird takes off from the ground, flapping its wings. The bird's feathers are delicate, with a unique pattern on its chest. The background shows a blue sky with white clouds under bright sunshine. The camera follows the bird upward, capturing its flight and the vastness of the sky from a close-up, low-angle perspective."
+
+output = pipeline(
+    video=video, prompt=prompt, height=544, width=960, guidance_scale=5.0,
+    num_inference_steps=30, num_frames=257, base_num_frames=97#, ar_step=5, causal_block_size=5,
+).frames[0]
+export_to_video(output, "output.mp4", fps=24, quality=8)
+# Total frames will be the number of frames of given video + 257
+```
+
+</hfoption>
+</hfoptions>
+
+
+## Notes
+
+- SkyReels-V2 supports LoRAs with [`~loaders.SkyReelsV2LoraLoaderMixin.load_lora_weights`].
+
+  <details>
+  <summary>Show example code</summary>
+
+  ```py
+  # pip install ftfy
+  import torch
+  from diffusers import AutoModel, SkyReelsV2DiffusionForcingPipeline
+  from diffusers.utils import export_to_video
+
+  vae = AutoModel.from_pretrained(
+      "Skywork/SkyReels-V2-DF-1.3B-540P-Diffusers", subfolder="vae", torch_dtype=torch.float32
+  )
+  pipeline = SkyReelsV2DiffusionForcingPipeline.from_pretrained(
+      "Skywork/SkyReels-V2-DF-1.3B-540P-Diffusers", vae=vae, torch_dtype=torch.bfloat16
+  )
+  pipeline.to("cuda")
+
+  pipeline.load_lora_weights("benjamin-paine/steamboat-willie-1.3b", adapter_name="steamboat-willie")
+  pipeline.set_adapters("steamboat-willie")
+
+  pipeline.enable_model_cpu_offload()
+
+  # use "steamboat willie style" to trigger the LoRA
+  prompt = """
+  steamboat willie style, golden era animation, The camera rushes from far to near in a low-angle shot,
+  revealing a white ferret on a log. It plays, leaps into the water, and emerges, as the camera zooms in
+  for a close-up. Water splashes berry bushes nearby, while moss, snow, and leaves blanket the ground.
+  Birch trees and a light blue sky frame the scene, with ferns in the foreground. Side lighting casts dynamic
+  shadows and warm highlights. Medium composition, front view, low angle, with depth of field.
+  """
+
+  output = pipeline(
+      prompt=prompt,
+      num_frames=97,
+      guidance_scale=6.0,
+  ).frames[0]
+  export_to_video(output, "output.mp4", fps=24)
+  ```
+
+  </details>
+
+
+## SkyReelsV2DiffusionForcingPipeline
+
+[[autodoc]] SkyReelsV2DiffusionForcingPipeline
+  - all
+  - __call__
+
+## SkyReelsV2DiffusionForcingImageToVideoPipeline
+
+[[autodoc]] SkyReelsV2DiffusionForcingImageToVideoPipeline
+  - all
+  - __call__
+
+## SkyReelsV2DiffusionForcingVideoToVideoPipeline
+
+[[autodoc]] SkyReelsV2DiffusionForcingVideoToVideoPipeline
+  - all
+  - __call__
+
+## SkyReelsV2Pipeline
+
+[[autodoc]] SkyReelsV2Pipeline
+  - all
+  - __call__
+
+## SkyReelsV2ImageToVideoPipeline
+
+[[autodoc]] SkyReelsV2ImageToVideoPipeline
+  - all
+  - __call__
+
+## SkyReelsV2PipelineOutput
+
+[[autodoc]] pipelines.skyreels_v2.pipeline_output.SkyReelsV2PipelineOutput
--- a/docs/source/en/api/pipelines/stable_diffusion/gligen.md
+++ b/docs/source/en/api/pipelines/stable_diffusion/gligen.md
@@ -10,6 +10,9 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->

+> [!WARNING]
+> This pipeline is deprecated but it can still be used. However, we won't test the pipeline anymore and won't accept any changes to it. If you run into any issues, reinstall the last Diffusers version that supported this model.
+
 # GLIGEN (Grounded Language-to-Image Generation)

 The GLIGEN model was created by researchers and engineers from [University of Wisconsin-Madison, Columbia University, and Microsoft](https://github.com/gligen/GLIGEN). The [`StableDiffusionGLIGENPipeline`] and [`StableDiffusionGLIGENTextImagePipeline`] can generate photorealistic images conditioned on grounding inputs. Along with text and bounding boxes with [`StableDiffusionGLIGENPipeline`], if input images are given, [`StableDiffusionGLIGENTextImagePipeline`] can insert objects described by text at the region defined by bounding boxes. Otherwise, it'll generate an image described by the caption/prompt and insert objects described by text at the region defined by bounding boxes. It's trained on COCO2014D and COCO2014CD datasets, and the model uses a frozen CLIP ViT-L/14 text encoder to condition itself on grounding inputs.
--- a/docs/source/en/api/pipelines/stable_diffusion/k_diffusion.md
+++ b/docs/source/en/api/pipelines/stable_diffusion/k_diffusion.md
@@ -10,6 +10,9 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->

+> [!WARNING]
+> This pipeline is deprecated but it can still be used. However, we won't test the pipeline anymore and won't accept any changes to it. If you run into any issues, reinstall the last Diffusers version that supported this model.
+
 # K-Diffusion

 [k-diffusion](https://github.com/crowsonkb/k-diffusion) is a popular library created by [Katherine Crowson](https://github.com/crowsonkb/). We provide `StableDiffusionKDiffusionPipeline` and `StableDiffusionXLKDiffusionPipeline` that allow you to run Stable DIffusion with samplers from k-diffusion.
--- a/docs/source/en/api/pipelines/stable_diffusion/ldm3d_diffusion.md
+++ b/docs/source/en/api/pipelines/stable_diffusion/ldm3d_diffusion.md
@@ -10,6 +10,9 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->

+> [!WARNING]
+> This pipeline is deprecated but it can still be used. However, we won't test the pipeline anymore and won't accept any changes to it. If you run into any issues, reinstall the last Diffusers version that supported this model.
+
 # Text-to-(RGB, depth)

 <div class="flex flex-wrap space-x-1">
--- a/docs/source/en/api/pipelines/stable_diffusion/stable_diffusion_3.md
+++ b/docs/source/en/api/pipelines/stable_diffusion/stable_diffusion_3.md
@@ -31,7 +31,7 @@ _As the model is gated, before using it with diffusers you first need to go to t
 Use the command below to log in:

 ```bash
-huggingface-cli login
+hf auth login
 ```

 <Tip>
--- a/docs/source/en/api/pipelines/stable_diffusion/stable_diffusion_safe.md
+++ b/docs/source/en/api/pipelines/stable_diffusion/stable_diffusion_safe.md
@@ -10,6 +10,9 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->

+> [!WARNING]
+> This pipeline is deprecated but it can still be used. However, we won't test the pipeline anymore and won't accept any changes to it. If you run into any issues, reinstall the last Diffusers version that supported this model.
+
 # Safe Stable Diffusion

 Safe Stable Diffusion was proposed in [Safe Latent Diffusion: Mitigating Inappropriate Degeneration in Diffusion Models](https://huggingface.co/papers/2211.05105) and mitigates inappropriate degeneration from Stable Diffusion models because they're trained on unfiltered web-crawled datasets. For instance Stable Diffusion may unexpectedly generate nudity, violence, images depicting self-harm, and otherwise offensive content. Safe Stable Diffusion is an extension of Stable Diffusion that drastically reduces this type of content.
--- a/docs/source/en/api/pipelines/text_to_video.md
+++ b/docs/source/en/api/pipelines/text_to_video.md
@@ -10,11 +10,8 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->

-<Tip warning={true}>
-
-🧪 This pipeline is for research purposes only.
-
-</Tip>
+> [!WARNING]
+> This pipeline is deprecated but it can still be used. However, we won't test the pipeline anymore and won't accept any changes to it. If you run into any issues, reinstall the last Diffusers version that supported this model.

 # Text-to-video

--- a/docs/source/en/api/pipelines/text_to_video_zero.md
+++ b/docs/source/en/api/pipelines/text_to_video_zero.md
@@ -10,6 +10,9 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->

+> [!WARNING]
+> This pipeline is deprecated but it can still be used. However, we won't test the pipeline anymore and won't accept any changes to it. If you run into any issues, reinstall the last Diffusers version that supported this model.
+
 # Text2Video-Zero

 <div class="flex flex-wrap space-x-1">
--- a/docs/source/en/api/pipelines/unclip.md
+++ b/docs/source/en/api/pipelines/unclip.md
@@ -7,6 +7,9 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->

+> [!WARNING]
+> This pipeline is deprecated but it can still be used. However, we won't test the pipeline anymore and won't accept any changes to it. If you run into any issues, reinstall the last Diffusers version that supported this model.
+
 # unCLIP

 [Hierarchical Text-Conditional Image Generation with CLIP Latents](https://huggingface.co/papers/2204.06125) is by Aditya Ramesh, Prafulla Dhariwal, Alex Nichol, Casey Chu, Mark Chen. The unCLIP model in 🤗 Diffusers comes from kakaobrain's [karlo](https://github.com/kakaobrain/karlo).
--- a/docs/source/en/api/pipelines/unidiffuser.md
+++ b/docs/source/en/api/pipelines/unidiffuser.md
@@ -10,6 +10,9 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->

+> [!WARNING]
+> This pipeline is deprecated but it can still be used. However, we won't test the pipeline anymore and won't accept any changes to it. If you run into any issues, reinstall the last Diffusers version that supported this model.
+
 # UniDiffuser

 <div class="flex flex-wrap space-x-1">
--- a/docs/source/en/api/pipelines/wan.md
+++ b/docs/source/en/api/pipelines/wan.md
@@ -29,6 +29,7 @@
 You can find all the original Wan2.1 checkpoints under the [Wan-AI](https://huggingface.co/Wan-AI) organization.

 The following Wan models are supported in Diffusers:
+
 - [Wan 2.1 T2V 1.3B](https://huggingface.co/Wan-AI/Wan2.1-T2V-1.3B-Diffusers)
 - [Wan 2.1 T2V 14B](https://huggingface.co/Wan-AI/Wan2.1-T2V-14B-Diffusers)
 - [Wan 2.1 I2V 14B - 480P](https://huggingface.co/Wan-AI/Wan2.1-I2V-14B-480P-Diffusers)
@@ -36,6 +37,9 @@ The following Wan models are supported in Diffusers:
 - [Wan 2.1 FLF2V 14B - 720P](https://huggingface.co/Wan-AI/Wan2.1-FLF2V-14B-720P-diffusers)
 - [Wan 2.1 VACE 1.3B](https://huggingface.co/Wan-AI/Wan2.1-VACE-1.3B-diffusers)
 - [Wan 2.1 VACE 14B](https://huggingface.co/Wan-AI/Wan2.1-VACE-14B-diffusers)
+- [Wan 2.2 T2V 14B](https://huggingface.co/Wan-AI/Wan2.2-T2V-A14B-Diffusers)
+- [Wan 2.2 I2V 14B](https://huggingface.co/Wan-AI/Wan2.2-I2V-A14B-Diffusers)
+- [Wan 2.2 TI2V 5B](https://huggingface.co/Wan-AI/Wan2.2-TI2V-5B-Diffusers)

 > [!TIP]
 > Click on the Wan2.1 models in the right sidebar for more examples of video generation.
@@ -115,7 +119,7 @@ export_to_video(output, "output.mp4", fps=16)
 </hfoption>
 <hfoption id="T2V inference speed">

-[Compilation](../../optimization/fp16#torchcompile) is slow the first time but subsequent calls to the pipeline are faster.
+[Compilation](../../optimization/fp16#torchcompile) is slow the first time but subsequent calls to the pipeline are faster. [Caching](../../optimization/cache) may also speed up inference by storing and reusing intermediate outputs.

 ```py
 # pip install ftfy
@@ -327,6 +331,10 @@ The general rule of thumb to keep in mind when preparing inputs for the VACE pip

 - Try lower `shift` values (`2.0` to `5.0`) for lower resolution videos and higher `shift` values (`7.0` to `12.0`) for higher resolution images.

+- Wan 2.1 and 2.2 support using [LightX2V LoRAs](https://huggingface.co/Kijai/WanVideo_comfy/tree/main/Lightx2v) to speed up inference. Using them on Wan 2.2 is slightly more involed. Refer to [this code snippet](https://github.com/huggingface/diffusers/pull/12040#issuecomment-3144185272) to learn more.
+
+- Wan 2.2 has two denoisers. By default, LoRAs are only loaded into the first denoiser. One can set `load_into_transformer_2=True` to load LoRAs into the second denoiser. Refer to [this](https://github.com/huggingface/diffusers/pull/12074#issue-3292620048) and [this](https://github.com/huggingface/diffusers/pull/12074#issuecomment-3155896144) examples to learn more.
+
 ## WanPipeline

 [[autodoc]] WanPipeline
--- a/docs/source/en/api/pipelines/wuerstchen.md
+++ b/docs/source/en/api/pipelines/wuerstchen.md
@@ -12,6 +12,9 @@ specific language governing permissions and limitations under the License.

 # Würstchen

+> [!WARNING]
+> This pipeline is deprecated but it can still be used. However, we won't test the pipeline anymore and won't accept any changes to it. If you run into any issues, reinstall the last Diffusers version that supported this model.
+
 <div class="flex flex-wrap space-x-1">
  <img alt="LoRA" src="https://img.shields.io/badge/LoRA-d8b4fe?style=flat"/>
 </div>
--- a/docs/source/en/api/quantization.md
+++ b/docs/source/en/api/quantization.md
@@ -27,19 +27,19 @@ Learn how to quantize models in the [Quantization](../quantization/overview) gui

 ## BitsAndBytesConfig

-[[autodoc]] BitsAndBytesConfig
+[[autodoc]] quantizers.quantization_config.BitsAndBytesConfig

 ## GGUFQuantizationConfig

-[[autodoc]] GGUFQuantizationConfig
+[[autodoc]] quantizers.quantization_config.GGUFQuantizationConfig

 ## QuantoConfig

-[[autodoc]] QuantoConfig
+[[autodoc]] quantizers.quantization_config.QuantoConfig

 ## TorchAoConfig

-[[autodoc]] TorchAoConfig
+[[autodoc]] quantizers.quantization_config.TorchAoConfig

 ## DiffusersQuantizer

--- a/docs/source/en/index.md
+++ b/docs/source/en/index.md
@@ -12,37 +12,24 @@ specific language governing permissions and limitations under the License.

 <p align="center">
    <br>
-    <img src="https://raw.githubusercontent.com/huggingface/diffusers/77aadfee6a891ab9fcfb780f87c693f7a5beeb8e/docs/source/imgs/diffusers_library.jpg" width="400"/>
+    <img src="https://raw.githubusercontent.com/huggingface/diffusers/77aadfee6a891ab9fcfb780f87c693f7a5beeb8e/docs/source/imgs/diffusers_library.jpg" width="400" style="border: none;"/>
    <br>
 </p>

 # Diffusers

-🤗 Diffusers is the go-to library for state-of-the-art pretrained diffusion models for generating images, audio, and even 3D structures of molecules. Whether you're looking for a simple inference solution or want to train your own diffusion model, 🤗 Diffusers is a modular toolbox that supports both. Our library is designed with a focus on [usability over performance](conceptual/philosophy#usability-over-performance), [simple over easy](conceptual/philosophy#simple-over-easy), and [customizability over abstractions](conceptual/philosophy#tweakable-contributorfriendly-over-abstraction).
+Diffusers is a library of state-of-the-art pretrained diffusion models for generating videos, images, and audio.

-The library has three main components:
+The library revolves around the [`DiffusionPipeline`], an API designed for:

- State-of-the-art diffusion pipelines for inference with just a few lines of code. There are many pipelines in 🤗 Diffusers, check out the table in the pipeline [overview](api/pipelines/overview) for a complete list of available pipelines and the task they solve.
- Interchangeable [noise schedulers](api/schedulers/overview) for balancing trade-offs between generation speed and quality.
- Pretrained [models](api/models) that can be used as building blocks, and combined with schedulers, for creating your own end-to-end diffusion systems.
+- easy inference with only a few lines of code
+- flexibility to mix-and-match pipeline components (models, schedulers)
+- loading and using adapters like LoRA

-<div class="mt-10">
-  <div class="w-full flex flex-col space-y-4 md:space-y-0 md:grid md:grid-cols-2 md:gap-y-4 md:gap-x-5">
-    <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="./tutorials/tutorial_overview"
-      ><div class="w-full text-center bg-gradient-to-br from-blue-400 to-blue-500 rounded-lg py-1.5 font-semibold mb-5 text-white text-lg leading-relaxed">Tutorials</div>
-      <p class="text-gray-700">Learn the fundamental skills you need to start generating outputs, build your own diffusion system, and train a diffusion model. We recommend starting here if you're using 🤗 Diffusers for the first time!</p>
-    </a>
-    <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="./using-diffusers/loading_overview"
-      ><div class="w-full text-center bg-gradient-to-br from-indigo-400 to-indigo-500 rounded-lg py-1.5 font-semibold mb-5 text-white text-lg leading-relaxed">How-to guides</div>
-      <p class="text-gray-700">Practical guides for helping you load pipelines, models, and schedulers. You'll also learn how to use pipelines for specific tasks, control how outputs are generated, optimize for inference speed, and different training techniques.</p>
-    </a>
-    <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="./conceptual/philosophy"
-      ><div class="w-full text-center bg-gradient-to-br from-pink-400 to-pink-500 rounded-lg py-1.5 font-semibold mb-5 text-white text-lg leading-relaxed">Conceptual guides</div>
-      <p class="text-gray-700">Understand why the library was designed the way it was, and learn more about the ethical guidelines and safety implementations for using the library.</p>
-   </a>
-    <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="./api/models/overview"
-      ><div class="w-full text-center bg-gradient-to-br from-purple-400 to-purple-500 rounded-lg py-1.5 font-semibold mb-5 text-white text-lg leading-relaxed">Reference</div>
-      <p class="text-gray-700">Technical descriptions of how 🤗 Diffusers classes and methods work.</p>
-    </a>
-  </div>
-</div>
+Diffusers also comes with optimizations - such as offloading and quantization - to ensure even the largest models are accessible on memory-constrained devices. If memory is not an issue, Diffusers supports torch.compile to boost inference speed.
+
+Get started right away with a Diffusers model on the [Hub](https://huggingface.co/models?library=diffusers&sort=trending) today!
+
+## Learn
+
+If you're a beginner, we recommend starting with the [Hugging Face Diffusion Models Course](https://huggingface.co/learn/diffusion-course/unit0/1). You'll learn the theory behind diffusion models, and learn how to use the Diffusers library to generate images, fine-tune your own models, and more.
--- a/docs/source/en/installation.md
+++ b/docs/source/en/installation.md
@@ -12,183 +12,156 @@ specific language governing permissions and limitations under the License.

 # Installation

-🤗 Diffusers is tested on Python 3.8+, PyTorch 1.7.0+, and Flax. Follow the installation instructions below for the deep learning library you are using:
+Diffusers is tested on Python 3.8+, PyTorch 1.4+, and Flax 0.4.1+. Follow the installation instructions for the deep learning library you're using, [PyTorch](https://pytorch.org/get-started/locally/) or [Flax](https://flax.readthedocs.io/en/latest/).

- [PyTorch](https://pytorch.org/get-started/locally/) installation instructions
- [Flax](https://flax.readthedocs.io/en/latest/) installation instructions
-
-## Install with pip
-
-You should install 🤗 Diffusers in a [virtual environment](https://docs.python.org/3/library/venv.html).
-If you're unfamiliar with Python virtual environments, take a look at this [guide](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/).
-A virtual environment makes it easier to manage different projects and avoid compatibility issues between dependencies.
-
-Create a virtual environment with Python or [uv](https://docs.astral.sh/uv/) (refer to [Installation](https://docs.astral.sh/uv/getting-started/installation/) for installation instructions), a fast Rust-based Python package and project manager.
-
-<hfoptions id="install">
-<hfoption id="uv">
+Create a [virtual environment](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/) for easier management of separate projects and to avoid compatibility issues between dependencies. Use [uv](https://docs.astral.sh/uv/), a Rust-based Python package and project manager, to create a virtual environment and install Diffusers.

 ```bash
 uv venv my-env
 source my-env/bin/activate
 ```

-</hfoption>
-<hfoption id="Python">
+Install Diffusers with one of the following methods.
+
+<hfoptions id="install">
+<hfoption id="pip">
+
+PyTorch only supports Python 3.8 - 3.11 on Windows.

 ```bash
-python -m venv my-env
-source my-env/bin/activate
+uv pip install diffusers["torch"] transformers
 ```

-</hfoption>
-</hfoptions>
-
-You should also install 🤗 Transformers because 🤗 Diffusers relies on its models.
-
-
-<frameworkcontent>
-<pt>
-
-PyTorch only supports Python 3.8 - 3.11 on Windows. Install Diffusers with uv.
-
-```bash
-uv install diffusers["torch"] transformers
-```
-
-You can also install Diffusers with pip.
-
-```bash
-pip install diffusers["torch"] transformers
-```
-
-</pt>
-<jax>
-
-Install Diffusers with uv.
+Use the command below for Flax.

 ```bash
 uv pip install diffusers["flax"] transformers
 ```

-You can also install Diffusers with pip.
-
-```bash
-pip install diffusers["flax"] transformers
-```
-
-</jax>
-</frameworkcontent>
-
-## Install with conda
-
-After activating your virtual environment, with `conda` (maintained by the community):
+</hfoption>
+<hfoption id="conda">

 ```bash
 conda install -c conda-forge diffusers
 ```

-## Install from source
+</hfoption>
+<hfoption id="source">

-Before installing 🤗 Diffusers from source, make sure you have PyTorch and 🤗 Accelerate installed.
+A source install installs the `main` version instead of the latest `stable` version. The `main` version is useful for staying updated with the latest changes but it may not always be stable. If you run into a problem, open an [Issue](https://github.com/huggingface/diffusers/issues/new/choose) and we will try to resolve it as soon as possible.

-To install 🤗 Accelerate:
+Make sure [Accelerate](https://huggingface.co/docs/accelerate/index) is installed.

 ```bash
-pip install accelerate
+uv pip install accelerate
 ```

-Then install 🤗 Diffusers from source:
+Install Diffusers from source with the command below.

 ```bash
-pip install git+https://github.com/huggingface/diffusers
+uv pip install git+https://github.com/huggingface/diffusers
 ```

-This command installs the bleeding edge `main` version rather than the latest `stable` version.
-The `main` version is useful for staying up-to-date with the latest developments.
-For instance, if a bug has been fixed since the last official release but a new release hasn't been rolled out yet.
-However, this means the `main` version may not always be stable.
-We strive to keep the `main` version operational, and most issues are usually resolved within a few hours or a day.
-If you run into a problem, please open an [Issue](https://github.com/huggingface/diffusers/issues/new/choose) so we can fix it even sooner!
+</hfoption>
+</hfoptions>

 ## Editable install

-You will need an editable install if you'd like to:
+An editable install is recommended for development workflows or if you're using the `main` version of the source code. A special link is created between the cloned repository and the Python library paths. This avoids reinstalling a package after every change.

-* Use the `main` version of the source code.
-* Contribute to 🤗 Diffusers and need to test changes in the code.
+Clone the repository and install Diffusers with the following commands.

-Clone the repository and install 🤗 Diffusers with the following commands:
+<hfoptions id="editable">
+<hfoption id="PyTorch">

 ```bash
 git clone https://github.com/huggingface/diffusers.git
 cd diffusers
+uv pip install -e ".[torch]"
 ```

-<frameworkcontent>
-<pt>
+</hfoption>
+<hfoption id="Flax">
+
 ```bash
-pip install -e ".[torch]"
+git clone https://github.com/huggingface/diffusers.git
+cd diffusers
+uv pip install -e ".[flax]"
 ```
-</pt>
-<jax>
-```bash
-pip install -e ".[flax]"
-```
-</jax>
-</frameworkcontent>

-These commands will link the folder you cloned the repository to and your Python library paths.
-Python will now look inside the folder you cloned to in addition to the normal library paths.
-For example, if your Python packages are typically installed in `~/anaconda3/envs/main/lib/python3.10/site-packages/`, Python will also search the `~/diffusers/` folder you cloned to.
+</hfoption>
+</hfoptions>

-<Tip warning={true}>
+> [!WARNING]
+> You must keep the `diffusers` folder if you want to keep using the library with the editable install.

-You must keep the `diffusers` folder if you want to keep using the library.
-
-</Tip>
-
-Now you can easily update your clone to the latest version of 🤗 Diffusers with the following command:
+Update your cloned repository to the latest version of Diffusers with the command below.

 ```bash
 cd ~/diffusers/
 git pull
 ```

-Your Python environment will find the `main` version of 🤗 Diffusers on the next run.
-
 ## Cache

-Model weights and files are downloaded from the Hub to a cache which is usually your home directory. You can change the cache location by specifying the `HF_HOME` or `HUGGINFACE_HUB_CACHE` environment variables or configuring the `cache_dir` parameter in methods like [`~DiffusionPipeline.from_pretrained`].
+Model weights and files are downloaded from the Hub to a cache, which is usually your home directory. Change the cache location with the [HF_HOME](https://huggingface.co/docs/huggingface_hub/package_reference/environment_variables#hfhome) or [HF_HUB_CACHE](https://huggingface.co/docs/huggingface_hub/package_reference/environment_variables#hfhubcache) environment variables or configuring the `cache_dir` parameter in methods like [`~DiffusionPipeline.from_pretrained`].

-Cached files allow you to run 🤗 Diffusers offline. To prevent 🤗 Diffusers from connecting to the internet, set the `HF_HUB_OFFLINE` environment variable to `1` and 🤗 Diffusers will only load previously downloaded files in the cache.
+<hfoptions id="cache">
+<hfoption id="env variable">
+
+```bash
+export HF_HOME="/path/to/your/cache"
+export HF_HUB_CACHE="/path/to/your/hub/cache"
+```
+
+</hfoption>
+<hfoption id="from_pretrained">
+
+```py
+from diffusers import DiffusionPipeline
+
+pipeline = DiffusionPipeline.from_pretrained(
+    "black-forest-labs/FLUX.1-dev",
+    cache_dir="/path/to/your/cache"
+)
+```
+
+</hfoption>
+</hfoptions>
+
+Cached files allow you to use Diffusers offline. Set the [HF_HUB_OFFLINE](https://huggingface.co/docs/huggingface_hub/package_reference/environment_variables#hfhuboffline) environment variable to `1` to prevent Diffusers from connecting to the internet.

 ```shell
 export HF_HUB_OFFLINE=1
 ```

-For more details about managing and cleaning the cache, take a look at the [caching](https://huggingface.co/docs/huggingface_hub/guides/manage-cache) guide.
+For more details about managing and cleaning the cache, take a look at the [Understand caching](https://huggingface.co/docs/huggingface_hub/guides/manage-cache) guide.

 ## Telemetry logging

-Our library gathers telemetry information during [`~DiffusionPipeline.from_pretrained`] requests.
-The data gathered includes the version of 🤗 Diffusers and PyTorch/Flax, the requested model or pipeline class,
-and the path to a pretrained checkpoint if it is hosted on the Hugging Face Hub.
+Diffusers gathers telemetry information during [`~DiffusionPipeline.from_pretrained`] requests.
+The data gathered includes the Diffusers and PyTorch/Flax version, the requested model or pipeline class,
+and the path to a pretrained checkpoint if it is hosted on the Hub.
+
 This usage data helps us debug issues and prioritize new features.
 Telemetry is only sent when loading models and pipelines from the Hub,
 and it is not collected if you're loading local files.

-We understand that not everyone wants to share additional information,and we respect your privacy.
-You can disable telemetry collection by setting the `HF_HUB_DISABLE_TELEMETRY` environment variable from your terminal:
+Opt-out and disable telemetry collection with the [HF_HUB_DISABLE_TELEMETRY](https://huggingface.co/docs/huggingface_hub/package_reference/environment_variables#hfhubdisabletelemetry) environment variable.

-On Linux/MacOS:
+<hfoptions id="telemetry">
+<hfoption id="Linux/macOS">

 ```bash
 export HF_HUB_DISABLE_TELEMETRY=1
 ```

-On Windows:
+</hfoption>
+<hfoption id="Windows">

 ```bash
 set HF_HUB_DISABLE_TELEMETRY=1
 ```
+
+</hfoption>
+</hfoptions>
--- a/docs/source/en/modular_diffusers/auto_pipeline_blocks.md
+++ b/docs/source/en/modular_diffusers/auto_pipeline_blocks.md
@@ -0,0 +1,156 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# AutoPipelineBlocks
+
+[`~modular_pipelines.AutoPipelineBlocks`] are a multi-block type containing blocks that support different workflows. It automatically selects which sub-blocks to run based on the input provided at runtime. This is typically used to package multiple workflows - text-to-image, image-to-image, inpaint - into a single pipeline for convenience.
+
+This guide shows how to create [`~modular_pipelines.AutoPipelineBlocks`].
+
+Create three [`~modular_pipelines.ModularPipelineBlocks`] for text-to-image, image-to-image, and inpainting. These represent the different workflows available in the pipeline.
+
+<hfoptions id="auto">
+<hfoption id="text-to-image">
+
+```py
+import torch
+from diffusers.modular_pipelines import ModularPipelineBlocks, InputParam, OutputParam
+
+class TextToImageBlock(ModularPipelineBlocks):
+    model_name = "text2img"
+
+    @property
+    def inputs(self):
+        return [InputParam(name="prompt")]
+
+    @property
+    def intermediate_outputs(self):
+        return []
+
+    @property
+    def description(self):
+        return "I'm a text-to-image workflow!"
+
+    def __call__(self, components, state):
+        block_state = self.get_block_state(state)
+        print("running the text-to-image workflow")
+        # Add your text-to-image logic here
+        # For example: generate image from prompt
+        self.set_block_state(state, block_state)
+        return components, state
+```
+
+
+</hfoption>
+<hfoption id="image-to-image">
+
+```py
+class ImageToImageBlock(ModularPipelineBlocks):
+    model_name = "img2img"
+
+    @property
+    def inputs(self):
+        return [InputParam(name="prompt"), InputParam(name="image")]
+
+    @property
+    def intermediate_outputs(self):
+        return []
+
+    @property
+    def description(self):
+        return "I'm an image-to-image workflow!"
+
+    def __call__(self, components, state):
+        block_state = self.get_block_state(state)
+        print("running the image-to-image workflow")
+        # Add your image-to-image logic here
+        # For example: transform input image based on prompt
+        self.set_block_state(state, block_state)
+        return components, state
+```
+
+
+</hfoption>
+<hfoption id="inpaint">
+
+```py
+class InpaintBlock(ModularPipelineBlocks):
+    model_name = "inpaint"
+
+    @property
+    def inputs(self):
+        return [InputParam(name="prompt"), InputParam(name="image"), InputParam(name="mask")]
+
+    @property
+    def intermediate_outputs(self):
+        return []
+
+    @property
+    def description(self):
+        return "I'm an inpaint workflow!"
+
+    def __call__(self, components, state):
+        block_state = self.get_block_state(state)
+        print("running the inpaint workflow")
+        # Add your inpainting logic here
+        # For example: fill masked areas based on prompt
+        self.set_block_state(state, block_state)
+        return components, state
+```
+
+</hfoption>
+</hfoptions>
+
+Create an [`~modular_pipelines.AutoPipelineBlocks`] class that includes a list of the sub-block classes and their corresponding block names.
+
+You also need to include `block_trigger_inputs`, a list of input names that trigger the corresponding block. If a trigger input is provided at runtime, then that block is selected to run. Use `None` to specify the default block to run if no trigger inputs are detected.
+
+Lastly, it is important to include a `description` that clearly explains which inputs trigger which workflow. This helps users understand how to run specific workflows.
+
+```py
+from diffusers.modular_pipelines import AutoPipelineBlocks
+
+class AutoImageBlocks(AutoPipelineBlocks):
+    # List of sub-block classes to choose from
+    block_classes = [block_inpaint_cls, block_i2i_cls, block_t2i_cls]
+    # Names for each block in the same order
+    block_names = ["inpaint", "img2img", "text2img"]
+    # Trigger inputs that determine which block to run
+    # - "mask" triggers inpaint workflow
+    # - "image" triggers img2img workflow (but only if mask is not provided)
+    # - if none of above, runs the text2img workflow (default)
+    block_trigger_inputs = ["mask", "image", None]
+    # Description is extremely important for AutoPipelineBlocks
+
+    def description(self):
+        return (
+            "Pipeline generates images given different types of conditions!\n"
+            + "This is an auto pipeline block that works for text2img, img2img and inpainting tasks.\n"
+            + " - inpaint workflow is run when `mask` is provided.\n"
+            + " - img2img workflow is run when `image` is provided (but only when `mask` is not provided).\n"
+            + " - text2img workflow is run when neither `image` nor `mask` is provided.\n"
+        )
+```
+
+It is **very** important to include a `description` to avoid any confusion over how to run a block and what inputs are required. While [`~modular_pipelines.AutoPipelineBlocks`] are convenient, it's conditional logic may be difficult to figure out if it isn't properly explained.
+
+Create an instance of `AutoImageBlocks`.
+
+```py
+auto_blocks = AutoImageBlocks()
+```
+
+For more complex compositions, such as nested [`~modular_pipelines.AutoPipelineBlocks`] blocks when they're used as sub-blocks in larger pipelines, use the [`~modular_pipelines.SequentialPipelineBlocks.get_execution_blocks`] method to extract the a block that is actually run based on your input.
+
+```py
+auto_blocks.get_execution_blocks("mask")
+```
--- a/docs/source/en/modular_diffusers/components_manager.md
+++ b/docs/source/en/modular_diffusers/components_manager.md
@@ -0,0 +1,190 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# ComponentsManager
+
+The [`ComponentsManager`] is a model registry and management system for Modular Diffusers. It adds and tracks models, stores useful metadata (model size, device placement, adapters), prevents duplicate model instances, and supports offloading.
+
+This guide will show you how to use [`ComponentsManager`] to manage components and device memory.
+
+## Add a component
+
+The [`ComponentsManager`] should be created alongside a [`ModularPipeline`] in either [`~ModularPipeline.from_pretrained`] or [`~ModularPipelineBlocks.init_pipeline`].
+
+> [!TIP]
+> The `collection` parameter is optional but makes it easier to organize and manage components.
+
+<hfoptions id="create">
+<hfoption id="from_pretrained">
+
+```py
+from diffusers import ModularPipeline, ComponentsManager
+
+comp = ComponentsManager()
+pipe = ModularPipeline.from_pretrained("YiYiXu/modular-demo-auto", components_manager=comp, collection="test1")
+```
+
+</hfoption>
+<hfoption id="init_pipeline">
+
+```py
+from diffusers import ComponentsManager
+from diffusers.modular_pipelines import SequentialPipelineBlocks
+from diffusers.modular_pipelines.stable_diffusion_xl import TEXT2IMAGE_BLOCKS
+
+t2i_blocks = SequentialPipelineBlocks.from_blocks_dict(TEXT2IMAGE_BLOCKS)
+
+modular_repo_id = "YiYiXu/modular-loader-t2i-0704"
+components = ComponentsManager()
+t2i_pipeline = t2i_blocks.init_pipeline(modular_repo_id, components_manager=components)
+```
+
+</hfoption>
+</hfoptions>
+
+Components are only loaded and registered when using [`~ModularPipeline.load_components`] or [`~ModularPipeline.load_default_components`]. The example below uses [`~ModularPipeline.load_default_components`] to create a second pipeline that reuses all the components from the first one, and assigns it to a different collection
+
+```py
+pipe.load_default_components()
+pipe2 = ModularPipeline.from_pretrained("YiYiXu/modular-demo-auto", components_manager=comp, collection="test2")
+```
+
+Use the [`~ModularPipeline.null_component_names`] property to identify any components that need to be loaded, retrieve them with [`~ComponentsManager.get_components_by_names`], and then call [`~ModularPipeline.update_components`] to add the missing components.
+
+```py
+pipe2.null_component_names 
+['text_encoder', 'text_encoder_2', 'tokenizer', 'tokenizer_2', 'image_encoder', 'unet', 'vae', 'scheduler', 'controlnet']
+
+comp_dict = comp.get_components_by_names(names=pipe2.null_component_names)
+pipe2.update_components(**comp_dict)
+```
+
+To add individual components, use the [`~ComponentsManager.add`] method. This registers a component with a unique id.
+
+```py
+from diffusers import AutoModel
+
+text_encoder = AutoModel.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", subfolder="text_encoder")
+component_id = comp.add("text_encoder", text_encoder)
+comp
+```
+
+Use [`~ComponentsManager.remove`] to remove a component using their id.
+
+```py
+comp.remove("text_encoder_139917733042864")
+```
+
+## Retrieve a component
+
+The [`ComponentsManager`] provides several methods to retrieve registered components.
+
+### get_one
+
+The [`~ComponentsManager.get_one`] method returns a single component and supports pattern matching for the `name` parameter. If multiple components match, [`~ComponentsManager.get_one`] returns an error.
+
+| Pattern     | Example                          | Description                               |
+|-------------|----------------------------------|-------------------------------------------|
+| exact       | `comp.get_one(name="unet")`      | exact name match                          |
+| wildcard    | `comp.get_one(name="unet*")`     | names starting with "unet"                |
+| exclusion   | `comp.get_one(name="!unet")`     | exclude components named "unet"           |
+| or          | `comp.get_one(name="unet&#124;vae")`  | name is "unet" or "vae"                   |
+
+[`~ComponentsManager.get_one`] also filters components by the `collection` argument or `load_id` argument.
+
+```py
+comp.get_one(name="unet", collection="sdxl")
+```
+
+### get_components_by_names
+
+The [`~ComponentsManager.get_components_by_names`] method accepts a list of names and returns a dictionary mapping names to components. This is especially useful with [`ModularPipeline`] since they provide lists of required component names and the returned dictionary can be passed directly to [`~ModularPipeline.update_components`].
+
+```py
+component_dict = comp.get_components_by_names(names=["text_encoder", "unet", "vae"])
+{"text_encoder": component1, "unet": component2, "vae": component3}
+```
+
+## Duplicate detection
+
+It is recommended to load model components with [`ComponentSpec`] to assign components with a unique id that encodes their loading parameters. This allows [`ComponentsManager`] to automatically detect and prevent duplicate model instances even when different objects represent the same underlying checkpoint.
+
+```py
+from diffusers import ComponentSpec, ComponentsManager
+from transformers import CLIPTextModel
+
+comp = ComponentsManager()
+
+# Create ComponentSpec for the first text encoder
+spec = ComponentSpec(name="text_encoder", repo="stabilityai/stable-diffusion-xl-base-1.0", subfolder="text_encoder", type_hint=AutoModel)
+# Create ComponentSpec for a duplicate text encoder (it is same checkpoint, from the same repo/subfolder)
+spec_duplicated = ComponentSpec(name="text_encoder_duplicated", repo="stabilityai/stable-diffusion-xl-base-1.0", subfolder="text_encoder", type_hint=CLIPTextModel)
+
+# Load and add both components - the manager will detect they're the same model
+comp.add("text_encoder", spec.load())
+comp.add("text_encoder_duplicated", spec_duplicated.load())
+```
+
+This returns a warning with instructions for removing the duplicate.
+
+```py
+ComponentsManager: adding component 'text_encoder_duplicated_139917580682672', but it has duplicate load_id 'stabilityai/stable-diffusion-xl-base-1.0|text_encoder|null|null' with existing components: text_encoder_139918506246832. To remove a duplicate, call `components_manager.remove('<component_id>')`.
+'text_encoder_duplicated_139917580682672'
+```
+
+You could also add a component without using [`ComponentSpec`] and duplicate detection still works in most cases even if you're adding the same component under a different name.
+
+However, [`ComponentManager`] can't detect duplicates when you load the same component into different objects. In this case, you should load a model with [`ComponentSpec`].
+
+```py
+text_encoder_2 = AutoModel.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", subfolder="text_encoder")
+comp.add("text_encoder", text_encoder_2)
+'text_encoder_139917732983664'
+```
+
+## Collections
+
+Collections are labels assigned to components for better organization and management. Add a component to a collection with the `collection` argument in [`~ComponentsManager.add`].
+
+Only one component per name is allowed in each collection. Adding a second component with the same name automatically removes the first component.
+
+```py
+from diffusers import ComponentSpec, ComponentsManager
+
+comp = ComponentsManager()
+# Create ComponentSpec for the first UNet
+spec = ComponentSpec(name="unet", repo="stabilityai/stable-diffusion-xl-base-1.0", subfolder="unet", type_hint=AutoModel)
+# Create ComponentSpec for a different UNet
+spec2 = ComponentSpec(name="unet", repo="RunDiffusion/Juggernaut-XL-v9", subfolder="unet", type_hint=AutoModel, variant="fp16")
+
+# Add both UNets to the same collection - the second one will replace the first
+comp.add("unet", spec.load(), collection="sdxl")
+comp.add("unet", spec2.load(), collection="sdxl")
+```
+
+This makes it convenient to work with node-based systems because you can:
+
+- Mark all models as loaded from one node with the `collection` label.
+- Automatically replace models when new checkpoints are loaded under the same name.
+- Batch delete all models in a collection when a node is removed.
+
+## Offloading
+
+The [`~ComponentsManager.enable_auto_cpu_offload`] method is a global offloading strategy that works across all models regardless of which pipeline is using them. Once enabled, you don't need to worry about device placement if you add or remove components.
+
+```py
+comp.enable_auto_cpu_offload(device="cuda")
+```
+
+All models begin on the CPU and [`ComponentsManager`] moves them to the appropriate device right before they're needed, and moves other models back to the CPU when GPU memory is low.
+
+You can set your own rules for which models to offload first.
--- a/docs/source/en/modular_diffusers/guiders.md
+++ b/docs/source/en/modular_diffusers/guiders.md
@@ -0,0 +1,175 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Guiders
+
+[Classifier-free guidance](https://huggingface.co/papers/2207.12598) steers model generation that better match a prompt and is commonly used to improve generation quality, control, and adherence to prompts. There are different types of guidance methods, and in Diffusers, they are known as *guiders*. Like blocks, it is easy to switch and use different guiders for different use cases without rewriting the pipeline.
+
+This guide will show you how to switch guiders, adjust guider parameters, and load and share them to the Hub.
+
+## Switching guiders
+
+[`ClassifierFreeGuidance`] is the default guider and created when a pipeline is initialized with [`~ModularPipelineBlocks.init_pipeline`]. It is created by `from_config` which means it doesn't require loading specifications from a modular repository. A guider won't be listed in `modular_model_index.json`.
+
+Use [`~ModularPipeline.get_component_spec`] to inspect a guider.
+
+```py
+t2i_pipeline.get_component_spec("guider")
+ComponentSpec(name='guider', type_hint=<class 'diffusers.guiders.classifier_free_guidance.ClassifierFreeGuidance'>, description=None, config=FrozenDict([('guidance_scale', 7.5), ('guidance_rescale', 0.0), ('use_original_formulation', False), ('start', 0.0), ('stop', 1.0), ('_use_default_values', ['start', 'guidance_rescale', 'stop', 'use_original_formulation'])]), repo=None, subfolder=None, variant=None, revision=None, default_creation_method='from_config')
+```
+
+Switch to a different guider by passing the new guider to [`~ModularPipeline.update_components`].
+
+> [!TIP]
+> Changing guiders will return text letting you know you're changing the guider type.
+> ```bash
+> ModularPipeline.update_components: adding guider with new type: PerturbedAttentionGuidance, previous type: ClassifierFreeGuidance
+> ```
+
+```py
+from diffusers import LayerSkipConfig, PerturbedAttentionGuidance
+
+config = LayerSkipConfig(indices=[2, 9], fqn="mid_block.attentions.0.transformer_blocks", skip_attention=False, skip_attention_scores=True, skip_ff=False)
+guider = PerturbedAttentionGuidance(
+    guidance_scale=5.0, perturbed_guidance_scale=2.5, perturbed_guidance_config=config
+)
+t2i_pipeline.update_components(guider=guider)
+```
+
+Use [`~ModularPipeline.get_component_spec`] again to verify the guider type is different.
+
+```py
+t2i_pipeline.get_component_spec("guider")
+ComponentSpec(name='guider', type_hint=<class 'diffusers.guiders.perturbed_attention_guidance.PerturbedAttentionGuidance'>, description=None, config=FrozenDict([('guidance_scale', 5.0), ('perturbed_guidance_scale', 2.5), ('perturbed_guidance_start', 0.01), ('perturbed_guidance_stop', 0.2), ('perturbed_guidance_layers', None), ('perturbed_guidance_config', LayerSkipConfig(indices=[2, 9], fqn='mid_block.attentions.0.transformer_blocks', skip_attention=False, skip_attention_scores=True, skip_ff=False, dropout=1.0)), ('guidance_rescale', 0.0), ('use_original_formulation', False), ('start', 0.0), ('stop', 1.0), ('_use_default_values', ['perturbed_guidance_start', 'use_original_formulation', 'perturbed_guidance_layers', 'stop', 'start', 'guidance_rescale', 'perturbed_guidance_stop']), ('_class_name', 'PerturbedAttentionGuidance'), ('_diffusers_version', '0.35.0.dev0')]), repo=None, subfolder=None, variant=None, revision=None, default_creation_method='from_config')
+```
+
+## Loading custom guiders
+
+Guiders that are already saved on the Hub with a `modular_model_index.json` file are considered a `from_pretrained` component now instead of a `from_config` component.
+
+```json
+{
+  "guider": [
+    null,
+    null,
+    {
+      "repo": "YiYiXu/modular-loader-t2i-guider",
+      "revision": null,
+      "subfolder": "pag_guider",
+      "type_hint": [
+        "diffusers",
+        "PerturbedAttentionGuidance"
+      ],
+      "variant": null
+    }
+  ]
+}
+```
+
+The guider is only created after calling [`~ModularPipeline.load_default_components`] based on the loading specification in `modular_model_index.json`.
+
+```py
+t2i_pipeline = t2i_blocks.init_pipeline("YiYiXu/modular-doc-guider")
+# not created during init
+assert t2i_pipeline.guider is None
+t2i_pipeline.load_default_components()
+# loaded as PAG guider
+t2i_pipeline.guider
+```
+
+
+## Changing guider parameters
+
+The guider parameters can be adjusted with either the [`~ComponentSpec.create`] method or with [`~ModularPipeline.update_components`]. The example below changes the `guidance_scale` value.
+
+<hfoptions id="switch">
+<hfoption id="create">
+
+```py
+guider_spec = t2i_pipeline.get_component_spec("guider")
+guider = guider_spec.create(guidance_scale=10)
+t2i_pipeline.update_components(guider=guider)
+```
+
+</hfoption>
+<hfoption id="update_components">
+
+```py
+guider_spec = t2i_pipeline.get_component_spec("guider")
+guider_spec.config["guidance_scale"] = 10
+t2i_pipeline.update_components(guider=guider_spec)
+```
+
+</hfoption>
+</hfoptions>
+
+## Uploading custom guiders
+
+Call the [`~utils.PushToHubMixin.push_to_hub`] method on a custom guider to share it to the Hub.
+
+```py
+guider.push_to_hub("YiYiXu/modular-loader-t2i-guider", subfolder="pag_guider")
+```
+
+To make this guider available to the pipeline, either modify the `modular_model_index.json` file or use the [`~ModularPipeline.update_components`] method.
+
+<hfoptions id="upload">
+<hfoption id="modular_model_index.json">
+
+Edit the `modular_model_index.json` file and add a loading specification for the guider by pointing to a folder containing the guider config.
+
+```json
+{
+  "guider": [
+    "diffusers",
+    "PerturbedAttentionGuidance",
+    {
+      "repo": "YiYiXu/modular-loader-t2i-guider",
+      "revision": null,
+      "subfolder": "pag_guider",
+      "type_hint": [
+        "diffusers",
+        "PerturbedAttentionGuidance"
+      ],
+      "variant": null
+    }
+  ],
+```
+
+</hfoption>
+<hfoption id="update_components">
+
+Change the [`~ComponentSpec.default_creation_method`] to `from_pretrained` and use [`~ModularPipeline.update_components`] to update the guider and component specifications as well as the pipeline config.
+
+> [!TIP]
+> Changing the creation method will return text letting you know you're changing the creation type to `from_pretrained`.
+> ```bash
+> ModularPipeline.update_components: changing the default_creation_method of guider from from_config to from_pretrained.
+> ```
+
+```py
+guider_spec = t2i_pipeline.get_component_spec("guider")
+guider_spec.default_creation_method="from_pretrained"
+guider_spec.repo="YiYiXu/modular-loader-t2i-guider"
+guider_spec.subfolder="pag_guider"
+pag_guider = guider_spec.load()
+t2i_pipeline.update_components(guider=pag_guider)
+```
+
+To make it the default guider for a pipeline, call [`~utils.PushToHubMixin.push_to_hub`]. This is an optional step and not necessary if you are only experimenting locally.
+
+```py
+t2i_pipeline.push_to_hub("YiYiXu/modular-doc-guider")
+```
+
+</hfoption>
+</hfoptions>
--- a/docs/source/en/modular_diffusers/loop_sequential_pipeline_blocks.md
+++ b/docs/source/en/modular_diffusers/loop_sequential_pipeline_blocks.md
@@ -0,0 +1,93 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# LoopSequentialPipelineBlocks
+
+[`~modular_pipelines.LoopSequentialPipelineBlocks`] are a multi-block type that composes other [`~modular_pipelines.ModularPipelineBlocks`] together in a loop. Data flows circularly, using `intermediate_inputs` and `intermediate_outputs`, and each block is run iteratively. This is typically used to create a denoising loop which is iterative by default.
+
+This guide shows you how to create [`~modular_pipelines.LoopSequentialPipelineBlocks`].
+
+## Loop wrapper
+
+[`~modular_pipelines.LoopSequentialPipelineBlocks`], is also known as the *loop wrapper* because it defines the loop structure, iteration variables, and configuration. Within the loop wrapper, you need the following variables.
+
+- `loop_inputs` are user provided values and equivalent to [`~modular_pipelines.ModularPipelineBlocks.inputs`].
+- `loop_intermediate_inputs` are intermediate variables from the [`~modular_pipelines.PipelineState`] and equivalent to [`~modular_pipelines.ModularPipelineBlocks.intermediate_inputs`].
+- `loop_intermediate_outputs` are new intermediate variables created by the block and added to the [`~modular_pipelines.PipelineState`]. It is equivalent to [`~modular_pipelines.ModularPipelineBlocks.intermediate_outputs`].
+- `__call__` method defines the loop structure and iteration logic.
+
+```py
+import torch
+from diffusers.modular_pipelines import LoopSequentialPipelineBlocks, ModularPipelineBlocks, InputParam, OutputParam
+
+class LoopWrapper(LoopSequentialPipelineBlocks):
+    model_name = "test"
+    @property
+    def description(self):
+        return "I'm a loop!!"
+    @property
+    def loop_inputs(self):
+        return [InputParam(name="num_steps")]
+    @torch.no_grad()
+    def __call__(self, components, state):
+        block_state = self.get_block_state(state)
+        # Loop structure - can be customized to your needs
+        for i in range(block_state.num_steps):
+            # loop_step executes all registered blocks in sequence
+            components, block_state = self.loop_step(components, block_state, i=i)
+        self.set_block_state(state, block_state)
+        return components, state
+```
+
+The loop wrapper can pass additional arguments, like current iteration index, to the loop blocks.
+
+## Loop blocks
+
+A loop block is a [`~modular_pipelines.ModularPipelineBlocks`], but the `__call__` method behaves differently.
+
+- It recieves the iteration variable from the loop wrapper.
+- It works directly with the [`~modular_pipelines.BlockState`] instead of the [`~modular_pipelines.PipelineState`].
+- It doesn't require retrieving or updating the [`~modular_pipelines.BlockState`].
+
+Loop blocks share the same [`~modular_pipelines.BlockState`] to allow values to accumulate and change for each iteration in the loop.
+
+```py
+class LoopBlock(ModularPipelineBlocks):
+    model_name = "test"
+    @property
+    def inputs(self):
+        return [InputParam(name="x")]
+    @property
+    def intermediate_outputs(self):
+        # outputs produced by this block
+        return [OutputParam(name="x")]
+    @property
+    def description(self):
+        return "I'm a block used inside the `LoopWrapper` class"
+    def __call__(self, components, block_state, i: int):
+        block_state.x += 1
+        return components, block_state
+```
+
+## LoopSequentialPipelineBlocks
+
+Use the [`~modular_pipelines.LoopSequentialPipelineBlocks.from_blocks_dict`] method to add the loop block to the loop wrapper to create [`~modular_pipelines.LoopSequentialPipelineBlocks`].
+
+```py
+loop = LoopWrapper.from_blocks_dict({"block1": LoopBlock})
+```
+
+Add more loop blocks to run within each iteration with [`~modular_pipelines.LoopSequentialPipelineBlocks.from_blocks_dict`]. This allows you to modify the blocks without changing the loop logic itself.
+
+```py
+loop = LoopWrapper.from_blocks_dict({"block1": LoopBlock(), "block2": LoopBlock})
+```
--- a/docs/source/en/modular_diffusers/modular_diffusers_states.md
+++ b/docs/source/en/modular_diffusers/modular_diffusers_states.md
@@ -0,0 +1,75 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# States
+
+Blocks rely on the [`~modular_pipelines.PipelineState`] and [`~modular_pipelines.BlockState`] data structures for communicating and sharing data.
+
+| State | Description |
+|-------|-------------|
+| [`~modular_pipelines.PipelineState`] | Maintains the overall data required for a pipeline's execution and allows blocks to read and update its data. |
+| [`~modular_pipelines.BlockState`] | Allows each block to perform its computation with the necessary data from `inputs`|
+
+This guide explains how states work and how they connect blocks.
+
+## PipelineState
+
+The [`~modular_pipelines.PipelineState`] is a global state container for all blocks. It maintains the complete runtime state of the pipeline and provides a structured way for blocks to read from and write to shared data.
+
+There are two dict's in [`~modular_pipelines.PipelineState`] for structuring data.
+
+- The `values` dict is a **mutable** state containing a copy of user provided input values and intermediate output values generated by blocks. If a block modifies an `input`, it will be reflected in the `values` dict after calling `set_block_state`.
+
+```py
+PipelineState(
+  values={
+    'prompt': 'a cat'
+    'guidance_scale': 7.0
+    'num_inference_steps': 25
+    'prompt_embeds': Tensor(dtype=torch.float32, shape=torch.Size([1, 1, 1, 1]))
+    'negative_prompt_embeds': None
+  },
+)
+```
+
+## BlockState
+
+The [`~modular_pipelines.BlockState`] is a local view of the relevant variables an individual block needs from [`~modular_pipelines.PipelineState`] for performing it's computations.
+
+Access these variables directly as attributes like `block_state.image`.
+
+```py
+BlockState(
+    image: <PIL.Image.Image image mode=RGB size=512x512 at 0x7F3ECC494640>
+)
+```
+
+When a block's `__call__` method is executed, it retrieves the [`BlockState`] with `self.get_block_state(state)`, performs it's operations, and updates [`~modular_pipelines.PipelineState`] with `self.set_block_state(state, block_state)`.
+
+```py
+def __call__(self, components, state):
+    # retrieve BlockState
+    block_state = self.get_block_state(state)
+
+    # computation logic on inputs
+
+    # update PipelineState
+    self.set_block_state(state, block_state)
+    return components, state
+```
+
+## State interaction
+
+[`~modular_pipelines.PipelineState`] and [`~modular_pipelines.BlockState`] interaction is defined by a block's `inputs`, and `intermediate_outputs`.
+
+- `inputs`, a block can modify an input - like `block_state.image` - and this change can be propagated globally to [`~modular_pipelines.PipelineState`] by calling `set_block_state`.
+- `intermediate_outputs`, is a new variable that a block creates. It is added to the [`~modular_pipelines.PipelineState`]'s `values` dict and is available as for subsequent blocks or accessed by users as a final output from the pipeline.
--- a/docs/source/en/modular_diffusers/modular_pipeline.md
+++ b/docs/source/en/modular_diffusers/modular_pipeline.md
@@ -0,0 +1,358 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# ModularPipeline
+
+[`ModularPipeline`] converts [`~modular_pipelines.ModularPipelineBlocks`]'s into an executable pipeline that loads models and performs the computation steps defined in the block. It is the main interface for running a pipeline and it is very similar to the [`DiffusionPipeline`] API.
+
+The main difference is to include an expected `output` argument in the pipeline.
+
+<hfoptions id="example">
+<hfoption id="text-to-image">
+
+```py
+import torch
+from diffusers.modular_pipelines import SequentialPipelineBlocks
+from diffusers.modular_pipelines.stable_diffusion_xl import TEXT2IMAGE_BLOCKS
+
+blocks = SequentialPipelineBlocks.from_blocks_dict(TEXT2IMAGE_BLOCKS)
+
+modular_repo_id = "YiYiXu/modular-loader-t2i-0704"
+pipeline = blocks.init_pipeline(modular_repo_id)
+
+pipeline.load_default_components(torch_dtype=torch.float16)
+pipeline.to("cuda")
+
+image = pipeline(prompt="Astronaut in a jungle, cold color palette, muted colors, detailed, 8k", output="images")[0]
+image.save("modular_t2i_out.png")
+```
+
+</hfoption>
+<hfoption id="image-to-image">
+
+```py
+import torch
+from diffusers.modular_pipelines import SequentialPipelineBlocks
+from diffusers.modular_pipelines.stable_diffusion_xl import IMAGE2IMAGE_BLOCKS
+
+blocks = SequentialPipelineBlocks.from_blocks_dict(IMAGE2IMAGE_BLOCKS)
+
+modular_repo_id = "YiYiXu/modular-loader-t2i-0704"
+pipeline = blocks.init_pipeline(modular_repo_id)
+
+pipeline.load_default_components(torch_dtype=torch.float16)
+pipeline.to("cuda")
+
+url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sdxl-text2img.png"
+init_image = load_image(url)
+prompt = "a dog catching a frisbee in the jungle"
+image = pipeline(prompt=prompt, image=init_image, strength=0.8, output="images")[0]
+image.save("modular_i2i_out.png")
+```
+
+</hfoption>
+<hfoption id="inpainting">
+
+```py
+import torch
+from diffusers.modular_pipelines import SequentialPipelineBlocks
+from diffusers.modular_pipelines.stable_diffusion_xl import INPAINT_BLOCKS
+from diffusers.utils import load_image
+
+blocks = SequentialPipelineBlocks.from_blocks_dict(INPAINT_BLOCKS)
+
+modular_repo_id = "YiYiXu/modular-loader-t2i-0704"
+pipeline = blocks.init_pipeline(modular_repo_id)
+
+pipeline.load_default_components(torch_dtype=torch.float16)
+pipeline.to("cuda")
+
+img_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sdxl-text2img.png"
+mask_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sdxl-inpaint-mask.png"
+
+init_image = load_image(img_url)
+mask_image = load_image(mask_url)
+
+prompt = "A deep sea diver floating"
+image = pipeline(prompt=prompt, image=init_image, mask_image=mask_image, strength=0.85, output="images")[0]
+image.save("moduar_inpaint_out.png")
+```
+
+</hfoption>
+</hfoptions>
+
+This guide will show you how to create a [`ModularPipeline`] and manage the components in it.
+
+## Adding blocks
+
+Blocks are [`InsertableDict`] objects that can be inserted at specific positions, providing a flexible way to mix-and-match blocks.
+
+Use [`~modular_pipelines.modular_pipeline_utils.InsertableDict.insert`] on either the block class or `sub_blocks` attribute to add a block.
+
+```py
+# BLOCKS is dict of block classes, you need to add class to it
+BLOCKS.insert("block_name", BlockClass, index)
+# sub_blocks attribute contains instance, add a block instance to the  attribute
+t2i_blocks.sub_blocks.insert("block_name", block_instance, index)
+```
+
+Use [`~modular_pipelines.modular_pipeline_utils.InsertableDict.pop`] on either the block class or `sub_blocks` attribute to remove a block.
+
+```py
+# remove a block class from preset
+BLOCKS.pop("text_encoder")
+# split out a block instance on its own
+text_encoder_block = t2i_blocks.sub_blocks.pop("text_encoder")
+```
+
+Swap blocks by setting the existing block to the new block.
+
+```py
+# Replace block class in preset
+BLOCKS["prepare_latents"] = CustomPrepareLatents
+# Replace in sub_blocks attribute using an block instance
+t2i_blocks.sub_blocks["prepare_latents"] = CustomPrepareLatents()
+```
+
+## Creating a pipeline
+
+There are two ways to create a [`ModularPipeline`]. Assemble and create a pipeline from [`ModularPipelineBlocks`] or load an existing pipeline with [`~ModularPipeline.from_pretrained`].
+
+You should also initialize a [`ComponentsManager`] to handle device placement and memory and component management.
+
+> [!TIP]
+> Refer to the [ComponentsManager](./components_manager) doc for more details about how it can help manage components across different workflows.
+
+<hfoptions id="create">
+<hfoption id="ModularPipelineBlocks">
+
+Use the [`~ModularPipelineBlocks.init_pipeline`] method to create a [`ModularPipeline`] from the component and configuration specifications. This method loads the *specifications* from a `modular_model_index.json` file, but it doesn't load the *models* yet.
+
+```py
+from diffusers import ComponentsManager
+from diffusers.modular_pipelines import SequentialPipelineBlocks
+from diffusers.modular_pipelines.stable_diffusion_xl import TEXT2IMAGE_BLOCKS
+
+t2i_blocks = SequentialPipelineBlocks.from_blocks_dict(TEXT2IMAGE_BLOCKS)
+
+modular_repo_id = "YiYiXu/modular-loader-t2i-0704"
+components = ComponentsManager()
+t2i_pipeline = t2i_blocks.init_pipeline(modular_repo_id, components_manager=components)
+```
+
+</hfoption>
+<hfoption id="from_pretrained">
+
+The [`~ModularPipeline.from_pretrained`] method creates a [`ModularPipeline`] from a modular repository on the Hub.
+
+```py
+from diffusers import ModularPipeline, ComponentsManager
+
+components = ComponentsManager()
+pipeline = ModularPipeline.from_pretrained("YiYiXu/modular-loader-t2i-0704", components_manager=components)
+```
+
+Add the `trust_remote_code` argument to load a custom [`ModularPipeline`].
+
+```py
+from diffusers import ModularPipeline, ComponentsManager
+
+components = ComponentsManager()
+modular_repo_id = "YiYiXu/modular-diffdiff-0704"
+diffdiff_pipeline = ModularPipeline.from_pretrained(modular_repo_id, trust_remote_code=True, components_manager=components)
+```
+
+</hfoption>
+</hfoptions>
+
+## Loading components
+
+A [`ModularPipeline`] doesn't automatically instantiate with components. It only loads the configuration and component specifications. You can load all components with [`~ModularPipeline.load_default_components`] or only load specific components with [`~ModularPipeline.load_components`].
+
+<hfoptions id="load">
+<hfoption id="load_default_components">
+
+```py
+import torch
+
+t2i_pipeline.load_default_components(torch_dtype=torch.float16)
+t2i_pipeline.to("cuda")
+```
+
+</hfoption>
+<hfoption id="load_components">
+
+The example below only loads the UNet and VAE.
+
+```py
+import torch
+
+t2i_pipeline.load_components(names=["unet", "vae"], torch_dtype=torch.float16)
+```
+
+</hfoption>
+</hfoptions>
+
+Print the pipeline to inspect the loaded pretrained components.
+
+```py
+t2i_pipeline
+```
+
+This should match the `modular_model_index.json` file from the modular repository a pipeline is initialized from. If a pipeline doesn't need a component, it won't be included even if it exists in the modular repository.
+
+To modify where components are loaded from, edit the `modular_model_index.json` file in the repository and change it to your desired loading path. The example below loads a UNet from a different repository.
+
+```json
+# original
+"unet": [
+  null, null,
+  {
+    "repo": "stabilityai/stable-diffusion-xl-base-1.0",
+    "subfolder": "unet",
+    "variant": "fp16"
+  }
+]
+
+# modified
+"unet": [
+  null, null,
+  {
+    "repo": "RunDiffusion/Juggernaut-XL-v9",
+    "subfolder": "unet",
+    "variant": "fp16"
+  }
+]
+```
+
+### Component loading status
+
+The pipeline properties below provide more information about which components are loaded.
+
+Use `component_names` to return all expected components.
+
+```py
+t2i_pipeline.component_names
+['text_encoder', 'text_encoder_2', 'tokenizer', 'tokenizer_2', 'guider', 'scheduler', 'unet', 'vae', 'image_processor']
+```
+
+Use `null_component_names` to return components that aren't loaded yet. Load these components with [`~ModularPipeline.from_pretrained`].
+
+```py
+t2i_pipeline.null_component_names
+['text_encoder', 'text_encoder_2', 'tokenizer', 'tokenizer_2', 'scheduler']
+```
+
+Use `pretrained_component_names` to return components that will be loaded from pretrained models.
+
+```py
+t2i_pipeline.pretrained_component_names
+['text_encoder', 'text_encoder_2', 'tokenizer', 'tokenizer_2', 'scheduler', 'unet', 'vae']
+```
+
+Use `config_component_names` to return components that are created with the default config (not loaded from a modular repository). Components from a config aren't included because they are already initialized during pipeline creation. This is why they aren't listed in `null_component_names`.
+
+```py
+t2i_pipeline.config_component_names
+['guider', 'image_processor']
+```
+
+## Updating components
+
+Components may be updated depending on whether it is a *pretrained component* or a *config component*.
+
+> [!WARNING]
+> A component may change from pretrained to config when updating a component. The component type is initially defined in a block's `expected_components` field.
+
+A pretrained component is updated with [`ComponentSpec`] whereas a config component is updated by eihter passing the object directly or with [`ComponentSpec`].
+
+The [`ComponentSpec`] shows `default_creation_method="from_pretrained"` for a pretrained component shows `default_creation_method="from_config` for a config component.
+
+To update a pretrained component, create a [`ComponentSpec`] with the name of the component and where to load it from. Use the [`~ComponentSpec.load`] method to load the component.
+
+```py
+from diffusers import ComponentSpec, UNet2DConditionModel
+
+unet_spec = ComponentSpec(name="unet",type_hint=UNet2DConditionModel, repo="stabilityai/stable-diffusion-xl-base-1.0", subfolder="unet", variant="fp16")
+unet = unet_spec.load(torch_dtype=torch.float16)
+```
+
+The [`~ModularPipeline.update_components`] method replaces the component with a new one.
+
+```py
+t2i_pipeline.update_components(unet=unet2)
+```
+
+When a component is updated, the loading specifications are also updated in the pipeline config.
+
+### Component extraction and modification
+
+When you use [`~ComponentSpec.load`], the new component maintains its loading specifications. This makes it possible to extract the specification and recreate the component.
+
+```py
+spec = ComponentSpec.from_component("unet", unet2)
+spec
+ComponentSpec(name='unet', type_hint=<class 'diffusers.models.unets.unet_2d_condition.UNet2DConditionModel'>, description=None, config=None, repo='stabilityai/stable-diffusion-xl-base-1.0', subfolder='unet', variant='fp16', revision=None, default_creation_method='from_pretrained')
+unet2_recreated = spec.load(torch_dtype=torch.float16)
+```
+
+The [`~ModularPipeline.get_component_spec`] method gets a copy of the current component specification to modify or update.
+
+```py
+unet_spec = t2i_pipeline.get_component_spec("unet")
+unet_spec
+ComponentSpec(
+    name='unet',
+    type_hint=<class 'diffusers.models.unets.unet_2d_condition.UNet2DConditionModel'>,
+    repo='RunDiffusion/Juggernaut-XL-v9',
+    subfolder='unet',
+    variant='fp16',
+    default_creation_method='from_pretrained'
+)
+
+# modify to load from a different repository
+unet_spec.repo = "stabilityai/stable-diffusion-xl-base-1.0"
+
+# load component with modified spec
+unet = unet_spec.load(torch_dtype=torch.float16)
+```
+
+## Modular repository
+
+A repository is required if the pipeline blocks use *pretrained components*. The repository supplies loading specifications and metadata.
+
+[`ModularPipeline`] specifically requires *modular repositories* (see [example repository](https://huggingface.co/YiYiXu/modular-diffdiff)) which are more flexible than a typical repository. It contains a `modular_model_index.json` file containing the following 3 elements.
+
+- `library` and `class` shows which library the component was loaded from and it's class. If `null`, the component hasn't been loaded yet.
+- `loading_specs_dict` contains the information required to load the component such as the repository and subfolder it is loaded from.
+
+Unlike standard repositories, a modular repository can fetch components from different repositories based on the `loading_specs_dict`. Components don't need to exist in the same repository.
+
+A modular repository may contain custom code for loading a [`ModularPipeline`]. This allows you to use specialized blocks that aren't native to Diffusers.
+
+```
+modular-diffdiff-0704/
+├── block.py                    # Custom pipeline blocks implementation
+├── config.json                 # Pipeline configuration and auto_map
+└── modular_model_index.json    # Component loading specifications
+```
+
+The [config.json](https://huggingface.co/YiYiXu/modular-diffdiff-0704/blob/main/config.json) file contains an `auto_map` key that points to where a custom block is defined in `block.py`.
+
+```json
+{
+  "_class_name": "DiffDiffBlocks",
+  "auto_map": {
+    "ModularPipelineBlocks": "block.DiffDiffBlocks"
+  }
+}
+```
--- a/docs/source/en/modular_diffusers/overview.md
+++ b/docs/source/en/modular_diffusers/overview.md
@@ -0,0 +1,41 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Overview
+
+> [!WARNING]
+> Modular Diffusers is under active development and it's API may change.
+
+Modular Diffusers is a unified pipeline system that simplifies your workflow with *pipeline blocks*.
+
+- Blocks are reusable and you only need to create new blocks that are unique to your pipeline.
+- Blocks can be mixed and matched to adapt to or create a pipeline for a specific workflow or multiple workflows.
+
+The Modular Diffusers docs are organized as shown below.
+
+## Quickstart
+
+- A [quickstart](./quickstart) demonstrating how to implement an example workflow with Modular Diffusers.
+
+## ModularPipelineBlocks
+
+- [States](./modular_diffusers_states) explains how data is shared and communicated between blocks and [`ModularPipeline`].
+- [ModularPipelineBlocks](./pipeline_block) is the most basic unit of a [`ModularPipeline`] and this guide shows you how to create one.
+- [SequentialPipelineBlocks](./sequential_pipeline_blocks) is a type of block that chains multiple blocks so they run one after another, passing data along the chain. This guide shows you how to create [`~modular_pipelines.SequentialPipelineBlocks`] and how they connect and work together.
+- [LoopSequentialPipelineBlocks](./loop_sequential_pipeline_blocks) is a type of block that runs a series of blocks in a loop. This guide shows you how to create [`~modular_pipelines.LoopSequentialPipelineBlocks`].
+- [AutoPipelineBlocks](./auto_pipeline_blocks) is a type of block that automatically chooses which blocks to run based on the input. This guide shows you how to create [`~modular_pipelines.AutoPipelineBlocks`].
+
+## ModularPipeline
+
+- [ModularPipeline](./modular_pipeline) shows you how to create and convert pipeline blocks into an executable [`ModularPipeline`].
+- [ComponentsManager](./components_manager) shows you how to manage and reuse components across multiple pipelines.
+- [Guiders](./guiders) shows you how to use different guidance methods in the pipeline.
--- a/docs/source/en/modular_diffusers/pipeline_block.md
+++ b/docs/source/en/modular_diffusers/pipeline_block.md
@@ -0,0 +1,115 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# ModularPipelineBlocks
+
+[`~modular_pipelines.ModularPipelineBlocks`] is the basic block for building a [`ModularPipeline`]. It defines what components, inputs/outputs, and computation a block should perform for a specific step in a pipeline. A [`~modular_pipelines.ModularPipelineBlocks`] connects with other blocks, using [state](./modular_diffusers_states), to enable the modular construction of workflows.
+
+A [`~modular_pipelines.ModularPipelineBlocks`] on it's own can't be executed. It is a blueprint for what a step should do in a pipeline. To actually run and execute a pipeline, the [`~modular_pipelines.ModularPipelineBlocks`] needs to be converted into a [`ModularPipeline`].
+
+This guide will show you how to create a [`~modular_pipelines.ModularPipelineBlocks`].
+
+## Inputs and outputs
+
+> [!TIP]
+> Refer to the [States](./modular_diffusers_states) guide if you aren't familiar with how state works in Modular Diffusers.
+
+A [`~modular_pipelines.ModularPipelineBlocks`] requires `inputs`, and `intermediate_outputs`.
+
+- `inputs` are values provided by a user and retrieved from the [`~modular_pipelines.PipelineState`]. This is useful because some workflows resize an image, but the original image is still required. The [`~modular_pipelines.PipelineState`] maintains the original image.
+
+    Use `InputParam` to define `inputs`.
+
+    ```py
+    from diffusers.modular_pipelines import InputParam
+
+    user_inputs = [
+        InputParam(name="image", type_hint="PIL.Image", description="raw input image to process")
+    ]
+    ```
+
+- `intermediate_inputs` are values typically created from a previous block but it can also be directly provided if no preceding block generates them. Unlike `inputs`, `intermediate_inputs` can be modified.
+
+    Use `InputParam` to define `intermediate_inputs`.
+
+    ```py
+    user_intermediate_inputs = [
+        InputParam(name="processed_image", type_hint="torch.Tensor", description="image that has been preprocessed and normalized"),
+    ]
+    ```
+
+- `intermediate_outputs` are new values created by a block and added to the [`~modular_pipelines.PipelineState`]. The `intermediate_outputs` are available as `intermediate_inputs` for subsequent blocks or available as the final output from running the pipeline.
+
+    Use `OutputParam` to define `intermediate_outputs`.
+
+    ```py
+    from diffusers.modular_pipelines import OutputParam
+
+        user_intermediate_outputs = [
+        OutputParam(name="image_latents", description="latents representing the image")
+    ]
+    ```
+
+The intermediate inputs and outputs share data to connect blocks. They are accessible at any point, allowing you to track the workflow's progress.
+
+## Computation logic
+
+The computation a block performs is defined in the `__call__` method and it follows a specific structure.
+
+1. Retrieve the [`~modular_pipelines.BlockState`] to get a local view of the `inputs` and `intermediate_inputs`.
+2. Implement the computation logic on the `inputs` and `intermediate_inputs`.
+3. Update [`~modular_pipelines.PipelineState`] to push changes from the local [`~modular_pipelines.BlockState`] back to the global [`~modular_pipelines.PipelineState`].
+4. Return the components and state which becomes available to the next block.
+
+```py
+def __call__(self, components, state):
+    # Get a local view of the state variables this block needs
+    block_state = self.get_block_state(state)
+
+    # Your computation logic here
+    # block_state contains all your inputs and intermediate_inputs
+    # Access them like: block_state.image, block_state.processed_image
+
+    # Update the pipeline state with your updated block_states
+    self.set_block_state(state, block_state)
+    return components, state
+```
+
+### Components and configs
+
+The components and pipeline-level configs a block needs are specified in [`ComponentSpec`] and [`~modular_pipelines.ConfigSpec`].
+
+- [`ComponentSpec`] contains the expected components used by a block. You need the `name` of the component and ideally a `type_hint` that specifies exactly what the component is.
+- [`~modular_pipelines.ConfigSpec`] contains pipeline-level settings that control behavior across all blocks.
+
+```py
+from diffusers import ComponentSpec, ConfigSpec
+
+expected_components = [
+    ComponentSpec(name="unet", type_hint=UNet2DConditionModel),
+    ComponentSpec(name="scheduler", type_hint=EulerDiscreteScheduler)
+]
+
+expected_config = [
+    ConfigSpec("force_zeros_for_empty_prompt", True)
+]
+```
+
+When the blocks are converted into a pipeline, the components become available to the block as the first argument in `__call__`.
+
+```py
+def __call__(self, components, state):
+    # Access components using dot notation
+    unet = components.unet
+    vae = components.vae
+    scheduler = components.scheduler
+```
--- a/docs/source/en/modular_diffusers/quickstart.md
+++ b/docs/source/en/modular_diffusers/quickstart.md
@@ -0,0 +1,344 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Quickstart
+
+Modular Diffusers is a framework for quickly building flexible and customizable pipelines. At the core of Modular Diffusers are [`ModularPipelineBlocks`] that can be combined with other blocks to adapt to new workflows. The blocks are converted into a [`ModularPipeline`], a friendly user-facing interface developers can use.
+
+This doc will show you how to implement a [Differential Diffusion](https://differential-diffusion.github.io/) pipeline with the modular framework.
+
+## ModularPipelineBlocks
+
+[`ModularPipelineBlocks`] are *definitions* that specify the components, inputs, outputs, and computation logic for a single step in a pipeline. There are four types of blocks.
+
+- [`ModularPipelineBlocks`] is the most basic block for a single step.
+- [`SequentialPipelineBlocks`] is a multi-block that composes other blocks linearly. The outputs of one block are the inputs to the next block.
+- [`LoopSequentialPipelineBlocks`] is a multi-block that runs iteratively and is designed for iterative workflows.
+- [`AutoPipelineBlocks`] is a collection of blocks for different workflows and it selects which block to run based on the input. It is designed to conveniently package multiple workflows into a single pipeline.
+
+[Differential Diffusion](https://differential-diffusion.github.io/) is an image-to-image workflow. Start with the `IMAGE2IMAGE_BLOCKS` preset, a collection of `ModularPipelineBlocks` for image-to-image generation.
+
+```py
+from diffusers.modular_pipelines.stable_diffusion_xl import IMAGE2IMAGE_BLOCKS
+IMAGE2IMAGE_BLOCKS = InsertableDict([
+    ("text_encoder", StableDiffusionXLTextEncoderStep),
+    ("image_encoder", StableDiffusionXLVaeEncoderStep),
+    ("input", StableDiffusionXLInputStep),
+    ("set_timesteps", StableDiffusionXLImg2ImgSetTimestepsStep),
+    ("prepare_latents", StableDiffusionXLImg2ImgPrepareLatentsStep),
+    ("prepare_add_cond", StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep),
+    ("denoise", StableDiffusionXLDenoiseStep),
+    ("decode", StableDiffusionXLDecodeStep)
+])
+```
+
+## Pipeline and block states
+
+Modular Diffusers uses *state* to communicate data between blocks. There are two types of states.
+
+- [`PipelineState`] is a global state that can be used to track all inputs and outputs across all blocks.
+- [`BlockState`] is a local view of relevant variables from [`PipelineState`] for an individual block.
+
+## Customizing blocks
+
+[Differential Diffusion](https://differential-diffusion.github.io/) differs from standard image-to-image in its `prepare_latents` and `denoise` blocks. All the other blocks can be reused, but you'll need to modify these two.
+
+Create placeholder `ModularPipelineBlocks` for `prepare_latents` and `denoise` by copying and modifying the existing ones.
+
+Print the `denoise` block to see that it is composed of [`LoopSequentialPipelineBlocks`] with three sub-blocks, `before_denoiser`, `denoiser`, and `after_denoiser`. Only the `before_denoiser` sub-block needs to be modified to prepare the latent input for the denoiser based on the change map.
+
+```py
+denoise_blocks = IMAGE2IMAGE_BLOCKS["denoise"]()
+print(denoise_blocks)
+```
+
+Replace the `StableDiffusionXLLoopBeforeDenoiser` sub-block with the new `SDXLDiffDiffLoopBeforeDenoiser` block.
+
+```py
+# Copy existing blocks as placeholders
+class SDXLDiffDiffPrepareLatentsStep(ModularPipelineBlocks):
+    """Copied from StableDiffusionXLImg2ImgPrepareLatentsStep - will modify later"""
+    # ... same implementation as StableDiffusionXLImg2ImgPrepareLatentsStep
+
+class SDXLDiffDiffDenoiseStep(StableDiffusionXLDenoiseLoopWrapper):
+    block_classes = [SDXLDiffDiffLoopBeforeDenoiser, StableDiffusionXLLoopDenoiser, StableDiffusionXLLoopAfterDenoiser]
+    block_names = ["before_denoiser", "denoiser", "after_denoiser"]
+```
+
+### prepare_latents
+
+The `prepare_latents` block requires the following changes.
+
+- a processor to process the change map
+- a new `inputs` to accept the user-provided change map, `timestep` for precomputing all the latents and `num_inference_steps` to create the mask for updating the image regions
+- update the computation in the `__call__` method for processing the change map and creating the masks, and storing it in the [`BlockState`]
+
+```diff
+class SDXLDiffDiffPrepareLatentsStep(ModularPipelineBlocks):
+    @property
+    def expected_components(self) -> List[ComponentSpec]:
+        return [
+            ComponentSpec("vae", AutoencoderKL),
+            ComponentSpec("scheduler", EulerDiscreteScheduler),
+           ComponentSpec("mask_processor", VaeImageProcessor, config=FrozenDict({"do_normalize": False, "do_convert_grayscale": True}))
+        ]
+    @property
+    def inputs(self) -> List[Tuple[str, Any]]:
+        return [
+            InputParam("generator"),
+           InputParam("diffdiff_map", required=True),
+-           InputParam("latent_timestep", required=True, type_hint=torch.Tensor),
+           InputParam("timesteps", type_hint=torch.Tensor),
+           InputParam("num_inference_steps", type_hint=int),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [
+           OutputParam("original_latents", type_hint=torch.Tensor),
+           OutputParam("diffdiff_masks", type_hint=torch.Tensor),
+        ]
+    def __call__(self, components, state: PipelineState):
+        # ... existing logic ...
+       # Process change map and create masks
+       diffdiff_map = components.mask_processor.preprocess(block_state.diffdiff_map, height=latent_height, width=latent_width)
+       thresholds = torch.arange(block_state.num_inference_steps, dtype=diffdiff_map.dtype) / block_state.num_inference_steps
+       block_state.diffdiff_masks = diffdiff_map > (thresholds + (block_state.denoising_start or 0))
+       block_state.original_latents = block_state.latents
+```
+
+### denoise
+
+The `before_denoiser` sub-block requires the following changes.
+
+- a new `inputs` to accept a `denoising_start` parameter,  `original_latents` and `diffdiff_masks` from the `prepare_latents` block
+- update the computation in the `__call__` method for applying Differential Diffusion
+
+```diff
+class SDXLDiffDiffLoopBeforeDenoiser(ModularPipelineBlocks):
+    @property
+    def description(self) -> str:
+        return (
+            "Step within the denoising loop for differential diffusion that prepare the latent input for the denoiser"
+        )
+
+    @property
+    def inputs(self) -> List[str]:
+        return [
+            InputParam("latents", required=True, type_hint=torch.Tensor),
+           InputParam("denoising_start"),
+           InputParam("original_latents", type_hint=torch.Tensor),
+           InputParam("diffdiff_masks", type_hint=torch.Tensor),
+        ]
+
+    def __call__(self, components, block_state, i, t):
+       # Apply differential diffusion logic
+       if i == 0 and block_state.denoising_start is None:
+           block_state.latents = block_state.original_latents[:1]
+       else:
+           block_state.mask = block_state.diffdiff_masks[i].unsqueeze(0).unsqueeze(1)
+           block_state.latents = block_state.original_latents[i] * block_state.mask + block_state.latents * (1 - block_state.mask)
+
+        # ... rest of existing logic ...
+```
+
+## Assembling the blocks
+
+You should have all the blocks you need at this point to create a [`ModularPipeline`].
+
+Copy the existing `IMAGE2IMAGE_BLOCKS` preset and for the `set_timesteps` block, use the `set_timesteps` from the `TEXT2IMAGE_BLOCKS` because Differential Diffusion doesn't require a `strength` parameter.
+
+Set the `prepare_latents` and `denoise` blocks to the `SDXLDiffDiffPrepareLatentsStep` and `SDXLDiffDiffDenoiseStep` blocks you just modified.
+
+Call [`SequentialPipelineBlocks.from_blocks_dict`] on the blocks to create a `SequentialPipelineBlocks`.
+
+```py
+DIFFDIFF_BLOCKS = IMAGE2IMAGE_BLOCKS.copy()
+DIFFDIFF_BLOCKS["set_timesteps"] = TEXT2IMAGE_BLOCKS["set_timesteps"]
+DIFFDIFF_BLOCKS["prepare_latents"] = SDXLDiffDiffPrepareLatentsStep
+DIFFDIFF_BLOCKS["denoise"] = SDXLDiffDiffDenoiseStep
+
+dd_blocks = SequentialPipelineBlocks.from_blocks_dict(DIFFDIFF_BLOCKS)
+print(dd_blocks)
+```
+
+## ModularPipeline
+
+Convert the [`SequentialPipelineBlocks`] into a [`ModularPipeline`] with the [`ModularPipeline.init_pipeline`] method. This initializes the expected components to load from a `modular_model_index.json` file. Explicitly load the components by calling [`ModularPipeline.load_default_components`].
+
+It is a good idea to initialize the [`ComponentManager`] with the pipeline to help manage the different components. Once you call [`~ModularPipeline.load_default_components`], the components are registered to the [`ComponentManager`] and can be shared between workflows. The example below uses the `collection` argument to assign the components a `"diffdiff"` label for better organization.
+
+```py
+from diffusers.modular_pipelines import ComponentsManager
+
+components = ComponentManager()
+
+dd_pipeline = dd_blocks.init_pipeline("YiYiXu/modular-demo-auto", components_manager=components, collection="diffdiff")
+dd_pipeline.load_default_componenets(torch_dtype=torch.float16)
+dd_pipeline.to("cuda")
+```
+
+## Adding workflows
+
+Other workflows can be added to the [`ModularPipeline`] to support additional features without rewriting the entire pipeline from scratch.
+
+This section demonstrates how to add an IP-Adapter or ControlNet.
+
+### IP-Adapter
+
+Stable Diffusion XL already has a preset IP-Adapter block that you can use and doesn't require any changes to the existing Differential Diffusion pipeline.
+
+```py
+from diffusers.modular_pipelines.stable_diffusion_xl.encoders import StableDiffusionXLAutoIPAdapterStep
+
+ip_adapter_block = StableDiffusionXLAutoIPAdapterStep()
+```
+
+Use the [`sub_blocks.insert`] method to insert it into the [`ModularPipeline`]. The example below inserts the `ip_adapter_block` at position `0`. Print the pipeline to see that the `ip_adapter_block` is added and it requires an `ip_adapter_image`. This also added two components to the pipeline, the `image_encoder` and `feature_extractor`.
+
+```py
+dd_blocks.sub_blocks.insert("ip_adapter", ip_adapter_block, 0)
+```
+
+Call [`~ModularPipeline.init_pipeline`] to initialize a [`ModularPipeline`] and use [`~ModularPipeline.load_default_components`] to load the model components. Load and set the IP-Adapter to run the pipeline.
+
+```py
+dd_pipeline = dd_blocks.init_pipeline("YiYiXu/modular-demo-auto", collection="diffdiff")
+dd_pipeline.load_default_components(torch_dtype=torch.float16)
+dd_pipeline.loader.load_ip_adapter("h94/IP-Adapter", subfolder="sdxl_models", weight_name="ip-adapter_sdxl.bin")
+dd_pipeline.loader.set_ip_adapter_scale(0.6)
+dd_pipeline = dd_pipeline.to(device)
+
+ip_adapter_image = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/diffdiff_orange.jpeg")
+image = load_image("https://huggingface.co/datasets/OzzyGT/testing-resources/resolve/main/differential/20240329211129_4024911930.png?download=true")
+mask = load_image("https://huggingface.co/datasets/OzzyGT/testing-resources/resolve/main/differential/gradient_mask.png?download=true")
+
+prompt = "a green pear"
+negative_prompt = "blurry"
+generator = torch.Generator(device=device).manual_seed(42)
+
+image = dd_pipeline(
+    prompt=prompt,
+    negative_prompt=negative_prompt,
+    num_inference_steps=25,
+    generator=generator,
+    ip_adapter_image=ip_adapter_image,
+    diffdiff_map=mask,
+    image=image,
+    output="images"
+)[0]
+```
+
+### ControlNet
+
+Stable Diffusion XL already has a preset ControlNet block that can readily be used.
+
+```py
+from diffusers.modular_pipelines.stable_diffusion_xl.modular_blocks import StableDiffusionXLAutoControlNetInputStep
+
+control_input_block = StableDiffusionXLAutoControlNetInputStep()
+```
+
+However, it requires modifying the `denoise` block because that's where the ControlNet injects the control information into the UNet.
+
+Modify the `denoise` block by replacing the `StableDiffusionXLLoopDenoiser` sub-block with the `StableDiffusionXLControlNetLoopDenoiser`.
+
+```py
+class SDXLDiffDiffControlNetDenoiseStep(StableDiffusionXLDenoiseLoopWrapper):
+    block_classes = [SDXLDiffDiffLoopBeforeDenoiser, StableDiffusionXLControlNetLoopDenoiser, StableDiffusionXLDenoiseLoopAfterDenoiser]
+    block_names = ["before_denoiser", "denoiser", "after_denoiser"]
+
+controlnet_denoise_block = SDXLDiffDiffControlNetDenoiseStep()
+```
+
+Insert the `controlnet_input` block and replace the `denoise` block with the new `controlnet_denoise_block`. Initialize a [`ModularPipeline`] and [`~ModularPipeline.load_default_components`] into it.
+
+```py
+dd_blocks.sub_blocks.insert("controlnet_input", control_input_block, 7)
+dd_blocks.sub_blocks["denoise"] = controlnet_denoise_block
+
+dd_pipeline = dd_blocks.init_pipeline("YiYiXu/modular-demo-auto", collection="diffdiff")
+dd_pipeline.load_default_components(torch_dtype=torch.float16)
+dd_pipeline = dd_pipeline.to(device)
+
+control_image = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/diffdiff_tomato_canny.jpeg")
+image = load_image("https://huggingface.co/datasets/OzzyGT/testing-resources/resolve/main/differential/20240329211129_4024911930.png?download=true")
+mask = load_image("https://huggingface.co/datasets/OzzyGT/testing-resources/resolve/main/differential/gradient_mask.png?download=true")
+
+prompt = "a green pear"
+negative_prompt = "blurry"
+generator = torch.Generator(device=device).manual_seed(42)
+
+image = dd_pipeline(
+    prompt=prompt,
+    negative_prompt=negative_prompt,
+    num_inference_steps=25,
+    generator=generator,
+    control_image=control_image,
+    controlnet_conditioning_scale=0.5,
+    diffdiff_map=mask,
+    image=image,
+    output="images"
+)[0]
+```
+
+### AutoPipelineBlocks
+
+The Differential Diffusion, IP-Adapter, and ControlNet workflows can be bundled into a single [`ModularPipeline`] by using [`AutoPipelineBlocks`]. This allows automatically selecting which sub-blocks to run based on the inputs like `control_image` or `ip_adapter_image`. If none of these inputs are passed, then it defaults to the Differential Diffusion.
+
+Use `block_trigger_inputs` to only run the `SDXLDiffDiffControlNetDenoiseStep` block if a `control_image` input is provided. Otherwise, the `SDXLDiffDiffDenoiseStep` is used.
+
+```py
+class SDXLDiffDiffAutoDenoiseStep(AutoPipelineBlocks):
+    block_classes = [SDXLDiffDiffControlNetDenoiseStep, SDXLDiffDiffDenoiseStep]
+    block_names = ["controlnet_denoise", "denoise"]
+    block_trigger_inputs = ["controlnet_cond", None]
+```
+
+Add the `ip_adapter` and `controlnet_input` blocks.
+
+```py
+DIFFDIFF_AUTO_BLOCKS = IMAGE2IMAGE_BLOCKS.copy()
+DIFFDIFF_AUTO_BLOCKS["prepare_latents"] = SDXLDiffDiffPrepareLatentsStep
+DIFFDIFF_AUTO_BLOCKS["set_timesteps"] = TEXT2IMAGE_BLOCKS["set_timesteps"]
+DIFFDIFF_AUTO_BLOCKS["denoise"] = SDXLDiffDiffAutoDenoiseStep
+DIFFDIFF_AUTO_BLOCKS.insert("ip_adapter", StableDiffusionXLAutoIPAdapterStep, 0)
+DIFFDIFF_AUTO_BLOCKS.insert("controlnet_input",StableDiffusionXLControlNetAutoInput, 7)
+```
+
+Call [`SequentialPipelineBlocks.from_blocks_dict`] to create a [`SequentialPipelineBlocks`] and create a [`ModularPipeline`] and load in the model components to run.
+
+```py
+dd_auto_blocks = SequentialPipelineBlocks.from_blocks_dict(DIFFDIFF_AUTO_BLOCKS)
+dd_pipeline = dd_auto_blocks.init_pipeline("YiYiXu/modular-demo-auto", collection="diffdiff")
+dd_pipeline.load_default_components(torch_dtype=torch.float16)
+```
+
+## Share
+
+Add your [`ModularPipeline`] to the Hub with [`~ModularPipeline.save_pretrained`] and set `push_to_hub` argument to `True`.
+
+```py
+dd_pipeline.save_pretrained("YiYiXu/test_modular_doc", push_to_hub=True)
+```
+
+Other users can load the [`ModularPipeline`] with [`~ModularPipeline.from_pretrained`].
+
+```py
+import torch
+from diffusers.modular_pipelines import ModularPipeline, ComponentsManager
+
+components = ComponentsManager()
+
+diffdiff_pipeline = ModularPipeline.from_pretrained("YiYiXu/modular-diffdiff-0704", trust_remote_code=True, components_manager=components, collection="diffdiff")
+diffdiff_pipeline.load_default_components(torch_dtype=torch.float16)
+```
--- a/docs/source/en/modular_diffusers/sequential_pipeline_blocks.md
+++ b/docs/source/en/modular_diffusers/sequential_pipeline_blocks.md
@@ -0,0 +1,113 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# SequentialPipelineBlocks
+
+[`~modular_pipelines.SequentialPipelineBlocks`] are a multi-block type that composes other [`~modular_pipelines.ModularPipelineBlocks`] together in a sequence. Data flows linearly from one block to the next using `intermediate_inputs` and `intermediate_outputs`. Each block in [`~modular_pipelines.SequentialPipelineBlocks`] usually represents a step in the pipeline, and by combining them, you gradually build a pipeline.
+
+This guide shows you how to connect two blocks into a [`~modular_pipelines.SequentialPipelineBlocks`].
+
+Create two [`~modular_pipelines.ModularPipelineBlocks`]. The first block, `InputBlock`, outputs a `batch_size` value and the second block, `ImageEncoderBlock` uses `batch_size` as `intermediate_inputs`.
+
+<hfoptions id="sequential">
+<hfoption id="InputBlock">
+
+```py
+from diffusers.modular_pipelines import ModularPipelineBlocks, InputParam, OutputParam
+
+class InputBlock(ModularPipelineBlocks):
+
+    @property
+    def inputs(self):
+        return [
+            InputParam(name="prompt", type_hint=list, description="list of text prompts"),
+            InputParam(name="num_images_per_prompt", type_hint=int, description="number of images per prompt"),
+        ]
+
+    @property
+    def intermediate_outputs(self):
+        return [
+            OutputParam(name="batch_size", description="calculated batch size"),
+        ]
+
+    @property
+    def description(self):
+        return "A block that determines batch_size based on the number of prompts and num_images_per_prompt argument."
+
+    def __call__(self, components, state):
+        block_state = self.get_block_state(state)
+        batch_size = len(block_state.prompt)
+        block_state.batch_size = batch_size * block_state.num_images_per_prompt
+        self.set_block_state(state, block_state)
+        return components, state
+```
+
+</hfoption>
+<hfoption id="ImageEncoderBlock">
+
+```py
+import torch
+from diffusers.modular_pipelines import ModularPipelineBlocks, InputParam, OutputParam
+
+class ImageEncoderBlock(ModularPipelineBlocks):
+
+    @property
+    def inputs(self):
+        return [
+            InputParam(name="image", type_hint="PIL.Image", description="raw input image to process"),
+            InputParam(name="batch_size", type_hint=int),
+        ]
+
+    @property
+    def intermediate_outputs(self):
+        return [
+            OutputParam(name="image_latents", description="latents representing the image"),
+        ]
+
+    @property
+    def description(self):
+        return "Encode raw image into its latent presentation"
+
+    def __call__(self, components, state):
+        block_state = self.get_block_state(state)
+        # Simulate processing the image
+        # This will change the state of the image from a PIL image to a tensor for all blocks
+        block_state.image = torch.randn(1, 3, 512, 512)
+        block_state.batch_size = block_state.batch_size * 2
+        block_state.image_latents = torch.randn(1, 4, 64, 64)
+        self.set_block_state(state, block_state)
+        return components, state
+```
+
+</hfoption>
+</hfoptions>
+
+Connect the two blocks by defining an [`InsertableDict`] to map the block names to the block instances. Blocks are executed in the order they're registered in `blocks_dict`.
+
+Use [`~modular_pipelines.SequentialPipelineBlocks.from_blocks_dict`] to create a [`~modular_pipelines.SequentialPipelineBlocks`].
+
+```py
+from diffusers.modular_pipelines import SequentialPipelineBlocks, InsertableDict
+
+blocks_dict = InsertableDict()
+blocks_dict["input"] = input_block
+blocks_dict["image_encoder"] = image_encoder_block
+
+blocks = SequentialPipelineBlocks.from_blocks_dict(blocks_dict)
+```
+
+Inspect the sub-blocks in [`~modular_pipelines.SequentialPipelineBlocks`] by calling `blocks`, and for more details about the inputs and outputs, access the `docs` attribute.
+
+```py
+print(blocks)
+print(blocks.doc)
+```
--- a/docs/source/en/optimization/fp16.md
+++ b/docs/source/en/optimization/fp16.md
@@ -174,39 +174,36 @@ Feel free to open an issue if dynamic compilation doesn't work as expected for a

 ### Regional compilation

+[Regional compilation](https://docs.pytorch.org/tutorials/recipes/regional_compilation.html) trims cold-start latency by only compiling the *small and frequently-repeated block(s)* of a model - typically a transformer layer - and enables reusing compiled artifacts for every subsequent occurrence.
+For many diffusion architectures, this delivers the same runtime speedups as full-graph compilation and reduces compile time by 8–10x.

-[Regional compilation](https://docs.pytorch.org/tutorials/recipes/regional_compilation.html) trims cold-start latency by compiling **only the small, frequently-repeated block(s)** of a model, typically a Transformer layer, enabling reuse of compiled artifacts for every subsequent occurrence.
-For many diffusion architectures this delivers the *same* runtime speed-ups as full-graph compilation yet cuts compile time by **8–10 ×**.
-
-To make this effortless, [`ModelMixin`] exposes [`ModelMixin.compile_repeated_blocks`] API, a helper that wraps `torch.compile` around any sub-modules you designate as repeatable:
+Use the [`~ModelMixin.compile_repeated_blocks`] method, a helper that wraps `torch.compile`, on any component such as the transformer model as shown below.

 ```py
 # pip install -U diffusers
 import torch
 from diffusers import StableDiffusionXLPipeline

-pipe = StableDiffusionXLPipeline.from_pretrained(
+pipeline = StableDiffusionXLPipeline.from_pretrained(
    "stabilityai/stable-diffusion-xl-base-1.0",
    torch_dtype=torch.float16,
 ).to("cuda")

-# Compile only the repeated Transformer layers inside the UNet
-pipe.unet.compile_repeated_blocks(fullgraph=True)
+# compile only the repeated transformer layers inside the UNet
+pipeline.unet.compile_repeated_blocks(fullgraph=True)
 ```

-To enable a new model with regional compilation, add a `_repeated_blocks` attribute to your model class containing the class names (as strings) of the blocks you want compiled:
-
+To enable regional compilation for a new model, add a `_repeated_blocks` attribute to a model class containing the class names (as strings) of the blocks you want to compile.

 ```py
 class MyUNet(ModelMixin):
    _repeated_blocks = ("Transformer2DModel",)  # ← compiled by default
 ```

-For more examples, see the reference [PR](https://github.com/huggingface/diffusers/pull/11705).
-
-**Relation to Accelerate compile_regions** There is also a separate API in [accelerate](https://huggingface.co/docs/accelerate/index) - [compile_regions](https://github.com/huggingface/accelerate/blob/273799c85d849a1954a4f2e65767216eb37fa089/src/accelerate/utils/other.py#L78). It takes a fully automatic approach: it walks the module, picks candidate blocks, then compiles the remaining graph separately. That hands-off experience is handy for quick experiments, but it also leaves fewer knobs when you want to fine-tune which blocks are compiled or adjust compilation flags.
-
+> [!TIP]
+> For more regional compilation examples, see the reference [PR](https://github.com/huggingface/diffusers/pull/11705).

+There is also a [compile_regions](https://github.com/huggingface/accelerate/blob/273799c85d849a1954a4f2e65767216eb37fa089/src/accelerate/utils/other.py#L78) method in [Accelerate](https://huggingface.co/docs/accelerate/index) that automatically selects candidate blocks in a model to compile. The remaining graph is compiled separately. This is useful for quick experiments because there aren't as many options for you to set which blocks to compile or adjust compilation flags.

 ```py
 # pip install -U accelerate
@@ -219,8 +216,8 @@ pipeline = StableDiffusionXLPipeline.from_pretrained(
 ).to("cuda")
 pipeline.unet = compile_regions(pipeline.unet, mode="reduce-overhead", fullgraph=True)
 ```
-`compile_repeated_blocks`, by contrast, is intentionally explicit. You list the repeated blocks once (via `_repeated_blocks`) and the helper compiles exactly those, nothing more. In practice this small dose of control hits a sweet spot for diffusion models: predictable behavior, easy reasoning about cache reuse, and still a one-liner for users.

+[`~ModelMixin.compile_repeated_blocks`] is intentionally explicit. List the blocks to repeat in `_repeated_blocks` and the helper only compiles those blocks. It offers predictable behavior and easy reasoning about cache reuse in one line of code.

 ### Graph breaks

@@ -242,6 +239,12 @@ The `step()` function is [called](https://github.com/huggingface/diffusers/blob/

 In general, the `sigmas` should [stay on the CPU](https://github.com/huggingface/diffusers/blob/35a969d297cba69110d175ee79c59312b9f49e1e/src/diffusers/schedulers/scheduling_euler_discrete.py#L240) to avoid the communication sync and latency.

+<Tip>
+
+Refer to the [torch.compile and Diffusers: A Hands-On Guide to Peak Performance](https://pytorch.org/blog/torch-compile-and-diffusers-a-hands-on-guide-to-peak-performance/) blog post for maximizing performance with `torch.compile` for diffusion models.
+
+</Tip>
+
 ### Benchmarks

 Refer to the [diffusers/benchmarks](https://huggingface.co/datasets/diffusers/benchmarks) dataset to see inference latency and memory usage data for compiled pipelines.
@@ -296,3 +299,11 @@ An input is projected into three subspaces, represented by the projection matric
 ```py
 pipeline.fuse_qkv_projections()
 ```
+
+## Resources
+
+- Read the [Presenting Flux Fast: Making Flux go brrr on H100s](https://pytorch.org/blog/presenting-flux-fast-making-flux-go-brrr-on-h100s/) blog post to learn more about how you can combine all of these optimizations with [TorchInductor](https://docs.pytorch.org/docs/stable/torch.compiler.html) and [AOTInductor](https://docs.pytorch.org/docs/stable/torch.compiler_aot_inductor.html) for a ~2.5x speedup using recipes from [flux-fast](https://github.com/huggingface/flux-fast).
+
+    These recipes support AMD hardware and [Flux.1 Kontext Dev](https://huggingface.co/black-forest-labs/FLUX.1-Kontext-dev).
+- Read the [torch.compile and Diffusers: A Hands-On Guide to Peak Performance](https://pytorch.org/blog/torch-compile-and-diffusers-a-hands-on-guide-to-peak-performance/) blog post
+to maximize performance when using `torch.compile`.
--- a/docs/source/en/optimization/speed-memory-optims.md
+++ b/docs/source/en/optimization/speed-memory-optims.md
@@ -14,6 +14,9 @@ specific language governing permissions and limitations under the License.

 Optimizing models often involves trade-offs between [inference speed](./fp16) and [memory-usage](./memory). For instance, while [caching](./cache) can boost inference speed, it also increases memory consumption since it needs to store the outputs of intermediate attention layers. A more balanced optimization strategy combines quantizing a model, [torch.compile](./fp16#torchcompile) and various [offloading methods](./memory#offloading).

+> [!TIP]
+> Check the [torch.compile](./fp16#torchcompile) guide to learn more about compilation and how they can be applied here. For example, regional compilation can significantly reduce compilation time without giving up any speedups. 
+
 For image generation, combining quantization and [model offloading](./memory#model-offloading) can often give the best trade-off between quality, speed, and memory. Group offloading is not as effective for image generation because it is usually not possible to *fully* overlap data transfer if the compute kernel finishes faster. This results in some communication overhead between the CPU and GPU.

 For video generation, combining quantization and [group-offloading](./memory#group-offloading) tends to be better because video models are more compute-bound. 
@@ -25,7 +28,7 @@ The table below provides a comparison of optimization strategy combinations and
 | quantization  | 32.602 | 14.9453 |
 | quantization, torch.compile  | 25.847 | 14.9448 |
 | quantization, torch.compile, model CPU offloading | 32.312 | 12.2369 |
-<small>These results are benchmarked on Flux with a RTX 4090. The transformer and text_encoder components are quantized. Refer to the <a href="https://gist.github.com/sayakpaul/0db9d8eeeb3d2a0e5ed7cf0d9ca19b7d" benchmarking script</a> if you're interested in evaluating your own model.</small>
+<small>These results are benchmarked on Flux with a RTX 4090. The transformer and text_encoder components are quantized. Refer to the [benchmarking script](https://gist.github.com/sayakpaul/0db9d8eeeb3d2a0e5ed7cf0d9ca19b7d) if you're interested in evaluating your own model.</small>

 This guide will show you how to compile and offload a quantized model with [bitsandbytes](../quantization/bitsandbytes#torchcompile). Make sure you are using [PyTorch nightly](https://pytorch.org/get-started/locally/) and the latest version of bitsandbytes.

--- a/docs/source/en/quantization/gguf.md
+++ b/docs/source/en/quantization/gguf.md
@@ -53,6 +53,16 @@ image = pipe(prompt, generator=torch.manual_seed(0)).images[0]
 image.save("flux-gguf.png")
 ```

+## Using Optimized CUDA Kernels with GGUF
+
+Optimized CUDA kernels can accelerate GGUF quantized model inference by approximately 10%. This functionality requires a compatible GPU with `torch.cuda.get_device_capability` greater than 7 and the kernels library:
+
+```shell
+pip install -U kernels
+```
+
+Once installed, set `DIFFUSERS_GGUF_CUDA_KERNELS=true`  to use optimized kernels when available. Note that CUDA kernels may introduce minor numerical differences compared to the original GGUF implementation, potentially causing subtle visual variations in generated images. To disable CUDA kernel usage, set the environment variable `DIFFUSERS_GGUF_CUDA_KERNELS=false`.
+
 ## Supported Quantization Types

 - BF16
@@ -67,3 +77,44 @@ image.save("flux-gguf.png")
 - Q5_K
 - Q6_K

+## Convert to GGUF
+
+Use the Space below to convert a Diffusers checkpoint into the GGUF format for inference.
+run conversion:
+
+<iframe
+	src="https://diffusers-internal-dev-diffusers-to-gguf.hf.space"
+	frameborder="0"
+	width="850"
+	height="450"
+></iframe>
+
+
+```py
+import torch
+
+from diffusers import FluxPipeline, FluxTransformer2DModel, GGUFQuantizationConfig
+
+ckpt_path = (
+    "https://huggingface.co/sayakpaul/different-lora-from-civitai/blob/main/flux_dev_diffusers-q4_0.gguf"
+)
+transformer = FluxTransformer2DModel.from_single_file(
+    ckpt_path,
+    quantization_config=GGUFQuantizationConfig(compute_dtype=torch.bfloat16),
+    config="black-forest-labs/FLUX.1-dev",
+    subfolder="transformer",
+    torch_dtype=torch.bfloat16,
+)
+pipe = FluxPipeline.from_pretrained(
+    "black-forest-labs/FLUX.1-dev",
+    transformer=transformer,
+    torch_dtype=torch.bfloat16,
+)
+pipe.enable_model_cpu_offload()
+prompt = "A cat holding a sign that says hello world"
+image = pipe(prompt, generator=torch.manual_seed(0)).images[0]
+image.save("flux-gguf.png")
+```
+
+When using Diffusers format GGUF checkpoints, it's a must to provide the model `config` path. If the
+model config resides in a `subfolder`, that needs to be specified, too.
--- a/docs/source/en/quantization/overview.md
+++ b/docs/source/en/quantization/overview.md
@@ -11,7 +11,7 @@ specific language governing permissions and limitations under the License.

 -->

-# Quantization
+# Getting started

 Quantization focuses on representing data with fewer bits while also trying to preserve the precision of the original data. This often means converting a data type to represent the same information with fewer bits. For example, if your model weights are stored as 32-bit floating points and they're quantized to 16-bit floating points, this halves the model size which makes it easier to store and reduces memory usage. Lower precision can also speedup inference because it takes less time to perform calculations with fewer bits.

@@ -19,19 +19,25 @@ Diffusers supports multiple quantization backends to make large diffusion models

 ## Pipeline-level quantization

-There are two ways you can use [`~quantizers.PipelineQuantizationConfig`] depending on the level of control you want over the quantization specifications of each model in the pipeline.
+There are two ways to use [`~quantizers.PipelineQuantizationConfig`] depending on how much customization you want to apply to the quantization configuration. 

- for more basic and simple use cases, you only need to define the `quant_backend`, `quant_kwargs`, and `components_to_quantize`
- for more granular quantization control, provide a `quant_mapping` that provides the quantization specifications for the individual model components
+- for basic use cases, define the `quant_backend`, `quant_kwargs`, and `components_to_quantize` arguments
+- for granular quantization control, define a `quant_mapping` that provides the quantization configuration for individual model components

-### Simple quantization
+### Basic quantization

 Initialize [`~quantizers.PipelineQuantizationConfig`] with the following parameters.

 - `quant_backend` specifies which quantization backend to use. Currently supported backends include: `bitsandbytes_4bit`, `bitsandbytes_8bit`, `gguf`, `quanto`, and `torchao`.
- `quant_kwargs` contains the specific quantization arguments to use.
+- `quant_kwargs` specifies the quantization arguments to use.
+
+> [!TIP]
+> These `quant_kwargs` arguments are different for each backend. Refer to the [Quantization API](../api/quantization) docs to view the arguments for each backend.
+
 - `components_to_quantize` specifies which components of the pipeline to quantize. Typically, you should quantize the most compute intensive components like the transformer. The text encoder is another component to consider quantizing if a pipeline has more than one such as [`FluxPipeline`]. The example below quantizes the T5 text encoder in [`FluxPipeline`] while keeping the CLIP model intact.

+The example below loads the bitsandbytes backend with the following arguments from [`~quantizers.quantization_config.BitsAndBytesConfig`], `load_in_4bit`, `bnb_4bit_quant_type`, and `bnb_4bit_compute_dtype`.
+
 ```py
 import torch
 from diffusers import DiffusionPipeline
@@ -56,13 +62,13 @@ pipe = DiffusionPipeline.from_pretrained(
 image = pipe("photo of a cute dog").images[0]
 ```

-### quant_mapping
+### Advanced quantization

-The `quant_mapping` argument provides more flexible options for how to quantize each individual component in a pipeline, like combining different quantization backends.
+The `quant_mapping` argument provides more options for how to quantize each individual component in a pipeline, like combining different quantization backends.

 Initialize [`~quantizers.PipelineQuantizationConfig`] and pass a `quant_mapping` to it. The `quant_mapping` allows you to specify the quantization options for each component in the pipeline such as the transformer and text encoder.

-The example below uses two quantization backends, [`~quantizers.QuantoConfig`] and [`transformers.BitsAndBytesConfig`], for the transformer and text encoder.
+The example below uses two quantization backends, [`~quantizers.quantization_config.QuantoConfig`] and [`transformers.BitsAndBytesConfig`], for the transformer and text encoder.

 ```py
 import torch
@@ -85,7 +91,7 @@ pipeline_quant_config = PipelineQuantizationConfig(
 There is a separate bitsandbytes backend in [Transformers](https://huggingface.co/docs/transformers/main_classes/quantization#transformers.BitsAndBytesConfig). You need to import and use [`transformers.BitsAndBytesConfig`] for components that come from Transformers. For example, `text_encoder_2` in [`FluxPipeline`] is a [`~transformers.T5EncoderModel`] from Transformers so you need to use [`transformers.BitsAndBytesConfig`] instead of [`diffusers.BitsAndBytesConfig`].

 > [!TIP]
-> Use the [simple quantization](#simple-quantization) method above if you don't want to manage these distinct imports or aren't sure where each pipeline component comes from.
+> Use the [basic quantization](#basic-quantization) method above if you don't want to manage these distinct imports or aren't sure where each pipeline component comes from.

 ```py
 import torch
@@ -129,4 +135,4 @@ Check out the resources below to learn more about quantization.

 - The Transformers quantization [Overview](https://huggingface.co/docs/transformers/quantization/overview#when-to-use-what) provides an overview of the pros and cons of different quantization backends.

- Read the [Exploring Quantization Backends in Diffusers](https://huggingface.co/blog/diffusers-quantization) blog post for a brief introduction to each quantization backend, how to choose a backend, and combining quantization with other memory optimizations.
+- Read the [Exploring Quantization Backends in Diffusers](https://huggingface.co/blog/diffusers-quantization) blog post for a brief introduction to each quantization backend, how to choose a backend, and combining quantization with other memory optimizations.
--- a/docs/source/en/quicktour.md
+++ b/docs/source/en/quicktour.md
@@ -10,314 +10,220 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->

-[[open-in-colab]]
+# Quickstart

-# Quicktour
+Diffusers is a library for developers and researchers that provides an easy inference API for generating images, videos and audio, as well as the building blocks for implementing new workflows.

-Diffusion models are trained to denoise random Gaussian noise step-by-step to generate a sample of interest, such as an image or audio. This has sparked a tremendous amount of interest in generative AI, and you have probably seen examples of diffusion generated images on the internet. 🧨 Diffusers is a library aimed at making diffusion models widely accessible to everyone.
+Diffusers provides many optimizations out-of-the-box that makes it possible to load and run large models on setups with limited memory or to accelerate inference.

-Whether you're a developer or an everyday user, this quicktour will introduce you to 🧨 Diffusers and help you get up and generating quickly! There are three main components of the library to know about:
+This Quickstart will give you an overview of Diffusers and get you up and generating quickly.

-* The [`DiffusionPipeline`] is a high-level end-to-end class designed to rapidly generate samples from pretrained diffusion models for inference.
-* Popular pretrained [model](./api/models) architectures and modules that can be used as building blocks for creating diffusion systems.
-* Many different [schedulers](./api/schedulers/overview) - algorithms that control how noise is added for training, and how to generate denoised images during inference.
+> [!TIP]
+> Before you begin, make sure you have a Hugging Face [account](https://huggingface.co/join) in order to use gated models like [Flux](https://huggingface.co/black-forest-labs/FLUX.1-dev).

-The quicktour will show you how to use the [`DiffusionPipeline`] for inference, and then walk you through how to combine a model and scheduler to replicate what's happening inside the [`DiffusionPipeline`].
-
-<Tip>
-
-The quicktour is a simplified version of the introductory 🧨 Diffusers [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/diffusers_intro.ipynb) to help you get started quickly. If you want to learn more about 🧨 Diffusers' goal, design philosophy, and additional details about its core API, check out the notebook!
-
-</Tip>
-
-Before you begin, make sure you have all the necessary libraries installed:
-
-```py
-# uncomment to install the necessary libraries in Colab
-#!pip install --upgrade diffusers accelerate transformers
-```
-
- [🤗 Accelerate](https://huggingface.co/docs/accelerate/index) speeds up model loading for inference and training.
- [🤗 Transformers](https://huggingface.co/docs/transformers/index) is required to run the most popular diffusion models, such as [Stable Diffusion](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/overview).
+Follow the [Installation](./installation) guide to install Diffusers if it's not already installed.

 ## DiffusionPipeline

-The [`DiffusionPipeline`] is the easiest way to use a pretrained diffusion system for inference. It is an end-to-end system containing the model and the scheduler. You can use the [`DiffusionPipeline`] out-of-the-box for many tasks. Take a look at the table below for some supported tasks, and for a complete list of supported tasks, check out the [🧨 Diffusers Summary](./api/pipelines/overview#diffusers-summary) table.
+A diffusion model combines multiple components to generate outputs in any modality based on an input, such as a text description, image or both.

-| **Task**                     | **Description**                                                                                              | **Pipeline**
-|------------------------------|--------------------------------------------------------------------------------------------------------------|-----------------|
-| Unconditional Image Generation          | generate an image from Gaussian noise | [unconditional_image_generation](./using-diffusers/unconditional_image_generation) |
-| Text-Guided Image Generation | generate an image given a text prompt | [conditional_image_generation](./using-diffusers/conditional_image_generation) |
-| Text-Guided Image-to-Image Translation     | adapt an image guided by a text prompt | [img2img](./using-diffusers/img2img) |
-| Text-Guided Image-Inpainting          | fill the masked part of an image given the image, the mask and a text prompt | [inpaint](./using-diffusers/inpaint) |
-| Text-Guided Depth-to-Image Translation | adapt parts of an image guided by a text prompt while preserving structure via depth estimation | [depth2img](./using-diffusers/depth2img) |
+For a standard text-to-image model:

-Start by creating an instance of a [`DiffusionPipeline`] and specify which pipeline checkpoint you would like to download.
-You can use the [`DiffusionPipeline`] for any [checkpoint](https://huggingface.co/models?library=diffusers&sort=downloads) stored on the Hugging Face Hub.
-In this quicktour, you'll load the [`stable-diffusion-v1-5`](https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5) checkpoint for text-to-image generation.
+1. A text encoder turns a prompt into embeddings that guide the denoising process. Some models have more than one text encoder.
+2. A scheduler contains the algorithmic specifics for gradually denoising initial random noise into clean outputs. Different schedulers affect generation speed and quality.
+3. A UNet or diffusion transformer (DiT) is the workhorse of a diffusion model.

-<Tip warning={true}>
+  At each step, it performs the denoising predictions, such as how much noise to remove or the general direction in which to steer the noise to generate better quality outputs.

-For [Stable Diffusion](https://huggingface.co/CompVis/stable-diffusion) models, please carefully read the [license](https://huggingface.co/spaces/CompVis/stable-diffusion-license) first before running the model. 🧨 Diffusers implements a [`safety_checker`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/safety_checker.py) to prevent offensive or harmful content, but the model's improved image generation capabilities can still produce potentially harmful content.
+  The UNet or DiT repeats this loop for a set amount of steps to generate the final output.
+  
+4. A variational autoencoder (VAE) encodes and decodes pixels to a spatially compressed latent-space. *Latents* are compressed representations of an image and are more efficient to work with. The UNet or DiT operates on latents, and the clean latents at the end are decoded back into images.

-</Tip>
+The [`DiffusionPipeline`] packages all these components into a single class for inference. There are several arguments in [`~DiffusionPipeline.__call__`] you can change, such as `num_inference_steps`, that affect the diffusion process. Try different values and arguments to see how they change generation quality or speed.

-Load the model with the [`~DiffusionPipeline.from_pretrained`] method:
+Load a model with [`~DiffusionPipeline.from_pretrained`] and describe what you'd like to generate. The example below uses the default argument values.

-```python
->>> from diffusers import DiffusionPipeline
+<hfoptions id="diffusionpipeline">
+<hfoption id="text-to-image">

->>> pipeline = DiffusionPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5", use_safetensors=True)
-```
-
-The [`DiffusionPipeline`] downloads and caches all modeling, tokenization, and scheduling components. You'll see that the Stable Diffusion pipeline is composed of the [`UNet2DConditionModel`] and [`PNDMScheduler`] among other things:
+Use `.images[0]` to access the generated image output.

 ```py
->>> pipeline
-StableDiffusionPipeline {
-  "_class_name": "StableDiffusionPipeline",
-  "_diffusers_version": "0.21.4",
-  ...,
-  "scheduler": [
-    "diffusers",
-    "PNDMScheduler"
-  ],
-  ...,
-  "unet": [
-    "diffusers",
-    "UNet2DConditionModel"
-  ],
-  "vae": [
-    "diffusers",
-    "AutoencoderKL"
-  ]
-}
+import torch
+from diffusers import DiffusionPipeline
+
+pipeline = DiffusionPipeline.from_pretrained(
+  "Qwen/Qwen-Image", torch_dtype=torch.bfloat16, device_map="cuda"
+)
+
+prompt = """
+cinematic film still of a cat sipping a margarita in a pool in Palm Springs, California
+highly detailed, high budget hollywood movie, cinemascope, moody, epic, gorgeous, film grain
+"""
+pipeline(prompt).images[0]
 ```

-We strongly recommend running the pipeline on a GPU because the model consists of roughly 1.4 billion parameters.
-You can move the generator object to a GPU, just like you would in PyTorch:
+</hfoption>
+<hfoption id="text-to-video">

-```python
->>> pipeline.to("cuda")
-```
-
-Now you can pass a text prompt to the `pipeline` to generate an image, and then access the denoised image. By default, the image output is wrapped in a [`PIL.Image`](https://pillow.readthedocs.io/en/stable/reference/Image.html?highlight=image#the-image-class) object.
-
-```python
->>> image = pipeline("An image of a squirrel in Picasso style").images[0]
->>> image
-```
-
-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/image_of_squirrel_painting.png"/>
-</div>
-
-Save the image by calling `save`:
-
-```python
->>> image.save("image_of_squirrel_painting.png")
-```
-
-### Local pipeline
-
-You can also use the pipeline locally. The only difference is you need to download the weights first:
-
-```bash
-!git lfs install
-!git clone https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5
-```
-
-Then load the saved weights into the pipeline:
-
-```python
->>> pipeline = DiffusionPipeline.from_pretrained("./stable-diffusion-v1-5", use_safetensors=True)
-```
-
-Now, you can run the pipeline as you would in the section above.
-
-### Swapping schedulers
-
-Different schedulers come with different denoising speeds and quality trade-offs. The best way to find out which one works best for you is to try them out! One of the main features of 🧨 Diffusers is to allow you to easily switch between schedulers. For example, to replace the default [`PNDMScheduler`] with the [`EulerDiscreteScheduler`], load it with the [`~diffusers.ConfigMixin.from_config`] method:
+Use `.frames[0]` to access the generated video output and [`~utils.export_to_video`] to save the video.

 ```py
->>> from diffusers import EulerDiscreteScheduler
+import torch
+from diffusers import AutoencoderKLWan, DiffusionPipeline
+from diffusers.quantizers import PipelineQuantizationConfig
+from diffusers.utils import export_to_video

->>> pipeline = DiffusionPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5", use_safetensors=True)
->>> pipeline.scheduler = EulerDiscreteScheduler.from_config(pipeline.scheduler.config)
+vae = AutoencoderKLWan.from_pretrained(
+  "Wan-AI/Wan2.2-T2V-A14B-Diffusers",
+  subfolder="vae",
+  torch_dtype=torch.float32
+)
+pipeline = DiffusionPipeline.from_pretrained(
+  "Wan-AI/Wan2.2-T2V-A14B-Diffusers",
+  vae=vae
+  torch_dtype=torch.bfloat16,
+  device_map="cuda"
+)
+
+prompt = """
+Cinematic video of a sleek cat lounging on a colorful inflatable in a crystal-clear turquoise pool in Palm Springs, 
+sipping a salt-rimmed margarita through a straw. Golden-hour sunlight glows over mid-century modern homes and swaying palms. 
+Shot in rich Sony a7S III: with moody, glamorous color grading, subtle lens flares, and soft vintage film grain. 
+Ripples shimmer as a warm desert breeze stirs the water, blending luxury and playful charm in an epic, gorgeously composed frame.
+"""
+video = pipeline(prompt=prompt, num_frames=81, num_inference_steps=40).frames[0]
+export_to_video(video, "output.mp4", fps=16)
 ```

-Try generating an image with the new scheduler and see if you notice a difference!
+</hfoption>
+</hfoptions>

-In the next section, you'll take a closer look at the components - the model and scheduler - that make up the [`DiffusionPipeline`] and learn how to use these components to generate an image of a cat.
+## LoRA

-## Models
+Adapters insert a small number of trainable parameters to the original base model. Only the inserted parameters are fine-tuned while the rest of the model weights remain frozen. This makes it fast and cheap to fine-tune a model on a new style. Among adapters, [LoRA's](./tutorials/using_peft_for_inference) are the most popular.

-Most models take a noisy sample, and at each timestep it predicts the *noise residual* (other models learn to predict the previous sample directly or the velocity or [`v-prediction`](https://github.com/huggingface/diffusers/blob/5e5ce13e2f89ac45a0066cb3f369462a3cf1d9ef/src/diffusers/schedulers/scheduling_ddim.py#L110)), the difference between a less noisy image and the input image. You can mix and match models to create other diffusion systems.
-
-Models are initiated with the [`~ModelMixin.from_pretrained`] method which also locally caches the model weights so it is faster the next time you load the model. For the quicktour, you'll load the [`UNet2DModel`], a basic unconditional image generation model with a checkpoint trained on cat images:
+Add a LoRA to a pipeline with the [`~loaders.QwenImageLoraLoaderMixin.load_lora_weights`] method. Some LoRA's require a special word to trigger it, such as `Realism`, in the example below. Check a LoRA's model card to see if it requires a trigger word.

 ```py
->>> from diffusers import UNet2DModel
+import torch
+from diffusers import DiffusionPipeline

->>> repo_id = "google/ddpm-cat-256"
->>> model = UNet2DModel.from_pretrained(repo_id, use_safetensors=True)
+pipeline = DiffusionPipeline.from_pretrained(
+  "Qwen/Qwen-Image", torch_dtype=torch.bfloat16, device_map="cuda"
+)
+pipeline.load_lora_weights(
+  "flymy-ai/qwen-image-realism-lora",
+)
+
+prompt = """
+super Realism cinematic film still of a cat sipping a margarita in a pool in Palm Springs in the style of umempart, California
+highly detailed, high budget hollywood movie, cinemascope, moody, epic, gorgeous, film grain
+"""
+pipeline(prompt).images[0]
 ```

-> [!TIP]
-> Use the [`AutoModel`] API to automatically select a model class if you're unsure of which one to use.
+Check out the [LoRA](./tutorials/using_peft_for_inference) docs or Adapters section to learn more.

-To access the model parameters, call `model.config`:
+## Quantization
+
+[Quantization](./quantization/overview) stores data in fewer bits to reduce memory usage. It may also speed up inference because it takes less time to perform calculations with fewer bits.
+
+Diffusers provides several quantization backends and picking one depends on your use case. For example, [bitsandbytes](./quantization/bitsandbytes) and [torchao](./quantization/torchao) are both simple and easy to use for inference, but torchao supports more [quantization types](./quantization/torchao#supported-quantization-types) like fp8.
+
+Configure [`PipelineQuantizationConfig`] with the backend to use, the specific arguments (refer to the [API](./api/quantization) reference for available arguments) for that backend, and which components to quantize. The example below quantizes the model to 4-bits and only uses 14.93GB of memory.

 ```py
->>> model.config
+import torch
+from diffusers import DiffusionPipeline
+from diffusers.quantizers import PipelineQuantizationConfig
+
+quant_config = PipelineQuantizationConfig(
+  quant_backend="bitsandbytes_4bit",
+  quant_kwargs={"load_in_4bit": True, "bnb_4bit_quant_type": "nf4", "bnb_4bit_compute_dtype": torch.bfloat16},
+  components_to_quantize=["transformer", "text_encoder"],
+)
+pipeline = DiffusionPipeline.from_pretrained(
+  "Qwen/Qwen-Image",
+  torch_dtype=torch.bfloat16,
+  quantization_config=quant_config,
+  device_map="cuda"
+)
+
+prompt = """
+cinematic film still of a cat sipping a margarita in a pool in Palm Springs, California
+highly detailed, high budget hollywood movie, cinemascope, moody, epic, gorgeous, film grain
+"""
+pipeline(prompt).images[0]
+print(f"Max memory reserved: {torch.cuda.max_memory_allocated() / 1024**3:.2f} GB")
 ```

-The model configuration is a 🧊 frozen 🧊 dictionary, which means those parameters can't be changed after the model is created. This is intentional and ensures that the parameters used to define the model architecture at the start remain the same, while other parameters can still be adjusted during inference.
+Take a look at the [Quantization](./quantization/overview) section for more details.

-Some of the most important parameters are:
+## Optimizations

-* `sample_size`: the height and width dimension of the input sample.
-* `in_channels`: the number of input channels of the input sample.
-* `down_block_types` and `up_block_types`: the type of down- and upsampling blocks used to create the UNet architecture.
-* `block_out_channels`: the number of output channels of the downsampling blocks; also used in reverse order for the number of input channels of the upsampling blocks.
-* `layers_per_block`: the number of ResNet blocks present in each UNet block.
+Modern diffusion models are very large and have billions of parameters. The iterative denoising process is also computationally intensive and slow. Diffusers provides techniques for reducing memory usage and boosting inference speed. These techniques can be combined with quantization to optimize for both memory usage and inference speed.

-To use the model for inference, create the image shape with random Gaussian noise. It should have a `batch` axis because the model can receive multiple random noises, a `channel` axis corresponding to the number of input channels, and a `sample_size` axis for the height and width of the image:
+### Memory usage
+
+The text encoders and UNet or DiT can use up as much as ~30GB of memory, exceeding the amount available on many free-tier or consumer GPUs.
+
+Offloading stores weights that aren't currently used on the CPU and only moves them to the GPU when they're needed. There are a few offloading types and the example below uses [model offloading](./optimization/memory#model-offloading). This moves an entire model, like a text encoder or transformer, to the CPU when it isn't actively being used.
+
+Call [`~DiffusionPipeline.enable_model_cpu_offload`] to activate it. By combining quantization and offloading, the following example only requires ~12.54GB of memory.

 ```py
->>> import torch
+import torch
+from diffusers import DiffusionPipeline
+from diffusers.quantizers import PipelineQuantizationConfig

->>> torch.manual_seed(0)
+quant_config = PipelineQuantizationConfig(
+  quant_backend="bitsandbytes_4bit",
+  quant_kwargs={"load_in_4bit": True, "bnb_4bit_quant_type": "nf4", "bnb_4bit_compute_dtype": torch.bfloat16},
+  components_to_quantize=["transformer", "text_encoder"],
+)
+pipeline = DiffusionPipeline.from_pretrained(
+  "Qwen/Qwen-Image",
+  torch_dtype=torch.bfloat16,
+  quantization_config=quant_config,
+  device_map="cuda"
+)
+pipeline.enable_model_cpu_offload()

->>> noisy_sample = torch.randn(1, model.config.in_channels, model.config.sample_size, model.config.sample_size)
->>> noisy_sample.shape
-torch.Size([1, 3, 256, 256])
+prompt = """
+cinematic film still of a cat sipping a margarita in a pool in Palm Springs, California
+highly detailed, high budget hollywood movie, cinemascope, moody, epic, gorgeous, film grain
+"""
+pipeline(prompt).images[0]
+print(f"Max memory reserved: {torch.cuda.max_memory_allocated() / 1024**3:.2f} GB")
 ```

-For inference, pass the noisy image and a `timestep` to the model. The `timestep` indicates how noisy the input image is, with more noise at the beginning and less at the end. This helps the model determine its position in the diffusion process, whether it is closer to the start or the end. Use the `sample` method to get the model output:
+Refer to the [Reduce memory usage](./optimization/memory) docs to learn more about other memory reducing techniques.
+
+### Inference speed
+
+The denoising loop performs a lot of computations and can be slow. Methods like [torch.compile](./optimization/fp16#torchcompile) increases inference speed by compiling the computations into an optimized kernel. Compilation is slow for the first generation but successive generations should be much faster.
+
+The example below uses [regional compilation](./optimization/fp16#regional-compilation) to only compile small regions of a model. It reduces cold-start latency while also providing a runtime speed up.
+
+Call [`~ModelMixin.compile_repeated_blocks`] on the model to activate it.

 ```py
->>> with torch.no_grad():
-...     noisy_residual = model(sample=noisy_sample, timestep=2).sample
+import torch
+from diffusers import DiffusionPipeline
+
+pipeline = DiffusionPipeline.from_pretrained(
+  "Qwen/Qwen-Image", torch_dtype=torch.bfloat16, device_map="cuda"
+)
+
+pipeline.transformer.compile_repeated_blocks(
+    fullgraph=True,
+)
+prompt = """
+cinematic film still of a cat sipping a margarita in a pool in Palm Springs, California
+highly detailed, high budget hollywood movie, cinemascope, moody, epic, gorgeous, film grain
+"""
+pipeline(prompt).images[0]
 ```

-To generate actual examples though, you'll need a scheduler to guide the denoising process. In the next section, you'll learn how to couple a model with a scheduler.
-
-## Schedulers
-
-Schedulers manage going from a noisy sample to a less noisy sample given the model output - in this case, it is the `noisy_residual`.
-
-<Tip>
-
-🧨 Diffusers is a toolbox for building diffusion systems. While the [`DiffusionPipeline`] is a convenient way to get started with a pre-built diffusion system, you can also choose your own model and scheduler components separately to build a custom diffusion system.
-
-</Tip>
-
-For the quicktour, you'll instantiate the [`DDPMScheduler`] with its [`~diffusers.ConfigMixin.from_config`] method:
-
-```py
->>> from diffusers import DDPMScheduler
-
->>> scheduler = DDPMScheduler.from_pretrained(repo_id)
->>> scheduler
-DDPMScheduler {
-  "_class_name": "DDPMScheduler",
-  "_diffusers_version": "0.21.4",
-  "beta_end": 0.02,
-  "beta_schedule": "linear",
-  "beta_start": 0.0001,
-  "clip_sample": true,
-  "clip_sample_range": 1.0,
-  "dynamic_thresholding_ratio": 0.995,
-  "num_train_timesteps": 1000,
-  "prediction_type": "epsilon",
-  "sample_max_value": 1.0,
-  "steps_offset": 0,
-  "thresholding": false,
-  "timestep_spacing": "leading",
-  "trained_betas": null,
-  "variance_type": "fixed_small"
-}
-```
-
-<Tip>
-
-💡 Unlike a model, a scheduler does not have trainable weights and is parameter-free!
-
-</Tip>
-
-Some of the most important parameters are:
-
-* `num_train_timesteps`: the length of the denoising process or, in other words, the number of timesteps required to process random Gaussian noise into a data sample.
-* `beta_schedule`: the type of noise schedule to use for inference and training.
-* `beta_start` and `beta_end`: the start and end noise values for the noise schedule.
-
-To predict a slightly less noisy image, pass the following to the scheduler's [`~diffusers.DDPMScheduler.step`] method: model output, `timestep`, and current `sample`.
-
-```py
->>> less_noisy_sample = scheduler.step(model_output=noisy_residual, timestep=2, sample=noisy_sample).prev_sample
->>> less_noisy_sample.shape
-torch.Size([1, 3, 256, 256])
-```
-
-The `less_noisy_sample` can be passed to the next `timestep` where it'll get even less noisy! Let's bring it all together now and visualize the entire denoising process.
-
-First, create a function that postprocesses and displays the denoised image as a `PIL.Image`:
-
-```py
->>> import PIL.Image
->>> import numpy as np
-
-
->>> def display_sample(sample, i):
-...     image_processed = sample.cpu().permute(0, 2, 3, 1)
-...     image_processed = (image_processed + 1.0) * 127.5
-...     image_processed = image_processed.numpy().astype(np.uint8)
-
-...     image_pil = PIL.Image.fromarray(image_processed[0])
-...     display(f"Image at step {i}")
-...     display(image_pil)
-```
-
-To speed up the denoising process, move the input and model to a GPU:
-
-```py
->>> model.to("cuda")
->>> noisy_sample = noisy_sample.to("cuda")
-```
-
-Now create a denoising loop that predicts the residual of the less noisy sample, and computes the less noisy sample with the scheduler:
-
-```py
->>> import tqdm
-
->>> sample = noisy_sample
-
->>> for i, t in enumerate(tqdm.tqdm(scheduler.timesteps)):
-...     # 1. predict noise residual
-...     with torch.no_grad():
-...         residual = model(sample, t).sample
-
-...     # 2. compute less noisy image and set x_t -> x_t-1
-...     sample = scheduler.step(residual, t, sample).prev_sample
-
-...     # 3. optionally look at image
-...     if (i + 1) % 50 == 0:
-...         display_sample(sample, i + 1)
-```
-
-Sit back and watch as a cat is generated from nothing but noise! 😻
-
-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/diffusion-quicktour.png"/>
-</div>
-
-## Next steps
-
-Hopefully, you generated some cool images with 🧨 Diffusers in this quicktour! For your next steps, you can:
-
-* Train or finetune a model to generate your own images in the [training](./tutorials/basic_training) tutorial.
-* See example official and community [training or finetuning scripts](https://github.com/huggingface/diffusers/tree/main/examples#-diffusers-examples) for a variety of use cases.
-* Learn more about loading, accessing, changing, and comparing schedulers in the [Using different Schedulers](./using-diffusers/schedulers) guide.
-* Explore prompt engineering, speed and memory optimizations, and tips and tricks for generating higher-quality images with the [Stable Diffusion](./stable_diffusion) guide.
-* Dive deeper into speeding up 🧨 Diffusers with guides on [optimized PyTorch on a GPU](./optimization/fp16), and inference guides for running [Stable Diffusion on Apple Silicon (M1/M2)](./optimization/mps) and [ONNX Runtime](./optimization/onnx).
+Check out the [Accelerate inference](./optimization/fp16) or [Caching](./optimization/cache) docs for more methods that speed up inference.
--- a/docs/source/en/stable_diffusion.md
+++ b/docs/source/en/stable_diffusion.md
@@ -10,252 +10,123 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->

-# Effective and efficient diffusion
-
 [[open-in-colab]]

-Getting the [`DiffusionPipeline`] to generate images in a certain style or include what you want can be tricky. Often times, you have to run the [`DiffusionPipeline`] several times before you end up with an image you're happy with. But generating something out of nothing is a computationally intensive process, especially if you're running inference over and over again.
+# Basic performance

-This is why it's important to get the most *computational* (speed) and *memory* (GPU vRAM) efficiency from the pipeline to reduce the time between inference cycles so you can iterate faster.
+Diffusion is a random process that is computationally demanding. You may need to run the [`DiffusionPipeline`] several times before getting a desired output. That's why it's important to carefully balance generation speed and memory usage in order to iterate faster,

-This tutorial walks you through how to generate faster and better with the [`DiffusionPipeline`].
+This guide recommends some basic performance tips for using the [`DiffusionPipeline`]. Refer to the Inference Optimization section docs such as [Accelerate inference](./optimization/fp16) or [Reduce memory usage](./optimization/memory) for more detailed performance guides.

-Begin by loading the [`stable-diffusion-v1-5/stable-diffusion-v1-5`](https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5) model:
+## Memory usage

-```python
+Reducing the amount of memory used indirectly speeds up generation and can help a model fit on device.
+
+The [`~DiffusionPipeline.enable_model_cpu_offload`] method moves a model to the CPU when it is not in use to save GPU memory.
+
+```py
+import torch
 from diffusers import DiffusionPipeline

-model_id = "stable-diffusion-v1-5/stable-diffusion-v1-5"
-pipeline = DiffusionPipeline.from_pretrained(model_id, use_safetensors=True)
+pipeline = DiffusionPipeline.from_pretrained(
+  "stabilityai/stable-diffusion-xl-base-1.0",
+  torch_dtype=torch.bfloat16,
+  device_map="cuda"
+)
+pipeline.enable_model_cpu_offload()
+
+prompt = """
+cinematic film still of a cat sipping a margarita in a pool in Palm Springs, California
+highly detailed, high budget hollywood movie, cinemascope, moody, epic, gorgeous, film grain
+"""
+pipeline(prompt).images[0]
+print(f"Max memory reserved: {torch.cuda.max_memory_allocated() / 1024**3:.2f} GB")
 ```

-The example prompt you'll use is a portrait of an old warrior chief, but feel free to use your own prompt:
+## Inference speed

-```python
-prompt = "portrait photo of a old warrior chief"
-```
+Denoising is the most computationally demanding process during diffusion. Methods that optimizes this process accelerates inference speed. Try the following methods for a speed up.

-## Speed
+- Add `device_map="cuda"` to place the pipeline on a GPU. Placing a model on an accelerator, like a GPU, increases speed because it performs computations in parallel.
+- Set `torch_dtype=torch.bfloat16` to execute the pipeline in half-precision. Reducing the data type precision increases speed because it takes less time to perform computations in a lower precision.

-<Tip>
-
-💡 If you don't have access to a GPU, you can use one for free from a GPU provider like [Colab](https://colab.research.google.com/)!
-
-</Tip>
-
-One of the simplest ways to speed up inference is to place the pipeline on a GPU the same way you would with any PyTorch module:
-
-```python
-pipeline = pipeline.to("cuda")
-```
-
-To make sure you can use the same image and improve on it, use a [`Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) and set a seed for [reproducibility](./using-diffusers/reusing_seeds):
-
-```python
+```py
 import torch
+import time
+from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler

-generator = torch.Generator("cuda").manual_seed(0)
+pipeline = DiffusionPipeline.from_pretrained(
+  "stabilityai/stable-diffusion-xl-base-1.0",
+  torch_dtype=torch.bfloat16,
+  device_map="cuda
+)
 ```

-Now you can generate an image:
-
-```python
-image = pipeline(prompt, generator=generator).images[0]
-image
-```
-
-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/stable_diffusion_101/sd_101_1.png">
-</div>
-
-This process took ~30 seconds on a T4 GPU (it might be faster if your allocated GPU is better than a T4). By default, the [`DiffusionPipeline`] runs inference with full `float32` precision for 50 inference steps. You can speed this up by switching to a lower precision like `float16` or running fewer inference steps.
-
-Let's start by loading the model in `float16` and generate an image:
-
-```python
-import torch
-
-pipeline = DiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16, use_safetensors=True)
-pipeline = pipeline.to("cuda")
-generator = torch.Generator("cuda").manual_seed(0)
-image = pipeline(prompt, generator=generator).images[0]
-image
-```
-
-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/stable_diffusion_101/sd_101_2.png">
-</div>
-
-This time, it only took ~11 seconds to generate the image, which is almost 3x faster than before!
-
-<Tip>
-
-💡 We strongly suggest always running your pipelines in `float16`, and so far, we've rarely seen any degradation in output quality.
-
-</Tip>
-
-Another option is to reduce the number of inference steps. Choosing a more efficient scheduler could help decrease the number of steps without sacrificing output quality. You can find which schedulers are compatible with the current model in the [`DiffusionPipeline`] by calling the `compatibles` method:
-
-```python
-pipeline.scheduler.compatibles
-[
-    diffusers.schedulers.scheduling_lms_discrete.LMSDiscreteScheduler,
-    diffusers.schedulers.scheduling_unipc_multistep.UniPCMultistepScheduler,
-    diffusers.schedulers.scheduling_k_dpm_2_discrete.KDPM2DiscreteScheduler,
-    diffusers.schedulers.scheduling_deis_multistep.DEISMultistepScheduler,
-    diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler,
-    diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler,
-    diffusers.schedulers.scheduling_ddpm.DDPMScheduler,
-    diffusers.schedulers.scheduling_dpmsolver_singlestep.DPMSolverSinglestepScheduler,
-    diffusers.schedulers.scheduling_k_dpm_2_ancestral_discrete.KDPM2AncestralDiscreteScheduler,
-    diffusers.utils.dummy_torch_and_torchsde_objects.DPMSolverSDEScheduler,
-    diffusers.schedulers.scheduling_heun_discrete.HeunDiscreteScheduler,
-    diffusers.schedulers.scheduling_pndm.PNDMScheduler,
-    diffusers.schedulers.scheduling_euler_ancestral_discrete.EulerAncestralDiscreteScheduler,
-    diffusers.schedulers.scheduling_ddim.DDIMScheduler,
-]
-```
-
-The Stable Diffusion model uses the [`PNDMScheduler`] by default which usually requires ~50 inference steps, but more performant schedulers like [`DPMSolverMultistepScheduler`], require only ~20 or 25 inference steps. Use the [`~ConfigMixin.from_config`] method to load a new scheduler:
-
-```python
-from diffusers import DPMSolverMultistepScheduler
+- Use a faster scheduler, such as [`DPMSolverMultistepScheduler`], which only requires ~20-25 steps.
+- Set `num_inference_steps` to a lower value. Reducing the number of inference steps reduces the overall number of computations. However, this can result in lower generation quality.

+```py
 pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config)
+
+prompt = """
+cinematic film still of a cat sipping a margarita in a pool in Palm Springs, California
+highly detailed, high budget hollywood movie, cinemascope, moody, epic, gorgeous, film grain
+"""
+
+start_time = time.perf_counter()
+image = pipeline(prompt).images[0]
+end_time = time.perf_counter()
+
+print(f"Image generation took {end_time - start_time:.3f} seconds")
 ```

-Now set the `num_inference_steps` to 20:
+## Generation quality

-```python
-generator = torch.Generator("cuda").manual_seed(0)
-image = pipeline(prompt, generator=generator, num_inference_steps=20).images[0]
-image
-```
+Many modern diffusion models deliver high-quality images out-of-the-box. However, you can still improve generation quality by trying the following.

-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/stable_diffusion_101/sd_101_3.png">
-</div>
+- Try a more detailed and descriptive prompt. Include details such as the image medium, subject, style, and aesthetic. A negative prompt may also help by guiding a model away from undesirable features by using words like low quality or blurry.

-Great, you've managed to cut the inference time to just 4 seconds! ⚡️
+    ```py
+    import torch
+    from diffusers import DiffusionPipeline

-## Memory
+    pipeline = DiffusionPipeline.from_pretrained(
+        "stabilityai/stable-diffusion-xl-base-1.0",
+        torch_dtype=torch.bfloat16,
+        device_map="cuda"
+    )

-The other key to improving pipeline performance is consuming less memory, which indirectly implies more speed, since you're often trying to maximize the number of images generated per second. The easiest way to see how many images you can generate at once is to try out different batch sizes until you get an `OutOfMemoryError` (OOM).
+    prompt = """
+    cinematic film still of a cat sipping a margarita in a pool in Palm Springs, California
+    highly detailed, high budget hollywood movie, cinemascope, moody, epic, gorgeous, film grain
+    """
+    negative_prompt = "low quality, blurry, ugly, poor details"
+    pipeline(prompt, negative_prompt=negative_prompt).images[0]
+    ```

-Create a function that'll generate a batch of images from a list of prompts and `Generators`. Make sure to assign each `Generator` a seed so you can reuse it if it produces a good result.
+    For more details about creating better prompts, take a look at the [Prompt techniques](./using-diffusers/weighted_prompts) doc.

-```python
-def get_inputs(batch_size=1):
-    generator = [torch.Generator("cuda").manual_seed(i) for i in range(batch_size)]
-    prompts = batch_size * [prompt]
-    num_inference_steps = 20
+- Try a different scheduler, like [`HeunDiscreteScheduler`] or [`LMSDiscreteScheduler`], that gives up generation speed for quality.

-    return {"prompt": prompts, "generator": generator, "num_inference_steps": num_inference_steps}
-```
+    ```py
+    import torch
+    from diffusers import DiffusionPipeline, HeunDiscreteScheduler

-Start with `batch_size=4` and see how much memory you've consumed:
+    pipeline = DiffusionPipeline.from_pretrained(
+        "stabilityai/stable-diffusion-xl-base-1.0",
+        torch_dtype=torch.bfloat16,
+        device_map="cuda"
+    )
+    pipeline.scheduler = HeunDiscreteScheduler.from_config(pipeline.scheduler.config)

-```python
-from diffusers.utils import make_image_grid
-
-images = pipeline(**get_inputs(batch_size=4)).images
-make_image_grid(images, 2, 2)
-```
-
-Unless you have a GPU with more vRAM, the code above probably returned an `OOM` error! Most of the memory is taken up by the cross-attention layers. Instead of running this operation in a batch, you can run it sequentially to save a significant amount of memory. All you have to do is configure the pipeline to use the [`~DiffusionPipeline.enable_attention_slicing`] function:
-
-```python
-pipeline.enable_attention_slicing()
-```
-
-Now try increasing the `batch_size` to 8!
-
-```python
-images = pipeline(**get_inputs(batch_size=8)).images
-make_image_grid(images, rows=2, cols=4)
-```
-
-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/stable_diffusion_101/sd_101_5.png">
-</div>
-
-Whereas before you couldn't even generate a batch of 4 images, now you can generate a batch of 8 images at ~3.5 seconds per image! This is probably the fastest you can go on a T4 GPU without sacrificing quality.
-
-## Quality
-
-In the last two sections, you learned how to optimize the speed of your pipeline by using `fp16`, reducing the number of inference steps by using a more performant scheduler, and enabling attention slicing to reduce memory consumption. Now you're going to focus on how to improve the quality of generated images.
-
-### Better checkpoints
-
-The most obvious step is to use better checkpoints. The Stable Diffusion model is a good starting point, and since its official launch, several improved versions have also been released. However, using a newer version doesn't automatically mean you'll get better results. You'll still have to experiment with different checkpoints yourself, and do a little research (such as using [negative prompts](https://minimaxir.com/2022/11/stable-diffusion-negative-prompt/)) to get the best results.
-
-As the field grows, there are more and more high-quality checkpoints finetuned to produce certain styles. Try exploring the [Hub](https://huggingface.co/models?library=diffusers&sort=downloads) and [Diffusers Gallery](https://huggingface.co/spaces/huggingface-projects/diffusers-gallery) to find one you're interested in!
-
-### Better pipeline components
-
-You can also try replacing the current pipeline components with a newer version. Let's try loading the latest [autoencoder](https://huggingface.co/stabilityai/stable-diffusion-2-1/tree/main/vae) from Stability AI into the pipeline, and generate some images:
-
-```python
-from diffusers import AutoencoderKL
-
-vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse", torch_dtype=torch.float16).to("cuda")
-pipeline.vae = vae
-images = pipeline(**get_inputs(batch_size=8)).images
-make_image_grid(images, rows=2, cols=4)
-```
-
-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/stable_diffusion_101/sd_101_6.png">
-</div>
-
-### Better prompt engineering
-
-The text prompt you use to generate an image is super important, so much so that it is called *prompt engineering*. Some considerations to keep during prompt engineering are:
-
- How is the image or similar images of the one I want to generate stored on the internet?
- What additional detail can I give that steers the model towards the style I want?
-
-With this in mind, let's improve the prompt to include color and higher quality details:
-
-```python
-prompt += ", tribal panther make up, blue on red, side profile, looking away, serious eyes"
-prompt += " 50mm portrait photography, hard rim lighting photography--beta --ar 2:3  --beta --upbeta"
-```
-
-Generate a batch of images with the new prompt:
-
-```python
-images = pipeline(**get_inputs(batch_size=8)).images
-make_image_grid(images, rows=2, cols=4)
-```
-
-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/stable_diffusion_101/sd_101_7.png">
-</div>
-
-Pretty impressive! Let's tweak the second image - corresponding to the `Generator` with a seed of `1` - a bit more by adding some text about the age of the subject:
-
-```python
-prompts = [
-    "portrait photo of the oldest warrior chief, tribal panther make up, blue on red, side profile, looking away, serious eyes 50mm portrait photography, hard rim lighting photography--beta --ar 2:3  --beta --upbeta",
-    "portrait photo of an old warrior chief, tribal panther make up, blue on red, side profile, looking away, serious eyes 50mm portrait photography, hard rim lighting photography--beta --ar 2:3  --beta --upbeta",
-    "portrait photo of a warrior chief, tribal panther make up, blue on red, side profile, looking away, serious eyes 50mm portrait photography, hard rim lighting photography--beta --ar 2:3  --beta --upbeta",
-    "portrait photo of a young warrior chief, tribal panther make up, blue on red, side profile, looking away, serious eyes 50mm portrait photography, hard rim lighting photography--beta --ar 2:3  --beta --upbeta",
-]
-
-generator = [torch.Generator("cuda").manual_seed(1) for _ in range(len(prompts))]
-images = pipeline(prompt=prompts, generator=generator, num_inference_steps=25).images
-make_image_grid(images, 2, 2)
-```
-
-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/stable_diffusion_101/sd_101_8.png">
-</div>
+    prompt = """
+    cinematic film still of a cat sipping a margarita in a pool in Palm Springs, California
+    highly detailed, high budget hollywood movie, cinemascope, moody, epic, gorgeous, film grain
+    """
+    negative_prompt = "low quality, blurry, ugly, poor details"
+    pipeline(prompt, negative_prompt=negative_prompt).images[0]
+    ```

 ## Next steps

-In this tutorial, you learned how to optimize a [`DiffusionPipeline`] for computational and memory efficiency as well as improving the quality of generated outputs. If you're interested in making your pipeline even faster, take a look at the following resources:
-
- Learn how [PyTorch 2.0](./optimization/fp16) and [`torch.compile`](https://pytorch.org/docs/stable/generated/torch.compile.html) can yield 5 - 300% faster inference speed. On an A100 GPU, inference can be up to 50% faster!
- If you can't use PyTorch 2, we recommend you install [xFormers](./optimization/xformers). Its memory-efficient attention mechanism works great with PyTorch 1.13.1 for faster speed and reduced memory consumption.
- Other optimization techniques, such as model offloading, are covered in [this guide](./optimization/fp16).
+Diffusers offers more advanced and powerful optimizations such as [group-offloading](./optimization/memory#group-offloading) and [regional compilation](./optimization/fp16#regional-compilation). To learn more about how to maximize performance, take a look at the Inference Optimization section.
--- a/docs/source/en/training/cogvideox.md
+++ b/docs/source/en/training/cogvideox.md
@@ -145,10 +145,10 @@ When running `accelerate config`, if you use torch.compile, there can be dramati
 If you would like to push your model to the Hub after training is completed with a neat model card, make sure you're logged in:

 ```bash
-huggingface-cli login
+hf auth login

 # Alternatively, you could upload your model manually using:
-# huggingface-cli upload my-cool-account-name/my-cool-lora-name /path/to/awesome/lora
+# hf upload my-cool-account-name/my-cool-lora-name /path/to/awesome/lora
 ```

 Make sure your data is prepared as described in [Data Preparation](#data-preparation). When ready, you can begin training!
--- a/docs/source/en/training/create_dataset.md
+++ b/docs/source/en/training/create_dataset.md
@@ -67,7 +67,7 @@ dataset = load_dataset(
 Then use the [`~datasets.Dataset.push_to_hub`] method to upload the dataset to the Hub:

 ```python
-# assuming you have ran the huggingface-cli login command in a terminal
+# assuming you have ran the hf auth login command in a terminal
 dataset.push_to_hub("name_of_your_dataset")

 # if you want to push to a private repo, simply pass private=True:
--- a/docs/source/en/tutorials/basic_training.md
+++ b/docs/source/en/tutorials/basic_training.md
@@ -42,7 +42,7 @@ We encourage you to share your model with the community, and in order to do that
 Or login in from the terminal:

 ```bash
-huggingface-cli login
+hf auth login
 ```

 Since the model checkpoints are quite large, install [Git-LFS](https://git-lfs.com/) to version these large files:
--- a/docs/source/en/tutorials/tutorial_overview.md
+++ b/docs/source/en/tutorials/tutorial_overview.md
@@ -1,23 +0,0 @@
-<!--Copyright 2025 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# Overview
-
-Welcome to 🧨 Diffusers! If you're new to diffusion models and generative AI, and want to learn more, then you've come to the right place. These beginner-friendly tutorials are designed to provide a gentle introduction to diffusion models and help you understand the library fundamentals - the core components and how 🧨 Diffusers is meant to be used.
-
-You'll learn how to use a pipeline for inference to rapidly generate things, and then deconstruct that pipeline to really understand how to use the library as a modular toolbox for building your own diffusion systems. In the next lesson, you'll learn how to train your own diffusion model to generate what you want.
-
-After completing the tutorials, you'll have gained the necessary skills to start exploring the library on your own and see how to use it for your own projects and applications.
-
-Feel free to join our community on [Discord](https://discord.com/invite/JfAtkvEtRb) or the [forums](https://discuss.huggingface.co/c/discussion-related-to-httpsgithubcomhuggingfacediffusers/63) to connect and collaborate with other users and developers!
-
-Let's start diffusing! 🧨
--- a/docs/source/en/tutorials/using_peft_for_inference.md
+++ b/docs/source/en/tutorials/using_peft_for_inference.md
@@ -319,6 +319,19 @@ If you expect to varied resolutions during inference with this feature, then mak

 There are still scenarios where recompulation is unavoidable, such as when the hotswapped LoRA targets more layers than the initial adapter. Try to load the LoRA that targets the most layers *first*. For more details about this limitation, refer to the PEFT [hotswapping](https://huggingface.co/docs/peft/main/en/package_reference/hotswap#peft.utils.hotswap.hotswap_adapter) docs.

+<details>
+<summary>Technical details of hotswapping</summary>
+
+The [`~loaders.lora_base.LoraBaseMixin.enable_lora_hotswap`] method converts the LoRA scaling factor from floats to torch.tensors and pads the shape of the weights to the largest required shape to avoid reassigning the whole attribute when the data in the weights are replaced.
+
+This is why the `max_rank` argument is important. The results are unchanged even when the values are padded with zeros. Computation may be slower though depending on the padding size.
+
+Since no new LoRA attributes are added, each subsequent LoRA is only allowed to target the same layers, or subset of layers, the first LoRA targets. Choosing the LoRA loading order is important because if the LoRAs target disjoint layers, you may end up creating a dummy LoRA that targets the union of all target layers.
+
+For more implementation details, take a look at the [`hotswap.py`](https://github.com/huggingface/peft/blob/92d65cafa51c829484ad3d95cf71d09de57ff066/src/peft/utils/hotswap.py) file.
+
+</details>
+
 ## Merge

 The weights from each LoRA can be merged together to produce a blend of multiple existing styles. There are several methods for merging LoRAs, each of which differ in *how* the weights are merged (may affect generation quality).
@@ -673,4 +686,6 @@ Browse the [LoRA Studio](https://lorastudio.co/models) for different LoRAs to us
 	height="450"
 ></iframe>

-You can find additional LoRAs in the [FLUX LoRA the Explorer](https://huggingface.co/spaces/multimodalart/flux-lora-the-explorer) and [LoRA the Explorer](https://huggingface.co/spaces/multimodalart/LoraTheExplorer) Spaces.
+You can find additional LoRAs in the [FLUX LoRA the Explorer](https://huggingface.co/spaces/multimodalart/flux-lora-the-explorer) and [LoRA the Explorer](https://huggingface.co/spaces/multimodalart/LoraTheExplorer) Spaces.
+
+Check out the [Fast LoRA inference for Flux with Diffusers and PEFT](https://huggingface.co/blog/lora-fast) blog post to learn how to optimize LoRA inference with methods like FlashAttention-3 and fp8 quantization.
--- a/docs/source/en/using-diffusers/batched_inference.md
+++ b/docs/source/en/using-diffusers/batched_inference.md
@@ -0,0 +1,264 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Batch inference
+
+Batch inference processes multiple prompts at a time to increase throughput. It is more efficient because processing multiple prompts at once maximizes GPU usage versus processing a single prompt and underutilizing the GPU.
+
+The downside is increased latency because you must wait for the entire batch to complete, and more GPU memory is required for large batches.
+
+<hfoptions id="usage">
+<hfoption id="text-to-image">
+
+For text-to-image, pass a list of prompts to the pipeline.
+
+```py
+import torch
+from diffusers import DiffusionPipeline
+
+pipeline = DiffusionPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-xl-base-1.0",
+    torch_dtype=torch.float16
+).to("cuda")
+
+prompts = [
+    "cinematic photo of A beautiful sunset over mountains, 35mm photograph, film, professional, 4k, highly detailed",
+    "cinematic film still of a cat basking in the sun on a roof in Turkey, highly detailed, high budget hollywood movie, cinemascope, moody, epic, gorgeous, film grain",
+    "pixel-art a cozy coffee shop interior, low-res, blocky, pixel art style, 8-bit graphics"
+]
+
+images = pipeline(
+    prompt=prompts,
+).images
+
+fig, axes = plt.subplots(2, 2, figsize=(12, 12))
+axes = axes.flatten()
+
+for i, image in enumerate(images):
+    axes[i].imshow(image)
+    axes[i].set_title(f"Image {i+1}")
+    axes[i].axis('off')
+
+plt.tight_layout()
+plt.show()
+```
+
+To generate multiple variations of one prompt, use the `num_images_per_prompt` argument.
+
+```py
+import torch
+import matplotlib.pyplot as plt
+from diffusers import DiffusionPipeline
+
+pipeline = DiffusionPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-xl-base-1.0",
+    torch_dtype=torch.float16
+).to("cuda")
+
+images = pipeline(
+    prompt="pixel-art a cozy coffee shop interior, low-res, blocky, pixel art style, 8-bit graphics",
+    num_images_per_prompt=4
+).images
+
+fig, axes = plt.subplots(2, 2, figsize=(12, 12))
+axes = axes.flatten()
+
+for i, image in enumerate(images):
+    axes[i].imshow(image)
+    axes[i].set_title(f"Image {i+1}")
+    axes[i].axis('off')
+
+plt.tight_layout()
+plt.show()
+```
+
+Combine both approaches to generate different variations of different prompts.
+
+```py
+images = pipeline(
+    prompt=prompts,
+    num_images_per_prompt=2,
+).images
+
+fig, axes = plt.subplots(2, 2, figsize=(12, 12))
+axes = axes.flatten()
+
+for i, image in enumerate(images):
+    axes[i].imshow(image)
+    axes[i].set_title(f"Image {i+1}")
+    axes[i].axis('off')
+
+plt.tight_layout()
+plt.show()
+```
+
+</hfoption>
+<hfoption id="image-to-image">
+
+For image-to-image, pass a list of input images and prompts to the pipeline.
+
+```py
+import torch
+from diffusers.utils import load_image
+from diffusers import DiffusionPipeline
+
+pipeline = DiffusionPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-xl-base-1.0",
+    torch_dtype=torch.float16
+).to("cuda")
+
+input_images = [
+    load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png"),
+    load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cat.png"),
+    load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/detail-prompt.png")
+]
+
+prompts = [
+    "cinematic photo of a beautiful sunset over mountains, 35mm photograph, film, professional, 4k, highly detailed",
+    "cinematic film still of a cat basking in the sun on a roof in Turkey, highly detailed, high budget hollywood movie, cinemascope, moody, epic, gorgeous, film grain",
+    "pixel-art a cozy coffee shop interior, low-res, blocky, pixel art style, 8-bit graphics"
+]
+
+images = pipeline(
+    prompt=prompts,
+    image=input_images,
+    guidance_scale=8.0,
+    strength=0.5
+).images
+
+fig, axes = plt.subplots(2, 2, figsize=(12, 12))
+axes = axes.flatten()
+
+for i, image in enumerate(images):
+    axes[i].imshow(image)
+    axes[i].set_title(f"Image {i+1}")
+    axes[i].axis('off')
+
+plt.tight_layout()
+plt.show()
+```
+
+To generate multiple variations of one prompt, use the `num_images_per_prompt` argument.
+
+```py
+import torch
+import matplotlib.pyplot as plt
+from diffusers.utils import load_image
+from diffusers import DiffusionPipeline
+
+pipeline = DiffusionPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-xl-base-1.0",
+    torch_dtype=torch.float16
+).to("cuda")
+
+input_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/detail-prompt.png")
+
+images = pipeline(
+    prompt="pixel-art a cozy coffee shop interior, low-res, blocky, pixel art style, 8-bit graphics",
+    image=input_image,
+    num_images_per_prompt=4
+).images
+
+fig, axes = plt.subplots(2, 2, figsize=(12, 12))
+axes = axes.flatten()
+
+for i, image in enumerate(images):
+    axes[i].imshow(image)
+    axes[i].set_title(f"Image {i+1}")
+    axes[i].axis('off')
+
+plt.tight_layout()
+plt.show()
+```
+
+Combine both approaches to generate different variations of different prompts.
+
+```py
+input_images = [
+    load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cat.png"),
+    load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/detail-prompt.png")
+]
+
+prompts = [
+    "cinematic film still of a cat basking in the sun on a roof in Turkey, highly detailed, high budget hollywood movie, cinemascope, moody, epic, gorgeous, film grain",
+    "pixel-art a cozy coffee shop interior, low-res, blocky, pixel art style, 8-bit graphics"
+]
+
+images = pipeline(
+    prompt=prompts,
+    image=input_images,
+    num_images_per_prompt=2,
+).images
+
+fig, axes = plt.subplots(2, 2, figsize=(12, 12))
+axes = axes.flatten()
+
+for i, image in enumerate(images):
+    axes[i].imshow(image)
+    axes[i].set_title(f"Image {i+1}")
+    axes[i].axis('off')
+
+plt.tight_layout()
+plt.show()
+```
+
+</hfoption>
+</hfoptions>
+
+## Deterministic generation
+
+Enable reproducible batch generation by passing a list of [Generator’s](https://pytorch.org/docs/stable/generated/torch.Generator.html) to the pipeline and tie each `Generator` to a seed to reuse it.
+
+Use a list comprehension to iterate over the batch size specified in `range()` to create a unique `Generator` object for each image in the batch.
+
+Don't multiply the `Generator` by the batch size because that only creates one `Generator` object that is used sequentially for each image in the batch.
+
+```py
+generator = [torch.Generator(device="cuda").manual_seed(0)] * 3
+```
+
+Pass the `generator` to the pipeline.
+
+```py
+import torch
+from diffusers import DiffusionPipeline
+
+pipeline = DiffusionPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-xl-base-1.0",
+    torch_dtype=torch.float16
+).to("cuda")
+
+generator = [torch.Generator(device="cuda").manual_seed(i) for i in range(3)]
+prompts = [
+    "cinematic photo of A beautiful sunset over mountains, 35mm photograph, film, professional, 4k, highly detailed",
+    "cinematic film still of a cat basking in the sun on a roof in Turkey, highly detailed, high budget hollywood movie, cinemascope, moody, epic, gorgeous, film grain",
+    "pixel-art a cozy coffee shop interior, low-res, blocky, pixel art style, 8-bit graphics"
+]
+
+images = pipeline(
+    prompt=prompts,
+    generator=generator
+).images
+
+fig, axes = plt.subplots(2, 2, figsize=(12, 12))
+axes = axes.flatten()
+
+for i, image in enumerate(images):
+    axes[i].imshow(image)
+    axes[i].set_title(f"Image {i+1}")
+    axes[i].axis('off')
+
+plt.tight_layout()
+plt.show()
+```
+
+You can use this to iteratively select an image associated with a seed and then improve on it by crafting a more detailed prompt.
--- a/docs/source/en/using-diffusers/loading.md
+++ b/docs/source/en/using-diffusers/loading.md
@@ -112,6 +112,30 @@ print(pipe.transformer.dtype, pipe.vae.dtype)  # (torch.bfloat16, torch.float16)

 If a component is not explicitly specified in the dictionary and no `default` is provided, it will be loaded with `torch.float32`.

+### Parallel loading
+
+Large models are often [sharded](../training/distributed_inference#model-sharding) into smaller files so that they are easier to load. Diffusers supports loading shards in parallel to speed up the loading process.
+
+Set the environment variables below to enable parallel loading.
+
+- Set `HF_ENABLE_PARALLEL_LOADING` to `"YES"` to enable parallel loading of shards.
+- Set `HF_PARALLEL_LOADING_WORKERS` to configure the number of parallel threads to use when loading shards. More workers loads a model faster but uses more memory.
+
+The `device_map` argument should be set to `"cuda"` to pre-allocate a large chunk of memory based on the model size. This substantially reduces model load time because warming up the memory allocator now avoids many smaller calls to the allocator later.
+
+```py
+import os
+import torch
+from diffusers import DiffusionPipeline
+
+os.environ["HF_ENABLE_PARALLEL_LOADING"] = "YES"
+pipeline = DiffusionPipeline.from_pretrained(
+    "Wan-AI/Wan2.2-I2V-A14B-Diffusers",
+    torch_dtype=torch.bfloat16,
+    device_map="cuda"
+)
+```
+
 ### Local pipeline

 To load a pipeline locally, use [git-lfs](https://git-lfs.github.com/) to manually download a checkpoint to your local disk.
--- a/Show More
+++ b/Show More