update

2026-02-06 10:54:59 +08:00 · 2026-02-03 06:05:12 +01:00 · 2026-02-02 15:48:06 +01:00
49 changed files with 1737 additions and 1228 deletions
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -114,8 +114,6 @@
    title: Guiders
  - local: modular_diffusers/custom_blocks
    title: Building Custom Blocks
-  - local: modular_diffusers/mellon
-    title: Using Custom Blocks with Mellon
  title: Modular Diffusers
 - isExpanded: false
  sections:
--- a/docs/source/en/modular_diffusers/custom_blocks.md
+++ b/docs/source/en/modular_diffusers/custom_blocks.md
@@ -16,7 +16,7 @@ specific language governing permissions and limitations under the License.
 [ModularPipelineBlocks](./pipeline_block) are the fundamental building blocks of a [`ModularPipeline`]. You can create custom blocks by defining their inputs, outputs, and computation logic. This guide demonstrates how to create and use a custom block.

 > [!TIP]
-> Explore the [Modular Diffusers Custom Blocks](https://huggingface.co/collections/diffusers/modular-diffusers-custom-blocks) collection for official custom blocks.
+> Explore the [Modular Diffusers Custom Blocks](https://huggingface.co/collections/diffusers/modular-diffusers-custom-blocks) collection for official custom modular blocks like Nano Banana.

 ## Project Structure

@@ -31,58 +31,18 @@ Your custom block project should use the following structure:
 - `block.py` contains the custom block implementation
 - `modular_config.json` contains the metadata needed to load the block

-## Quick Start with Template
+## Example: Florence 2 Inpainting Block

-The fastest way to create a custom block is to start from our template. The template provides a pre-configured project structure with `block.py` and `modular_config.json` files, plus commented examples showing how to define components, inputs, outputs, and the `__call__` method—so you can focus on your custom logic instead of boilerplate setup.
+In this example we will create a custom block that uses the [Florence 2](https://huggingface.co/docs/transformers/model_doc/florence2) model to process an input image and generate a mask for inpainting.

-### Download the template
+The first step is to define the components that the block will use. In this case, we will need to use the `Florence2ForConditionalGeneration` model and its corresponding processor `AutoProcessor`. When defining components, we must specify the name of the component within our pipeline, model class via `type_hint`, and provide a `pretrained_model_name_or_path` for the component if we intend to load the model weights from a specific repository on the Hub.

-```python
-from diffusers import ModularPipelineBlocks
-
-model_id = "diffusers/custom-block-template"
-local_dir = model_id.split("/")[-1]
-
-blocks = ModularPipelineBlocks.from_pretrained(
-    model_id, 
-    trust_remote_code=True, 
-    local_dir=local_dir
-)
-```
-
-This saves the template files to `custom-block-template/` locally or you could use `local_dir` to save to a specific location.
-
-### Edit locally
-
-Open `block.py` and implement your custom block. The template includes commented examples showing how to define each property. See the [Florence-2 example](#example-florence-2-image-annotator) below for a complete implementation.
-
-### Test your block
-
-```python
-from diffusers import ModularPipelineBlocks
-
-blocks = ModularPipelineBlocks.from_pretrained(local_dir, trust_remote_code=True)
-pipeline = blocks.init_pipeline()
-output = pipeline(...)  # your inputs here
-```
-
-### Upload to the Hub
-
-```python
-pipeline.save_pretrained(local_dir, repo_id="your-username/your-block-name", push_to_hub=True)
-```
-
-## Example: Florence-2 Image Annotator
-
-This example creates a custom block with [Florence-2](https://huggingface.co/docs/transformers/model_doc/florence2) to process an input image and generate a mask for inpainting.
-
-### Define components
-
-Define the components the block needs, `Florence2ForConditionalGeneration` and its processor. When defining components, specify the `name` (how you'll access it in code), `type_hint` (the model class), and `pretrained_model_name_or_path` (where to load weights from).
-
-```python
+```py
 # Inside block.py
-from diffusers.modular_pipelines import ModularPipelineBlocks, ComponentSpec
+from diffusers.modular_pipelines import (
+    ModularPipelineBlocks,
+    ComponentSpec,
+)
 from transformers import AutoProcessor, Florence2ForConditionalGeneration


@@ -104,19 +64,40 @@ class Florence2ImageAnnotatorBlock(ModularPipelineBlocks):
        ]
 ```

-### Define inputs and outputs
+Next, we define the inputs and outputs of the block. The inputs include the image to be annotated, the annotation task, and the annotation prompt. The outputs include the generated mask image and annotations.

-Inputs include the image, annotation task, and prompt. Outputs include the generated mask and annotations.
-
-```python
+```py
 from typing import List, Union
-from PIL import Image
-from diffusers.modular_pipelines import InputParam, OutputParam
+from PIL import Image, ImageDraw
+import torch
+import numpy as np
+
+from diffusers.modular_pipelines import (
+    PipelineState,
+    ModularPipelineBlocks,
+    InputParam,
+    ComponentSpec,
+    OutputParam,
+)
+from transformers import AutoProcessor, Florence2ForConditionalGeneration


 class Florence2ImageAnnotatorBlock(ModularPipelineBlocks):

-    # ... expected_components from above ...
+    @property
+    def expected_components(self):
+        return [
+            ComponentSpec(
+                name="image_annotator",
+                type_hint=Florence2ForConditionalGeneration,
+                pretrained_model_name_or_path="florence-community/Florence-2-base-ft",
+            ),
+            ComponentSpec(
+                name="image_annotator_processor",
+                type_hint=AutoProcessor,
+                pretrained_model_name_or_path="florence-community/Florence-2-base-ft",
+            ),
+        ]

    @property
    def inputs(self) -> List[InputParam]:
@@ -129,21 +110,51 @@ class Florence2ImageAnnotatorBlock(ModularPipelineBlocks):
            ),
            InputParam(
                "annotation_task",
-                type_hint=str,
+                type_hint=Union[str, List[str]],
+                required=True,
                default="<REFERRING_EXPRESSION_SEGMENTATION>",
-                description="Annotation task to perform (e.g., <OD>, <CAPTION>, <REFERRING_EXPRESSION_SEGMENTATION>)",
+                description="""Annotation Task to perform on the image.
+                Supported Tasks:
+
+                <OD>
+                <REFERRING_EXPRESSION_SEGMENTATION>
+                <CAPTION>
+                <DETAILED_CAPTION>
+                <MORE_DETAILED_CAPTION>
+                <DENSE_REGION_CAPTION>
+                <CAPTION_TO_PHRASE_GROUNDING>
+                <OPEN_VOCABULARY_DETECTION>
+
+                """,
            ),
            InputParam(
                "annotation_prompt",
-                type_hint=str,
+                type_hint=Union[str, List[str]],
                required=True,
-                description="Prompt to provide context for the annotation task",
+                description="""Annotation Prompt to provide more context to the task.
+                Can be used to detect or segment out specific elements in the image
+                """,
            ),
            InputParam(
                "annotation_output_type",
                type_hint=str,
+                required=True,
                default="mask_image",
-                description="Output type: 'mask_image', 'mask_overlay', or 'bounding_box'",
+                description="""Output type from annotation predictions. Available options are
+                mask_image:
+                    -black and white mask image for the given image based on the task type
+                mask_overlay:
+                    - mask overlayed on the original image
+                bounding_box:
+                    - bounding boxes drawn on the original image
+                """,
+            ),
+            InputParam(
+                "annotation_overlay",
+                type_hint=bool,
+                required=True,
+                default=False,
+                description="",
            ),
        ]

@@ -152,45 +163,225 @@ class Florence2ImageAnnotatorBlock(ModularPipelineBlocks):
        return [
            OutputParam(
                "mask_image",
-                type_hint=Image.Image,
-                description="Inpainting mask for the input image",
+                type_hint=Image,
+                description="Inpainting Mask for input Image(s)",
            ),
            OutputParam(
                "annotations",
                type_hint=dict,
-                description="Raw annotation predictions",
+                description="Annotations Predictions for input Image(s)",
            ),
            OutputParam(
                "image",
-                type_hint=Image.Image,
-                description="Annotated image",
+                type_hint=Image,
+                description="Annotated input Image(s)",
            ),
        ]
+
 ```

-### Implement the `__call__` method
+Now we implement the `__call__` method, which contains the logic for processing the input image and generating the mask.

-The `__call__` method contains the block's logic. Access inputs via `block_state`, run your computation, and set outputs back to `block_state`.
-
-```python
+```py
+from typing import List, Union
+from PIL import Image, ImageDraw
 import torch
-from diffusers.modular_pipelines import PipelineState
+import numpy as np
+
+from diffusers.modular_pipelines import (
+    PipelineState,
+    ModularPipelineBlocks,
+    InputParam,
+    ComponentSpec,
+    OutputParam,
+)
+from transformers import AutoProcessor, Florence2ForConditionalGeneration


 class Florence2ImageAnnotatorBlock(ModularPipelineBlocks):

-    # ... expected_components, inputs, intermediate_outputs from above ...
+    @property
+    def expected_components(self):
+        return [
+            ComponentSpec(
+                name="image_annotator",
+                type_hint=Florence2ForConditionalGeneration,
+                pretrained_model_name_or_path="florence-community/Florence-2-base-ft",
+            ),
+            ComponentSpec(
+                name="image_annotator_processor",
+                type_hint=AutoProcessor,
+                pretrained_model_name_or_path="florence-community/Florence-2-base-ft",
+            ),
+        ]
+
+    @property
+    def inputs(self) -> List[InputParam]:
+        return [
+            InputParam(
+                "image",
+                type_hint=Union[Image.Image, List[Image.Image]],
+                required=True,
+                description="Image(s) to annotate",
+            ),
+            InputParam(
+                "annotation_task",
+                type_hint=Union[str, List[str]],
+                required=True,
+                default="<REFERRING_EXPRESSION_SEGMENTATION>",
+                description="""Annotation Task to perform on the image.
+                Supported Tasks:
+
+                <OD>
+                <REFERRING_EXPRESSION_SEGMENTATION>
+                <CAPTION>
+                <DETAILED_CAPTION>
+                <MORE_DETAILED_CAPTION>
+                <DENSE_REGION_CAPTION>
+                <CAPTION_TO_PHRASE_GROUNDING>
+                <OPEN_VOCABULARY_DETECTION>
+
+                """,
+            ),
+            InputParam(
+                "annotation_prompt",
+                type_hint=Union[str, List[str]],
+                required=True,
+                description="""Annotation Prompt to provide more context to the task.
+                Can be used to detect or segment out specific elements in the image
+                """,
+            ),
+            InputParam(
+                "annotation_output_type",
+                type_hint=str,
+                required=True,
+                default="mask_image",
+                description="""Output type from annotation predictions. Available options are
+                mask_image:
+                    -black and white mask image for the given image based on the task type
+                mask_overlay:
+                    - mask overlayed on the original image
+                bounding_box:
+                    - bounding boxes drawn on the original image
+                """,
+            ),
+            InputParam(
+                "annotation_overlay",
+                type_hint=bool,
+                required=True,
+                default=False,
+                description="",
+            ),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [
+            OutputParam(
+                "mask_image",
+                type_hint=Image,
+                description="Inpainting Mask for input Image(s)",
+            ),
+            OutputParam(
+                "annotations",
+                type_hint=dict,
+                description="Annotations Predictions for input Image(s)",
+            ),
+            OutputParam(
+                "image",
+                type_hint=Image,
+                description="Annotated input Image(s)",
+            ),
+        ]
+
+    def get_annotations(self, components, images, prompts, task):
+        task_prompts = [task + prompt for prompt in prompts]
+
+        inputs = components.image_annotator_processor(
+            text=task_prompts, images=images, return_tensors="pt"
+        ).to(components.image_annotator.device, components.image_annotator.dtype)
+
+        generated_ids = components.image_annotator.generate(
+            input_ids=inputs["input_ids"],
+            pixel_values=inputs["pixel_values"],
+            max_new_tokens=1024,
+            early_stopping=False,
+            do_sample=False,
+            num_beams=3,
+        )
+        annotations = components.image_annotator_processor.batch_decode(
+            generated_ids, skip_special_tokens=False
+        )
+        outputs = []
+        for image, annotation in zip(images, annotations):
+            outputs.append(
+                components.image_annotator_processor.post_process_generation(
+                    annotation, task=task, image_size=(image.width, image.height)
+                )
+            )
+        return outputs
+
+    def prepare_mask(self, images, annotations, overlay=False, fill="white"):
+        masks = []
+        for image, annotation in zip(images, annotations):
+            mask_image = image.copy() if overlay else Image.new("L", image.size, 0)
+            draw = ImageDraw.Draw(mask_image)
+
+            for _, _annotation in annotation.items():
+                if "polygons" in _annotation:
+                    for polygon in _annotation["polygons"]:
+                        polygon = np.array(polygon).reshape(-1, 2)
+                        if len(polygon) < 3:
+                            continue
+                        polygon = polygon.reshape(-1).tolist()
+                        draw.polygon(polygon, fill=fill)
+
+                elif "bbox" in _annotation:
+                    bbox = _annotation["bbox"]
+                    draw.rectangle(bbox, fill="white")
+
+            masks.append(mask_image)
+
+        return masks
+
+    def prepare_bounding_boxes(self, images, annotations):
+        outputs = []
+        for image, annotation in zip(images, annotations):
+            image_copy = image.copy()
+            draw = ImageDraw.Draw(image_copy)
+            for _, _annotation in annotation.items():
+                bbox = _annotation["bbox"]
+                label = _annotation["label"]
+
+                draw.rectangle(bbox, outline="red", width=3)
+                draw.text((bbox[0], bbox[1] - 20), label, fill="red")
+
+            outputs.append(image_copy)
+
+        return outputs
+
+    def prepare_inputs(self, images, prompts):
+        prompts = prompts or ""
+
+        if isinstance(images, Image.Image):
+            images = [images]
+        if isinstance(prompts, str):
+            prompts = [prompts]
+
+        if len(images) != len(prompts):
+            raise ValueError("Number of images and annotation prompts must match.")
+
+        return images, prompts

    @torch.no_grad()
    def __call__(self, components, state: PipelineState) -> PipelineState:
        block_state = self.get_block_state(state)
-        
        images, annotation_task_prompt = self.prepare_inputs(
            block_state.image, block_state.annotation_prompt
        )
        task = block_state.annotation_task
        fill = block_state.fill
-        
+
        annotations = self.get_annotations(
            components, images, annotation_task_prompt, task
        )
@@ -209,69 +400,67 @@ class Florence2ImageAnnotatorBlock(ModularPipelineBlocks):
        self.set_block_state(state, block_state)

        return components, state
-    
-    # Helper methods for mask/bounding box generation...
+
 ```

-> [!TIP]
-> See the complete implementation at [diffusers/Florence2-image-Annotator](https://huggingface.co/diffusers/Florence2-image-Annotator).
+Once we have defined our custom block, we can save it to the Hub, using either the CLI or the [`push_to_hub`] method. This will make it easy to share and reuse our custom block with other pipelines.
+
+<hfoptions id="share">
+<hfoption id="hf CLI">
+
+```shell
+# In the folder with the `block.py` file, run:
+diffusers-cli custom_block
+```
+
+Then upload the block to the Hub:
+
+```shell
+hf upload <your repo id> . .
+```
+</hfoption>
+<hfoption id="push_to_hub">
+
+```py
+from block import Florence2ImageAnnotatorBlock
+block = Florence2ImageAnnotatorBlock()
+block.push_to_hub("<your repo id>")
+```
+
+</hfoption>
+</hfoptions>

 ## Using Custom Blocks

-Load a custom block with [`~ModularPipeline.from_pretrained`] and set `trust_remote_code=True`.
+Load the custom block with [`~ModularPipelineBlocks.from_pretrained`] and set `trust_remote_code=True`.

 ```py
 import torch
-from diffusers import ModularPipeline
+from diffusers.modular_pipelines import ModularPipelineBlocks, SequentialPipelineBlocks
+from diffusers.modular_pipelines.stable_diffusion_xl import INPAINT_BLOCKS
 from diffusers.utils import load_image

-# Load the Florence-2 annotator pipeline
-image_annotator = ModularPipeline.from_pretrained(
-    "diffusers/Florence2-image-Annotator",
-    trust_remote_code=True
-)
+# Fetch the Florence2 image annotator block that will create our mask
+image_annotator_block = ModularPipelineBlocks.from_pretrained("diffusers/florence-2-custom-block", trust_remote_code=True)

-# Check the docstring to see inputs/outputs
-print(image_annotator.blocks.doc)
-```
+my_blocks = INPAINT_BLOCKS.copy()
+# insert the annotation block before the image encoding step
+my_blocks.insert("image_annotator", image_annotator_block, 1)

-Use the block to generate a mask:
+# Create our initial set of inpainting blocks
+blocks = SequentialPipelineBlocks.from_blocks_dict(my_blocks)

-```python
-image_annotator.load_components(torch_dtype=torch.bfloat16)
-image_annotator.to("cuda")
+repo_id = "diffusers/modular-stable-diffusion-xl-base-1.0"
+pipe = blocks.init_pipeline(repo_id)
+pipe.load_components(torch_dtype=torch.float16, device_map="cuda", trust_remote_code=True)

-image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg")
+image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg?download=true")
 image = image.resize((1024, 1024))
+
 prompt = ["A red car"]
 annotation_task = "<REFERRING_EXPRESSION_SEGMENTATION>"
 annotation_prompt = ["the car"]

-mask_image = image_annotator_node(
-    prompt=prompt,
-    image=image,
-    annotation_task=annotation_task,
-    annotation_prompt=annotation_prompt,
-    annotation_output_type="mask_image",
-).images
-mask_image[0].save("car-mask.png")
-```
-
-Compose it with other blocks to create a new pipeline:
-
-```python
-# Get the annotator block
-annotator_block = image_annotator.blocks
-
-# Get an inpainting workflow and insert the annotator at the beginning
-inpaint_blocks = ModularPipeline.from_pretrained("Qwen/Qwen-Image").blocks.get_workflow("inpainting")
-inpaint_blocks.sub_blocks.insert("image_annotator", annotator_block, 0)
-
-# Initialize the combined pipeline
-pipe = inpaint_blocks.init_pipeline()
-pipe.load_components(torch_dtype=torch.float16, device="cuda")
-
-# Now the pipeline automatically generates masks from prompts
 output = pipe(
    prompt=prompt,
    image=image,
@@ -286,50 +475,18 @@ output = pipe(
 output[0].save("florence-inpainting.png")
 ```

-## Editing custom blocks
+## Editing Custom Blocks

-Edit custom blocks by downloading it locally. This is the same workflow as the [Quick Start with Template](#quick-start-with-template), but starting from an existing block instead of the template.
+By default, custom blocks are saved in your cache directory. Use the `local_dir` argument to download and edit a custom block in a specific folder.

-Use the `local_dir` argument to download a custom block to a specific folder:
+```py
+import torch
+from diffusers.modular_pipelines import ModularPipelineBlocks, SequentialPipelineBlocks
+from diffusers.modular_pipelines.stable_diffusion_xl import INPAINT_BLOCKS
+from diffusers.utils import load_image

-```python
-from diffusers import ModularPipelineBlocks
-
-# Download to a local folder for editing
-annotator_block = ModularPipelineBlocks.from_pretrained(
-    "diffusers/Florence2-image-Annotator",
-    trust_remote_code=True,
-    local_dir="./my-florence-block"
-)
+# Fetch the Florence2 image annotator block that will create our mask
+image_annotator_block = ModularPipelineBlocks.from_pretrained("diffusers/florence-2-custom-block", trust_remote_code=True, local_dir="/my-local-folder")
 ```

-Any changes made to the block files in this folder will be reflected when you load the block again. When you're ready to share your changes, upload to a new repository:
-
-```python
-pipeline = annotator_block.init_pipeline()
-pipeline.save_pretrained("./my-florence-block", repo_id="your-username/my-custom-florence", push_to_hub=True)
-```
-
-## Next Steps
-
-<hfoptions id="next">
-<hfoption id="Learn block types">
-
-This guide covered creating a single custom block. Learn how to compose multiple blocks together:
-
- [SequentialPipelineBlocks](./sequential_pipeline_blocks): Chain blocks to execute in sequence
- [ConditionalPipelineBlocks](./auto_pipeline_blocks): Create conditional blocks that select different execution paths
- [LoopSequentialPipelineBlocks](./loop_sequential_pipeline_blocks): Define an iterative workflows like the denoising loop
-
-</hfoption>
-<hfoption id="Use in Mellon">
-
-Make your custom block work with Mellon's visual interface. See the [Mellon Custom Blocks](./mellon) guide.
-
-</hfoption>
-<hfoption id="Explore existing blocks">
-
-Browse the [Modular Diffusers Custom Blocks](https://huggingface.co/collections/diffusers/modular-diffusers-custom-blocks) collection for inspiration and ready-to-use blocks.
-
-</hfoption>
-</hfoptions>
+Any changes made to the block files in this folder will be reflected when you load the block again.
--- a/docs/source/en/modular_diffusers/mellon.md
+++ b/docs/source/en/modular_diffusers/mellon.md
@@ -1,270 +0,0 @@
-<!--Copyright 2025 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-
-## Using Custom Blocks with Mellon
-
-[Mellon](https://github.com/cubiq/Mellon) is a visual workflow interface that integrates with Modular Diffusers and is designed for node-based workflows.
-
-> [!WARNING]
-> Mellon is in early development and not ready for production use yet. Consider this a sneak peek of how the integration works!
-
-
-Custom blocks work in Mellon out of the box - just need to add a `mellon_pipeline_config.json` to your repository. This config file tells Mellon how to render your block's parameters as UI components.
-
-Here's what it looks like in action with the [Gemini Prompt Expander](https://huggingface.co/diffusers/gemini-prompt-expander-mellon) block:
-
-![Mellon custom block demo](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/modular_demo_dynamic.gif)
-
-To use a modular diffusers custom block in Mellon:
-1. Drag a **Dynamic Block Node** from the ModularDiffusers section
-2. Enter the `repo_id` (e.g., `diffusers/gemini-prompt-expander-mellon`)
-3. Click **Load Custom Block**
-4. The node transforms to show your block's inputs and outputs
-
-Now let's walk through how to create this config for your own custom block.
-
-## Steps to create a Mellon config
-
-1. **Specify Mellon types for your parameters** - Each `InputParam`/`OutputParam` needs a type that tells Mellon what UI component to render (e.g., `"textbox"`, `"dropdown"`, `"image"`).
-2. **Generate `mellon_pipeline_config.json`** - Use our utility to generate a config template and push it to your Hub repository.
-3. **(Optional) Manually adjust the config** - Fine-tune the generated config for your specific needs.
-
-## Specify Mellon types for parameters
-
-Mellon types determine how each parameter renders in the UI. If you don't specify a type for a parameter, it will default to `"custom"`, which renders as a simple connection dot. You can always adjust this later in the generated config.
-
-
-| Type | Input/Output | Description |
-|------|--------------|-------------|
-| `image` | Both | Image (PIL Image) |
-| `video` | Both | Video |
-| `text` | Both | Text display |
-| `textbox` | Input | Text input |
-| `dropdown` | Input | Dropdown selection menu |
-| `slider` | Input | Slider for numeric values |
-| `number` | Input | Numeric input |
-| `checkbox` | Input | Boolean toggle |
-
-For parameters that need more configuration (like dropdowns with options, or sliders with min/max values), pass a `MellonParam` instance directly instead of a string. You can use one of the class methods below, or create a fully custom one with `MellonParam(name, label, type, ...)`.
-
-| Method | Description |
-|--------|-------------|
-| `MellonParam.Input.image(name)` | Image input |
-| `MellonParam.Input.textbox(name, default)` | Text input as textarea |
-| `MellonParam.Input.dropdown(name, options, default)` | Dropdown selection |
-| `MellonParam.Input.slider(name, default, min, max, step)` | Slider for numeric values |
-| `MellonParam.Input.number(name, default, min, max, step)` | Numeric input (no slider) |
-| `MellonParam.Input.seed(name, default)` | Seed input with randomize button |
-| `MellonParam.Input.checkbox(name, default)` | Boolean checkbox |
-| `MellonParam.Input.model(name)` | Model input for diffusers components |
-| `MellonParam.Output.image(name)` | Image output |
-| `MellonParam.Output.video(name)` | Video output |
-| `MellonParam.Output.text(name)` | Text output |
-| `MellonParam.Output.model(name)` | Model output for diffusers components |
-
-Choose one of the methods below to specify a Mellon type.
-
-### Using `metadata` in block definitions
-
-If you're defining a custom block from scratch, add `metadata={"mellon": "<type>"}` directly to your `InputParam` and `OutputParam` definitions. If you're editing an existing custom block from the Hub, see [Editing custom blocks](./custom_blocks#editing-custom-blocks) for how to download it locally.
-
-```python
-class GeminiPromptExpander(ModularPipelineBlocks):
-    
-    @property
-    def inputs(self) -> List[InputParam]:
-        return [
-            InputParam(
-                "prompt",
-                type_hint=str,
-                required=True,
-                description="Prompt to use",
-                metadata={"mellon": "textbox"},  # Text input
-            )
-        ]
-    
-    @property
-    def intermediate_outputs(self) -> List[OutputParam]:
-        return [
-            OutputParam(
-                "prompt",
-                type_hint=str,
-                description="Expanded prompt by the LLM",
-                metadata={"mellon": "text"},  # Text output
-            ),
-            OutputParam(
-                "old_prompt",
-                type_hint=str,
-                description="Old prompt provided by the user",
-                # No metadata - we don't want to render this in UI
-            )
-        ]
-```
-
-For full control over UI configuration, pass a `MellonParam` instance directly:
-```python
-from diffusers.modular_pipelines.mellon_node_utils import MellonParam
-
-InputParam(
-    "mode",
-    type_hint=str,
-    default="balanced",
-    metadata={"mellon": MellonParam.Input.dropdown("mode", options=["fast", "balanced", "quality"])},
-)
-```
-
-### Using `input_types` and `output_types` when Generating Config
-
-If you're working with an existing pipeline or prefer to keep your block definitions clean, specify types when generating the config using the `input_types/output_types` argument:
-```python
-from diffusers.modular_pipelines.mellon_node_utils import MellonPipelineConfig
-
-mellon_config = MellonPipelineConfig.from_custom_block(
-    blocks,
-    input_types={"prompt": "textbox"},
-    output_types={"prompt": "text"}
-)
-```
-
-> [!NOTE]
-> When both `metadata` and `input_types`/`output_types` are specified, the arguments overrides `metadata`.
-
-## Generate and push the Mellon config
-
-After adding metadata to your block, generate the default Mellon configuration template and push it to the Hub:
-
-```python
-from diffusers import ModularPipelineBlocks
-from diffusers.modular_pipelines.mellon_node_utils import MellonPipelineConfig
-
-# load your custom blocks from your local dir
-blocks = ModularPipelineBlocks.from_pretrained("/path/local/folder", trust_remote_code=True)
-
-# Generate the default config template
-mellon_config = MellonPipelineConfig.from_custom_block(blocks)
-# push the default template to `repo_id`, you will need to pass the same local folder path so that it will save the config locally first
-mellon_config.save(
-    local_dir="/path/local/folder",
-    repo_id= repo_id,
-    push_to_hub=True
-)
-```
-
-This creates a `mellon_pipeline_config.json` file in your repository.
-
-## Review and adjust the config
-
-The generated template is a starting point - you may want to adjust it for your needs. Let's walk through the generated config for the Gemini Prompt Expander:
-
-```json
-{
-  "label": "Gemini Prompt Expander",
-  "default_repo": "",
-  "default_dtype": "",
-  "node_params": {
-    "custom": {
-      "params": {
-        "prompt": {
-          "label": "Prompt",
-          "type": "string",
-          "display": "textarea",
-          "default": ""
-        },
-        "out_prompt": {
-          "label": "Prompt",
-          "type": "string",
-          "display": "output"
-        },
-        "old_prompt": {
-          "label": "Old Prompt",
-          "type": "custom",
-          "display": "output"
-        },
-        "doc": {
-          "label": "Doc",
-          "type": "string",
-          "display": "output"
-        }
-      },
-      "input_names": ["prompt"],
-      "model_input_names": [],
-      "output_names": ["out_prompt", "old_prompt", "doc"],
-      "block_name": "custom",
-      "node_type": "custom"
-    }
-  }
-}
-```
-
-### Understanding the Structure
-
-The `params` dict defines how each UI element renders. The `input_names`, `model_input_names`, and `output_names` lists map these UI elements to the underlying [`ModularPipelineBlocks`]'s I/O interface:
-
-| Mellon Config | ModularPipelineBlocks |
-|---------------|----------------------|
-| `input_names` | `inputs` property |
-| `model_input_names` | `expected_components` property |
-| `output_names` | `intermediate_outputs` property |
-
-In this example: `prompt` is the only input. There are no model components, and outputs include `out_prompt`, `old_prompt`, and `doc`.
-
-Now let's look at the `params` dict:
-
- **`prompt`**: An input parameter with `display: "textarea"` (renders as a text input box), `label: "Prompt"` (shown in the UI), and `default: ""` (starts empty). The `type: "string"` field is important in Mellon because it determines which nodes can connect together - only matching types can be linked with "noodles".
-
- **`out_prompt`**: The expanded prompt output. The `out_` prefix was automatically added because the input and output share the same name (`prompt`), avoiding naming conflicts in the config. It has `display: "output"` which renders as an output socket.
-
- **`old_prompt`**: Has `type: "custom"` because we didn't specify metadata. This renders as a simple dot in the UI. Since we don't actually want to expose this in the UI, we can remove it.
-
- **`doc`**: The documentation output, automatically added to all custom blocks.
-
-### Making Adjustments
-
-Remove `old_prompt` from both `params` and `output_names` because you won't need to use it.
-
-```json
-{
-  "label": "Gemini Prompt Expander",
-  "default_repo": "",
-  "default_dtype": "",
-  "node_params": {
-    "custom": {
-      "params": {
-        "prompt": {
-          "label": "Prompt",
-          "type": "string",
-          "display": "textarea",
-          "default": ""
-        },
-        "out_prompt": {
-          "label": "Prompt",
-          "type": "string",
-          "display": "output"
-        },
-        "doc": {
-          "label": "Doc",
-          "type": "string",
-          "display": "output"
-        }
-      },
-      "input_names": ["prompt"],
-      "model_input_names": [],
-      "output_names": ["out_prompt", "doc"],
-      "block_name": "custom",
-      "node_type": "custom"
-    }
-  }
-}
-```
-
-See the final config at [diffusers/gemini-prompt-expander-mellon](https://huggingface.co/diffusers/gemini-prompt-expander-mellon).
--- a/docs/source/en/modular_diffusers/overview.md
+++ b/docs/source/en/modular_diffusers/overview.md
@@ -33,14 +33,9 @@ The Modular Diffusers docs are organized as shown below.
 - [SequentialPipelineBlocks](./sequential_pipeline_blocks) is a type of block that chains multiple blocks so they run one after another, passing data along the chain. This guide shows you how to create [`~modular_pipelines.SequentialPipelineBlocks`] and how they connect and work together.
 - [LoopSequentialPipelineBlocks](./loop_sequential_pipeline_blocks) is a type of block that runs a series of blocks in a loop. This guide shows you how to create [`~modular_pipelines.LoopSequentialPipelineBlocks`].
 - [AutoPipelineBlocks](./auto_pipeline_blocks) is a type of block that automatically chooses which blocks to run based on the input. This guide shows you how to create [`~modular_pipelines.AutoPipelineBlocks`].
- [Building Custom Blocks](./custom_blocks) shows you how to create your own custom blocks and share them on the Hub.

 ## ModularPipeline

 - [ModularPipeline](./modular_pipeline) shows you how to create and convert pipeline blocks into an executable [`ModularPipeline`].
 - [ComponentsManager](./components_manager) shows you how to manage and reuse components across multiple pipelines.
- [Guiders](./guiders) shows you how to use different guidance methods in the pipeline.
-
-## Mellon Integration
-
- [Using Custom Blocks with Mellon](./mellon) shows you how to make your custom blocks work with [Mellon](https://github.com/cubiq/Mellon), a visual node-based interface for building workflows.
+- [Guiders](./guiders) shows you how to use different guidance methods in the pipeline.
--- a/src/diffusers/loaders/lora_conversion_utils.py
+++ b/src/diffusers/loaders/lora_conversion_utils.py
@@ -2321,14 +2321,8 @@ def _convert_non_diffusers_flux2_lora_to_diffusers(state_dict):
    prefix = "diffusion_model."
    original_state_dict = {k[len(prefix) :]: v for k, v in state_dict.items()}

-    num_double_layers = 0
-    num_single_layers = 0
-    for key in original_state_dict.keys():
-        if key.startswith("single_blocks."):
-            num_single_layers = max(num_single_layers, int(key.split(".")[1]) + 1)
-        elif key.startswith("double_blocks."):
-            num_double_layers = max(num_double_layers, int(key.split(".")[1]) + 1)
-
+    num_double_layers = 8
+    num_single_layers = 48
    lora_keys = ("lora_A", "lora_B")
    attn_types = ("img_attn", "txt_attn")

--- a/src/diffusers/models/auto_model.py
+++ b/src/diffusers/models/auto_model.py
@@ -18,7 +18,7 @@ from typing import Optional, Union
 from huggingface_hub.utils import validate_hf_hub_args

 from ..configuration_utils import ConfigMixin
-from ..utils import DIFFUSERS_LOAD_ID_FIELDS, logging
+from ..utils import logging
 from ..utils.dynamic_modules_utils import get_class_from_dynamic_module, resolve_trust_remote_code


@@ -220,11 +220,4 @@ class AutoModel(ConfigMixin):
            raise ValueError(f"AutoModel can't find a model linked to {orig_class_name}.")

        kwargs = {**load_config_kwargs, **kwargs}
-        model = model_cls.from_pretrained(pretrained_model_or_path, **kwargs)
-
-        load_id_kwargs = {"pretrained_model_name_or_path": pretrained_model_or_path, **kwargs}
-        parts = [load_id_kwargs.get(field, "null") for field in DIFFUSERS_LOAD_ID_FIELDS]
-        load_id = "|".join("null" if p is None else p for p in parts)
-        model._diffusers_load_id = load_id
-
-        return model
+        return model_cls.from_pretrained(pretrained_model_or_path, **kwargs)
--- a/src/diffusers/models/controlnets/controlnet.py
+++ b/src/diffusers/models/controlnets/controlnet.py
@@ -21,7 +21,7 @@ from torch.nn import functional as F
 from ...configuration_utils import ConfigMixin, register_to_config
 from ...loaders import PeftAdapterMixin
 from ...loaders.single_file_model import FromOriginalModelMixin
-from ...utils import BaseOutput, apply_lora_scale, logging
+from ...utils import BaseOutput, logging
 from ..attention import AttentionMixin
 from ..attention_processor import (
    ADDED_KV_ATTENTION_PROCESSORS,
@@ -598,7 +598,6 @@ class ControlNetModel(ModelMixin, AttentionMixin, ConfigMixin, FromOriginalModel
        for module in self.children():
            fn_recursive_set_attention_slice(module, reversed_slice_size)

-    @apply_lora_scale("cross_attention_kwargs")
    def forward(
        self,
        sample: torch.Tensor,
--- a/src/diffusers/models/controlnets/controlnet_flux.py
+++ b/src/diffusers/models/controlnets/controlnet_flux.py
@@ -20,11 +20,7 @@ import torch.nn as nn

 from ...configuration_utils import ConfigMixin, register_to_config
 from ...loaders import PeftAdapterMixin
-from ...utils import (
-    BaseOutput,
-    apply_lora_scale,
-    logging,
-)
+from ...utils import USE_PEFT_BACKEND, BaseOutput, logging, scale_lora_layers, unscale_lora_layers
 from ..attention import AttentionMixin
 from ..controlnets.controlnet import ControlNetConditioningEmbedding, zero_module
 from ..embeddings import CombinedTimestepGuidanceTextProjEmbeddings, CombinedTimestepTextProjEmbeddings, FluxPosEmbed
@@ -154,7 +150,6 @@ class FluxControlNetModel(ModelMixin, AttentionMixin, ConfigMixin, PeftAdapterMi

        return controlnet

-    @apply_lora_scale("joint_attention_kwargs")
    def forward(
        self,
        hidden_states: torch.Tensor,
@@ -202,6 +197,20 @@ class FluxControlNetModel(ModelMixin, AttentionMixin, ConfigMixin, PeftAdapterMi
            If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
            `tuple` where the first element is the sample tensor.
        """
+        if joint_attention_kwargs is not None:
+            joint_attention_kwargs = joint_attention_kwargs.copy()
+            lora_scale = joint_attention_kwargs.pop("scale", 1.0)
+        else:
+            lora_scale = 1.0
+
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+        else:
+            if joint_attention_kwargs is not None and joint_attention_kwargs.get("scale", None) is not None:
+                logger.warning(
+                    "Passing `scale` via `joint_attention_kwargs` when not using the PEFT backend is ineffective."
+                )
        hidden_states = self.x_embedder(hidden_states)

        if self.input_hint_block is not None:
@@ -314,6 +323,10 @@ class FluxControlNetModel(ModelMixin, AttentionMixin, ConfigMixin, PeftAdapterMi
            None if len(controlnet_single_block_samples) == 0 else controlnet_single_block_samples
        )

+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+
        if not return_dict:
            return (controlnet_block_samples, controlnet_single_block_samples)

--- a/src/diffusers/models/controlnets/controlnet_qwenimage.py
+++ b/src/diffusers/models/controlnets/controlnet_qwenimage.py
@@ -20,12 +20,7 @@ import torch.nn as nn

 from ...configuration_utils import ConfigMixin, register_to_config
 from ...loaders import FromOriginalModelMixin, PeftAdapterMixin
-from ...utils import (
-    BaseOutput,
-    apply_lora_scale,
-    deprecate,
-    logging,
-)
+from ...utils import USE_PEFT_BACKEND, BaseOutput, deprecate, logging, scale_lora_layers, unscale_lora_layers
 from ..attention import AttentionMixin
 from ..cache_utils import CacheMixin
 from ..controlnets.controlnet import zero_module
@@ -128,7 +123,6 @@ class QwenImageControlNetModel(

        return controlnet

-    @apply_lora_scale("joint_attention_kwargs")
    def forward(
        self,
        hidden_states: torch.Tensor,
@@ -187,6 +181,20 @@ class QwenImageControlNetModel(
                standard_warn=False,
            )

+        if joint_attention_kwargs is not None:
+            joint_attention_kwargs = joint_attention_kwargs.copy()
+            lora_scale = joint_attention_kwargs.pop("scale", 1.0)
+        else:
+            lora_scale = 1.0
+
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+        else:
+            if joint_attention_kwargs is not None and joint_attention_kwargs.get("scale", None) is not None:
+                logger.warning(
+                    "Passing `scale` via `joint_attention_kwargs` when not using the PEFT backend is ineffective."
+                )
        hidden_states = self.img_in(hidden_states)

        # add
@@ -248,6 +256,10 @@ class QwenImageControlNetModel(
        controlnet_block_samples = [sample * conditioning_scale for sample in controlnet_block_samples]
        controlnet_block_samples = None if len(controlnet_block_samples) == 0 else controlnet_block_samples

+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+
        if not return_dict:
            return controlnet_block_samples

--- a/src/diffusers/models/controlnets/controlnet_sana.py
+++ b/src/diffusers/models/controlnets/controlnet_sana.py
@@ -20,7 +20,7 @@ from torch import nn

 from ...configuration_utils import ConfigMixin, register_to_config
 from ...loaders import PeftAdapterMixin
-from ...utils import BaseOutput, apply_lora_scale, logging
+from ...utils import USE_PEFT_BACKEND, BaseOutput, logging, scale_lora_layers, unscale_lora_layers
 from ..attention import AttentionMixin
 from ..embeddings import PatchEmbed, PixArtAlphaTextProjection
 from ..modeling_outputs import Transformer2DModelOutput
@@ -117,7 +117,6 @@ class SanaControlNetModel(ModelMixin, AttentionMixin, ConfigMixin, PeftAdapterMi

        self.gradient_checkpointing = False

-    @apply_lora_scale("attention_kwargs")
    def forward(
        self,
        hidden_states: torch.Tensor,
@@ -130,6 +129,21 @@ class SanaControlNetModel(ModelMixin, AttentionMixin, ConfigMixin, PeftAdapterMi
        attention_kwargs: Optional[Dict[str, Any]] = None,
        return_dict: bool = True,
    ) -> Union[Tuple[torch.Tensor, ...], Transformer2DModelOutput]:
+        if attention_kwargs is not None:
+            attention_kwargs = attention_kwargs.copy()
+            lora_scale = attention_kwargs.pop("scale", 1.0)
+        else:
+            lora_scale = 1.0
+
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+        else:
+            if attention_kwargs is not None and attention_kwargs.get("scale", None) is not None:
+                logger.warning(
+                    "Passing `scale` via `attention_kwargs` when not using the PEFT backend is ineffective."
+                )
+
        # ensure attention_mask is a bias, and give it a singleton query_tokens dimension.
        #   we may have done this conversion already, e.g. if we came here via UNet2DConditionModel#forward.
        #   we can tell by counting dims; if ndim == 2: it's a mask rather than a bias.
@@ -204,6 +218,10 @@ class SanaControlNetModel(ModelMixin, AttentionMixin, ConfigMixin, PeftAdapterMi
            block_res_sample = controlnet_block(block_res_sample)
            controlnet_block_res_samples = controlnet_block_res_samples + (block_res_sample,)

+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+
        controlnet_block_res_samples = [sample * conditioning_scale for sample in controlnet_block_res_samples]

        if not return_dict:
--- a/src/diffusers/models/controlnets/controlnet_sd3.py
+++ b/src/diffusers/models/controlnets/controlnet_sd3.py
@@ -21,7 +21,7 @@ import torch.nn as nn

 from ...configuration_utils import ConfigMixin, register_to_config
 from ...loaders import FromOriginalModelMixin, PeftAdapterMixin
-from ...utils import apply_lora_scale, logging
+from ...utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers
 from ..attention import AttentionMixin, JointTransformerBlock
 from ..attention_processor import Attention, FusedJointAttnProcessor2_0
 from ..embeddings import CombinedTimestepTextProjEmbeddings, PatchEmbed
@@ -269,7 +269,6 @@ class SD3ControlNetModel(ModelMixin, AttentionMixin, ConfigMixin, PeftAdapterMix

        return controlnet

-    @apply_lora_scale("joint_attention_kwargs")
    def forward(
        self,
        hidden_states: torch.Tensor,
@@ -309,6 +308,21 @@ class SD3ControlNetModel(ModelMixin, AttentionMixin, ConfigMixin, PeftAdapterMix
            If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
            `tuple` where the first element is the sample tensor.
        """
+        if joint_attention_kwargs is not None:
+            joint_attention_kwargs = joint_attention_kwargs.copy()
+            lora_scale = joint_attention_kwargs.pop("scale", 1.0)
+        else:
+            lora_scale = 1.0
+
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+        else:
+            if joint_attention_kwargs is not None and joint_attention_kwargs.get("scale", None) is not None:
+                logger.warning(
+                    "Passing `scale` via `joint_attention_kwargs` when not using the PEFT backend is ineffective."
+                )
+
        if self.pos_embed is not None and hidden_states.ndim != 4:
            raise ValueError("hidden_states must be 4D when pos_embed is used")

@@ -368,6 +382,10 @@ class SD3ControlNetModel(ModelMixin, AttentionMixin, ConfigMixin, PeftAdapterMix
        # 6. scaling
        controlnet_block_res_samples = [sample * conditioning_scale for sample in controlnet_block_res_samples]

+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+
        if not return_dict:
            return (controlnet_block_res_samples,)

--- a/src/diffusers/models/transformers/auraflow_transformer_2d.py
+++ b/src/diffusers/models/transformers/auraflow_transformer_2d.py
@@ -21,7 +21,7 @@ import torch.nn.functional as F

 from ...configuration_utils import ConfigMixin, register_to_config
 from ...loaders import FromOriginalModelMixin, PeftAdapterMixin
-from ...utils import apply_lora_scale, logging
+from ...utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers
 from ...utils.torch_utils import maybe_allow_in_graph
 from ..attention import AttentionMixin
 from ..attention_processor import (
@@ -397,7 +397,6 @@ class AuraFlowTransformer2DModel(ModelMixin, AttentionMixin, ConfigMixin, PeftAd
        if self.original_attn_processors is not None:
            self.set_attn_processor(self.original_attn_processors)

-    @apply_lora_scale("attention_kwargs")
    def forward(
        self,
        hidden_states: torch.FloatTensor,
@@ -406,6 +405,21 @@ class AuraFlowTransformer2DModel(ModelMixin, AttentionMixin, ConfigMixin, PeftAd
        attention_kwargs: Optional[Dict[str, Any]] = None,
        return_dict: bool = True,
    ) -> Union[Tuple[torch.Tensor], Transformer2DModelOutput]:
+        if attention_kwargs is not None:
+            attention_kwargs = attention_kwargs.copy()
+            lora_scale = attention_kwargs.pop("scale", 1.0)
+        else:
+            lora_scale = 1.0
+
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+        else:
+            if attention_kwargs is not None and attention_kwargs.get("scale", None) is not None:
+                logger.warning(
+                    "Passing `scale` via `attention_kwargs` when not using the PEFT backend is ineffective."
+                )
+
        height, width = hidden_states.shape[-2:]

        # Apply patch embedding, timestep embedding, and project the caption embeddings.
@@ -472,6 +486,10 @@ class AuraFlowTransformer2DModel(ModelMixin, AttentionMixin, ConfigMixin, PeftAd
            shape=(hidden_states.shape[0], out_channels, height * patch_size, width * patch_size)
        )

+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+
        if not return_dict:
            return (output,)

--- a/src/diffusers/models/transformers/cogvideox_transformer_3d.py
+++ b/src/diffusers/models/transformers/cogvideox_transformer_3d.py
@@ -20,7 +20,7 @@ from torch import nn

 from ...configuration_utils import ConfigMixin, register_to_config
 from ...loaders import PeftAdapterMixin
-from ...utils import apply_lora_scale, logging
+from ...utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers
 from ...utils.torch_utils import maybe_allow_in_graph
 from ..attention import Attention, AttentionMixin, FeedForward
 from ..attention_processor import CogVideoXAttnProcessor2_0, FusedCogVideoXAttnProcessor2_0
@@ -363,7 +363,6 @@ class CogVideoXTransformer3DModel(ModelMixin, AttentionMixin, ConfigMixin, PeftA
        if self.original_attn_processors is not None:
            self.set_attn_processor(self.original_attn_processors)

-    @apply_lora_scale("attention_kwargs")
    def forward(
        self,
        hidden_states: torch.Tensor,
@@ -375,6 +374,21 @@ class CogVideoXTransformer3DModel(ModelMixin, AttentionMixin, ConfigMixin, PeftA
        attention_kwargs: Optional[Dict[str, Any]] = None,
        return_dict: bool = True,
    ) -> Union[Tuple[torch.Tensor], Transformer2DModelOutput]:
+        if attention_kwargs is not None:
+            attention_kwargs = attention_kwargs.copy()
+            lora_scale = attention_kwargs.pop("scale", 1.0)
+        else:
+            lora_scale = 1.0
+
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+        else:
+            if attention_kwargs is not None and attention_kwargs.get("scale", None) is not None:
+                logger.warning(
+                    "Passing `scale` via `attention_kwargs` when not using the PEFT backend is ineffective."
+                )
+
        batch_size, num_frames, channels, height, width = hidden_states.shape

        # 1. Time embedding
@@ -440,6 +454,10 @@ class CogVideoXTransformer3DModel(ModelMixin, AttentionMixin, ConfigMixin, PeftA
            )
            output = output.permute(0, 1, 5, 4, 2, 6, 3, 7).flatten(6, 7).flatten(4, 5).flatten(1, 2)

+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+
        if not return_dict:
            return (output,)
        return Transformer2DModelOutput(sample=output)
--- a/src/diffusers/models/transformers/consisid_transformer_3d.py
+++ b/src/diffusers/models/transformers/consisid_transformer_3d.py
@@ -20,7 +20,7 @@ from torch import nn

 from ...configuration_utils import ConfigMixin, register_to_config
 from ...loaders import PeftAdapterMixin
-from ...utils import apply_lora_scale, logging
+from ...utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers
 from ...utils.torch_utils import maybe_allow_in_graph
 from ..attention import Attention, AttentionMixin, FeedForward
 from ..attention_processor import CogVideoXAttnProcessor2_0
@@ -620,7 +620,6 @@ class ConsisIDTransformer3DModel(ModelMixin, AttentionMixin, ConfigMixin, PeftAd
            ]
        )

-    @apply_lora_scale("attention_kwargs")
    def forward(
        self,
        hidden_states: torch.Tensor,
@@ -633,6 +632,21 @@ class ConsisIDTransformer3DModel(ModelMixin, AttentionMixin, ConfigMixin, PeftAd
        id_vit_hidden: Optional[torch.Tensor] = None,
        return_dict: bool = True,
    ) -> Union[Tuple[torch.Tensor], Transformer2DModelOutput]:
+        if attention_kwargs is not None:
+            attention_kwargs = attention_kwargs.copy()
+            lora_scale = attention_kwargs.pop("scale", 1.0)
+        else:
+            lora_scale = 1.0
+
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+        else:
+            if attention_kwargs is not None and attention_kwargs.get("scale", None) is not None:
+                logger.warning(
+                    "Passing `scale` via `attention_kwargs` when not using the PEFT backend is ineffective."
+                )
+
        # fuse clip and insightface
        valid_face_emb = None
        if self.is_train_face:
@@ -706,6 +720,10 @@ class ConsisIDTransformer3DModel(ModelMixin, AttentionMixin, ConfigMixin, PeftAd
        output = hidden_states.reshape(batch_size, num_frames, height // p, width // p, -1, p, p)
        output = output.permute(0, 1, 4, 2, 5, 3, 6).flatten(5, 6).flatten(3, 4)

+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+
        if not return_dict:
            return (output,)
        return Transformer2DModelOutput(sample=output)
--- a/src/diffusers/models/transformers/sana_transformer.py
+++ b/src/diffusers/models/transformers/sana_transformer.py
@@ -20,7 +20,7 @@ from torch import nn

 from ...configuration_utils import ConfigMixin, register_to_config
 from ...loaders import FromOriginalModelMixin, PeftAdapterMixin
-from ...utils import apply_lora_scale, logging
+from ...utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers
 from ..attention import AttentionMixin
 from ..attention_processor import (
    Attention,
@@ -414,7 +414,6 @@ class SanaTransformer2DModel(ModelMixin, AttentionMixin, ConfigMixin, PeftAdapte

        self.gradient_checkpointing = False

-    @apply_lora_scale("attention_kwargs")
    def forward(
        self,
        hidden_states: torch.Tensor,
@@ -427,6 +426,21 @@ class SanaTransformer2DModel(ModelMixin, AttentionMixin, ConfigMixin, PeftAdapte
        controlnet_block_samples: Optional[Tuple[torch.Tensor]] = None,
        return_dict: bool = True,
    ) -> Union[Tuple[torch.Tensor, ...], Transformer2DModelOutput]:
+        if attention_kwargs is not None:
+            attention_kwargs = attention_kwargs.copy()
+            lora_scale = attention_kwargs.pop("scale", 1.0)
+        else:
+            lora_scale = 1.0
+
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+        else:
+            if attention_kwargs is not None and attention_kwargs.get("scale", None) is not None:
+                logger.warning(
+                    "Passing `scale` via `attention_kwargs` when not using the PEFT backend is ineffective."
+                )
+
        # ensure attention_mask is a bias, and give it a singleton query_tokens dimension.
        #   we may have done this conversion already, e.g. if we came here via UNet2DConditionModel#forward.
        #   we can tell by counting dims; if ndim == 2: it's a mask rather than a bias.
@@ -513,6 +527,10 @@ class SanaTransformer2DModel(ModelMixin, AttentionMixin, ConfigMixin, PeftAdapte
        hidden_states = hidden_states.permute(0, 5, 1, 3, 2, 4)
        output = hidden_states.reshape(batch_size, -1, post_patch_height * p, post_patch_width * p)

+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+
        if not return_dict:
            return (output,)

--- a/src/diffusers/models/transformers/transformer_bria.py
+++ b/src/diffusers/models/transformers/transformer_bria.py
@@ -8,7 +8,7 @@ import torch.nn.functional as F

 from ...configuration_utils import ConfigMixin, register_to_config
 from ...loaders import FromOriginalModelMixin, PeftAdapterMixin
-from ...utils import apply_lora_scale, logging
+from ...utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers
 from ...utils.torch_utils import maybe_allow_in_graph
 from ..attention import AttentionModuleMixin, FeedForward
 from ..attention_dispatch import dispatch_attention_fn
@@ -581,7 +581,6 @@ class BriaTransformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOrig

        self.gradient_checkpointing = False

-    @apply_lora_scale("attention_kwargs")
    def forward(
        self,
        hidden_states: torch.Tensor,
@@ -622,6 +621,20 @@ class BriaTransformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOrig
            If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
            `tuple` where the first element is the sample tensor.
        """
+        if attention_kwargs is not None:
+            attention_kwargs = attention_kwargs.copy()
+            lora_scale = attention_kwargs.pop("scale", 1.0)
+        else:
+            lora_scale = 1.0
+
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+        else:
+            if attention_kwargs is not None and attention_kwargs.get("scale", None) is not None:
+                logger.warning(
+                    "Passing `scale` via `attention_kwargs` when not using the PEFT backend is ineffective."
+                )
        hidden_states = self.x_embedder(hidden_states)

        timestep = timestep.to(hidden_states.dtype)
@@ -702,6 +715,10 @@ class BriaTransformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOrig
        hidden_states = self.norm_out(hidden_states, temb)
        output = self.proj_out(hidden_states)

+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+
        if not return_dict:
            return (output,)

--- a/src/diffusers/models/transformers/transformer_bria_fibo.py
+++ b/src/diffusers/models/transformers/transformer_bria_fibo.py
@@ -22,8 +22,10 @@ from ...models.modeling_outputs import Transformer2DModelOutput
 from ...models.modeling_utils import ModelMixin
 from ...models.transformers.transformer_bria import BriaAttnProcessor
 from ...utils import (
-    apply_lora_scale,
+    USE_PEFT_BACKEND,
    logging,
+    scale_lora_layers,
+    unscale_lora_layers,
 )
 from ...utils.torch_utils import maybe_allow_in_graph
 from ..attention import AttentionModuleMixin, FeedForward
@@ -508,7 +510,6 @@ class BriaFiboTransformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, From
        ]
        self.caption_projection = nn.ModuleList(caption_projection)

-    @apply_lora_scale("joint_attention_kwargs")
    def forward(
        self,
        hidden_states: torch.Tensor,
@@ -544,7 +545,20 @@ class BriaFiboTransformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, From
            If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
            `tuple` where the first element is the sample tensor.
        """
+        if joint_attention_kwargs is not None:
+            joint_attention_kwargs = joint_attention_kwargs.copy()
+            lora_scale = joint_attention_kwargs.pop("scale", 1.0)
+        else:
+            lora_scale = 1.0

+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+        else:
+            if joint_attention_kwargs is not None and joint_attention_kwargs.get("scale", None) is not None:
+                logger.warning(
+                    "Passing `scale` via `joint_attention_kwargs` when not using the PEFT backend is ineffective."
+                )
        hidden_states = self.x_embedder(hidden_states)

        timestep = timestep.to(hidden_states.dtype)
@@ -631,6 +645,10 @@ class BriaFiboTransformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, From
        hidden_states = self.norm_out(hidden_states, temb)
        output = self.proj_out(hidden_states)

+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+
        if not return_dict:
            return (output,)

--- a/src/diffusers/models/transformers/transformer_chroma.py
+++ b/src/diffusers/models/transformers/transformer_chroma.py
@@ -21,7 +21,7 @@ import torch.nn as nn

 from ...configuration_utils import ConfigMixin, register_to_config
 from ...loaders import FluxTransformer2DLoadersMixin, FromOriginalModelMixin, PeftAdapterMixin
-from ...utils import apply_lora_scale, deprecate, logging
+from ...utils import USE_PEFT_BACKEND, deprecate, logging, scale_lora_layers, unscale_lora_layers
 from ...utils.import_utils import is_torch_npu_available
 from ...utils.torch_utils import maybe_allow_in_graph
 from ..attention import AttentionMixin, FeedForward
@@ -473,7 +473,6 @@ class ChromaTransformer2DModel(

        self.gradient_checkpointing = False

-    @apply_lora_scale("joint_attention_kwargs")
    def forward(
        self,
        hidden_states: torch.Tensor,
@@ -512,6 +511,20 @@ class ChromaTransformer2DModel(
            If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
            `tuple` where the first element is the sample tensor.
        """
+        if joint_attention_kwargs is not None:
+            joint_attention_kwargs = joint_attention_kwargs.copy()
+            lora_scale = joint_attention_kwargs.pop("scale", 1.0)
+        else:
+            lora_scale = 1.0
+
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+        else:
+            if joint_attention_kwargs is not None and joint_attention_kwargs.get("scale", None) is not None:
+                logger.warning(
+                    "Passing `scale` via `joint_attention_kwargs` when not using the PEFT backend is ineffective."
+                )

        hidden_states = self.x_embedder(hidden_states)

@@ -618,6 +631,10 @@ class ChromaTransformer2DModel(
        hidden_states = self.norm_out(hidden_states, temb)
        output = self.proj_out(hidden_states)

+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+
        if not return_dict:
            return (output,)

--- a/src/diffusers/models/transformers/transformer_chronoedit.py
+++ b/src/diffusers/models/transformers/transformer_chronoedit.py
@@ -21,7 +21,7 @@ import torch.nn.functional as F

 from ...configuration_utils import ConfigMixin, register_to_config
 from ...loaders import FromOriginalModelMixin, PeftAdapterMixin
-from ...utils import apply_lora_scale, deprecate, logging
+from ...utils import USE_PEFT_BACKEND, deprecate, logging, scale_lora_layers, unscale_lora_layers
 from ...utils.torch_utils import maybe_allow_in_graph
 from .._modeling_parallel import ContextParallelInput, ContextParallelOutput
 from ..attention import AttentionMixin, AttentionModuleMixin, FeedForward
@@ -638,7 +638,6 @@ class ChronoEditTransformer3DModel(

        self.gradient_checkpointing = False

-    @apply_lora_scale("attention_kwargs")
    def forward(
        self,
        hidden_states: torch.Tensor,
@@ -648,6 +647,21 @@ class ChronoEditTransformer3DModel(
        return_dict: bool = True,
        attention_kwargs: Optional[Dict[str, Any]] = None,
    ) -> Union[torch.Tensor, Dict[str, torch.Tensor]]:
+        if attention_kwargs is not None:
+            attention_kwargs = attention_kwargs.copy()
+            lora_scale = attention_kwargs.pop("scale", 1.0)
+        else:
+            lora_scale = 1.0
+
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+        else:
+            if attention_kwargs is not None and attention_kwargs.get("scale", None) is not None:
+                logger.warning(
+                    "Passing `scale` via `attention_kwargs` when not using the PEFT backend is ineffective."
+                )
+
        batch_size, num_channels, num_frames, height, width = hidden_states.shape
        p_t, p_h, p_w = self.config.patch_size
        post_patch_num_frames = num_frames // p_t
@@ -715,6 +729,10 @@ class ChronoEditTransformer3DModel(
        hidden_states = hidden_states.permute(0, 7, 1, 4, 2, 5, 3, 6)
        output = hidden_states.flatten(6, 7).flatten(4, 5).flatten(2, 3)

+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+
        if not return_dict:
            return (output,)

--- a/src/diffusers/models/transformers/transformer_cogview4.py
+++ b/src/diffusers/models/transformers/transformer_cogview4.py
@@ -20,7 +20,7 @@ import torch.nn.functional as F

 from ...configuration_utils import ConfigMixin, register_to_config
 from ...loaders import PeftAdapterMixin
-from ...utils import apply_lora_scale, logging
+from ...utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers
 from ...utils.torch_utils import maybe_allow_in_graph
 from ..attention import FeedForward
 from ..attention_processor import Attention
@@ -703,7 +703,6 @@ class CogView4Transformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, Cach

        self.gradient_checkpointing = False

-    @apply_lora_scale("attention_kwargs")
    def forward(
        self,
        hidden_states: torch.Tensor,
@@ -719,6 +718,21 @@ class CogView4Transformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, Cach
            Union[Tuple[torch.Tensor, torch.Tensor], List[Tuple[torch.Tensor, torch.Tensor]]]
        ] = None,
    ) -> Union[Tuple[torch.Tensor], Transformer2DModelOutput]:
+        if attention_kwargs is not None:
+            attention_kwargs = attention_kwargs.copy()
+            lora_scale = attention_kwargs.pop("scale", 1.0)
+        else:
+            lora_scale = 1.0
+
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+        else:
+            if attention_kwargs is not None and attention_kwargs.get("scale", None) is not None:
+                logger.warning(
+                    "Passing `scale` via `attention_kwargs` when not using the PEFT backend is ineffective."
+                )
+
        batch_size, num_channels, height, width = hidden_states.shape

        # 1. RoPE
@@ -765,6 +779,10 @@ class CogView4Transformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, Cach
        hidden_states = hidden_states.reshape(batch_size, post_patch_height, post_patch_width, -1, p, p)
        output = hidden_states.permute(0, 3, 1, 4, 2, 5).flatten(4, 5).flatten(2, 3)

+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+
        if not return_dict:
            return (output,)
        return Transformer2DModelOutput(sample=output)
--- a/src/diffusers/models/transformers/transformer_flux.py
+++ b/src/diffusers/models/transformers/transformer_flux.py
@@ -22,7 +22,7 @@ import torch.nn.functional as F

 from ...configuration_utils import ConfigMixin, register_to_config
 from ...loaders import FluxTransformer2DLoadersMixin, FromOriginalModelMixin, PeftAdapterMixin
-from ...utils import apply_lora_scale, logging
+from ...utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers
 from ...utils.torch_utils import maybe_allow_in_graph
 from .._modeling_parallel import ContextParallelInput, ContextParallelOutput
 from ..attention import AttentionMixin, AttentionModuleMixin, FeedForward
@@ -634,7 +634,6 @@ class FluxTransformer2DModel(

        self.gradient_checkpointing = False

-    @apply_lora_scale("joint_attention_kwargs")
    def forward(
        self,
        hidden_states: torch.Tensor,
@@ -676,6 +675,20 @@ class FluxTransformer2DModel(
            If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
            `tuple` where the first element is the sample tensor.
        """
+        if joint_attention_kwargs is not None:
+            joint_attention_kwargs = joint_attention_kwargs.copy()
+            lora_scale = joint_attention_kwargs.pop("scale", 1.0)
+        else:
+            lora_scale = 1.0
+
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+        else:
+            if joint_attention_kwargs is not None and joint_attention_kwargs.get("scale", None) is not None:
+                logger.warning(
+                    "Passing `scale` via `joint_attention_kwargs` when not using the PEFT backend is ineffective."
+                )

        hidden_states = self.x_embedder(hidden_states)

@@ -772,6 +785,10 @@ class FluxTransformer2DModel(
        hidden_states = self.norm_out(hidden_states, temb)
        output = self.proj_out(hidden_states)

+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+
        if not return_dict:
            return (output,)

--- a/src/diffusers/models/transformers/transformer_flux2.py
+++ b/src/diffusers/models/transformers/transformer_flux2.py
@@ -21,7 +21,7 @@ import torch.nn.functional as F

 from ...configuration_utils import ConfigMixin, register_to_config
 from ...loaders import FluxTransformer2DLoadersMixin, FromOriginalModelMixin, PeftAdapterMixin
-from ...utils import apply_lora_scale, logging
+from ...utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers
 from .._modeling_parallel import ContextParallelInput, ContextParallelOutput
 from ..attention import AttentionMixin, AttentionModuleMixin
 from ..attention_dispatch import dispatch_attention_fn
@@ -774,7 +774,6 @@ class Flux2Transformer2DModel(

        self.gradient_checkpointing = False

-    @apply_lora_scale("joint_attention_kwargs")
    def forward(
        self,
        hidden_states: torch.Tensor,
@@ -811,6 +810,20 @@ class Flux2Transformer2DModel(
            `tuple` where the first element is the sample tensor.
        """
        # 0. Handle input arguments
+        if joint_attention_kwargs is not None:
+            joint_attention_kwargs = joint_attention_kwargs.copy()
+            lora_scale = joint_attention_kwargs.pop("scale", 1.0)
+        else:
+            lora_scale = 1.0
+
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+        else:
+            if joint_attention_kwargs is not None and joint_attention_kwargs.get("scale", None) is not None:
+                logger.warning(
+                    "Passing `scale` via `joint_attention_kwargs` when not using the PEFT backend is ineffective."
+                )

        num_txt_tokens = encoder_hidden_states.shape[1]

@@ -895,6 +908,10 @@ class Flux2Transformer2DModel(
        hidden_states = self.norm_out(hidden_states, temb)
        output = self.proj_out(hidden_states)

+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+
        if not return_dict:
            return (output,)

--- a/src/diffusers/models/transformers/transformer_hidream_image.py
+++ b/src/diffusers/models/transformers/transformer_hidream_image.py
@@ -8,7 +8,7 @@ from ...configuration_utils import ConfigMixin, register_to_config
 from ...loaders import FromOriginalModelMixin, PeftAdapterMixin
 from ...models.modeling_outputs import Transformer2DModelOutput
 from ...models.modeling_utils import ModelMixin
-from ...utils import apply_lora_scale, deprecate, logging
+from ...utils import USE_PEFT_BACKEND, deprecate, logging, scale_lora_layers, unscale_lora_layers
 from ...utils.torch_utils import maybe_allow_in_graph
 from ..attention import Attention
 from ..embeddings import TimestepEmbedding, Timesteps
@@ -773,7 +773,6 @@ class HiDreamImageTransformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin,

        return hidden_states, hidden_states_masks, img_sizes, img_ids

-    @apply_lora_scale("attention_kwargs")
    def forward(
        self,
        hidden_states: torch.Tensor,
@@ -809,6 +808,21 @@ class HiDreamImageTransformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin,
                "if `hidden_states_masks` is passed, `hidden_states` must be a 3D tensors with shape (batch_size, patch_height * patch_width, patch_size * patch_size * channels)"
            )

+        if attention_kwargs is not None:
+            attention_kwargs = attention_kwargs.copy()
+            lora_scale = attention_kwargs.pop("scale", 1.0)
+        else:
+            lora_scale = 1.0
+
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+        else:
+            if attention_kwargs is not None and attention_kwargs.get("scale", None) is not None:
+                logger.warning(
+                    "Passing `scale` via `attention_kwargs` when not using the PEFT backend is ineffective."
+                )
+
        # spatial forward
        batch_size = hidden_states.shape[0]
        hidden_states_type = hidden_states.dtype
@@ -919,6 +933,10 @@ class HiDreamImageTransformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin,
        if hidden_states_masks is not None:
            hidden_states_masks = hidden_states_masks[:, :image_tokens_seq_len]

+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+
        if not return_dict:
            return (output,)
        return Transformer2DModelOutput(sample=output)
--- a/src/diffusers/models/transformers/transformer_hunyuan_video.py
+++ b/src/diffusers/models/transformers/transformer_hunyuan_video.py
@@ -22,7 +22,7 @@ from diffusers.loaders import FromOriginalModelMixin

 from ...configuration_utils import ConfigMixin, register_to_config
 from ...loaders import PeftAdapterMixin
-from ...utils import apply_lora_scale, logging
+from ...utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers
 from ..attention import AttentionMixin, FeedForward
 from ..attention_dispatch import dispatch_attention_fn
 from ..attention_processor import Attention
@@ -989,7 +989,6 @@ class HunyuanVideoTransformer3DModel(

        self.gradient_checkpointing = False

-    @apply_lora_scale("attention_kwargs")
    def forward(
        self,
        hidden_states: torch.Tensor,
@@ -1001,6 +1000,21 @@ class HunyuanVideoTransformer3DModel(
        attention_kwargs: Optional[Dict[str, Any]] = None,
        return_dict: bool = True,
    ) -> Union[Tuple[torch.Tensor], Transformer2DModelOutput]:
+        if attention_kwargs is not None:
+            attention_kwargs = attention_kwargs.copy()
+            lora_scale = attention_kwargs.pop("scale", 1.0)
+        else:
+            lora_scale = 1.0
+
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+        else:
+            if attention_kwargs is not None and attention_kwargs.get("scale", None) is not None:
+                logger.warning(
+                    "Passing `scale` via `attention_kwargs` when not using the PEFT backend is ineffective."
+                )
+
        batch_size, num_channels, num_frames, height, width = hidden_states.shape
        p, p_t = self.config.patch_size, self.config.patch_size_t
        post_patch_num_frames = num_frames // p_t
@@ -1090,6 +1104,10 @@ class HunyuanVideoTransformer3DModel(
        hidden_states = hidden_states.permute(0, 4, 1, 5, 2, 6, 3, 7)
        hidden_states = hidden_states.flatten(6, 7).flatten(4, 5).flatten(2, 3)

+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+
        if not return_dict:
            return (hidden_states,)

--- a/src/diffusers/models/transformers/transformer_hunyuan_video15.py
+++ b/src/diffusers/models/transformers/transformer_hunyuan_video15.py
@@ -22,7 +22,7 @@ from diffusers.loaders import FromOriginalModelMixin

 from ...configuration_utils import ConfigMixin, register_to_config
 from ...loaders import PeftAdapterMixin
-from ...utils import apply_lora_scale, logging
+from ...utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers
 from ..attention import AttentionMixin, FeedForward
 from ..attention_dispatch import dispatch_attention_fn
 from ..attention_processor import Attention
@@ -620,7 +620,6 @@ class HunyuanVideo15Transformer3DModel(

        self.gradient_checkpointing = False

-    @apply_lora_scale("attention_kwargs")
    def forward(
        self,
        hidden_states: torch.Tensor,
@@ -634,6 +633,21 @@ class HunyuanVideo15Transformer3DModel(
        attention_kwargs: Optional[Dict[str, Any]] = None,
        return_dict: bool = True,
    ) -> Union[Tuple[torch.Tensor], Transformer2DModelOutput]:
+        if attention_kwargs is not None:
+            attention_kwargs = attention_kwargs.copy()
+            lora_scale = attention_kwargs.pop("scale", 1.0)
+        else:
+            lora_scale = 1.0
+
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+        else:
+            if attention_kwargs is not None and attention_kwargs.get("scale", None) is not None:
+                logger.warning(
+                    "Passing `scale` via `attention_kwargs` when not using the PEFT backend is ineffective."
+                )
+
        batch_size, num_channels, num_frames, height, width = hidden_states.shape
        p_t, p_h, p_w = self.config.patch_size_t, self.config.patch_size, self.config.patch_size
        post_patch_num_frames = num_frames // p_t
@@ -769,6 +783,10 @@ class HunyuanVideo15Transformer3DModel(
        hidden_states = hidden_states.permute(0, 4, 1, 5, 2, 6, 3, 7)
        hidden_states = hidden_states.flatten(6, 7).flatten(4, 5).flatten(2, 3)

+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+
        if not return_dict:
            return (hidden_states,)

--- a/src/diffusers/models/transformers/transformer_hunyuan_video_framepack.py
+++ b/src/diffusers/models/transformers/transformer_hunyuan_video_framepack.py
@@ -20,7 +20,7 @@ import torch.nn.functional as F

 from ...configuration_utils import ConfigMixin, register_to_config
 from ...loaders import FromOriginalModelMixin, PeftAdapterMixin
-from ...utils import apply_lora_scale, get_logger
+from ...utils import USE_PEFT_BACKEND, get_logger, scale_lora_layers, unscale_lora_layers
 from ..cache_utils import CacheMixin
 from ..embeddings import get_1d_rotary_pos_embed
 from ..modeling_outputs import Transformer2DModelOutput
@@ -198,7 +198,6 @@ class HunyuanVideoFramepackTransformer3DModel(

        self.gradient_checkpointing = False

-    @apply_lora_scale("attention_kwargs")
    def forward(
        self,
        hidden_states: torch.Tensor,
@@ -218,6 +217,21 @@ class HunyuanVideoFramepackTransformer3DModel(
        attention_kwargs: Optional[Dict[str, Any]] = None,
        return_dict: bool = True,
    ) -> Union[Tuple[torch.Tensor], Transformer2DModelOutput]:
+        if attention_kwargs is not None:
+            attention_kwargs = attention_kwargs.copy()
+            lora_scale = attention_kwargs.pop("scale", 1.0)
+        else:
+            lora_scale = 1.0
+
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+        else:
+            if attention_kwargs is not None and attention_kwargs.get("scale", None) is not None:
+                logger.warning(
+                    "Passing `scale` via `attention_kwargs` when not using the PEFT backend is ineffective."
+                )
+
        batch_size, num_channels, num_frames, height, width = hidden_states.shape
        p, p_t = self.config.patch_size, self.config.patch_size_t
        post_patch_num_frames = num_frames // p_t
@@ -323,6 +337,10 @@ class HunyuanVideoFramepackTransformer3DModel(
        hidden_states = hidden_states.permute(0, 4, 1, 5, 2, 6, 3, 7)
        hidden_states = hidden_states.flatten(6, 7).flatten(4, 5).flatten(2, 3)

+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+
        if not return_dict:
            return (hidden_states,)
        return Transformer2DModelOutput(sample=hidden_states)
--- a/src/diffusers/models/transformers/transformer_hunyuanimage.py
+++ b/src/diffusers/models/transformers/transformer_hunyuanimage.py
@@ -23,7 +23,7 @@ from diffusers.loaders import FromOriginalModelMixin

 from ...configuration_utils import ConfigMixin, register_to_config
 from ...loaders import PeftAdapterMixin
-from ...utils import apply_lora_scale, logging
+from ...utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers
 from ...utils.torch_utils import maybe_allow_in_graph
 from ..attention import AttentionMixin, FeedForward
 from ..attention_dispatch import dispatch_attention_fn
@@ -742,7 +742,6 @@ class HunyuanImageTransformer2DModel(

        self.gradient_checkpointing = False

-    @apply_lora_scale("attention_kwargs")
    def forward(
        self,
        hidden_states: torch.Tensor,
@@ -756,6 +755,21 @@ class HunyuanImageTransformer2DModel(
        attention_kwargs: Optional[Dict[str, Any]] = None,
        return_dict: bool = True,
    ) -> Union[torch.Tensor, Dict[str, torch.Tensor]]:
+        if attention_kwargs is not None:
+            attention_kwargs = attention_kwargs.copy()
+            lora_scale = attention_kwargs.pop("scale", 1.0)
+        else:
+            lora_scale = 1.0
+
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+        else:
+            if attention_kwargs is not None and attention_kwargs.get("scale", None) is not None:
+                logger.warning(
+                    "Passing `scale` via `attention_kwargs` when not using the PEFT backend is ineffective."
+                )
+
        if hidden_states.ndim == 4:
            batch_size, channels, height, width = hidden_states.shape
            sizes = (height, width)
@@ -886,6 +900,10 @@ class HunyuanImageTransformer2DModel(
        ]
        hidden_states = hidden_states.reshape(*final_dims)

+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+
        if not return_dict:
            return (hidden_states,)

--- a/src/diffusers/models/transformers/transformer_ltx.py
+++ b/src/diffusers/models/transformers/transformer_ltx.py
@@ -22,7 +22,7 @@ import torch.nn as nn

 from ...configuration_utils import ConfigMixin, register_to_config
 from ...loaders import FromOriginalModelMixin, PeftAdapterMixin
-from ...utils import apply_lora_scale, deprecate, is_torch_version, logging
+from ...utils import USE_PEFT_BACKEND, deprecate, is_torch_version, logging, scale_lora_layers, unscale_lora_layers
 from ...utils.torch_utils import maybe_allow_in_graph
 from .._modeling_parallel import ContextParallelInput, ContextParallelOutput
 from ..attention import AttentionMixin, AttentionModuleMixin, FeedForward
@@ -491,7 +491,6 @@ class LTXVideoTransformer3DModel(

        self.gradient_checkpointing = False

-    @apply_lora_scale("attention_kwargs")
    def forward(
        self,
        hidden_states: torch.Tensor,
@@ -506,6 +505,21 @@ class LTXVideoTransformer3DModel(
        attention_kwargs: Optional[Dict[str, Any]] = None,
        return_dict: bool = True,
    ) -> torch.Tensor:
+        if attention_kwargs is not None:
+            attention_kwargs = attention_kwargs.copy()
+            lora_scale = attention_kwargs.pop("scale", 1.0)
+        else:
+            lora_scale = 1.0
+
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+        else:
+            if attention_kwargs is not None and attention_kwargs.get("scale", None) is not None:
+                logger.warning(
+                    "Passing `scale` via `attention_kwargs` when not using the PEFT backend is ineffective."
+                )
+
        image_rotary_emb = self.rope(hidden_states, num_frames, height, width, rope_interpolation_scale, video_coords)

        # convert encoder_attention_mask to a bias the same way we do for attention_mask
@@ -554,6 +568,10 @@ class LTXVideoTransformer3DModel(
        hidden_states = hidden_states * (1 + scale) + shift
        output = self.proj_out(hidden_states)

+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+
        if not return_dict:
            return (output,)
        return Transformer2DModelOutput(sample=output)
--- a/src/diffusers/models/transformers/transformer_ltx2.py
+++ b/src/diffusers/models/transformers/transformer_ltx2.py
@@ -22,7 +22,14 @@ import torch.nn as nn

 from ...configuration_utils import ConfigMixin, register_to_config
 from ...loaders import FromOriginalModelMixin, PeftAdapterMixin
-from ...utils import BaseOutput, apply_lora_scale, is_torch_version, logging
+from ...utils import (
+    USE_PEFT_BACKEND,
+    BaseOutput,
+    is_torch_version,
+    logging,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
 from .._modeling_parallel import ContextParallelInput, ContextParallelOutput
 from ..attention import AttentionMixin, AttentionModuleMixin, FeedForward
 from ..attention_dispatch import dispatch_attention_fn
@@ -1094,7 +1101,6 @@ class LTX2VideoTransformer3DModel(

        self.gradient_checkpointing = False

-    @apply_lora_scale("attention_kwargs")
    def forward(
        self,
        hidden_states: torch.Tensor,
@@ -1165,6 +1171,21 @@ class LTX2VideoTransformer3DModel(
                `tuple` is returned where the first element is the denoised video latent patch sequence and the second
                element is the denoised audio latent patch sequence.
        """
+        if attention_kwargs is not None:
+            attention_kwargs = attention_kwargs.copy()
+            lora_scale = attention_kwargs.pop("scale", 1.0)
+        else:
+            lora_scale = 1.0
+
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+        else:
+            if attention_kwargs is not None and attention_kwargs.get("scale", None) is not None:
+                logger.warning(
+                    "Passing `scale` via `attention_kwargs` when not using the PEFT backend is ineffective."
+                )
+
        # Determine timestep for audio.
        audio_timestep = audio_timestep if audio_timestep is not None else timestep

@@ -1320,6 +1341,10 @@ class LTX2VideoTransformer3DModel(
        audio_hidden_states = audio_hidden_states * (1 + audio_scale) + audio_shift
        audio_output = self.audio_proj_out(audio_hidden_states)

+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+
        if not return_dict:
            return (output, audio_output)
        return AudioVisualModelOutput(sample=output, audio_sample=audio_output)
--- a/src/diffusers/models/transformers/transformer_lumina2.py
+++ b/src/diffusers/models/transformers/transformer_lumina2.py
@@ -22,7 +22,7 @@ import torch.nn.functional as F
 from ...configuration_utils import ConfigMixin, register_to_config
 from ...loaders import PeftAdapterMixin
 from ...loaders.single_file_model import FromOriginalModelMixin
-from ...utils import apply_lora_scale, logging
+from ...utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers
 from ..attention import LuminaFeedForward
 from ..attention_processor import Attention
 from ..embeddings import TimestepEmbedding, Timesteps, apply_rotary_emb, get_1d_rotary_pos_embed
@@ -455,7 +455,6 @@ class Lumina2Transformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromO

        self.gradient_checkpointing = False

-    @apply_lora_scale("attention_kwargs")
    def forward(
        self,
        hidden_states: torch.Tensor,
@@ -465,6 +464,21 @@ class Lumina2Transformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromO
        attention_kwargs: Optional[Dict[str, Any]] = None,
        return_dict: bool = True,
    ) -> Union[torch.Tensor, Transformer2DModelOutput]:
+        if attention_kwargs is not None:
+            attention_kwargs = attention_kwargs.copy()
+            lora_scale = attention_kwargs.pop("scale", 1.0)
+        else:
+            lora_scale = 1.0
+
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+        else:
+            if attention_kwargs is not None and attention_kwargs.get("scale", None) is not None:
+                logger.warning(
+                    "Passing `scale` via `attention_kwargs` when not using the PEFT backend is ineffective."
+                )
+
        # 1. Condition, positional & patch embedding
        batch_size, _, height, width = hidden_states.shape

@@ -525,6 +539,10 @@ class Lumina2Transformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromO
            )
        output = torch.stack(output, dim=0)

+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+
        if not return_dict:
            return (output,)
        return Transformer2DModelOutput(sample=output)
--- a/src/diffusers/models/transformers/transformer_mochi.py
+++ b/src/diffusers/models/transformers/transformer_mochi.py
@@ -21,7 +21,7 @@ import torch.nn as nn
 from ...configuration_utils import ConfigMixin, register_to_config
 from ...loaders import PeftAdapterMixin
 from ...loaders.single_file_model import FromOriginalModelMixin
-from ...utils import apply_lora_scale, logging
+from ...utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers
 from ...utils.torch_utils import maybe_allow_in_graph
 from ..attention import FeedForward
 from ..attention_processor import MochiAttention, MochiAttnProcessor2_0
@@ -404,7 +404,6 @@ class MochiTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOri

        self.gradient_checkpointing = False

-    @apply_lora_scale("attention_kwargs")
    def forward(
        self,
        hidden_states: torch.Tensor,
@@ -414,6 +413,21 @@ class MochiTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOri
        attention_kwargs: Optional[Dict[str, Any]] = None,
        return_dict: bool = True,
    ) -> torch.Tensor:
+        if attention_kwargs is not None:
+            attention_kwargs = attention_kwargs.copy()
+            lora_scale = attention_kwargs.pop("scale", 1.0)
+        else:
+            lora_scale = 1.0
+
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+        else:
+            if attention_kwargs is not None and attention_kwargs.get("scale", None) is not None:
+                logger.warning(
+                    "Passing `scale` via `attention_kwargs` when not using the PEFT backend is ineffective."
+                )
+
        batch_size, num_channels, num_frames, height, width = hidden_states.shape
        p = self.config.patch_size

@@ -465,6 +479,10 @@ class MochiTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOri
        hidden_states = hidden_states.permute(0, 6, 1, 2, 4, 3, 5)
        output = hidden_states.reshape(batch_size, -1, num_frames, height, width)

+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+
        if not return_dict:
            return (output,)
        return Transformer2DModelOutput(sample=output)
--- a/src/diffusers/models/transformers/transformer_qwenimage.py
+++ b/src/diffusers/models/transformers/transformer_qwenimage.py
@@ -24,7 +24,7 @@ import torch.nn.functional as F

 from ...configuration_utils import ConfigMixin, register_to_config
 from ...loaders import FromOriginalModelMixin, PeftAdapterMixin
-from ...utils import apply_lora_scale, deprecate, logging
+from ...utils import USE_PEFT_BACKEND, deprecate, logging, scale_lora_layers, unscale_lora_layers
 from ...utils.torch_utils import maybe_allow_in_graph
 from .._modeling_parallel import ContextParallelInput, ContextParallelOutput
 from ..attention import AttentionMixin, FeedForward
@@ -829,7 +829,6 @@ class QwenImageTransformer2DModel(
        self.gradient_checkpointing = False
        self.zero_cond_t = zero_cond_t

-    @apply_lora_scale("attention_kwargs")
    def forward(
        self,
        hidden_states: torch.Tensor,
@@ -888,6 +887,20 @@ class QwenImageTransformer2DModel(
                "The mask-based approach is more flexible and supports variable-length sequences.",
                standard_warn=False,
            )
+        if attention_kwargs is not None:
+            attention_kwargs = attention_kwargs.copy()
+            lora_scale = attention_kwargs.pop("scale", 1.0)
+        else:
+            lora_scale = 1.0
+
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+        else:
+            if attention_kwargs is not None and attention_kwargs.get("scale", None) is not None:
+                logger.warning(
+                    "Passing `scale` via `joint_attention_kwargs` when not using the PEFT backend is ineffective."
+                )

        hidden_states = self.img_in(hidden_states)

@@ -968,6 +981,10 @@ class QwenImageTransformer2DModel(
        hidden_states = self.norm_out(hidden_states, temb)
        output = self.proj_out(hidden_states)

+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+
        if not return_dict:
            return (output,)

--- a/src/diffusers/models/transformers/transformer_sana_video.py
+++ b/src/diffusers/models/transformers/transformer_sana_video.py
@@ -21,7 +21,7 @@ from torch import nn

 from ...configuration_utils import ConfigMixin, register_to_config
 from ...loaders import FromOriginalModelMixin, PeftAdapterMixin
-from ...utils import apply_lora_scale, logging
+from ...utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers
 from ..attention import AttentionMixin
 from ..attention_dispatch import dispatch_attention_fn
 from ..attention_processor import Attention
@@ -570,7 +570,6 @@ class SanaVideoTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, Fro

        self.gradient_checkpointing = False

-    @apply_lora_scale("attention_kwargs")
    def forward(
        self,
        hidden_states: torch.Tensor,
@@ -583,6 +582,21 @@ class SanaVideoTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, Fro
        controlnet_block_samples: Optional[Tuple[torch.Tensor]] = None,
        return_dict: bool = True,
    ) -> Union[Tuple[torch.Tensor, ...], Transformer2DModelOutput]:
+        if attention_kwargs is not None:
+            attention_kwargs = attention_kwargs.copy()
+            lora_scale = attention_kwargs.pop("scale", 1.0)
+        else:
+            lora_scale = 1.0
+
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+        else:
+            if attention_kwargs is not None and attention_kwargs.get("scale", None) is not None:
+                logger.warning(
+                    "Passing `scale` via `attention_kwargs` when not using the PEFT backend is ineffective."
+                )
+
        # ensure attention_mask is a bias, and give it a singleton query_tokens dimension.
        #   we may have done this conversion already, e.g. if we came here via UNet2DConditionModel#forward.
        #   we can tell by counting dims; if ndim == 2: it's a mask rather than a bias.
@@ -681,6 +695,10 @@ class SanaVideoTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, Fro
        hidden_states = hidden_states.permute(0, 7, 1, 4, 2, 5, 3, 6)
        output = hidden_states.flatten(6, 7).flatten(4, 5).flatten(2, 3)

+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+
        if not return_dict:
            return (output,)

--- a/src/diffusers/models/transformers/transformer_sd3.py
+++ b/src/diffusers/models/transformers/transformer_sd3.py
@@ -18,7 +18,7 @@ import torch.nn as nn

 from ...configuration_utils import ConfigMixin, register_to_config
 from ...loaders import FromOriginalModelMixin, PeftAdapterMixin, SD3Transformer2DLoadersMixin
-from ...utils import apply_lora_scale, logging
+from ...utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers
 from ...utils.torch_utils import maybe_allow_in_graph
 from ..attention import AttentionMixin, FeedForward, JointTransformerBlock
 from ..attention_processor import (
@@ -245,7 +245,6 @@ class SD3Transformer2DModel(
        if self.original_attn_processors is not None:
            self.set_attn_processor(self.original_attn_processors)

-    @apply_lora_scale("joint_attention_kwargs")
    def forward(
        self,
        hidden_states: torch.Tensor,
@@ -285,6 +284,20 @@ class SD3Transformer2DModel(
            If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
            `tuple` where the first element is the sample tensor.
        """
+        if joint_attention_kwargs is not None:
+            joint_attention_kwargs = joint_attention_kwargs.copy()
+            lora_scale = joint_attention_kwargs.pop("scale", 1.0)
+        else:
+            lora_scale = 1.0
+
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+        else:
+            if joint_attention_kwargs is not None and joint_attention_kwargs.get("scale", None) is not None:
+                logger.warning(
+                    "Passing `scale` via `joint_attention_kwargs` when not using the PEFT backend is ineffective."
+                )

        height, width = hidden_states.shape[-2:]

@@ -339,6 +352,10 @@ class SD3Transformer2DModel(
            shape=(hidden_states.shape[0], self.out_channels, height * patch_size, width * patch_size)
        )

+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+
        if not return_dict:
            return (output,)

--- a/src/diffusers/models/transformers/transformer_skyreels_v2.py
+++ b/src/diffusers/models/transformers/transformer_skyreels_v2.py
@@ -21,7 +21,7 @@ import torch.nn.functional as F

 from ...configuration_utils import ConfigMixin, register_to_config
 from ...loaders import FromOriginalModelMixin, PeftAdapterMixin
-from ...utils import apply_lora_scale, deprecate, logging
+from ...utils import USE_PEFT_BACKEND, deprecate, logging, scale_lora_layers, unscale_lora_layers
 from ...utils.torch_utils import maybe_allow_in_graph
 from ..attention import AttentionMixin, AttentionModuleMixin, FeedForward
 from ..attention_dispatch import dispatch_attention_fn
@@ -630,7 +630,6 @@ class SkyReelsV2Transformer3DModel(

        self.gradient_checkpointing = False

-    @apply_lora_scale("attention_kwargs")
    def forward(
        self,
        hidden_states: torch.Tensor,
@@ -642,6 +641,21 @@ class SkyReelsV2Transformer3DModel(
        return_dict: bool = True,
        attention_kwargs: Optional[Dict[str, Any]] = None,
    ) -> Union[torch.Tensor, Dict[str, torch.Tensor]]:
+        if attention_kwargs is not None:
+            attention_kwargs = attention_kwargs.copy()
+            lora_scale = attention_kwargs.pop("scale", 1.0)
+        else:
+            lora_scale = 1.0
+
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+        else:
+            if attention_kwargs is not None and attention_kwargs.get("scale", None) is not None:
+                logger.warning(
+                    "Passing `scale` via `attention_kwargs` when not using the PEFT backend is ineffective."
+                )
+
        batch_size, num_channels, num_frames, height, width = hidden_states.shape
        p_t, p_h, p_w = self.config.patch_size
        post_patch_num_frames = num_frames // p_t
@@ -757,6 +771,10 @@ class SkyReelsV2Transformer3DModel(
        hidden_states = hidden_states.permute(0, 7, 1, 4, 2, 5, 3, 6)
        output = hidden_states.flatten(6, 7).flatten(4, 5).flatten(2, 3)

+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+
        if not return_dict:
            return (output,)

--- a/src/diffusers/models/transformers/transformer_wan.py
+++ b/src/diffusers/models/transformers/transformer_wan.py
@@ -21,7 +21,7 @@ import torch.nn.functional as F

 from ...configuration_utils import ConfigMixin, register_to_config
 from ...loaders import FromOriginalModelMixin, PeftAdapterMixin
-from ...utils import apply_lora_scale, deprecate, logging
+from ...utils import USE_PEFT_BACKEND, deprecate, logging, scale_lora_layers, unscale_lora_layers
 from ...utils.torch_utils import maybe_allow_in_graph
 from .._modeling_parallel import ContextParallelInput, ContextParallelOutput
 from ..attention import AttentionMixin, AttentionModuleMixin, FeedForward
@@ -622,7 +622,6 @@ class WanTransformer3DModel(

        self.gradient_checkpointing = False

-    @apply_lora_scale("attention_kwargs")
    def forward(
        self,
        hidden_states: torch.Tensor,
@@ -632,6 +631,21 @@ class WanTransformer3DModel(
        return_dict: bool = True,
        attention_kwargs: Optional[Dict[str, Any]] = None,
    ) -> Union[torch.Tensor, Dict[str, torch.Tensor]]:
+        if attention_kwargs is not None:
+            attention_kwargs = attention_kwargs.copy()
+            lora_scale = attention_kwargs.pop("scale", 1.0)
+        else:
+            lora_scale = 1.0
+
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+        else:
+            if attention_kwargs is not None and attention_kwargs.get("scale", None) is not None:
+                logger.warning(
+                    "Passing `scale` via `attention_kwargs` when not using the PEFT backend is ineffective."
+                )
+
        batch_size, num_channels, num_frames, height, width = hidden_states.shape
        p_t, p_h, p_w = self.config.patch_size
        post_patch_num_frames = num_frames // p_t
@@ -699,6 +713,10 @@ class WanTransformer3DModel(
        hidden_states = hidden_states.permute(0, 7, 1, 4, 2, 5, 3, 6)
        output = hidden_states.flatten(6, 7).flatten(4, 5).flatten(2, 3)

+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+
        if not return_dict:
            return (output,)

--- a/src/diffusers/models/transformers/transformer_wan_animate.py
+++ b/src/diffusers/models/transformers/transformer_wan_animate.py
@@ -21,7 +21,7 @@ import torch.nn.functional as F

 from ...configuration_utils import ConfigMixin, register_to_config
 from ...loaders import FromOriginalModelMixin, PeftAdapterMixin
-from ...utils import apply_lora_scale, logging
+from ...utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers
 from ..attention import AttentionMixin, AttentionModuleMixin, FeedForward
 from ..attention_dispatch import dispatch_attention_fn
 from ..cache_utils import CacheMixin
@@ -1141,7 +1141,6 @@ class WanAnimateTransformer3DModel(

        self.gradient_checkpointing = False

-    @apply_lora_scale("attention_kwargs")
    def forward(
        self,
        hidden_states: torch.Tensor,
@@ -1180,6 +1179,21 @@ class WanAnimateTransformer3DModel(
                Whether to return the output as a dict or tuple.
        """

+        if attention_kwargs is not None:
+            attention_kwargs = attention_kwargs.copy()
+            lora_scale = attention_kwargs.pop("scale", 1.0)
+        else:
+            lora_scale = 1.0
+
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+        else:
+            if attention_kwargs is not None and attention_kwargs.get("scale", None) is not None:
+                logger.warning(
+                    "Passing `scale` via `attention_kwargs` when not using the PEFT backend is ineffective."
+                )
+
        # Check that shapes match up
        if pose_hidden_states is not None and pose_hidden_states.shape[2] + 1 != hidden_states.shape[2]:
            raise ValueError(
@@ -1280,6 +1294,10 @@ class WanAnimateTransformer3DModel(
        hidden_states = hidden_states.permute(0, 7, 1, 4, 2, 5, 3, 6)
        output = hidden_states.flatten(6, 7).flatten(4, 5).flatten(2, 3)

+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+
        if not return_dict:
            return (output,)

--- a/src/diffusers/models/transformers/transformer_wan_vace.py
+++ b/src/diffusers/models/transformers/transformer_wan_vace.py
@@ -20,7 +20,7 @@ import torch.nn as nn

 from ...configuration_utils import ConfigMixin, register_to_config
 from ...loaders import FromOriginalModelMixin, PeftAdapterMixin
-from ...utils import apply_lora_scale, logging
+from ...utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers
 from ..attention import AttentionMixin, FeedForward
 from ..cache_utils import CacheMixin
 from ..modeling_outputs import Transformer2DModelOutput
@@ -261,7 +261,6 @@ class WanVACETransformer3DModel(

        self.gradient_checkpointing = False

-    @apply_lora_scale("attention_kwargs")
    def forward(
        self,
        hidden_states: torch.Tensor,
@@ -273,6 +272,21 @@ class WanVACETransformer3DModel(
        return_dict: bool = True,
        attention_kwargs: Optional[Dict[str, Any]] = None,
    ) -> Union[torch.Tensor, Dict[str, torch.Tensor]]:
+        if attention_kwargs is not None:
+            attention_kwargs = attention_kwargs.copy()
+            lora_scale = attention_kwargs.pop("scale", 1.0)
+        else:
+            lora_scale = 1.0
+
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+        else:
+            if attention_kwargs is not None and attention_kwargs.get("scale", None) is not None:
+                logger.warning(
+                    "Passing `scale` via `attention_kwargs` when not using the PEFT backend is ineffective."
+                )
+
        batch_size, num_channels, num_frames, height, width = hidden_states.shape
        p_t, p_h, p_w = self.config.patch_size
        post_patch_num_frames = num_frames // p_t
@@ -365,6 +379,10 @@ class WanVACETransformer3DModel(
        hidden_states = hidden_states.permute(0, 7, 1, 4, 2, 5, 3, 6)
        output = hidden_states.flatten(6, 7).flatten(4, 5).flatten(2, 3)

+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+
        if not return_dict:
            return (output,)

--- a/src/diffusers/models/unets/unet_2d_condition.py
+++ b/src/diffusers/models/unets/unet_2d_condition.py
@@ -20,12 +20,7 @@ import torch.nn as nn
 from ...configuration_utils import ConfigMixin, register_to_config
 from ...loaders import PeftAdapterMixin, UNet2DConditionLoadersMixin
 from ...loaders.single_file_model import FromOriginalModelMixin
-from ...utils import (
-    BaseOutput,
-    apply_lora_scale,
-    deprecate,
-    logging,
-)
+from ...utils import USE_PEFT_BACKEND, BaseOutput, deprecate, logging, scale_lora_layers, unscale_lora_layers
 from ..activations import get_activation
 from ..attention import AttentionMixin
 from ..attention_processor import (
@@ -979,7 +974,6 @@ class UNet2DConditionModel(
            encoder_hidden_states = (encoder_hidden_states, image_embeds)
        return encoder_hidden_states

-    @apply_lora_scale("cross_attention_kwargs")
    def forward(
        self,
        sample: torch.Tensor,
@@ -1118,6 +1112,18 @@ class UNet2DConditionModel(
            cross_attention_kwargs["gligen"] = {"objs": self.position_net(**gligen_args)}

        # 3. down
+        # we're popping the `scale` instead of getting it because otherwise `scale` will be propagated
+        # to the internal blocks and will raise deprecation warnings. this will be confusing for our users.
+        if cross_attention_kwargs is not None:
+            cross_attention_kwargs = cross_attention_kwargs.copy()
+            lora_scale = cross_attention_kwargs.pop("scale", 1.0)
+        else:
+            lora_scale = 1.0
+
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+
        is_controlnet = mid_block_additional_residual is not None and down_block_additional_residuals is not None
        # using new arg down_intrablock_additional_residuals for T2I-Adapters, to distinguish from controlnets
        is_adapter = down_intrablock_additional_residuals is not None
@@ -1233,6 +1239,10 @@ class UNet2DConditionModel(
            sample = self.conv_act(sample)
        sample = self.conv_out(sample)

+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+
        if not return_dict:
            return (sample,)

--- a/src/diffusers/models/unets/unet_motion_model.py
+++ b/src/diffusers/models/unets/unet_motion_model.py
@@ -21,7 +21,7 @@ import torch.nn.functional as F

 from ...configuration_utils import ConfigMixin, FrozenDict, register_to_config
 from ...loaders import FromOriginalModelMixin, PeftAdapterMixin, UNet2DConditionLoadersMixin
-from ...utils import BaseOutput, apply_lora_scale, deprecate, logging
+from ...utils import BaseOutput, deprecate, logging
 from ...utils.torch_utils import apply_freeu
 from ..attention import AttentionMixin, BasicTransformerBlock
 from ..attention_processor import (
@@ -1875,7 +1875,6 @@ class UNetMotionModel(ModelMixin, AttentionMixin, ConfigMixin, UNet2DConditionLo
        if self.original_attn_processors is not None:
            self.set_attn_processor(self.original_attn_processors)

-    @apply_lora_scale("cross_attention_kwargs")
    def forward(
        self,
        sample: torch.Tensor,
--- a/src/diffusers/models/unets/uvit_2d.py
+++ b/src/diffusers/models/unets/uvit_2d.py
@@ -21,7 +21,6 @@ from torch.utils.checkpoint import checkpoint

 from ...configuration_utils import ConfigMixin, register_to_config
 from ...loaders import PeftAdapterMixin
-from ...utils import apply_lora_scale
 from ..attention import AttentionMixin, BasicTransformerBlock, SkipFFTransformerBlock
 from ..attention_processor import (
    ADDED_KV_ATTENTION_PROCESSORS,
@@ -147,7 +146,6 @@ class UVit2DModel(ModelMixin, AttentionMixin, ConfigMixin, PeftAdapterMixin):

        self.gradient_checkpointing = False

-    @apply_lora_scale("cross_attention_kwargs")
    def forward(self, input_ids, encoder_hidden_states, pooled_text_emb, micro_conds, cross_attention_kwargs=None):
        encoder_hidden_states = self.encoder_proj(encoder_hidden_states)
        encoder_hidden_states = self.encoder_proj_layer_norm(encoder_hidden_states)
--- a/src/diffusers/modular_pipelines/components_manager.py
+++ b/src/diffusers/modular_pipelines/components_manager.py
@@ -324,7 +324,6 @@ class ComponentsManager:
        "has_hook",
        "execution_device",
        "ip_adapter",
-        "quantization",
    ]

    def __init__(self):
@@ -357,9 +356,7 @@ class ComponentsManager:
                    ids_by_name.add(component_id)
        else:
            ids_by_name = set(components.keys())
-        if collection and collection not in self.collections:
-            return set()
-        elif collection and collection in self.collections:
+        if collection:
            ids_by_collection = set()
            for component_id, component in components.items():
                if component_id in self.collections[collection]:
@@ -426,8 +423,7 @@ class ComponentsManager:

        # add component to components manager
        self.components[component_id] = component
-        if is_new_component:
-            self.added_time[component_id] = time.time()
+        self.added_time[component_id] = time.time()

        if collection:
            if collection not in self.collections:
@@ -764,6 +760,7 @@ class ComponentsManager:
        self.model_hooks = None
        self._auto_offload_enabled = False

+    # YiYi TODO: (1) add quantization info
    def get_model_info(
        self,
        component_id: str,
@@ -839,17 +836,6 @@ class ComponentsManager:
                    if scales:
                        info["ip_adapter"] = summarize_dict_by_value_and_parts(scales)

-            # Check for quantization
-            hf_quantizer = getattr(component, "hf_quantizer", None)
-            if hf_quantizer is not None:
-                quant_config = hf_quantizer.quantization_config
-                if hasattr(quant_config, "to_diff_dict"):
-                    info["quantization"] = quant_config.to_diff_dict()
-                else:
-                    info["quantization"] = quant_config.to_dict()
-            else:
-                info["quantization"] = None
-
        # If fields specified, filter info
        if fields is not None:
            return {k: v for k, v in info.items() if k in fields}
@@ -980,16 +966,12 @@ class ComponentsManager:
        output += "\nAdditional Component Info:\n" + "=" * 50 + "\n"
        for name in self.components:
            info = self.get_model_info(name)
-            if info is not None and (
-                info.get("adapters") is not None or info.get("ip_adapter") or info.get("quantization")
-            ):
+            if info is not None and (info.get("adapters") is not None or info.get("ip_adapter")):
                output += f"\n{name}:\n"
                if info.get("adapters") is not None:
                    output += f"  Adapters: {info['adapters']}\n"
                if info.get("ip_adapter"):
                    output += "  IP-Adapter: Enabled\n"
-                if info.get("quantization"):
-                    output += f"  Quantization: {info['quantization']}\n"

        return output

--- a/src/diffusers/modular_pipelines/mellon_node_utils.py
+++ b/src/diffusers/modular_pipelines/mellon_node_utils.py
--- a/src/diffusers/modular_pipelines/modular_pipeline.py
+++ b/src/diffusers/modular_pipelines/modular_pipeline.py
@@ -2143,8 +2143,6 @@ class ModularPipeline(ConfigMixin, PushToHubMixin):
                name
                for name in self._component_specs.keys()
                if self._component_specs[name].default_creation_method == "from_pretrained"
-                and self._component_specs[name].pretrained_model_name_or_path is not None
-                and getattr(self, name, None) is None
            ]
        elif isinstance(names, str):
            names = [names]
--- a/src/diffusers/modular_pipelines/modular_pipeline_utils.py
+++ b/src/diffusers/modular_pipelines/modular_pipeline_utils.py
@@ -15,7 +15,7 @@
 import inspect
 import re
 from collections import OrderedDict
-from dataclasses import dataclass, field
+from dataclasses import dataclass, field, fields
 from typing import Any, Dict, List, Literal, Optional, Type, Union

 import PIL.Image
@@ -23,7 +23,7 @@ import torch

 from ..configuration_utils import ConfigMixin, FrozenDict
 from ..loaders.single_file_utils import _is_single_file_path_or_url
-from ..utils import DIFFUSERS_LOAD_ID_FIELDS, is_torch_available, logging
+from ..utils import is_torch_available, logging


 if is_torch_available():
@@ -186,7 +186,7 @@ class ComponentSpec:
        """
        Return the names of all loading‐related fields (i.e. those whose field.metadata["loading"] is True).
        """
-        return DIFFUSERS_LOAD_ID_FIELDS.copy()
+        return [f.name for f in fields(cls) if f.metadata.get("loading", False)]

    @property
    def load_id(self) -> str:
@@ -198,7 +198,7 @@ class ComponentSpec:
            return "null"
        parts = [getattr(self, k) for k in self.loading_fields()]
        parts = ["null" if p is None else p for p in parts]
-        return "|".join(parts)
+        return "|".join(p for p in parts if p)

    @classmethod
    def decode_load_id(cls, load_id: str) -> Dict[str, Optional[str]]:
@@ -520,7 +520,6 @@ class InputParam:
    required: bool = False
    description: str = ""
    kwargs_type: str = None
-    metadata: Dict[str, Any] = None

    def __repr__(self):
        return f"<{self.name}: {'required' if self.required else 'optional'}, default={self.default}>"
@@ -554,7 +553,6 @@ class OutputParam:
    type_hint: Any = None
    description: str = ""
    kwargs_type: str = None
-    metadata: Dict[str, Any] = None

    def __repr__(self):
        return (
--- a/src/diffusers/utils/init.py
+++ b/src/diffusers/utils/init.py
@@ -23,7 +23,6 @@ from .constants import (
    DEFAULT_HF_PARALLEL_LOADING_WORKERS,
    DEPRECATED_REVISION_ARGS,
    DIFFUSERS_DYNAMIC_MODULE_NAME,
-    DIFFUSERS_LOAD_ID_FIELDS,
    FLAX_WEIGHTS_NAME,
    GGUF_FILE_EXTENSION,
    HF_ENABLE_PARALLEL_LOADING,
@@ -131,7 +130,6 @@ from .loading_utils import get_module_from_name, get_submodule_by_name, load_ima
 from .logging import get_logger
 from .outputs import BaseOutput
 from .peft_utils import (
-    apply_lora_scale,
    check_peft_version,
    delete_adapter_layers,
    get_adapter_name,
--- a/src/diffusers/utils/constants.py
+++ b/src/diffusers/utils/constants.py
@@ -73,11 +73,3 @@ DECODE_ENDPOINT_HUNYUAN_VIDEO = "https://o7ywnmrahorts457.us-east-1.aws.endpoint
 ENCODE_ENDPOINT_SD_V1 = "https://qc6479g0aac6qwy9.us-east-1.aws.endpoints.huggingface.cloud/"
 ENCODE_ENDPOINT_SD_XL = "https://xjqqhmyn62rog84g.us-east-1.aws.endpoints.huggingface.cloud/"
 ENCODE_ENDPOINT_FLUX = "https://ptccx55jz97f9zgo.us-east-1.aws.endpoints.huggingface.cloud/"
-
-
-DIFFUSERS_LOAD_ID_FIELDS = [
-    "pretrained_model_name_or_path",
-    "subfolder",
-    "variant",
-    "revision",
-]
--- a/src/diffusers/utils/peft_utils.py
+++ b/src/diffusers/utils/peft_utils.py
@@ -16,7 +16,6 @@ PEFT utilities: Utilities related to peft library
 """

 import collections
-import functools
 import importlib
 from typing import Optional

@@ -276,59 +275,6 @@ def set_weights_and_activate_adapters(model, adapter_names, weights):
                module.set_scale(adapter_name, get_module_weight(weight, module_name))


-def apply_lora_scale(kwargs_name: str = "joint_attention_kwargs"):
-    """
-    Decorator to automatically handle LoRA layer scaling/unscaling in forward methods.
-
-    This decorator extracts the `lora_scale` from the specified kwargs parameter, applies scaling before the forward
-    pass, and ensures unscaling happens after, even if an exception occurs.
-
-    Args:
-        kwargs_name (`str`, defaults to `"joint_attention_kwargs"`):
-            The name of the keyword argument that contains the LoRA scale. Common values include
-            "joint_attention_kwargs", "attention_kwargs", "cross_attention_kwargs", etc.
-    """
-
-    def decorator(forward_fn):
-        @functools.wraps(forward_fn)
-        def wrapper(self, *args, **kwargs):
-            from . import USE_PEFT_BACKEND
-
-            lora_scale = 1.0
-            attention_kwargs = kwargs.get(kwargs_name)
-
-            if attention_kwargs is not None:
-                attention_kwargs = attention_kwargs.copy()
-                kwargs[kwargs_name] = attention_kwargs
-                lora_scale = attention_kwargs.pop("scale", 1.0)
-            else:
-                if (
-                    not USE_PEFT_BACKEND
-                    and attention_kwargs is not None
-                    and attention_kwargs.get("scale", None) is not None
-                ):
-                    logger.warning(
-                        f"Passing `scale` via `{kwargs_name}` when not using the PEFT backend is ineffective."
-                    )
-
-            # Apply LoRA scaling if using PEFT backend
-            if USE_PEFT_BACKEND:
-                scale_lora_layers(self, lora_scale)
-
-            try:
-                # Execute the forward pass
-                result = forward_fn(self, *args, **kwargs)
-                return result
-            finally:
-                # Always unscale, even if forward pass raises an exception
-                if USE_PEFT_BACKEND:
-                    unscale_lora_layers(self, lora_scale)
-
-        return wrapper
-
-    return decorator
-
-
 def check_peft_version(min_version: str) -> None:
    r"""
    Checks if the version of PEFT is compatible.
--- a/tests/models/transformers/test_models_transformer_qwenimage.py
+++ b/tests/models/transformers/test_models_transformer_qwenimage.py
@@ -13,49 +13,87 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import unittest
-
 import torch

 from diffusers import QwenImageTransformer2DModel
 from diffusers.models.transformers.transformer_qwenimage import compute_text_seq_len_from_mask
+from diffusers.utils.torch_utils import randn_tensor

 from ...testing_utils import enable_full_determinism, torch_device
-from ..test_modeling_common import ModelTesterMixin, TorchCompileTesterMixin
+from ..testing_utils import (
+    AttentionTesterMixin,
+    BaseModelTesterConfig,
+    BitsAndBytesTesterMixin,
+    ContextParallelTesterMixin,
+    LoraHotSwappingForModelTesterMixin,
+    LoraTesterMixin,
+    MemoryTesterMixin,
+    ModelTesterMixin,
+    TorchAoTesterMixin,
+    TorchCompileTesterMixin,
+    TrainingTesterMixin,
+)


 enable_full_determinism()


-class QwenImageTransformerTests(ModelTesterMixin, unittest.TestCase):
-    model_class = QwenImageTransformer2DModel
-    main_input_name = "hidden_states"
-    # We override the items here because the transformer under consideration is small.
-    model_split_percents = [0.7, 0.6, 0.6]
-
-    # Skip setting testing with default: AttnProcessor
-    uses_custom_attn_processor = True
+class QwenImageTransformerTesterConfig(BaseModelTesterConfig):
+    @property
+    def model_class(self):
+        return QwenImageTransformer2DModel

    @property
-    def dummy_input(self):
-        return self.prepare_dummy_input()
-
-    @property
-    def input_shape(self):
+    def output_shape(self) -> tuple[int, int]:
        return (16, 16)

    @property
-    def output_shape(self):
+    def input_shape(self) -> tuple[int, int]:
        return (16, 16)

-    def prepare_dummy_input(self, height=4, width=4):
+    @property
+    def model_split_percents(self) -> list:
+        # We override the items here because the transformer under consideration is small.
+        return [0.7, 0.6, 0.6]
+
+    @property
+    def main_input_name(self) -> str:
+        return "hidden_states"
+
+    @property
+    def uses_custom_attn_processor(self) -> bool:
+        # Skip setting testing with default: AttnProcessor
+        return True
+
+    @property
+    def generator(self):
+        return torch.Generator("cpu").manual_seed(0)
+
+    def get_init_dict(self) -> dict[str, int | list[int]]:
+        return {
+            "patch_size": 2,
+            "in_channels": 16,
+            "out_channels": 4,
+            "num_layers": 2,
+            "attention_head_dim": 16,
+            "num_attention_heads": 4,  # Must be divisible by 2 for Ulysses context parallel
+            "joint_attention_dim": 16,
+            "guidance_embeds": False,
+            "axes_dims_rope": (8, 4, 4),
+        }
+
+    def get_dummy_inputs(self, height: int = 4, width: int = 4) -> dict[str, torch.Tensor]:
        batch_size = 1
        num_latent_channels = embedding_dim = 16
-        sequence_length = 7
+        sequence_length = 8  # Must be divisible by 2 for context parallel tests
        vae_scale_factor = 4

-        hidden_states = torch.randn((batch_size, height * width, num_latent_channels)).to(torch_device)
-        encoder_hidden_states = torch.randn((batch_size, sequence_length, embedding_dim)).to(torch_device)
+        hidden_states = randn_tensor(
+            (batch_size, height * width, num_latent_channels), generator=self.generator, device=torch_device
+        )
+        encoder_hidden_states = randn_tensor(
+            (batch_size, sequence_length, embedding_dim), generator=self.generator, device=torch_device
+        )
        encoder_hidden_states_mask = torch.ones((batch_size, sequence_length)).to(torch_device, torch.long)
        timestep = torch.tensor([1.0]).to(torch_device).expand(batch_size)
        orig_height = height * 2 * vae_scale_factor
@@ -70,29 +108,12 @@ class QwenImageTransformerTests(ModelTesterMixin, unittest.TestCase):
            "img_shapes": img_shapes,
        }

-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict = {
-            "patch_size": 2,
-            "in_channels": 16,
-            "out_channels": 4,
-            "num_layers": 2,
-            "attention_head_dim": 16,
-            "num_attention_heads": 3,
-            "joint_attention_dim": 16,
-            "guidance_embeds": False,
-            "axes_dims_rope": (8, 4, 4),
-        }
-
-        inputs_dict = self.dummy_input
-        return init_dict, inputs_dict
-
-    def test_gradient_checkpointing_is_applied(self):
-        expected_set = {"QwenImageTransformer2DModel"}
-        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)

+class TestQwenImageTransformer(QwenImageTransformerTesterConfig, ModelTesterMixin):
    def test_infers_text_seq_len_from_mask(self):
        """Test that compute_text_seq_len_from_mask correctly infers sequence lengths and returns tensors."""
-        init_dict, inputs = self.prepare_init_args_and_inputs_for_common()
+        init_dict = self.get_init_dict()
+        inputs = self.get_dummy_inputs()
        model = self.model_class(**init_dict).to(torch_device)

        # Test 1: Contiguous mask with padding at the end (only first 2 tokens valid)
@@ -104,55 +125,56 @@ class QwenImageTransformerTests(ModelTesterMixin, unittest.TestCase):
        )

        # Verify rope_text_seq_len is returned as an int (for torch.compile compatibility)
-        self.assertIsInstance(rope_text_seq_len, int)
+        assert isinstance(rope_text_seq_len, int)

        # Verify per_sample_len is computed correctly (max valid position + 1 = 2)
-        self.assertIsInstance(per_sample_len, torch.Tensor)
-        self.assertEqual(int(per_sample_len.max().item()), 2)
+        assert isinstance(per_sample_len, torch.Tensor)
+        assert int(per_sample_len.max().item()) == 2

        # Verify mask is normalized to bool dtype
-        self.assertTrue(normalized_mask.dtype == torch.bool)
-        self.assertEqual(normalized_mask.sum().item(), 2)  # Only 2 True values
+        assert normalized_mask.dtype == torch.bool
+        assert normalized_mask.sum().item() == 2  # Only 2 True values

        # Verify rope_text_seq_len is at least the sequence length
-        self.assertGreaterEqual(rope_text_seq_len, inputs["encoder_hidden_states"].shape[1])
+        assert rope_text_seq_len >= inputs["encoder_hidden_states"].shape[1]

        # Test 2: Verify model runs successfully with inferred values
        inputs["encoder_hidden_states_mask"] = normalized_mask
        with torch.no_grad():
            output = model(**inputs)
-        self.assertEqual(output.sample.shape[1], inputs["hidden_states"].shape[1])
+        assert output.sample.shape[1] == inputs["hidden_states"].shape[1]

        # Test 3: Different mask pattern (padding at beginning)
        encoder_hidden_states_mask2 = inputs["encoder_hidden_states_mask"].clone()
        encoder_hidden_states_mask2[:, :3] = 0  # First 3 tokens are padding
-        encoder_hidden_states_mask2[:, 3:] = 1  # Last 4 tokens are valid
+        encoder_hidden_states_mask2[:, 3:] = 1  # Last 5 tokens are valid (seq_len=8)

        rope_text_seq_len2, per_sample_len2, normalized_mask2 = compute_text_seq_len_from_mask(
            inputs["encoder_hidden_states"], encoder_hidden_states_mask2
        )

-        # Max valid position is 6 (last token), so per_sample_len should be 7
-        self.assertEqual(int(per_sample_len2.max().item()), 7)
-        self.assertEqual(normalized_mask2.sum().item(), 4)  # 4 True values
+        # Max valid position is 7 (last token), so per_sample_len should be 8
+        assert int(per_sample_len2.max().item()) == 8
+        assert normalized_mask2.sum().item() == 5  # 5 True values

        # Test 4: No mask provided (None case)
        rope_text_seq_len_none, per_sample_len_none, normalized_mask_none = compute_text_seq_len_from_mask(
            inputs["encoder_hidden_states"], None
        )
-        self.assertEqual(rope_text_seq_len_none, inputs["encoder_hidden_states"].shape[1])
-        self.assertIsInstance(rope_text_seq_len_none, int)
-        self.assertIsNone(per_sample_len_none)
-        self.assertIsNone(normalized_mask_none)
+        assert rope_text_seq_len_none == inputs["encoder_hidden_states"].shape[1]
+        assert isinstance(rope_text_seq_len_none, int)
+        assert per_sample_len_none is None
+        assert normalized_mask_none is None

    def test_non_contiguous_attention_mask(self):
-        """Test that non-contiguous masks work correctly (e.g., [1, 0, 1, 0, 1, 0, 0])"""
-        init_dict, inputs = self.prepare_init_args_and_inputs_for_common()
+        """Test that non-contiguous masks work correctly (e.g., [1, 0, 1, 0, 1, 0, 0, 0])"""
+        init_dict = self.get_init_dict()
+        inputs = self.get_dummy_inputs()
        model = self.model_class(**init_dict).to(torch_device)

        # Create a non-contiguous mask pattern: valid, padding, valid, padding, etc.
        encoder_hidden_states_mask = inputs["encoder_hidden_states_mask"].clone()
-        # Pattern: [True, False, True, False, True, False, False]
+        # Pattern: [True, False, True, False, True, False, False, False] (seq_len=8)
        encoder_hidden_states_mask[:, 1] = 0
        encoder_hidden_states_mask[:, 3] = 0
        encoder_hidden_states_mask[:, 5:] = 0
@@ -160,21 +182,22 @@ class QwenImageTransformerTests(ModelTesterMixin, unittest.TestCase):
        inferred_rope_len, per_sample_len, normalized_mask = compute_text_seq_len_from_mask(
            inputs["encoder_hidden_states"], encoder_hidden_states_mask
        )
-        self.assertEqual(int(per_sample_len.max().item()), 5)
-        self.assertEqual(inferred_rope_len, inputs["encoder_hidden_states"].shape[1])
-        self.assertIsInstance(inferred_rope_len, int)
-        self.assertTrue(normalized_mask.dtype == torch.bool)
+        assert int(per_sample_len.max().item()) == 5
+        assert inferred_rope_len == inputs["encoder_hidden_states"].shape[1]
+        assert isinstance(inferred_rope_len, int)
+        assert normalized_mask.dtype == torch.bool

        inputs["encoder_hidden_states_mask"] = normalized_mask

        with torch.no_grad():
            output = model(**inputs)

-        self.assertEqual(output.sample.shape[1], inputs["hidden_states"].shape[1])
+        assert output.sample.shape[1] == inputs["hidden_states"].shape[1]

    def test_txt_seq_lens_deprecation(self):
        """Test that passing txt_seq_lens raises a deprecation warning."""
-        init_dict, inputs = self.prepare_init_args_and_inputs_for_common()
+        init_dict = self.get_init_dict()
+        inputs = self.get_dummy_inputs()
        model = self.model_class(**init_dict).to(torch_device)

        # Prepare inputs with txt_seq_lens (deprecated parameter)
@@ -186,18 +209,24 @@ class QwenImageTransformerTests(ModelTesterMixin, unittest.TestCase):
        inputs_with_deprecated["txt_seq_lens"] = txt_seq_lens

        # Test that deprecation warning is raised
-        with self.assertWarns(FutureWarning) as warning_context:
+        import warnings
+
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter("always")
            with torch.no_grad():
                output = model(**inputs_with_deprecated)

-        # Verify the warning message mentions the deprecation
-        warning_message = str(warning_context.warning)
-        self.assertIn("txt_seq_lens", warning_message)
-        self.assertIn("deprecated", warning_message)
-        self.assertIn("encoder_hidden_states_mask", warning_message)
+            # Verify a FutureWarning was raised
+            future_warnings = [x for x in w if issubclass(x.category, FutureWarning)]
+            assert len(future_warnings) > 0, "Expected FutureWarning to be raised"
+
+            # Verify the warning message mentions the deprecation
+            warning_message = str(future_warnings[0].message)
+            assert "txt_seq_lens" in warning_message
+            assert "deprecated" in warning_message

        # Verify the model still works correctly despite the deprecation
-        self.assertEqual(output.sample.shape[1], inputs["hidden_states"].shape[1])
+        assert output.sample.shape[1] == inputs["hidden_states"].shape[1]

    def test_layered_model_with_mask(self):
        """Test QwenImageTransformer2DModel with use_layer3d_rope=True (layered model)."""
@@ -208,7 +237,7 @@ class QwenImageTransformerTests(ModelTesterMixin, unittest.TestCase):
            "out_channels": 4,
            "num_layers": 2,
            "attention_head_dim": 16,
-            "num_attention_heads": 3,
+            "num_attention_heads": 4,  # Must be divisible by 2 for Ulysses context parallel
            "joint_attention_dim": 16,
            "axes_dims_rope": (8, 4, 4),  # Must match attention_head_dim (8+4+4=16)
            "use_layer3d_rope": True,  # Enable layered RoPE
@@ -220,11 +249,11 @@ class QwenImageTransformerTests(ModelTesterMixin, unittest.TestCase):
        # Verify the model uses QwenEmbedLayer3DRope
        from diffusers.models.transformers.transformer_qwenimage import QwenEmbedLayer3DRope

-        self.assertIsInstance(model.pos_embed, QwenEmbedLayer3DRope)
+        assert isinstance(model.pos_embed, QwenEmbedLayer3DRope)

        # Test single generation with layered structure
        batch_size = 1
-        text_seq_len = 7
+        text_seq_len = 8
        img_h, img_w = 4, 4
        layers = 4

@@ -262,24 +291,104 @@ class QwenImageTransformerTests(ModelTesterMixin, unittest.TestCase):
                additional_t_cond=addition_t_cond,
            )

-        self.assertEqual(output.sample.shape[1], hidden_states.shape[1])
+        assert output.sample.shape[1] == hidden_states.shape[1]


-class QwenImageTransformerCompileTests(TorchCompileTesterMixin, unittest.TestCase):
-    model_class = QwenImageTransformer2DModel
+class TestQwenImageTransformerMemory(QwenImageTransformerTesterConfig, MemoryTesterMixin):
+    """Memory optimization tests for QwenImage Transformer."""

-    def prepare_init_args_and_inputs_for_common(self):
-        return QwenImageTransformerTests().prepare_init_args_and_inputs_for_common()

-    def prepare_dummy_input(self, height, width):
-        return QwenImageTransformerTests().prepare_dummy_input(height=height, width=width)
+class TestQwenImageTransformerTraining(QwenImageTransformerTesterConfig, TrainingTesterMixin):
+    """Training tests for QwenImage Transformer."""

-    def test_torch_compile_recompilation_and_graph_break(self):
-        super().test_torch_compile_recompilation_and_graph_break()
+    def test_gradient_checkpointing_is_applied(self):
+        expected_set = {"QwenImageTransformer2DModel"}
+        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
+
+
+class TestQwenImageTransformerAttention(QwenImageTransformerTesterConfig, AttentionTesterMixin):
+    """Attention processor tests for QwenImage Transformer."""
+
+
+class TestQwenImageTransformerContextParallel(QwenImageTransformerTesterConfig, ContextParallelTesterMixin):
+    """Context Parallel inference tests for QwenImage Transformer."""
+
+
+class TestQwenImageTransformerLoRA(QwenImageTransformerTesterConfig, LoraTesterMixin):
+    """LoRA adapter tests for QwenImage Transformer."""
+
+
+class TestQwenImageTransformerLoRAHotSwap(QwenImageTransformerTesterConfig, LoraHotSwappingForModelTesterMixin):
+    """LoRA hot-swapping tests for QwenImage Transformer."""
+
+    @property
+    def different_shapes_for_compilation(self):
+        return [(4, 4), (4, 8), (8, 8)]
+
+    def get_dummy_inputs(self, height: int = 4, width: int = 4) -> dict[str, torch.Tensor]:
+        """Override to support dynamic height/width for LoRA hotswap tests."""
+        batch_size = 1
+        num_latent_channels = embedding_dim = 16
+        sequence_length = 8
+        vae_scale_factor = 4
+
+        hidden_states = randn_tensor(
+            (batch_size, height * width, num_latent_channels), generator=self.generator, device=torch_device
+        )
+        encoder_hidden_states = randn_tensor(
+            (batch_size, sequence_length, embedding_dim), generator=self.generator, device=torch_device
+        )
+        encoder_hidden_states_mask = torch.ones((batch_size, sequence_length)).to(torch_device, torch.long)
+        timestep = torch.tensor([1.0]).to(torch_device).expand(batch_size)
+        orig_height = height * 2 * vae_scale_factor
+        orig_width = width * 2 * vae_scale_factor
+        img_shapes = [(1, orig_height // vae_scale_factor // 2, orig_width // vae_scale_factor // 2)] * batch_size
+
+        return {
+            "hidden_states": hidden_states,
+            "encoder_hidden_states": encoder_hidden_states,
+            "encoder_hidden_states_mask": encoder_hidden_states_mask,
+            "timestep": timestep,
+            "img_shapes": img_shapes,
+        }
+
+
+class TestQwenImageTransformerCompile(QwenImageTransformerTesterConfig, TorchCompileTesterMixin):
+    @property
+    def different_shapes_for_compilation(self):
+        return [(4, 4), (4, 8), (8, 8)]
+
+    def get_dummy_inputs(self, height: int = 4, width: int = 4) -> dict[str, torch.Tensor]:
+        """Override to support dynamic height/width for compilation tests."""
+        batch_size = 1
+        num_latent_channels = embedding_dim = 16
+        sequence_length = 8  # Must be divisible by 2 for context parallel tests
+        vae_scale_factor = 4
+
+        hidden_states = randn_tensor(
+            (batch_size, height * width, num_latent_channels), generator=self.generator, device=torch_device
+        )
+        encoder_hidden_states = randn_tensor(
+            (batch_size, sequence_length, embedding_dim), generator=self.generator, device=torch_device
+        )
+        encoder_hidden_states_mask = torch.ones((batch_size, sequence_length)).to(torch_device, torch.long)
+        timestep = torch.tensor([1.0]).to(torch_device).expand(batch_size)
+        orig_height = height * 2 * vae_scale_factor
+        orig_width = width * 2 * vae_scale_factor
+        img_shapes = [(1, orig_height // vae_scale_factor // 2, orig_width // vae_scale_factor // 2)] * batch_size
+
+        return {
+            "hidden_states": hidden_states,
+            "encoder_hidden_states": encoder_hidden_states,
+            "encoder_hidden_states_mask": encoder_hidden_states_mask,
+            "timestep": timestep,
+            "img_shapes": img_shapes,
+        }

    def test_torch_compile_with_and_without_mask(self):
        """Test that torch.compile works with both None mask and padding mask."""
-        init_dict, inputs = self.prepare_init_args_and_inputs_for_common()
+        init_dict = self.get_init_dict()
+        inputs = self.get_dummy_inputs()
        model = self.model_class(**init_dict).to(torch_device)
        model.eval()
        model.compile(mode="default", fullgraph=True)
@@ -300,13 +409,13 @@ class QwenImageTransformerCompileTests(TorchCompileTesterMixin, unittest.TestCas
        ):
            output_no_mask_2 = model(**inputs_no_mask)

-        self.assertEqual(output_no_mask.sample.shape[1], inputs["hidden_states"].shape[1])
-        self.assertEqual(output_no_mask_2.sample.shape[1], inputs["hidden_states"].shape[1])
+        assert output_no_mask.sample.shape[1] == inputs["hidden_states"].shape[1]
+        assert output_no_mask_2.sample.shape[1] == inputs["hidden_states"].shape[1]

        # Test 2: Run with all-ones mask (should behave like None)
        inputs_all_ones = inputs.copy()
        # Keep the all-ones mask
-        self.assertTrue(inputs_all_ones["encoder_hidden_states_mask"].all().item())
+        assert inputs_all_ones["encoder_hidden_states_mask"].all().item()

        # First run to allow compilation
        with torch.no_grad():
@@ -320,8 +429,8 @@ class QwenImageTransformerCompileTests(TorchCompileTesterMixin, unittest.TestCas
        ):
            output_all_ones_2 = model(**inputs_all_ones)

-        self.assertEqual(output_all_ones.sample.shape[1], inputs["hidden_states"].shape[1])
-        self.assertEqual(output_all_ones_2.sample.shape[1], inputs["hidden_states"].shape[1])
+        assert output_all_ones.sample.shape[1] == inputs["hidden_states"].shape[1]
+        assert output_all_ones_2.sample.shape[1] == inputs["hidden_states"].shape[1]

        # Test 3: Run with actual padding mask (has zeros)
        inputs_with_padding = inputs.copy()
@@ -342,8 +451,16 @@ class QwenImageTransformerCompileTests(TorchCompileTesterMixin, unittest.TestCas
        ):
            output_with_padding_2 = model(**inputs_with_padding)

-        self.assertEqual(output_with_padding.sample.shape[1], inputs["hidden_states"].shape[1])
-        self.assertEqual(output_with_padding_2.sample.shape[1], inputs["hidden_states"].shape[1])
+        assert output_with_padding.sample.shape[1] == inputs["hidden_states"].shape[1]
+        assert output_with_padding_2.sample.shape[1] == inputs["hidden_states"].shape[1]

        # Verify that outputs are different (mask should affect results)
-        self.assertFalse(torch.allclose(output_no_mask.sample, output_with_padding.sample, atol=1e-3))
+        assert not torch.allclose(output_no_mask.sample, output_with_padding.sample, atol=1e-3)
+
+
+class TestQwenImageTransformerBitsAndBytes(QwenImageTransformerTesterConfig, BitsAndBytesTesterMixin):
+    """BitsAndBytes quantization tests for QwenImage Transformer."""
+
+
+class TestQwenImageTransformerTorchAo(QwenImageTransformerTesterConfig, TorchAoTesterMixin):
+    """TorchAO quantization tests for QwenImage Transformer."""
Author	SHA1	Message	Date
Dhruv Nair	ffdfe28983	update	2026-02-03 06:05:12 +01:00
Dhruv Nair	31ed009706	update	2026-02-02 15:48:06 +01:00