up up

initial support: workflow
include auto-docstring check in the modular ci. (#13004 )
2026-02-01 00:15:00 +08:00 · 2026-01-25 12:11:37 +01:00 · 2026-01-25 11:40:52 +01:00 · 2026-01-23 22:34:24 -10:00 · 2026-01-19 09:23:31 +00:00 · 2026-01-18 23:20:36 -10:00
19 changed files with 4347 additions and 805 deletions
--- a/.github/workflows/pr_modular_tests.yml
+++ b/.github/workflows/pr_modular_tests.yml
@@ -75,9 +75,27 @@ jobs:
        if: ${{ failure() }}
        run: |
          echo "Repo consistency check failed. Please ensure the right dependency versions are installed with 'pip install -e .[quality]' and run 'make fix-copies'" >> $GITHUB_STEP_SUMMARY
  check_auto_docs:
    runs-on: ubuntu-22.04
    steps:
      - uses: actions/checkout@v6
      - name: Set up Python
        uses: actions/setup-python@v6
        with:
          python-version: "3.10"
      - name: Install dependencies
        run: |
          pip install --upgrade pip
          pip install .[quality]
      - name: Check auto docs
        run: make modular-autodoctrings
      - name: Check if failure
        if: ${{ failure() }}
        run: |
          echo "Auto docstring checks failed. Please run `python utils/modular_auto_docstring.py --fix_and_overwrite`." >> $GITHUB_STEP_SUMMARY
  run_fast_tests:
-    needs: [check_code_quality, check_repository_consistency]
+    needs: [check_code_quality, check_repository_consistency, check_auto_docs]
    name: Fast PyTorch Modular Pipeline CPU tests
    runs-on:
--- a/4
+++ b/4
@@ -70,6 +70,10 @@ fix-copies:
 	python utils/check_copies.py --fix_and_overwrite
 	python utils/check_dummies.py --fix_and_overwrite
 # Auto docstrings in modular blocks
 modular-autodoctrings:
 	python utils/modular_auto_docstring.py
 # Run tests for the library
 test:
--- a/src/diffusers/modular_pipelines/modular_pipeline.py
+++ b/src/diffusers/modular_pipelines/modular_pipeline.py
@@ -39,8 +39,11 @@ from .modular_pipeline_utils import (
    InputParam,
    InsertableDict,
    OutputParam,
    combine_inputs,
    combine_outputs,
    format_components,
    format_configs,
    format_workflow,
    make_doc_string,
 )
@@ -242,6 +245,7 @@ class ModularPipelineBlocks(ConfigMixin, PushToHubMixin):
    config_name = "modular_config.json"
    model_name = None
    _workflow_map = None
    @classmethod
    def _get_signature_keys(cls, obj):
@@ -297,6 +301,35 @@ class ModularPipelineBlocks(ConfigMixin, PushToHubMixin):
    def outputs(self) -> List[OutputParam]:
        return self._get_outputs()
    # currentlyonly ConditionalPipelineBlocks and SequentialPipelineBlocks support `get_execution_blocks`
    def get_execution_blocks(self, **kwargs):
        """
        Get the block(s) that would execute given the inputs. Must be implemented by subclasses that support
        conditional block selection.
        Args:
            **kwargs: Input names and values. Only trigger inputs affect block selection.
        """
        raise NotImplementedError(f"`get_execution_blocks` is not implemented for {self.__class__.__name__}")
    # currently only SequentialPipelineBlocks support workflows
    @property
    def workflow_names(self):
        """
        Returns a list of available workflow names. Must be implemented by subclasses that define `_workflow_map`.
        """
        raise NotImplementedError(f"`workflow_names` is not implemented for {self.__class__.__name__}")
    def get_workflow(self, workflow_name: str):
        """
        Get the execution blocks for a specific workflow. Must be implemented by subclasses that define
        `_workflow_map`.
        Args:
            workflow_name: Name of the workflow to retrieve.
        """
        raise NotImplementedError(f"`get_workflow` is not implemented for {self.__class__.__name__}")
    @classmethod
    def from_pretrained(
        cls,
@@ -434,72 +467,6 @@ class ModularPipelineBlocks(ConfigMixin, PushToHubMixin):
                    if current_value is not param:  # Using identity comparison to check if object was modified
                        state.set(param_name, param, input_param.kwargs_type)
    @staticmethod
    def combine_inputs(*named_input_lists: List[Tuple[str, List[InputParam]]]) -> List[InputParam]:
        """
        Combines multiple lists of InputParam objects from different blocks. For duplicate inputs, updates only if
        current default value is None and new default value is not None. Warns if multiple non-None default values
        exist for the same input.
        Args:
            named_input_lists: List of tuples containing (block_name, input_param_list) pairs
        Returns:
            List[InputParam]: Combined list of unique InputParam objects
        """
        combined_dict = {}  # name -> InputParam
        value_sources = {}  # name -> block_name
        for block_name, inputs in named_input_lists:
            for input_param in inputs:
                if input_param.name is None and input_param.kwargs_type is not None:
                    input_name = "*_" + input_param.kwargs_type
                else:
                    input_name = input_param.name
                if input_name in combined_dict:
                    current_param = combined_dict[input_name]
                    if (
                        current_param.default is not None
                        and input_param.default is not None
                        and current_param.default != input_param.default
                    ):
                        warnings.warn(
                            f"Multiple different default values found for input '{input_name}': "
                            f"{current_param.default} (from block '{value_sources[input_name]}') and "
                            f"{input_param.default} (from block '{block_name}'). Using {current_param.default}."
                        )
                    if current_param.default is None and input_param.default is not None:
                        combined_dict[input_name] = input_param
                        value_sources[input_name] = block_name
                else:
                    combined_dict[input_name] = input_param
                    value_sources[input_name] = block_name
        return list(combined_dict.values())
    @staticmethod
    def combine_outputs(*named_output_lists: List[Tuple[str, List[OutputParam]]]) -> List[OutputParam]:
        """
        Combines multiple lists of OutputParam objects from different blocks. For duplicate outputs, keeps the first
        occurrence of each output name.
        Args:
            named_output_lists: List of tuples containing (block_name, output_param_list) pairs
        Returns:
            List[OutputParam]: Combined list of unique OutputParam objects
        """
        combined_dict = {}  # name -> OutputParam
        for block_name, outputs in named_output_lists:
            for output_param in outputs:
                if (output_param.name not in combined_dict) or (
                    combined_dict[output_param.name].kwargs_type is None and output_param.kwargs_type is not None
                ):
                    combined_dict[output_param.name] = output_param
        return list(combined_dict.values())
    @property
    def input_names(self) -> List[str]:
        return [input_param.name for input_param in self.inputs if input_param.name is not None]
@@ -531,7 +498,8 @@ class ModularPipelineBlocks(ConfigMixin, PushToHubMixin):
 class ConditionalPipelineBlocks(ModularPipelineBlocks):
    """
    A Pipeline Blocks that conditionally selects a block to run based on the inputs. Subclasses must implement the
-    `select_block` method to define the logic for selecting the block.
+    `select_block` method to define the logic for selecting the block. Currently, we only support selection logic based
    on the presence or absence of inputs (i.e., whether they are `None` or not)
    This class inherits from [`ModularPipelineBlocks`]. Check the superclass documentation for the generic methods the
    library implements for all the pipeline blocks (such as loading or saving etc.)
@@ -539,15 +507,20 @@ class ConditionalPipelineBlocks(ModularPipelineBlocks):
    > [!WARNING] > This is an experimental feature and is likely to change in the future.
    Attributes:
-        block_classes: List of block classes to be used
+        block_classes: List of block classes to be used. Must have the same length as `block_names`.
-        block_names: List of prefixes for each block
+        block_names: List of names for each block. Must have the same length as `block_classes`.
-        block_trigger_inputs: List of input names that select_block() uses to determine which block to run
+        block_trigger_inputs: List of input names that `select_block()` uses to determine which block to run.
            For `ConditionalPipelineBlocks`, this does not need to correspond to `block_names` and `block_classes`. For
            `AutoPipelineBlocks`, this must have the same length as `block_names` and `block_classes`, where each
            element specifies the trigger input for the corresponding block.
        default_block_name: Name of the default block to run when no trigger inputs match.
            If None, this block can be skipped entirely when no trigger inputs are provided.
    """
    block_classes = []
    block_names = []
    block_trigger_inputs = []
-    default_block_name = None  # name of the default block if no trigger inputs are provided, if None, this block can be skipped if no trigger inputs are provided
+    default_block_name = None
    def __init__(self):
        sub_blocks = InsertableDict()
@@ -611,7 +584,7 @@ class ConditionalPipelineBlocks(ModularPipelineBlocks):
    @property
    def inputs(self) -> List[Tuple[str, Any]]:
        named_inputs = [(name, block.inputs) for name, block in self.sub_blocks.items()]
-        combined_inputs = self.combine_inputs(*named_inputs)
+        combined_inputs = combine_inputs(*named_inputs)
        # mark Required inputs only if that input is required by all the blocks
        for input_param in combined_inputs:
            if input_param.name in self.required_inputs:
@@ -623,15 +596,16 @@ class ConditionalPipelineBlocks(ModularPipelineBlocks):
    @property
    def intermediate_outputs(self) -> List[str]:
        named_outputs = [(name, block.intermediate_outputs) for name, block in self.sub_blocks.items()]
-        combined_outputs = self.combine_outputs(*named_outputs)
+        combined_outputs = combine_outputs(*named_outputs)
        return combined_outputs
    @property
    def outputs(self) -> List[str]:
        named_outputs = [(name, block.outputs) for name, block in self.sub_blocks.items()]
-        combined_outputs = self.combine_outputs(*named_outputs)
+        combined_outputs = combine_outputs(*named_outputs)
        return combined_outputs
    # used for `__repr__`
    def _get_trigger_inputs(self) -> set:
        """
        Returns a set of all unique trigger input values found in this block and nested blocks.
@@ -660,11 +634,6 @@ class ConditionalPipelineBlocks(ModularPipelineBlocks):
        return all_triggers
    @property
    def trigger_inputs(self):
        """All trigger inputs including from nested blocks."""
        return self._get_trigger_inputs()
    def select_block(self, **kwargs) -> Optional[str]:
        """
        Select the block to run based on the trigger inputs. Subclasses must implement this method to define the logic
@@ -704,6 +673,39 @@ class ConditionalPipelineBlocks(ModularPipelineBlocks):
            logger.error(error_msg)
            raise
    def get_execution_blocks(self, **kwargs) -> Optional["ModularPipelineBlocks"]:
        """
        Get the block(s) that would execute given the inputs.
        Recursively resolves nested ConditionalPipelineBlocks until reaching either:
        - A leaf block (no sub_blocks) → returns single `ModularPipelineBlocks`
        - A `SequentialPipelineBlocks` → delegates to its `get_execution_blocks()` which returns
        a `SequentialPipelineBlocks` containing the resolved execution blocks
        Args:
            **kwargs: Input names and values. Only trigger inputs affect block selection.
        Returns:
            - `ModularPipelineBlocks`: A leaf block or resolved `SequentialPipelineBlocks`
            - `None`: If this block would be skipped (no trigger matched and no default)
        """
        trigger_kwargs = {name: kwargs.get(name) for name in self.block_trigger_inputs if name is not None}
        block_name = self.select_block(**trigger_kwargs)
        if block_name is None:
            block_name = self.default_block_name
        if block_name is None:
            return None
        block = self.sub_blocks[block_name]
        # Recursively resolve until we hit a leaf block or a SequentialPipelineBlocks
        if block.sub_blocks:
            return block.get_execution_blocks(**kwargs)
        return block
    def __repr__(self):
        class_name = self.__class__.__name__
        base_class = self.__class__.__bases__[0].__name__
@@ -711,11 +713,11 @@ class ConditionalPipelineBlocks(ModularPipelineBlocks):
            f"{class_name}(\n  Class: {base_class}\n" if base_class and base_class != "object" else f"{class_name}(\n"
        )
-        if self.trigger_inputs:
+        if self._get_trigger_inputs():
            header += "\n"
            header += "  " + "=" * 100 + "\n"
            header += "  This pipeline contains blocks that are selected at runtime based on inputs.\n"
-            header += f"  Trigger Inputs: {sorted(self.trigger_inputs)}\n"
+            header += f"  Trigger Inputs: {sorted(self._get_trigger_inputs())}\n"
            header += "  " + "=" * 100 + "\n\n"
        # Format description with proper indentation
@@ -782,24 +784,56 @@ class ConditionalPipelineBlocks(ModularPipelineBlocks):
 class AutoPipelineBlocks(ConditionalPipelineBlocks):
    """
-    A Pipeline Blocks that automatically selects a block to run based on the presence of trigger inputs.
+        A Pipeline Blocks that automatically selects a block to run based on the presence of trigger inputs.
        This is a specialized version of `ConditionalPipelineBlocks` where:
        - Each block has one corresponding trigger input (1:1 mapping)
        - Block selection is automatic: the first block whose trigger input is present gets selected
        - `block_trigger_inputs` must have the same length as `block_names` and `block_classes`
        - Use `None` in `block_trigger_inputs` to specify the default block, i.e the block that will run if no trigger
          inputs are present
        Attributes:
            block_classes:
                List of block classes to be used. Must have the same length as `block_names` and
                `block_trigger_inputs`.
            block_names:
                List of names for each block. Must have the same length as `block_classes` and `block_trigger_inputs`.
            block_trigger_inputs:
                List of input names where each element specifies the trigger input for the corresponding block. Use
                `None` to mark the default block.
        Example:
    ```python
        class MyAutoBlock(AutoPipelineBlocks):
            block_classes = [InpaintEncoderBlock, ImageEncoderBlock, TextEncoderBlock]
            block_names = ["inpaint", "img2img", "text2img"]
            block_trigger_inputs = ["mask_image", "image", None]  # text2img is the default
    ```
        With this definition:
        - As long as `mask_image` is provided, "inpaint" block runs (regardless of `image` being provided or not)
        - If `mask_image` is not provided but `image` is provided, "img2img" block runs
        - Otherwise, "text2img" block runs (default, trigger is `None`)
    """
    def __init__(self):
        super().__init__()
        if self.default_block_name is not None:
            raise ValueError(
                f"In {self.__class__.__name__}, do not set `default_block_name` for AutoPipelineBlocks. "
                f"Use `None` in `block_trigger_inputs` to specify the default block."
            )
        if not (len(self.block_classes) == len(self.block_names) == len(self.block_trigger_inputs)):
            raise ValueError(
                f"In {self.__class__.__name__}, the number of block_classes, block_names, and block_trigger_inputs must be the same."
            )
    @property
    def default_block_name(self) -> Optional[str]:
        """Derive default_block_name from block_trigger_inputs (None entry)."""
        if None in self.block_trigger_inputs:
            idx = self.block_trigger_inputs.index(None)
-            return self.block_names[idx]
+            self.default_block_name = self.block_names[idx]
        return None
    def select_block(self, **kwargs) -> Optional[str]:
        """Select block based on which trigger input is present (not None)."""
@@ -853,6 +887,29 @@ class SequentialPipelineBlocks(ModularPipelineBlocks):
                    expected_configs.append(config)
        return expected_configs
    @property
    def workflow_names(self):
        if self._workflow_map is None:
            raise NotImplementedError(
                f"workflows is not supported because _workflow_map is not set for {self.__class__.__name__}"
            )
        return list(self._workflow_map.keys())
    def get_workflow(self, workflow_name: str):
        if self._workflow_map is None:
            raise NotImplementedError(
                f"workflows is not supported because _workflow_map is not set for {self.__class__.__name__}"
            )
        if workflow_name not in self._workflow_map:
            raise ValueError(f"Workflow {workflow_name} not found in {self.__class__.__name__}")
        trigger_inputs = self._workflow_map[workflow_name]
        workflow_blocks = self.get_execution_blocks(**trigger_inputs)
        return workflow_blocks
    @classmethod
    def from_blocks_dict(
        cls, blocks_dict: Dict[str, Any], description: Optional[str] = None
@@ -948,7 +1005,7 @@ class SequentialPipelineBlocks(ModularPipelineBlocks):
            # filter out them here so they do not end up as intermediate_outputs
            if name not in inp_names:
                named_outputs.append((name, block.intermediate_outputs))
-        combined_outputs = self.combine_outputs(*named_outputs)
+        combined_outputs = combine_outputs(*named_outputs)
        return combined_outputs
    # YiYi TODO: I think we can remove the outputs property
@@ -972,6 +1029,7 @@ class SequentialPipelineBlocks(ModularPipelineBlocks):
                raise
        return pipeline, state
    # used for `trigger_inputs` property
    def _get_trigger_inputs(self):
        """
        Returns a set of all unique trigger input values found in the blocks.
@@ -995,89 +1053,50 @@ class SequentialPipelineBlocks(ModularPipelineBlocks):
        return fn_recursive_get_trigger(self.sub_blocks)
-    @property
+    def get_execution_blocks(self, **kwargs) -> "SequentialPipelineBlocks":
    def trigger_inputs(self):
        return self._get_trigger_inputs()
    def _traverse_trigger_blocks(self, active_inputs):
        """
-        Traverse blocks and select which ones would run given the active inputs.
+        Get the blocks that would execute given the specified inputs.
        Args:
-            active_inputs: Dict of input names to values that are "present"
+            **kwargs: Input names and values. Only trigger inputs affect block selection.
        Returns:
-            OrderedDict of block_name -> block that would execute
+            SequentialPipelineBlocks containing only the blocks that would execute
        """
        # Copy kwargs so we can add outputs as we traverse
        active_inputs = dict(kwargs)
        def fn_recursive_traverse(block, block_name, active_inputs):
            result_blocks = OrderedDict()
            # ConditionalPipelineBlocks (includes AutoPipelineBlocks)
            if isinstance(block, ConditionalPipelineBlocks):
-                trigger_kwargs = {name: active_inputs.get(name) for name in block.block_trigger_inputs}
+                block = block.get_execution_blocks(**active_inputs)
-                selected_block_name = block.select_block(**trigger_kwargs)
+                if block is None:
                if selected_block_name is None:
                    selected_block_name = block.default_block_name
                if selected_block_name is None:
                    return result_blocks
-                selected_block = block.sub_blocks[selected_block_name]
+            # Has sub_blocks (SequentialPipelineBlocks/ConditionalPipelineBlocks)
-
+            if block.sub_blocks and not isinstance(block, LoopSequentialPipelineBlocks):
                if selected_block.sub_blocks:
                    result_blocks.update(fn_recursive_traverse(selected_block, block_name, active_inputs))
                else:
                    result_blocks[block_name] = selected_block
                    if hasattr(selected_block, "outputs"):
                        for out in selected_block.outputs:
                            active_inputs[out.name] = True
                return result_blocks
            # SequentialPipelineBlocks or LoopSequentialPipelineBlocks
            if block.sub_blocks:
                for sub_block_name, sub_block in block.sub_blocks.items():
-                    blocks_to_update = fn_recursive_traverse(sub_block, sub_block_name, active_inputs)
+                    nested_blocks = fn_recursive_traverse(sub_block, sub_block_name, active_inputs)
-                    blocks_to_update = {f"{block_name}.{k}": v for k, v in blocks_to_update.items()}
+                    nested_blocks = {f"{block_name}.{k}": v for k, v in nested_blocks.items()}
-                    result_blocks.update(blocks_to_update)
+                    result_blocks.update(nested_blocks)
            else:
                # Leaf block: single ModularPipelineBlocks or LoopSequentialPipelineBlocks
                result_blocks[block_name] = block
-                if hasattr(block, "outputs"):
+                # Add outputs to active_inputs so subsequent blocks can use them as triggers
-                    for out in block.outputs:
+                if hasattr(block, "intermediate_outputs"):
                    for out in block.intermediate_outputs:
                        active_inputs[out.name] = True
            return result_blocks
        all_blocks = OrderedDict()
        for block_name, block in self.sub_blocks.items():
-            blocks_to_update = fn_recursive_traverse(block, block_name, active_inputs)
+            nested_blocks = fn_recursive_traverse(block, block_name, active_inputs)
-            all_blocks.update(blocks_to_update)
+            all_blocks.update(nested_blocks)
        return all_blocks
-    def get_execution_blocks(self, **kwargs):
+        return SequentialPipelineBlocks.from_blocks_dict(all_blocks)
        """
        Get the blocks that would execute given the specified inputs.
        Args:
            **kwargs: Input names and values. Only trigger inputs affect block selection.
                    Pass any inputs that would be non-None at runtime.
        Returns:
            SequentialPipelineBlocks containing only the blocks that would execute
        Example:
            # Get blocks for inpainting workflow blocks = pipeline.get_execution_blocks(prompt="a cat", mask=mask,
            image=image)
            # Get blocks for text2image workflow blocks = pipeline.get_execution_blocks(prompt="a cat")
        """
        # Filter out None values
        active_inputs = {k: v for k, v in kwargs.items() if v is not None}
        blocks_triggered = self._traverse_trigger_blocks(active_inputs)
        return SequentialPipelineBlocks.from_blocks_dict(blocks_triggered)
    def __repr__(self):
        class_name = self.__class__.__name__
@@ -1086,18 +1105,23 @@ class SequentialPipelineBlocks(ModularPipelineBlocks):
            f"{class_name}(\n  Class: {base_class}\n" if base_class and base_class != "object" else f"{class_name}(\n"
        )
-        if self.trigger_inputs:
+        if self._workflow_map is None and self._get_trigger_inputs():
            header += "\n"
            header += "  " + "=" * 100 + "\n"
            header += "  This pipeline contains blocks that are selected at runtime based on inputs.\n"
-            header += f"  Trigger Inputs: {[inp for inp in self.trigger_inputs if inp is not None]}\n"
+            header += f"  Trigger Inputs: {[inp for inp in self._get_trigger_inputs() if inp is not None]}\n"
            # Get first trigger input as example
-            example_input = next(t for t in self.trigger_inputs if t is not None)
+            example_input = next(t for t in self._get_trigger_inputs() if t is not None)
            header += f"  Use `get_execution_blocks()` to see selected blocks (e.g. `get_execution_blocks({example_input}=...)`).\n"
            header += "  " + "=" * 100 + "\n\n"
        description = self.description
        if self._workflow_map is not None:
            workflow_str = format_workflow(self._workflow_map)
            description = f"{self.description}\n\n{workflow_str}"
        # Format description with proper indentation
-        desc_lines = self.description.split("\n")
+        desc_lines = description.split("\n")
        desc = []
        # First line with "Description:" label
        desc.append(f"  Description: {desc_lines[0]}")
@@ -1145,10 +1169,15 @@ class SequentialPipelineBlocks(ModularPipelineBlocks):
    @property
    def doc(self):
        description = self.description
        if self._workflow_map is not None:
            workflow_str = format_workflow(self._workflow_map)
            description = f"{self.description}\n\n{workflow_str}"
        return make_doc_string(
            self.inputs,
            self.outputs,
-            self.description,
+            description=description,
            class_name=self.__class__.__name__,
            expected_components=self.expected_components,
            expected_configs=self.expected_configs,
@@ -1281,7 +1310,7 @@ class LoopSequentialPipelineBlocks(ModularPipelineBlocks):
    @property
    def intermediate_outputs(self) -> List[str]:
        named_outputs = [(name, block.intermediate_outputs) for name, block in self.sub_blocks.items()]
-        combined_outputs = self.combine_outputs(*named_outputs)
+        combined_outputs = combine_outputs(*named_outputs)
        for output in self.loop_intermediate_outputs:
            if output.name not in {output.name for output in combined_outputs}:
                combined_outputs.append(output)
--- a/src/diffusers/modular_pipelines/modular_pipeline_utils.py
+++ b/src/diffusers/modular_pipelines/modular_pipeline_utils.py
@@ -14,10 +14,12 @@
 import inspect
 import re
 import warnings
 from collections import OrderedDict
 from dataclasses import dataclass, field, fields
-from typing import Any, Dict, List, Literal, Optional, Type, Union
+from typing import Any, Dict, List, Literal, Optional, Tuple, Type, Union
 import PIL.Image
 import torch
 from ..configuration_utils import ConfigMixin, FrozenDict
@@ -323,11 +325,192 @@ class ConfigSpec:
    description: Optional[str] = None
-# YiYi Notes: both inputs and intermediate_inputs are InputParam objects
+# ======================================================
-# however some fields are not relevant for intermediate_inputs
+# InputParam and OutputParam templates
-# e.g. unlike inputs, required only used in docstring for intermediate_inputs, we do not check if a required intermediate inputs is passed
+# ======================================================
-# default is not used for intermediate_inputs, we only use default from inputs, so it is ignored if it is set for intermediate_inputs
+
-# -> should we use different class for inputs and intermediate_inputs?
+INPUT_PARAM_TEMPLATES = {
    "prompt": {
        "type_hint": str,
        "required": True,
        "description": "The prompt or prompts to guide image generation.",
    },
    "negative_prompt": {
        "type_hint": str,
        "description": "The prompt or prompts not to guide the image generation.",
    },
    "max_sequence_length": {
        "type_hint": int,
        "default": 512,
        "description": "Maximum sequence length for prompt encoding.",
    },
    "height": {
        "type_hint": int,
        "description": "The height in pixels of the generated image.",
    },
    "width": {
        "type_hint": int,
        "description": "The width in pixels of the generated image.",
    },
    "num_inference_steps": {
        "type_hint": int,
        "default": 50,
        "description": "The number of denoising steps.",
    },
    "num_images_per_prompt": {
        "type_hint": int,
        "default": 1,
        "description": "The number of images to generate per prompt.",
    },
    "generator": {
        "type_hint": torch.Generator,
        "description": "Torch generator for deterministic generation.",
    },
    "sigmas": {
        "type_hint": List[float],
        "description": "Custom sigmas for the denoising process.",
    },
    "strength": {
        "type_hint": float,
        "default": 0.9,
        "description": "Strength for img2img/inpainting.",
    },
    "image": {
        "type_hint": Union[PIL.Image.Image, List[PIL.Image.Image]],
        "required": True,
        "description": "Reference image(s) for denoising. Can be a single image or list of images.",
    },
    "latents": {
        "type_hint": torch.Tensor,
        "description": "Pre-generated noisy latents for image generation.",
    },
    "timesteps": {
        "type_hint": torch.Tensor,
        "description": "Timesteps for the denoising process.",
    },
    "output_type": {
        "type_hint": str,
        "default": "pil",
        "description": "Output format: 'pil', 'np', 'pt'.",
    },
    "attention_kwargs": {
        "type_hint": Dict[str, Any],
        "description": "Additional kwargs for attention processors.",
    },
    "denoiser_input_fields": {
        "name": None,
        "kwargs_type": "denoiser_input_fields",
        "description": "conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.",
    },
    # inpainting
    "mask_image": {
        "type_hint": PIL.Image.Image,
        "required": True,
        "description": "Mask image for inpainting.",
    },
    "padding_mask_crop": {
        "type_hint": int,
        "description": "Padding for mask cropping in inpainting.",
    },
    # controlnet
    "control_image": {
        "type_hint": PIL.Image.Image,
        "required": True,
        "description": "Control image for ControlNet conditioning.",
    },
    "control_guidance_start": {
        "type_hint": float,
        "default": 0.0,
        "description": "When to start applying ControlNet.",
    },
    "control_guidance_end": {
        "type_hint": float,
        "default": 1.0,
        "description": "When to stop applying ControlNet.",
    },
    "controlnet_conditioning_scale": {
        "type_hint": float,
        "default": 1.0,
        "description": "Scale for ControlNet conditioning.",
    },
    "layers": {
        "type_hint": int,
        "default": 4,
        "description": "Number of layers to extract from the image",
    },
    # common intermediate inputs
    "prompt_embeds": {
        "type_hint": torch.Tensor,
        "required": True,
        "description": "text embeddings used to guide the image generation. Can be generated from text_encoder step.",
    },
    "prompt_embeds_mask": {
        "type_hint": torch.Tensor,
        "required": True,
        "description": "mask for the text embeddings. Can be generated from text_encoder step.",
    },
    "negative_prompt_embeds": {
        "type_hint": torch.Tensor,
        "description": "negative text embeddings used to guide the image generation. Can be generated from text_encoder step.",
    },
    "negative_prompt_embeds_mask": {
        "type_hint": torch.Tensor,
        "description": "mask for the negative text embeddings. Can be generated from text_encoder step.",
    },
    "image_latents": {
        "type_hint": torch.Tensor,
        "required": True,
        "description": "image latents used to guide the image generation. Can be generated from vae_encoder step.",
    },
    "batch_size": {
        "type_hint": int,
        "default": 1,
        "description": "Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be generated in input step.",
    },
    "dtype": {
        "type_hint": torch.dtype,
        "default": torch.float32,
        "description": "The dtype of the model inputs, can be generated in input step.",
    },
 }
 OUTPUT_PARAM_TEMPLATES = {
    "images": {
        "type_hint": List[PIL.Image.Image],
        "description": "Generated images.",
    },
    "latents": {
        "type_hint": torch.Tensor,
        "description": "Denoised latents.",
    },
    # intermediate outputs
    "prompt_embeds": {
        "type_hint": torch.Tensor,
        "kwargs_type": "denoiser_input_fields",
        "description": "The prompt embeddings.",
    },
    "prompt_embeds_mask": {
        "type_hint": torch.Tensor,
        "kwargs_type": "denoiser_input_fields",
        "description": "The encoder attention mask.",
    },
    "negative_prompt_embeds": {
        "type_hint": torch.Tensor,
        "kwargs_type": "denoiser_input_fields",
        "description": "The negative prompt embeddings.",
    },
    "negative_prompt_embeds_mask": {
        "type_hint": torch.Tensor,
        "kwargs_type": "denoiser_input_fields",
        "description": "The negative prompt embeddings mask.",
    },
    "image_latents": {
        "type_hint": torch.Tensor,
        "description": "The latent representation of the input image.",
    },
 }
@dataclass
 class InputParam:
    """Specification for an input parameter."""
@@ -337,11 +520,31 @@ class InputParam:
    default: Any = None
    required: bool = False
    description: str = ""
-    kwargs_type: str = None  # YiYi Notes: remove this feature (maybe)
+    kwargs_type: str = None
    def __repr__(self):
        return f"<{self.name}: {'required' if self.required else 'optional'}, default={self.default}>"
    @classmethod
    def template(cls, template_name: str, note: str = None, **overrides) -> "InputParam":
        """Get template for name if exists, otherwise raise ValueError."""
        if template_name not in INPUT_PARAM_TEMPLATES:
            raise ValueError(f"InputParam template for {template_name} not found")
        template_kwargs = INPUT_PARAM_TEMPLATES[template_name].copy()
        # Determine the actual param name:
        # 1. From overrides if provided
        # 2. From template if present
        # 3. Fall back to template_name
        name = overrides.pop("name", template_kwargs.pop("name", template_name))
        if note and "description" in template_kwargs:
            template_kwargs["description"] = f"{template_kwargs['description']} ({note})"
        template_kwargs.update(overrides)
        return cls(name=name, **template_kwargs)
@dataclass
 class OutputParam:
@@ -350,13 +553,33 @@ class OutputParam:
    name: str
    type_hint: Any = None
    description: str = ""
-    kwargs_type: str = None  # YiYi notes: remove this feature (maybe)
+    kwargs_type: str = None
    def __repr__(self):
        return (
            f"<{self.name}: {self.type_hint.__name__ if hasattr(self.type_hint, '__name__') else str(self.type_hint)}>"
        )
    @classmethod
    def template(cls, template_name: str, note: str = None, **overrides) -> "OutputParam":
        """Get template for name if exists, otherwise raise ValueError."""
        if template_name not in OUTPUT_PARAM_TEMPLATES:
            raise ValueError(f"OutputParam template for {template_name} not found")
        template_kwargs = OUTPUT_PARAM_TEMPLATES[template_name].copy()
        # Determine the actual param name:
        # 1. From overrides if provided
        # 2. From template if present
        # 3. Fall back to template_name
        name = overrides.pop("name", template_kwargs.pop("name", template_name))
        if note and "description" in template_kwargs:
            template_kwargs["description"] = f"{template_kwargs['description']} ({note})"
        template_kwargs.update(overrides)
        return cls(name=name, **template_kwargs)
 def format_inputs_short(inputs):
    """
@@ -509,10 +732,12 @@ def format_params(params, header="Args", indent_level=4, max_line_length=115):
            desc = re.sub(r"\[(.*?)\]\((https?://[^\s\)]+)\)", r"[\1](\2)", param.description)
            wrapped_desc = wrap_text(desc, desc_indent, max_line_length)
            param_str += f"\n{desc_indent}{wrapped_desc}"
        else:
            param_str += f"\n{desc_indent}TODO: Add description."
        formatted_params.append(param_str)
-    return "\n\n".join(formatted_params)
+    return "\n".join(formatted_params)
 def format_input_params(input_params, indent_level=4, max_line_length=115):
@@ -582,7 +807,7 @@ def format_components(components, indent_level=4, max_line_length=115, add_empty
        loading_field_values = []
        for field_name in component.loading_fields():
            field_value = getattr(component, field_name)
-            if field_value is not None:
+            if field_value:
                loading_field_values.append(f"{field_name}={field_value}")
        # Add loading field information if available
@@ -636,6 +861,30 @@ def format_configs(configs, indent_level=4, max_line_length=115, add_empty_lines
    return "\n".join(formatted_configs)
 def format_workflow(workflow_map):
    """Format a workflow map into a readable string representation.
    Args:
        workflow_map: Dictionary mapping workflow names to trigger inputs
    Returns:
        A formatted string representing all workflows
    """
    if workflow_map is None:
        return ""
    lines = ["Supported workflows:"]
    for workflow_name, trigger_inputs in workflow_map.items():
        required_inputs = [k for k, v in trigger_inputs.items() if v]
        if required_inputs:
            inputs_str = ", ".join(f"`{t}`" for t in required_inputs)
            lines.append(f"  - `{workflow_name}`: requires {inputs_str}")
        else:
            lines.append(f"  - `{workflow_name}`: default (no additional inputs required)")
    return "\n".join(lines)
 def make_doc_string(
    inputs,
    outputs,
@@ -669,17 +918,17 @@ def make_doc_string(
    # Add description
    if description:
        desc_lines = description.strip().split("\n")
-        aligned_desc = "\n".join("  " + line for line in desc_lines)
+        aligned_desc = "\n".join("  " + line.rstrip() for line in desc_lines)
        output += aligned_desc + "\n\n"
    # Add components section if provided
    if expected_components and len(expected_components) > 0:
-        components_str = format_components(expected_components, indent_level=2)
+        components_str = format_components(expected_components, indent_level=2, add_empty_lines=False)
        output += components_str + "\n\n"
    # Add configs section if provided
    if expected_configs and len(expected_configs) > 0:
-        configs_str = format_configs(expected_configs, indent_level=2)
+        configs_str = format_configs(expected_configs, indent_level=2, add_empty_lines=False)
        output += configs_str + "\n\n"
    # Add inputs section
@@ -690,3 +939,69 @@ def make_doc_string(
    output += format_output_params(outputs, indent_level=2)
    return output
 def combine_inputs(*named_input_lists: List[Tuple[str, List[InputParam]]]) -> List[InputParam]:
    """
    Combines multiple lists of InputParam objects from different blocks. For duplicate inputs, updates only if current
    default value is None and new default value is not None. Warns if multiple non-None default values exist for the
    same input.
    Args:
        named_input_lists: List of tuples containing (block_name, input_param_list) pairs
    Returns:
        List[InputParam]: Combined list of unique InputParam objects
    """
    combined_dict = {}  # name -> InputParam
    value_sources = {}  # name -> block_name
    for block_name, inputs in named_input_lists:
        for input_param in inputs:
            if input_param.name is None and input_param.kwargs_type is not None:
                input_name = "*_" + input_param.kwargs_type
            else:
                input_name = input_param.name
            if input_name in combined_dict:
                current_param = combined_dict[input_name]
                if (
                    current_param.default is not None
                    and input_param.default is not None
                    and current_param.default != input_param.default
                ):
                    warnings.warn(
                        f"Multiple different default values found for input '{input_name}': "
                        f"{current_param.default} (from block '{value_sources[input_name]}') and "
                        f"{input_param.default} (from block '{block_name}'). Using {current_param.default}."
                    )
                if current_param.default is None and input_param.default is not None:
                    combined_dict[input_name] = input_param
                    value_sources[input_name] = block_name
            else:
                combined_dict[input_name] = input_param
                value_sources[input_name] = block_name
    return list(combined_dict.values())
 def combine_outputs(*named_output_lists: List[Tuple[str, List[OutputParam]]]) -> List[OutputParam]:
    """
    Combines multiple lists of OutputParam objects from different blocks. For duplicate outputs, keeps the first
    occurrence of each output name.
    Args:
        named_output_lists: List of tuples containing (block_name, output_param_list) pairs
    Returns:
        List[OutputParam]: Combined list of unique OutputParam objects
    """
    combined_dict = {}  # name -> OutputParam
    for block_name, outputs in named_output_lists:
        for output_param in outputs:
            if (output_param.name not in combined_dict) or (
                combined_dict[output_param.name].kwargs_type is None and output_param.kwargs_type is not None
            ):
                combined_dict[output_param.name] = output_param
    return list(combined_dict.values())
--- a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
+++ b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
@@ -118,7 +118,40 @@ def get_timesteps(scheduler, num_inference_steps, strength):
 # ====================
 # auto_docstring
 class QwenImagePrepareLatentsStep(ModularPipelineBlocks):
    """
    Prepare initial random noise for the generation process
      Components:
          pachifier (`QwenImagePachifier`)
      Inputs:
          latents (`Tensor`, *optional*):
              Pre-generated noisy latents for image generation.
          height (`int`, *optional*):
              The height in pixels of the generated image.
          width (`int`, *optional*):
              The width in pixels of the generated image.
          num_images_per_prompt (`int`, *optional*, defaults to 1):
              The number of images to generate per prompt.
          generator (`Generator`, *optional*):
              Torch generator for deterministic generation.
          batch_size (`int`, *optional*, defaults to 1):
              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
              be generated in input step.
          dtype (`dtype`, *optional*, defaults to torch.float32):
              The dtype of the model inputs, can be generated in input step.
      Outputs:
          height (`int`):
              if not set, updated to default value
          width (`int`):
              if not set, updated to default value
          latents (`Tensor`):
              The initial latents to use for the denoising process
    """
    model_name = "qwenimage"
    @property
@@ -134,28 +167,20 @@ class QwenImagePrepareLatentsStep(ModularPipelineBlocks):
    @property
    def inputs(self) -> List[InputParam]:
        return [
-            InputParam("latents"),
+            InputParam.template("latents"),
-            InputParam(name="height"),
+            InputParam.template("height"),
-            InputParam(name="width"),
+            InputParam.template("width"),
-            InputParam(name="num_images_per_prompt", default=1),
+            InputParam.template("num_images_per_prompt"),
-            InputParam(name="generator"),
+            InputParam.template("generator"),
-            InputParam(
+            InputParam.template("batch_size"),
-                name="batch_size",
+            InputParam.template("dtype"),
                required=True,
                type_hint=int,
                description="Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be generated in input step.",
            ),
            InputParam(
                name="dtype",
                required=True,
                type_hint=torch.dtype,
                description="The dtype of the model inputs, can be generated in input step.",
            ),
        ]
    @property
    def intermediate_outputs(self) -> List[OutputParam]:
        return [
            OutputParam(name="height", type_hint=int, description="if not set, updated to default value"),
            OutputParam(name="width", type_hint=int, description="if not set, updated to default value"),
            OutputParam(
                name="latents",
                type_hint=torch.Tensor,
@@ -209,7 +234,42 @@ class QwenImagePrepareLatentsStep(ModularPipelineBlocks):
        return components, state
 # auto_docstring
 class QwenImageLayeredPrepareLatentsStep(ModularPipelineBlocks):
    """
    Prepare initial random noise (B, layers+1, C, H, W) for the generation process
      Components:
          pachifier (`QwenImageLayeredPachifier`)
      Inputs:
          latents (`Tensor`, *optional*):
              Pre-generated noisy latents for image generation.
          height (`int`, *optional*):
              The height in pixels of the generated image.
          width (`int`, *optional*):
              The width in pixels of the generated image.
          layers (`int`, *optional*, defaults to 4):
              Number of layers to extract from the image
          num_images_per_prompt (`int`, *optional*, defaults to 1):
              The number of images to generate per prompt.
          generator (`Generator`, *optional*):
              Torch generator for deterministic generation.
          batch_size (`int`, *optional*, defaults to 1):
              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
              be generated in input step.
          dtype (`dtype`, *optional*, defaults to torch.float32):
              The dtype of the model inputs, can be generated in input step.
      Outputs:
          height (`int`):
              if not set, updated to default value
          width (`int`):
              if not set, updated to default value
          latents (`Tensor`):
              The initial latents to use for the denoising process
    """
    model_name = "qwenimage-layered"
    @property
@@ -225,29 +285,21 @@ class QwenImageLayeredPrepareLatentsStep(ModularPipelineBlocks):
    @property
    def inputs(self) -> List[InputParam]:
        return [
-            InputParam("latents"),
+            InputParam.template("latents"),
-            InputParam(name="height"),
+            InputParam.template("height"),
-            InputParam(name="width"),
+            InputParam.template("width"),
-            InputParam(name="layers", default=4),
+            InputParam.template("layers"),
-            InputParam(name="num_images_per_prompt", default=1),
+            InputParam.template("num_images_per_prompt"),
-            InputParam(name="generator"),
+            InputParam.template("generator"),
-            InputParam(
+            InputParam.template("batch_size"),
-                name="batch_size",
+            InputParam.template("dtype"),
                required=True,
                type_hint=int,
                description="Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be generated in input step.",
            ),
            InputParam(
                name="dtype",
                required=True,
                type_hint=torch.dtype,
                description="The dtype of the model inputs, can be generated in input step.",
            ),
        ]
    @property
    def intermediate_outputs(self) -> List[OutputParam]:
        return [
            OutputParam(name="height", type_hint=int, description="if not set, updated to default value"),
            OutputParam(name="width", type_hint=int, description="if not set, updated to default value"),
            OutputParam(
                name="latents",
                type_hint=torch.Tensor,
@@ -301,7 +353,31 @@ class QwenImageLayeredPrepareLatentsStep(ModularPipelineBlocks):
        return components, state
 # auto_docstring
 class QwenImagePrepareLatentsWithStrengthStep(ModularPipelineBlocks):
    """
    Step that adds noise to image latents for image-to-image/inpainting. Should be run after set_timesteps,
    prepare_latents. Both noise and image latents should alreadybe patchified.
      Components:
          scheduler (`FlowMatchEulerDiscreteScheduler`)
      Inputs:
          latents (`Tensor`):
              The initial random noised, can be generated in prepare latent step.
          image_latents (`Tensor`):
              image latents used to guide the image generation. Can be generated from vae_encoder step. (Can be
              generated from vae encoder and updated in input step.)
          timesteps (`Tensor`):
              The timesteps to use for the denoising process. Can be generated in set_timesteps step.
      Outputs:
          initial_noise (`Tensor`):
              The initial random noised used for inpainting denoising.
          latents (`Tensor`):
              The scaled noisy latents to use for inpainting/image-to-image denoising.
    """
    model_name = "qwenimage"
    @property
@@ -323,12 +399,7 @@ class QwenImagePrepareLatentsWithStrengthStep(ModularPipelineBlocks):
                type_hint=torch.Tensor,
                description="The initial random noised, can be generated in prepare latent step.",
            ),
-            InputParam(
+            InputParam.template("image_latents", note="Can be generated from vae encoder and updated in input step."),
                name="image_latents",
                required=True,
                type_hint=torch.Tensor,
                description="The image latents to use for the denoising process. Can be generated in vae encoder and packed in input step.",
            ),
            InputParam(
                name="timesteps",
                required=True,
@@ -345,6 +416,11 @@ class QwenImagePrepareLatentsWithStrengthStep(ModularPipelineBlocks):
                type_hint=torch.Tensor,
                description="The initial random noised used for inpainting denoising.",
            ),
            OutputParam(
                name="latents",
                type_hint=torch.Tensor,
                description="The scaled noisy latents to use for inpainting/image-to-image denoising.",
            ),
        ]
    @staticmethod
@@ -382,7 +458,29 @@ class QwenImagePrepareLatentsWithStrengthStep(ModularPipelineBlocks):
        return components, state
 # auto_docstring
 class QwenImageCreateMaskLatentsStep(ModularPipelineBlocks):
    """
    Step that creates mask latents from preprocessed mask_image by interpolating to latent space.
      Components:
          pachifier (`QwenImagePachifier`)
      Inputs:
          processed_mask_image (`Tensor`):
              The processed mask to use for the inpainting process.
          height (`int`):
              The height in pixels of the generated image.
          width (`int`):
              The width in pixels of the generated image.
          dtype (`dtype`, *optional*, defaults to torch.float32):
              The dtype of the model inputs, can be generated in input step.
      Outputs:
          mask (`Tensor`):
              The mask to use for the inpainting process.
    """
    model_name = "qwenimage"
    @property
@@ -404,9 +502,9 @@ class QwenImageCreateMaskLatentsStep(ModularPipelineBlocks):
                type_hint=torch.Tensor,
                description="The processed mask to use for the inpainting process.",
            ),
-            InputParam(name="height", required=True),
+            InputParam.template("height", required=True),
-            InputParam(name="width", required=True),
+            InputParam.template("width", required=True),
-            InputParam(name="dtype", required=True),
+            InputParam.template("dtype"),
        ]
    @property
@@ -450,7 +548,27 @@ class QwenImageCreateMaskLatentsStep(ModularPipelineBlocks):
 # ====================
 # auto_docstring
 class QwenImageSetTimestepsStep(ModularPipelineBlocks):
    """
    Step that sets the scheduler's timesteps for text-to-image generation. Should be run after prepare latents step.
      Components:
          scheduler (`FlowMatchEulerDiscreteScheduler`)
      Inputs:
          num_inference_steps (`int`, *optional*, defaults to 50):
              The number of denoising steps.
          sigmas (`List`, *optional*):
              Custom sigmas for the denoising process.
          latents (`Tensor`):
              The initial random noised latents for the denoising process. Can be generated in prepare latents step.
      Outputs:
          timesteps (`Tensor`):
              The timesteps to use for the denoising process
    """
    model_name = "qwenimage"
    @property
@@ -466,13 +584,13 @@ class QwenImageSetTimestepsStep(ModularPipelineBlocks):
    @property
    def inputs(self) -> List[InputParam]:
        return [
-            InputParam(name="num_inference_steps", default=50),
+            InputParam.template("num_inference_steps"),
-            InputParam(name="sigmas"),
+            InputParam.template("sigmas"),
            InputParam(
                name="latents",
                required=True,
                type_hint=torch.Tensor,
-                description="The latents to use for the denoising process, used to calculate the image sequence length.",
+                description="The initial random noised latents for the denoising process. Can be generated in prepare latents step.",
            ),
        ]
@@ -516,7 +634,27 @@ class QwenImageSetTimestepsStep(ModularPipelineBlocks):
        return components, state
 # auto_docstring
 class QwenImageLayeredSetTimestepsStep(ModularPipelineBlocks):
    """
    Set timesteps step for QwenImage Layered with custom mu calculation based on image_latents.
      Components:
          scheduler (`FlowMatchEulerDiscreteScheduler`)
      Inputs:
          num_inference_steps (`int`, *optional*, defaults to 50):
              The number of denoising steps.
          sigmas (`List`, *optional*):
              Custom sigmas for the denoising process.
          image_latents (`Tensor`):
              image latents used to guide the image generation. Can be generated from vae_encoder step.
      Outputs:
          timesteps (`Tensor`):
              The timesteps to use for the denoising process.
    """
    model_name = "qwenimage-layered"
    @property
@@ -532,15 +670,17 @@ class QwenImageLayeredSetTimestepsStep(ModularPipelineBlocks):
    @property
    def inputs(self) -> List[InputParam]:
        return [
-            InputParam("num_inference_steps", default=50, type_hint=int),
+            InputParam.template("num_inference_steps"),
-            InputParam("sigmas", type_hint=List[float]),
+            InputParam.template("sigmas"),
-            InputParam("image_latents", required=True, type_hint=torch.Tensor),
+            InputParam.template("image_latents"),
        ]
    @property
    def intermediate_outputs(self) -> List[OutputParam]:
        return [
-            OutputParam(name="timesteps", type_hint=torch.Tensor),
+            OutputParam(
                name="timesteps", type_hint=torch.Tensor, description="The timesteps to use for the denoising process."
            ),
        ]
    @torch.no_grad()
@@ -574,7 +714,32 @@ class QwenImageLayeredSetTimestepsStep(ModularPipelineBlocks):
        return components, state
 # auto_docstring
 class QwenImageSetTimestepsWithStrengthStep(ModularPipelineBlocks):
    """
    Step that sets the scheduler's timesteps for image-to-image generation, and inpainting. Should be run after prepare
    latents step.
      Components:
          scheduler (`FlowMatchEulerDiscreteScheduler`)
      Inputs:
          num_inference_steps (`int`, *optional*, defaults to 50):
              The number of denoising steps.
          sigmas (`List`, *optional*):
              Custom sigmas for the denoising process.
          latents (`Tensor`):
              The latents to use for the denoising process. Can be generated in prepare latents step.
          strength (`float`, *optional*, defaults to 0.9):
              Strength for img2img/inpainting.
      Outputs:
          timesteps (`Tensor`):
              The timesteps to use for the denoising process.
          num_inference_steps (`int`):
              The number of denoising steps to perform at inference time. Updated based on strength.
    """
    model_name = "qwenimage"
    @property
@@ -590,15 +755,15 @@ class QwenImageSetTimestepsWithStrengthStep(ModularPipelineBlocks):
    @property
    def inputs(self) -> List[InputParam]:
        return [
-            InputParam(name="num_inference_steps", default=50),
+            InputParam.template("num_inference_steps"),
-            InputParam(name="sigmas"),
+            InputParam.template("sigmas"),
            InputParam(
-                name="latents",
+                "latents",
                required=True,
                type_hint=torch.Tensor,
-                description="The latents to use for the denoising process, used to calculate the image sequence length.",
+                description="The latents to use for the denoising process. Can be generated in prepare latents step.",
            ),
-            InputParam(name="strength", default=0.9),
+            InputParam.template("strength", default=0.9),
        ]
    @property
@@ -607,7 +772,12 @@ class QwenImageSetTimestepsWithStrengthStep(ModularPipelineBlocks):
            OutputParam(
                name="timesteps",
                type_hint=torch.Tensor,
-                description="The timesteps to use for the denoising process. Can be generated in set_timesteps step.",
+                description="The timesteps to use for the denoising process.",
            ),
            OutputParam(
                name="num_inference_steps",
                type_hint=int,
                description="The number of denoising steps to perform at inference time. Updated based on strength.",
            ),
        ]
@@ -654,7 +824,29 @@ class QwenImageSetTimestepsWithStrengthStep(ModularPipelineBlocks):
 ## RoPE inputs for denoiser
 # auto_docstring
 class QwenImageRoPEInputsStep(ModularPipelineBlocks):
    """
    Step that prepares the RoPE inputs for the denoising process. Should be place after prepare_latents step
      Inputs:
          batch_size (`int`, *optional*, defaults to 1):
              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
              be generated in input step.
          height (`int`):
              The height in pixels of the generated image.
          width (`int`):
              The width in pixels of the generated image.
          prompt_embeds_mask (`Tensor`):
              mask for the text embeddings. Can be generated from text_encoder step.
          negative_prompt_embeds_mask (`Tensor`, *optional*):
              mask for the negative text embeddings. Can be generated from text_encoder step.
      Outputs:
          img_shapes (`List`):
              The shapes of the images latents, used for RoPE calculation
    """
    model_name = "qwenimage"
    @property
@@ -666,11 +858,11 @@ class QwenImageRoPEInputsStep(ModularPipelineBlocks):
    @property
    def inputs(self) -> List[InputParam]:
        return [
-            InputParam(name="batch_size", required=True),
+            InputParam.template("batch_size"),
-            InputParam(name="height", required=True),
+            InputParam.template("height", required=True),
-            InputParam(name="width", required=True),
+            InputParam.template("width", required=True),
-            InputParam(name="prompt_embeds_mask"),
+            InputParam.template("prompt_embeds_mask"),
-            InputParam(name="negative_prompt_embeds_mask"),
+            InputParam.template("negative_prompt_embeds_mask"),
        ]
    @property
@@ -702,7 +894,34 @@ class QwenImageRoPEInputsStep(ModularPipelineBlocks):
        return components, state
 # auto_docstring
 class QwenImageEditRoPEInputsStep(ModularPipelineBlocks):
    """
    Step that prepares the RoPE inputs for denoising process. This is used in QwenImage Edit. Should be placed after
    prepare_latents step
      Inputs:
          batch_size (`int`, *optional*, defaults to 1):
              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
              be generated in input step.
          image_height (`int`):
              The height of the reference image. Can be generated in input step.
          image_width (`int`):
              The width of the reference image. Can be generated in input step.
          height (`int`):
              The height in pixels of the generated image.
          width (`int`):
              The width in pixels of the generated image.
          prompt_embeds_mask (`Tensor`):
              mask for the text embeddings. Can be generated from text_encoder step.
          negative_prompt_embeds_mask (`Tensor`, *optional*):
              mask for the negative text embeddings. Can be generated from text_encoder step.
      Outputs:
          img_shapes (`List`):
              The shapes of the images latents, used for RoPE calculation
    """
    model_name = "qwenimage"
    @property
@@ -712,13 +931,23 @@ class QwenImageEditRoPEInputsStep(ModularPipelineBlocks):
    @property
    def inputs(self) -> List[InputParam]:
        return [
-            InputParam(name="batch_size", required=True),
+            InputParam.template("batch_size"),
-            InputParam(name="image_height", required=True),
+            InputParam(
-            InputParam(name="image_width", required=True),
+                name="image_height",
-            InputParam(name="height", required=True),
+                required=True,
-            InputParam(name="width", required=True),
+                type_hint=int,
-            InputParam(name="prompt_embeds_mask"),
+                description="The height of the reference image. Can be generated in input step.",
-            InputParam(name="negative_prompt_embeds_mask"),
+            ),
            InputParam(
                name="image_width",
                required=True,
                type_hint=int,
                description="The width of the reference image. Can be generated in input step.",
            ),
            InputParam.template("height", required=True),
            InputParam.template("width", required=True),
            InputParam.template("prompt_embeds_mask"),
            InputParam.template("negative_prompt_embeds_mask"),
        ]
    @property
@@ -756,7 +985,39 @@ class QwenImageEditRoPEInputsStep(ModularPipelineBlocks):
        return components, state
 # auto_docstring
 class QwenImageEditPlusRoPEInputsStep(ModularPipelineBlocks):
    """
    Step that prepares the RoPE inputs for denoising process. This is used in QwenImage Edit Plus.
      Unlike Edit, Edit Plus handles lists of image_height/image_width for multiple reference images. Should be placed
      after prepare_latents step.
      Inputs:
          batch_size (`int`, *optional*, defaults to 1):
              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
              be generated in input step.
          image_height (`List`):
              The heights of the reference images. Can be generated in input step.
          image_width (`List`):
              The widths of the reference images. Can be generated in input step.
          height (`int`):
              The height in pixels of the generated image.
          width (`int`):
              The width in pixels of the generated image.
          prompt_embeds_mask (`Tensor`):
              mask for the text embeddings. Can be generated from text_encoder step.
          negative_prompt_embeds_mask (`Tensor`, *optional*):
              mask for the negative text embeddings. Can be generated from text_encoder step.
      Outputs:
          img_shapes (`List`):
              The shapes of the image latents, used for RoPE calculation
          txt_seq_lens (`List`):
              The sequence lengths of the prompt embeds, used for RoPE calculation
          negative_txt_seq_lens (`List`):
              The sequence lengths of the negative prompt embeds, used for RoPE calculation
    """
    model_name = "qwenimage-edit-plus"
    @property
@@ -770,13 +1031,23 @@ class QwenImageEditPlusRoPEInputsStep(ModularPipelineBlocks):
    @property
    def inputs(self) -> List[InputParam]:
        return [
-            InputParam(name="batch_size", required=True),
+            InputParam.template("batch_size"),
-            InputParam(name="image_height", required=True, type_hint=List[int]),
+            InputParam(
-            InputParam(name="image_width", required=True, type_hint=List[int]),
+                name="image_height",
-            InputParam(name="height", required=True),
+                required=True,
-            InputParam(name="width", required=True),
+                type_hint=List[int],
-            InputParam(name="prompt_embeds_mask"),
+                description="The heights of the reference images. Can be generated in input step.",
-            InputParam(name="negative_prompt_embeds_mask"),
+            ),
            InputParam(
                name="image_width",
                required=True,
                type_hint=List[int],
                description="The widths of the reference images. Can be generated in input step.",
            ),
            InputParam.template("height", required=True),
            InputParam.template("width", required=True),
            InputParam.template("prompt_embeds_mask"),
            InputParam.template("negative_prompt_embeds_mask"),
        ]
    @property
@@ -832,7 +1103,37 @@ class QwenImageEditPlusRoPEInputsStep(ModularPipelineBlocks):
        return components, state
 # auto_docstring
 class QwenImageLayeredRoPEInputsStep(ModularPipelineBlocks):
    """
    Step that prepares the RoPE inputs for the denoising process. Should be place after prepare_latents step
      Inputs:
          batch_size (`int`, *optional*, defaults to 1):
              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
              be generated in input step.
          layers (`int`, *optional*, defaults to 4):
              Number of layers to extract from the image
          height (`int`):
              The height in pixels of the generated image.
          width (`int`):
              The width in pixels of the generated image.
          prompt_embeds_mask (`Tensor`):
              mask for the text embeddings. Can be generated from text_encoder step.
          negative_prompt_embeds_mask (`Tensor`, *optional*):
              mask for the negative text embeddings. Can be generated from text_encoder step.
      Outputs:
          img_shapes (`List`):
              The shapes of the image latents, used for RoPE calculation
          txt_seq_lens (`List`):
              The sequence lengths of the prompt embeds, used for RoPE calculation
          negative_txt_seq_lens (`List`):
              The sequence lengths of the negative prompt embeds, used for RoPE calculation
          additional_t_cond (`Tensor`):
              The additional t cond, used for RoPE calculation
    """
    model_name = "qwenimage-layered"
    @property
@@ -844,12 +1145,12 @@ class QwenImageLayeredRoPEInputsStep(ModularPipelineBlocks):
    @property
    def inputs(self) -> List[InputParam]:
        return [
-            InputParam(name="batch_size", required=True),
+            InputParam.template("batch_size"),
-            InputParam(name="layers", required=True),
+            InputParam.template("layers"),
-            InputParam(name="height", required=True),
+            InputParam.template("height", required=True),
-            InputParam(name="width", required=True),
+            InputParam.template("width", required=True),
-            InputParam(name="prompt_embeds_mask"),
+            InputParam.template("prompt_embeds_mask"),
-            InputParam(name="negative_prompt_embeds_mask"),
+            InputParam.template("negative_prompt_embeds_mask"),
        ]
    @property
@@ -914,7 +1215,34 @@ class QwenImageLayeredRoPEInputsStep(ModularPipelineBlocks):
 ## ControlNet inputs for denoiser
 # auto_docstring
 class QwenImageControlNetBeforeDenoiserStep(ModularPipelineBlocks):
    """
    step that prepare inputs for controlnet. Insert before the Denoise Step, after set_timesteps step.
      Components:
          controlnet (`QwenImageControlNetModel`)
      Inputs:
          control_guidance_start (`float`, *optional*, defaults to 0.0):
              When to start applying ControlNet.
          control_guidance_end (`float`, *optional*, defaults to 1.0):
              When to stop applying ControlNet.
          controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
              Scale for ControlNet conditioning.
          control_image_latents (`Tensor`):
              The control image latents to use for the denoising process. Can be generated in controlnet vae encoder
              step.
          timesteps (`Tensor`):
              The timesteps to use for the denoising process. Can be generated in set_timesteps step.
      Outputs:
          controlnet_keep (`List`):
              The controlnet keep values
    """
    model_name = "qwenimage"
    @property
@@ -930,12 +1258,17 @@ class QwenImageControlNetBeforeDenoiserStep(ModularPipelineBlocks):
    @property
    def inputs(self) -> List[InputParam]:
        return [
-            InputParam("control_guidance_start", default=0.0),
+            InputParam.template("control_guidance_start"),
-            InputParam("control_guidance_end", default=1.0),
+            InputParam.template("control_guidance_end"),
-            InputParam("controlnet_conditioning_scale", default=1.0),
+            InputParam.template("controlnet_conditioning_scale"),
            InputParam("control_image_latents", required=True),
            InputParam(
-                "timesteps",
+                name="control_image_latents",
                required=True,
                type_hint=torch.Tensor,
                description="The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step.",
            ),
            InputParam(
                name="timesteps",
                required=True,
                type_hint=torch.Tensor,
                description="The timesteps to use for the denoising process. Can be generated in set_timesteps step.",
--- a/src/diffusers/modular_pipelines/qwenimage/decoders.py
+++ b/src/diffusers/modular_pipelines/qwenimage/decoders.py
@@ -12,10 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import List, Union
+from typing import Any, Dict, List
 import numpy as np
 import PIL
 import torch
 from ...configuration_utils import FrozenDict
@@ -31,7 +29,30 @@ logger = logging.get_logger(__name__)
 # after denoising loop (unpack latents)
 # auto_docstring
 class QwenImageAfterDenoiseStep(ModularPipelineBlocks):
    """
    Step that unpack the latents from 3D tensor (batch_size, sequence_length, channels) into 5D tensor (batch_size,
    channels, 1, height, width)
      Components:
          pachifier (`QwenImagePachifier`)
      Inputs:
          height (`int`):
              The height in pixels of the generated image.
          width (`int`):
              The width in pixels of the generated image.
          latents (`Tensor`):
              The latents to decode, can be generated in the denoise step.
      Outputs:
          latents (`Tensor`):
              The denoisedlatents unpacked to B, C, 1, H, W
    """
    model_name = "qwenimage"
    @property
@@ -49,13 +70,21 @@ class QwenImageAfterDenoiseStep(ModularPipelineBlocks):
    @property
    def inputs(self) -> List[InputParam]:
        return [
-            InputParam(name="height", required=True),
+            InputParam.template("height", required=True),
-            InputParam(name="width", required=True),
+            InputParam.template("width", required=True),
            InputParam(
                name="latents",
                required=True,
                type_hint=torch.Tensor,
-                description="The latents to decode, can be generated in the denoise step",
+                description="The latents to decode, can be generated in the denoise step.",
            ),
        ]
    @property
    def intermediate_outputs(self) -> List[OutputParam]:
        return [
            OutputParam(
                name="latents", type_hint=torch.Tensor, description="The denoisedlatents unpacked to B, C, 1, H, W"
            ),
        ]
@@ -72,7 +101,29 @@ class QwenImageAfterDenoiseStep(ModularPipelineBlocks):
        return components, state
 # auto_docstring
 class QwenImageLayeredAfterDenoiseStep(ModularPipelineBlocks):
    """
    Unpack latents from (B, seq, C*4) to (B, C, layers+1, H, W) after denoising.
      Components:
          pachifier (`QwenImageLayeredPachifier`)
      Inputs:
          latents (`Tensor`):
              The denoised latents to decode, can be generated in the denoise step.
          height (`int`):
              The height in pixels of the generated image.
          width (`int`):
              The width in pixels of the generated image.
          layers (`int`, *optional*, defaults to 4):
              Number of layers to extract from the image
      Outputs:
          latents (`Tensor`):
              Denoised latents. (unpacked to B, C, layers+1, H, W)
    """
    model_name = "qwenimage-layered"
    @property
@@ -88,10 +139,21 @@ class QwenImageLayeredAfterDenoiseStep(ModularPipelineBlocks):
    @property
    def inputs(self) -> List[InputParam]:
        return [
-            InputParam("latents", required=True, type_hint=torch.Tensor),
+            InputParam(
-            InputParam("height", required=True, type_hint=int),
+                name="latents",
-            InputParam("width", required=True, type_hint=int),
+                required=True,
-            InputParam("layers", required=True, type_hint=int),
+                type_hint=torch.Tensor,
                description="The denoised latents to decode, can be generated in the denoise step.",
            ),
            InputParam.template("height", required=True),
            InputParam.template("width", required=True),
            InputParam.template("layers"),
        ]
    @property
    def intermediate_outputs(self) -> List[OutputParam]:
        return [
            OutputParam.template("latents", note="unpacked to B, C, layers+1, H, W"),
        ]
    @torch.no_grad()
@@ -112,7 +174,26 @@ class QwenImageLayeredAfterDenoiseStep(ModularPipelineBlocks):
 # decode step
 # auto_docstring
 class QwenImageDecoderStep(ModularPipelineBlocks):
    """
    Step that decodes the latents to images
      Components:
          vae (`AutoencoderKLQwenImage`)
      Inputs:
          latents (`Tensor`):
              The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise
              step.
      Outputs:
          images (`List`):
              Generated images. (tensor output of the vae decoder.)
    """
    model_name = "qwenimage"
    @property
@@ -134,19 +215,13 @@ class QwenImageDecoderStep(ModularPipelineBlocks):
                name="latents",
                required=True,
                type_hint=torch.Tensor,
-                description="The latents to decode, can be generated in the denoise step",
+                description="The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step.",
            ),
        ]
    @property
-    def intermediate_outputs(self) -> List[str]:
+    def intermediate_outputs(self) -> List[OutputParam]:
-        return [
+        return [OutputParam.template("images", note="tensor output of the vae decoder.")]
            OutputParam(
                "images",
                type_hint=Union[List[PIL.Image.Image], List[torch.Tensor], List[np.array]],
                description="The generated images, can be a PIL.Image.Image, torch.Tensor or a numpy array",
            )
        ]
    @torch.no_grad()
    def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
@@ -176,7 +251,26 @@ class QwenImageDecoderStep(ModularPipelineBlocks):
        return components, state
 # auto_docstring
 class QwenImageLayeredDecoderStep(ModularPipelineBlocks):
    """
    Decode unpacked latents (B, C, layers+1, H, W) into layer images.
      Components:
          vae (`AutoencoderKLQwenImage`) image_processor (`VaeImageProcessor`)
      Inputs:
          latents (`Tensor`):
              The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise
              step.
          output_type (`str`, *optional*, defaults to pil):
              Output format: 'pil', 'np', 'pt'.
      Outputs:
          images (`List`):
              Generated images.
    """
    model_name = "qwenimage-layered"
    @property
@@ -198,14 +292,19 @@ class QwenImageLayeredDecoderStep(ModularPipelineBlocks):
    @property
    def inputs(self) -> List[InputParam]:
        return [
-            InputParam("latents", required=True, type_hint=torch.Tensor),
+            InputParam(
-            InputParam("output_type", default="pil", type_hint=str),
+                name="latents",
                required=True,
                type_hint=torch.Tensor,
                description="The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step.",
            ),
            InputParam.template("output_type"),
        ]
    @property
    def intermediate_outputs(self) -> List[OutputParam]:
        return [
-            OutputParam(name="images", type_hint=List[List[PIL.Image.Image]]),
+            OutputParam.template("images"),
        ]
    @torch.no_grad()
@@ -251,7 +350,27 @@ class QwenImageLayeredDecoderStep(ModularPipelineBlocks):
 # postprocess the decoded images
 # auto_docstring
 class QwenImageProcessImagesOutputStep(ModularPipelineBlocks):
    """
    postprocess the generated image
      Components:
          image_processor (`VaeImageProcessor`)
      Inputs:
          images (`Tensor`):
              the generated image tensor from decoders step
          output_type (`str`, *optional*, defaults to pil):
              Output format: 'pil', 'np', 'pt'.
      Outputs:
          images (`List`):
              Generated images.
    """
    model_name = "qwenimage"
    @property
@@ -272,15 +391,19 @@ class QwenImageProcessImagesOutputStep(ModularPipelineBlocks):
    @property
    def inputs(self) -> List[InputParam]:
        return [
            InputParam("images", required=True, description="the generated image from decoders step"),
            InputParam(
-                name="output_type",
+                name="images",
-                default="pil",
+                required=True,
-                type_hint=str,
+                type_hint=torch.Tensor,
-                description="The type of the output images, can be 'pil', 'np', 'pt'",
+                description="the generated image tensor from decoders step",
            ),
            InputParam.template("output_type"),
        ]
    @property
    def intermediate_outputs(self) -> List[OutputParam]:
        return [OutputParam.template("images")]
    @staticmethod
    def check_inputs(output_type):
        if output_type not in ["pil", "np", "pt"]:
@@ -301,7 +424,28 @@ class QwenImageProcessImagesOutputStep(ModularPipelineBlocks):
        return components, state
 # auto_docstring
 class QwenImageInpaintProcessImagesOutputStep(ModularPipelineBlocks):
    """
    postprocess the generated image, optional apply the mask overally to the original image..
      Components:
          image_mask_processor (`InpaintProcessor`)
      Inputs:
          images (`Tensor`):
              the generated image tensor from decoders step
          output_type (`str`, *optional*, defaults to pil):
              Output format: 'pil', 'np', 'pt'.
          mask_overlay_kwargs (`Dict`, *optional*):
              The kwargs for the postprocess step to apply the mask overlay. generated in
              InpaintProcessImagesInputStep.
      Outputs:
          images (`List`):
              Generated images.
    """
    model_name = "qwenimage"
    @property
@@ -322,16 +466,24 @@ class QwenImageInpaintProcessImagesOutputStep(ModularPipelineBlocks):
    @property
    def inputs(self) -> List[InputParam]:
        return [
            InputParam("images", required=True, description="the generated image from decoders step"),
            InputParam(
-                name="output_type",
+                name="images",
-                default="pil",
+                required=True,
-                type_hint=str,
+                type_hint=torch.Tensor,
-                description="The type of the output images, can be 'pil', 'np', 'pt'",
+                description="the generated image tensor from decoders step",
            ),
            InputParam.template("output_type"),
            InputParam(
                name="mask_overlay_kwargs",
                type_hint=Dict[str, Any],
                description="The kwargs for the postprocess step to apply the mask overlay. generated in InpaintProcessImagesInputStep.",
            ),
            InputParam("mask_overlay_kwargs"),
        ]
    @property
    def intermediate_outputs(self) -> List[OutputParam]:
        return [OutputParam.template("images")]
    @staticmethod
    def check_inputs(output_type, mask_overlay_kwargs):
        if output_type not in ["pil", "np", "pt"]:
--- a/src/diffusers/modular_pipelines/qwenimage/denoise.py
+++ b/src/diffusers/modular_pipelines/qwenimage/denoise.py
@@ -50,7 +50,7 @@ class QwenImageLoopBeforeDenoiser(ModularPipelineBlocks):
    def inputs(self) -> List[InputParam]:
        return [
            InputParam(
-                "latents",
+                name="latents",
                required=True,
                type_hint=torch.Tensor,
                description="The initial latents to use for the denoising process. Can be generated in prepare_latent step.",
@@ -80,17 +80,12 @@ class QwenImageEditLoopBeforeDenoiser(ModularPipelineBlocks):
    def inputs(self) -> List[InputParam]:
        return [
            InputParam(
-                "latents",
+                name="latents",
                required=True,
                type_hint=torch.Tensor,
                description="The initial latents to use for the denoising process. Can be generated in prepare_latent step.",
            ),
-            InputParam(
+            InputParam.template("image_latents"),
                "image_latents",
                required=True,
                type_hint=torch.Tensor,
                description="The initial image latents to use for the denoising process. Can be encoded in vae_encoder step and packed in prepare_image_latents step.",
            ),
        ]
    @torch.no_grad()
@@ -134,29 +129,12 @@ class QwenImageLoopBeforeDenoiserControlNet(ModularPipelineBlocks):
                type_hint=torch.Tensor,
                description="The control image to use for the denoising process. Can be generated in prepare_controlnet_inputs step.",
            ),
            InputParam.template("controlnet_conditioning_scale", note="updated in prepare_controlnet_inputs step."),
            InputParam(
-                "controlnet_conditioning_scale",
+                name="controlnet_keep",
                type_hint=float,
                description="The controlnet conditioning scale value to use for the denoising process. Can be generated in prepare_controlnet_inputs step.",
            ),
            InputParam(
                "controlnet_keep",
                required=True,
                type_hint=List[float],
-                description="The controlnet keep values to use for the denoising process. Can be generated in prepare_controlnet_inputs step.",
+                description="The controlnet keep values. Can be generated in prepare_controlnet_inputs step.",
            ),
            InputParam(
                "num_inference_steps",
                required=True,
                type_hint=int,
                description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.",
            ),
            InputParam(
                kwargs_type="denoiser_input_fields",
                description=(
                    "All conditional model inputs for the denoiser. "
                    "It should contain prompt_embeds/negative_prompt_embeds."
                ),
            ),
        ]
@@ -217,28 +195,13 @@ class QwenImageLoopDenoiser(ModularPipelineBlocks):
    @property
    def inputs(self) -> List[InputParam]:
        return [
-            InputParam("attention_kwargs"),
+            InputParam.template("attention_kwargs"),
-            InputParam(
+            InputParam.template("denoiser_input_fields"),
                "latents",
                required=True,
                type_hint=torch.Tensor,
                description="The latents to use for the denoising process. Can be generated in prepare_latents step.",
            ),
            InputParam(
                "num_inference_steps",
                required=True,
                type_hint=int,
                description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.",
            ),
            InputParam(
                kwargs_type="denoiser_input_fields",
                description="conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.",
            ),
            InputParam(
                "img_shapes",
                required=True,
                type_hint=List[Tuple[int, int]],
-                description="The shape of the image latents for RoPE calculation. Can be generated in prepare_additional_inputs step.",
+                description="The shape of the image latents for RoPE calculation. can be generated in prepare_additional_inputs step.",
            ),
        ]
@@ -317,23 +280,8 @@ class QwenImageEditLoopDenoiser(ModularPipelineBlocks):
    @property
    def inputs(self) -> List[InputParam]:
        return [
-            InputParam("attention_kwargs"),
+            InputParam.template("attention_kwargs"),
-            InputParam(
+            InputParam.template("denoiser_input_fields"),
                "latents",
                required=True,
                type_hint=torch.Tensor,
                description="The latents to use for the denoising process. Can be generated in prepare_latents step.",
            ),
            InputParam(
                "num_inference_steps",
                required=True,
                type_hint=int,
                description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.",
            ),
            InputParam(
                kwargs_type="denoiser_input_fields",
                description="conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.",
            ),
            InputParam(
                "img_shapes",
                required=True,
@@ -415,7 +363,7 @@ class QwenImageLoopAfterDenoiser(ModularPipelineBlocks):
    @property
    def intermediate_outputs(self) -> List[OutputParam]:
        return [
-            OutputParam("latents", type_hint=torch.Tensor, description="The denoised latents."),
+            OutputParam.template("latents"),
        ]
    @torch.no_grad()
@@ -456,24 +404,19 @@ class QwenImageLoopAfterDenoiserInpaint(ModularPipelineBlocks):
                type_hint=torch.Tensor,
                description="The mask to use for the inpainting process. Can be generated in inpaint prepare latents step.",
            ),
-            InputParam(
+            InputParam.template("image_latents"),
                "image_latents",
                required=True,
                type_hint=torch.Tensor,
                description="The image latents to use for the inpainting process. Can be generated in inpaint prepare latents step.",
            ),
            InputParam(
                "initial_noise",
                required=True,
                type_hint=torch.Tensor,
                description="The initial noise to use for the inpainting process. Can be generated in inpaint prepare latents step.",
            ),
-            InputParam(
+        ]
-                "timesteps",
+
-                required=True,
+    @property
-                type_hint=torch.Tensor,
+    def intermediate_outputs(self) -> List[OutputParam]:
-                description="The timesteps to use for the denoising process. Can be generated in set_timesteps step.",
+        return [
-            ),
+            OutputParam.template("latents"),
        ]
    @torch.no_grad()
@@ -515,17 +458,12 @@ class QwenImageDenoiseLoopWrapper(LoopSequentialPipelineBlocks):
    def loop_inputs(self) -> List[InputParam]:
        return [
            InputParam(
-                "timesteps",
+                name="timesteps",
                required=True,
                type_hint=torch.Tensor,
                description="The timesteps to use for the denoising process. Can be generated in set_timesteps step.",
            ),
-            InputParam(
+            InputParam.template("num_inference_steps", required=True),
                "num_inference_steps",
                required=True,
                type_hint=int,
                description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.",
            ),
        ]
    @torch.no_grad()
@@ -557,7 +495,42 @@ class QwenImageDenoiseLoopWrapper(LoopSequentialPipelineBlocks):
 # Qwen Image (text2image, image2image)
 # auto_docstring
 class QwenImageDenoiseStep(QwenImageDenoiseLoopWrapper):
    """
    Denoise step that iteratively denoise the latents.
      Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method At each iteration, it runs blocks
      defined in `sub_blocks` sequencially:
       - `QwenImageLoopBeforeDenoiser`
       - `QwenImageLoopDenoiser`
       - `QwenImageLoopAfterDenoiser`
      This block supports text2image and image2image tasks for QwenImage.
      Components:
          guider (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`) scheduler
          (`FlowMatchEulerDiscreteScheduler`)
      Inputs:
          timesteps (`Tensor`):
              The timesteps to use for the denoising process. Can be generated in set_timesteps step.
          num_inference_steps (`int`):
              The number of denoising steps.
          latents (`Tensor`):
              The initial latents to use for the denoising process. Can be generated in prepare_latent step.
          attention_kwargs (`Dict`, *optional*):
              Additional kwargs for attention processors.
          **denoiser_input_fields (`None`, *optional*):
              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
          img_shapes (`List`):
              The shape of the image latents for RoPE calculation. can be generated in prepare_additional_inputs step.
      Outputs:
          latents (`Tensor`):
              Denoised latents.
    """
    model_name = "qwenimage"
    block_classes = [
@@ -570,8 +543,8 @@ class QwenImageDenoiseStep(QwenImageDenoiseLoopWrapper):
    @property
    def description(self) -> str:
        return (
-            "Denoise step that iteratively denoise the latents. \n"
+            "Denoise step that iteratively denoise the latents.\n"
-            "Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method \n"
+            "Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method\n"
            "At each iteration, it runs blocks defined in `sub_blocks` sequencially:\n"
            " - `QwenImageLoopBeforeDenoiser`\n"
            " - `QwenImageLoopDenoiser`\n"
@@ -581,7 +554,47 @@ class QwenImageDenoiseStep(QwenImageDenoiseLoopWrapper):
 # Qwen Image (inpainting)
 # auto_docstring
 class QwenImageInpaintDenoiseStep(QwenImageDenoiseLoopWrapper):
    """
    Denoise step that iteratively denoise the latents.
      Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method At each iteration, it runs blocks
      defined in `sub_blocks` sequencially:
       - `QwenImageLoopBeforeDenoiser`
       - `QwenImageLoopDenoiser`
       - `QwenImageLoopAfterDenoiser`
       - `QwenImageLoopAfterDenoiserInpaint`
      This block supports inpainting tasks for QwenImage.
      Components:
          guider (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`) scheduler
          (`FlowMatchEulerDiscreteScheduler`)
      Inputs:
          timesteps (`Tensor`):
              The timesteps to use for the denoising process. Can be generated in set_timesteps step.
          num_inference_steps (`int`):
              The number of denoising steps.
          latents (`Tensor`):
              The initial latents to use for the denoising process. Can be generated in prepare_latent step.
          attention_kwargs (`Dict`, *optional*):
              Additional kwargs for attention processors.
          **denoiser_input_fields (`None`, *optional*):
              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
          img_shapes (`List`):
              The shape of the image latents for RoPE calculation. can be generated in prepare_additional_inputs step.
          mask (`Tensor`):
              The mask to use for the inpainting process. Can be generated in inpaint prepare latents step.
          image_latents (`Tensor`):
              image latents used to guide the image generation. Can be generated from vae_encoder step.
          initial_noise (`Tensor`):
              The initial noise to use for the inpainting process. Can be generated in inpaint prepare latents step.
      Outputs:
          latents (`Tensor`):
              Denoised latents.
    """
    model_name = "qwenimage"
    block_classes = [
        QwenImageLoopBeforeDenoiser,
@@ -606,7 +619,47 @@ class QwenImageInpaintDenoiseStep(QwenImageDenoiseLoopWrapper):
 # Qwen Image (text2image, image2image) with controlnet
 # auto_docstring
 class QwenImageControlNetDenoiseStep(QwenImageDenoiseLoopWrapper):
    """
    Denoise step that iteratively denoise the latents.
      Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method At each iteration, it runs blocks
      defined in `sub_blocks` sequencially:
       - `QwenImageLoopBeforeDenoiser`
       - `QwenImageLoopBeforeDenoiserControlNet`
       - `QwenImageLoopDenoiser`
       - `QwenImageLoopAfterDenoiser`
      This block supports text2img/img2img tasks with controlnet for QwenImage.
      Components:
          guider (`ClassifierFreeGuidance`) controlnet (`QwenImageControlNetModel`) transformer
          (`QwenImageTransformer2DModel`) scheduler (`FlowMatchEulerDiscreteScheduler`)
      Inputs:
          timesteps (`Tensor`):
              The timesteps to use for the denoising process. Can be generated in set_timesteps step.
          num_inference_steps (`int`):
              The number of denoising steps.
          latents (`Tensor`):
              The initial latents to use for the denoising process. Can be generated in prepare_latent step.
          control_image_latents (`Tensor`):
              The control image to use for the denoising process. Can be generated in prepare_controlnet_inputs step.
          controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
              Scale for ControlNet conditioning. (updated in prepare_controlnet_inputs step.)
          controlnet_keep (`List`):
              The controlnet keep values. Can be generated in prepare_controlnet_inputs step.
          attention_kwargs (`Dict`, *optional*):
              Additional kwargs for attention processors.
          **denoiser_input_fields (`None`, *optional*):
              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
          img_shapes (`List`):
              The shape of the image latents for RoPE calculation. can be generated in prepare_additional_inputs step.
      Outputs:
          latents (`Tensor`):
              Denoised latents.
    """
    model_name = "qwenimage"
    block_classes = [
        QwenImageLoopBeforeDenoiser,
@@ -631,7 +684,54 @@ class QwenImageControlNetDenoiseStep(QwenImageDenoiseLoopWrapper):
 # Qwen Image (inpainting) with controlnet
 # auto_docstring
 class QwenImageInpaintControlNetDenoiseStep(QwenImageDenoiseLoopWrapper):
    """
    Denoise step that iteratively denoise the latents.
      Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method At each iteration, it runs blocks
      defined in `sub_blocks` sequencially:
       - `QwenImageLoopBeforeDenoiser`
       - `QwenImageLoopBeforeDenoiserControlNet`
       - `QwenImageLoopDenoiser`
       - `QwenImageLoopAfterDenoiser`
       - `QwenImageLoopAfterDenoiserInpaint`
      This block supports inpainting tasks with controlnet for QwenImage.
      Components:
          guider (`ClassifierFreeGuidance`) controlnet (`QwenImageControlNetModel`) transformer
          (`QwenImageTransformer2DModel`) scheduler (`FlowMatchEulerDiscreteScheduler`)
      Inputs:
          timesteps (`Tensor`):
              The timesteps to use for the denoising process. Can be generated in set_timesteps step.
          num_inference_steps (`int`):
              The number of denoising steps.
          latents (`Tensor`):
              The initial latents to use for the denoising process. Can be generated in prepare_latent step.
          control_image_latents (`Tensor`):
              The control image to use for the denoising process. Can be generated in prepare_controlnet_inputs step.
          controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
              Scale for ControlNet conditioning. (updated in prepare_controlnet_inputs step.)
          controlnet_keep (`List`):
              The controlnet keep values. Can be generated in prepare_controlnet_inputs step.
          attention_kwargs (`Dict`, *optional*):
              Additional kwargs for attention processors.
          **denoiser_input_fields (`None`, *optional*):
              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
          img_shapes (`List`):
              The shape of the image latents for RoPE calculation. can be generated in prepare_additional_inputs step.
          mask (`Tensor`):
              The mask to use for the inpainting process. Can be generated in inpaint prepare latents step.
          image_latents (`Tensor`):
              image latents used to guide the image generation. Can be generated from vae_encoder step.
          initial_noise (`Tensor`):
              The initial noise to use for the inpainting process. Can be generated in inpaint prepare latents step.
      Outputs:
          latents (`Tensor`):
              Denoised latents.
    """
    model_name = "qwenimage"
    block_classes = [
        QwenImageLoopBeforeDenoiser,
@@ -664,7 +764,42 @@ class QwenImageInpaintControlNetDenoiseStep(QwenImageDenoiseLoopWrapper):
 # Qwen Image Edit (image2image)
 # auto_docstring
 class QwenImageEditDenoiseStep(QwenImageDenoiseLoopWrapper):
    """
    Denoise step that iteratively denoise the latents.
      Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method At each iteration, it runs blocks
      defined in `sub_blocks` sequencially:
       - `QwenImageEditLoopBeforeDenoiser`
       - `QwenImageEditLoopDenoiser`
       - `QwenImageLoopAfterDenoiser`
      This block supports QwenImage Edit.
      Components:
          guider (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`) scheduler
          (`FlowMatchEulerDiscreteScheduler`)
      Inputs:
          timesteps (`Tensor`):
              The timesteps to use for the denoising process. Can be generated in set_timesteps step.
          num_inference_steps (`int`):
              The number of denoising steps.
          latents (`Tensor`):
              The initial latents to use for the denoising process. Can be generated in prepare_latent step.
          image_latents (`Tensor`):
              image latents used to guide the image generation. Can be generated from vae_encoder step.
          attention_kwargs (`Dict`, *optional*):
              Additional kwargs for attention processors.
          **denoiser_input_fields (`None`, *optional*):
              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
          img_shapes (`List`):
              The shape of the image latents for RoPE calculation. Can be generated in prepare_additional_inputs step.
      Outputs:
          latents (`Tensor`):
              Denoised latents.
    """
    model_name = "qwenimage-edit"
    block_classes = [
        QwenImageEditLoopBeforeDenoiser,
@@ -687,7 +822,47 @@ class QwenImageEditDenoiseStep(QwenImageDenoiseLoopWrapper):
 # Qwen Image Edit (inpainting)
 # auto_docstring
 class QwenImageEditInpaintDenoiseStep(QwenImageDenoiseLoopWrapper):
    """
    Denoise step that iteratively denoise the latents.
      Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method At each iteration, it runs blocks
      defined in `sub_blocks` sequencially:
       - `QwenImageEditLoopBeforeDenoiser`
       - `QwenImageEditLoopDenoiser`
       - `QwenImageLoopAfterDenoiser`
       - `QwenImageLoopAfterDenoiserInpaint`
      This block supports inpainting tasks for QwenImage Edit.
      Components:
          guider (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`) scheduler
          (`FlowMatchEulerDiscreteScheduler`)
      Inputs:
          timesteps (`Tensor`):
              The timesteps to use for the denoising process. Can be generated in set_timesteps step.
          num_inference_steps (`int`):
              The number of denoising steps.
          latents (`Tensor`):
              The initial latents to use for the denoising process. Can be generated in prepare_latent step.
          image_latents (`Tensor`):
              image latents used to guide the image generation. Can be generated from vae_encoder step.
          attention_kwargs (`Dict`, *optional*):
              Additional kwargs for attention processors.
          **denoiser_input_fields (`None`, *optional*):
              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
          img_shapes (`List`):
              The shape of the image latents for RoPE calculation. Can be generated in prepare_additional_inputs step.
          mask (`Tensor`):
              The mask to use for the inpainting process. Can be generated in inpaint prepare latents step.
          initial_noise (`Tensor`):
              The initial noise to use for the inpainting process. Can be generated in inpaint prepare latents step.
      Outputs:
          latents (`Tensor`):
              Denoised latents.
    """
    model_name = "qwenimage-edit"
    block_classes = [
        QwenImageEditLoopBeforeDenoiser,
@@ -712,7 +887,42 @@ class QwenImageEditInpaintDenoiseStep(QwenImageDenoiseLoopWrapper):
 # Qwen Image Layered (image2image)
 # auto_docstring
 class QwenImageLayeredDenoiseStep(QwenImageDenoiseLoopWrapper):
    """
    Denoise step that iteratively denoise the latents.
      Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method At each iteration, it runs blocks
      defined in `sub_blocks` sequencially:
       - `QwenImageEditLoopBeforeDenoiser`
       - `QwenImageEditLoopDenoiser`
       - `QwenImageLoopAfterDenoiser`
      This block supports QwenImage Layered.
      Components:
          guider (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`) scheduler
          (`FlowMatchEulerDiscreteScheduler`)
      Inputs:
          timesteps (`Tensor`):
              The timesteps to use for the denoising process. Can be generated in set_timesteps step.
          num_inference_steps (`int`):
              The number of denoising steps.
          latents (`Tensor`):
              The initial latents to use for the denoising process. Can be generated in prepare_latent step.
          image_latents (`Tensor`):
              image latents used to guide the image generation. Can be generated from vae_encoder step.
          attention_kwargs (`Dict`, *optional*):
              Additional kwargs for attention processors.
          **denoiser_input_fields (`None`, *optional*):
              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
          img_shapes (`List`):
              The shape of the image latents for RoPE calculation. Can be generated in prepare_additional_inputs step.
      Outputs:
          latents (`Tensor`):
              Denoised latents.
    """
    model_name = "qwenimage-layered"
    block_classes = [
        QwenImageEditLoopBeforeDenoiser,
--- a/src/diffusers/modular_pipelines/qwenimage/encoders.py
+++ b/src/diffusers/modular_pipelines/qwenimage/encoders.py
--- a/src/diffusers/modular_pipelines/qwenimage/inputs.py
+++ b/src/diffusers/modular_pipelines/qwenimage/inputs.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import List, Tuple
+from typing import List, Optional, Tuple
 import torch
@@ -109,7 +109,44 @@ def calculate_dimension_from_latents(latents: torch.Tensor, vae_scale_factor: in
    return height, width
 # auto_docstring
 class QwenImageTextInputsStep(ModularPipelineBlocks):
    """
    Text input processing step that standardizes text embeddings for the pipeline.
      This step:
        1. Determines `batch_size` and `dtype` based on `prompt_embeds`
        2. Ensures all text embeddings have consistent batch sizes (batch_size * num_images_per_prompt)
      This block should be placed after all encoder steps to process the text embeddings before they are used in
      subsequent pipeline steps.
      Inputs:
          num_images_per_prompt (`int`, *optional*, defaults to 1):
              The number of images to generate per prompt.
          prompt_embeds (`Tensor`):
              text embeddings used to guide the image generation. Can be generated from text_encoder step.
          prompt_embeds_mask (`Tensor`):
              mask for the text embeddings. Can be generated from text_encoder step.
          negative_prompt_embeds (`Tensor`, *optional*):
              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
          negative_prompt_embeds_mask (`Tensor`, *optional*):
              mask for the negative text embeddings. Can be generated from text_encoder step.
      Outputs:
          batch_size (`int`):
              The batch size of the prompt embeddings
          dtype (`dtype`):
              The data type of the prompt embeddings
          prompt_embeds (`Tensor`):
              The prompt embeddings. (batch-expanded)
          prompt_embeds_mask (`Tensor`):
              The encoder attention mask. (batch-expanded)
          negative_prompt_embeds (`Tensor`):
              The negative prompt embeddings. (batch-expanded)
          negative_prompt_embeds_mask (`Tensor`):
              The negative prompt embeddings mask. (batch-expanded)
    """
    model_name = "qwenimage"
    @property
@@ -129,26 +166,22 @@ class QwenImageTextInputsStep(ModularPipelineBlocks):
    @property
    def inputs(self) -> List[InputParam]:
        return [
-            InputParam(name="num_images_per_prompt", default=1),
+            InputParam.template("num_images_per_prompt"),
-            InputParam(name="prompt_embeds", required=True, kwargs_type="denoiser_input_fields"),
+            InputParam.template("prompt_embeds"),
-            InputParam(name="prompt_embeds_mask", required=True, kwargs_type="denoiser_input_fields"),
+            InputParam.template("prompt_embeds_mask"),
-            InputParam(name="negative_prompt_embeds", kwargs_type="denoiser_input_fields"),
+            InputParam.template("negative_prompt_embeds"),
-            InputParam(name="negative_prompt_embeds_mask", kwargs_type="denoiser_input_fields"),
+            InputParam.template("negative_prompt_embeds_mask"),
        ]
    @property
-    def intermediate_outputs(self) -> List[str]:
+    def intermediate_outputs(self) -> List[OutputParam]:
        return [
-            OutputParam(
+            OutputParam(name="batch_size", type_hint=int, description="The batch size of the prompt embeddings"),
-                "batch_size",
+            OutputParam(name="dtype", type_hint=torch.dtype, description="The data type of the prompt embeddings"),
-                type_hint=int,
+            OutputParam.template("prompt_embeds", note="batch-expanded"),
-                description="Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt",
+            OutputParam.template("prompt_embeds_mask", note="batch-expanded"),
-            ),
+            OutputParam.template("negative_prompt_embeds", note="batch-expanded"),
-            OutputParam(
+            OutputParam.template("negative_prompt_embeds_mask", note="batch-expanded"),
                "dtype",
                type_hint=torch.dtype,
                description="Data type of model tensor inputs (determined by `prompt_embeds`)",
            ),
        ]
    @staticmethod
@@ -221,20 +254,76 @@ class QwenImageTextInputsStep(ModularPipelineBlocks):
        return components, state
 # auto_docstring
 class QwenImageAdditionalInputsStep(ModularPipelineBlocks):
-    """Input step for QwenImage: update height/width, expand batch, patchify."""
+    """
    Input processing step that:
        1. For image latent inputs: Updates height/width if None, patchifies, and expands batch size
        2. For additional batch inputs: Expands batch dimensions to match final batch size
      Configured inputs:
        - Image latent inputs: ['image_latents']
      This block should be placed after the encoder steps and the text input step.
      Components:
          pachifier (`QwenImagePachifier`)
      Inputs:
          num_images_per_prompt (`int`, *optional*, defaults to 1):
              The number of images to generate per prompt.
          batch_size (`int`, *optional*, defaults to 1):
              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
              be generated in input step.
          height (`int`, *optional*):
              The height in pixels of the generated image.
          width (`int`, *optional*):
              The width in pixels of the generated image.
          image_latents (`Tensor`):
              image latents used to guide the image generation. Can be generated from vae_encoder step.
      Outputs:
          image_height (`int`):
              The image height calculated from the image latents dimension
          image_width (`int`):
              The image width calculated from the image latents dimension
          height (`int`):
              if not provided, updated to image height
          width (`int`):
              if not provided, updated to image width
          image_latents (`Tensor`):
              image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified and
              batch-expanded)
    """
    model_name = "qwenimage"
    def __init__(
        self,
-        image_latent_inputs: List[str] = ["image_latents"],
+        image_latent_inputs: Optional[List[InputParam]] = None,
-        additional_batch_inputs: List[str] = [],
+        additional_batch_inputs: Optional[List[InputParam]] = None,
    ):
        # by default, process `image_latents`
        if image_latent_inputs is None:
            image_latent_inputs = [InputParam.template("image_latents")]
        if additional_batch_inputs is None:
            additional_batch_inputs = []
        if not isinstance(image_latent_inputs, list):
-            image_latent_inputs = [image_latent_inputs]
+            raise ValueError(f"image_latent_inputs must be a list, but got {type(image_latent_inputs)}")
        else:
            for input_param in image_latent_inputs:
                if not isinstance(input_param, InputParam):
                    raise ValueError(f"image_latent_inputs must be a list of InputParam, but got {type(input_param)}")
        if not isinstance(additional_batch_inputs, list):
-            additional_batch_inputs = [additional_batch_inputs]
+            raise ValueError(f"additional_batch_inputs must be a list, but got {type(additional_batch_inputs)}")
        else:
            for input_param in additional_batch_inputs:
                if not isinstance(input_param, InputParam):
                    raise ValueError(
                        f"additional_batch_inputs must be a list of InputParam, but got {type(input_param)}"
                    )
        self._image_latent_inputs = image_latent_inputs
        self._additional_batch_inputs = additional_batch_inputs
@@ -252,9 +341,9 @@ class QwenImageAdditionalInputsStep(ModularPipelineBlocks):
        if self._image_latent_inputs or self._additional_batch_inputs:
            inputs_info = "\n\nConfigured inputs:"
            if self._image_latent_inputs:
-                inputs_info += f"\n  - Image latent inputs: {self._image_latent_inputs}"
+                inputs_info += f"\n  - Image latent inputs: {[p.name for p in self._image_latent_inputs]}"
            if self._additional_batch_inputs:
-                inputs_info += f"\n  - Additional batch inputs: {self._additional_batch_inputs}"
+                inputs_info += f"\n  - Additional batch inputs: {[p.name for p in self._additional_batch_inputs]}"
        placement_section = "\n\nThis block should be placed after the encoder steps and the text input step."
@@ -269,23 +358,19 @@ class QwenImageAdditionalInputsStep(ModularPipelineBlocks):
    @property
    def inputs(self) -> List[InputParam]:
        inputs = [
-            InputParam(name="num_images_per_prompt", default=1),
+            InputParam.template("num_images_per_prompt"),
-            InputParam(name="batch_size", required=True),
+            InputParam.template("batch_size"),
-            InputParam(name="height"),
+            InputParam.template("height"),
-            InputParam(name="width"),
+            InputParam.template("width"),
        ]
-
+        # default is `image_latents`
-        for image_latent_input_name in self._image_latent_inputs:
+        inputs += self._image_latent_inputs + self._additional_batch_inputs
            inputs.append(InputParam(name=image_latent_input_name))
        for input_name in self._additional_batch_inputs:
            inputs.append(InputParam(name=input_name))
        return inputs
    @property
    def intermediate_outputs(self) -> List[OutputParam]:
-        return [
+        outputs = [
            OutputParam(
                name="image_height",
                type_hint=int,
@@ -298,11 +383,43 @@ class QwenImageAdditionalInputsStep(ModularPipelineBlocks):
            ),
        ]
        # `height`/`width` are not new outputs, but they will be updated if any image latent inputs are provided
        if len(self._image_latent_inputs) > 0:
            outputs.append(
                OutputParam(name="height", type_hint=int, description="if not provided, updated to image height")
            )
            outputs.append(
                OutputParam(name="width", type_hint=int, description="if not provided, updated to image width")
            )
        # image latent inputs are modified in place (patchified and batch-expanded)
        for input_param in self._image_latent_inputs:
            outputs.append(
                OutputParam(
                    name=input_param.name,
                    type_hint=input_param.type_hint,
                    description=input_param.description + " (patchified and batch-expanded)",
                )
            )
        # additional batch inputs (batch-expanded only)
        for input_param in self._additional_batch_inputs:
            outputs.append(
                OutputParam(
                    name=input_param.name,
                    type_hint=input_param.type_hint,
                    description=input_param.description + " (batch-expanded)",
                )
            )
        return outputs
    def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
        block_state = self.get_block_state(state)
        # Process image latent inputs
-        for image_latent_input_name in self._image_latent_inputs:
+        for input_param in self._image_latent_inputs:
            image_latent_input_name = input_param.name
            image_latent_tensor = getattr(block_state, image_latent_input_name)
            if image_latent_tensor is None:
                continue
@@ -331,7 +448,8 @@ class QwenImageAdditionalInputsStep(ModularPipelineBlocks):
            setattr(block_state, image_latent_input_name, image_latent_tensor)
        # Process additional batch inputs (only batch expansion)
-        for input_name in self._additional_batch_inputs:
+        for input_param in self._additional_batch_inputs:
            input_name = input_param.name
            input_tensor = getattr(block_state, input_name)
            if input_tensor is None:
                continue
@@ -349,20 +467,76 @@ class QwenImageAdditionalInputsStep(ModularPipelineBlocks):
        return components, state
 # auto_docstring
 class QwenImageEditPlusAdditionalInputsStep(ModularPipelineBlocks):
-    """Input step for QwenImage Edit Plus: handles list of latents with different sizes."""
+    """
    Input processing step for Edit Plus that:
        1. For image latent inputs (list): Collects heights/widths, patchifies each, concatenates, expands batch
        2. For additional batch inputs: Expands batch dimensions to match final batch size
        Height/width defaults to last image in the list.
      Configured inputs:
        - Image latent inputs: ['image_latents']
      This block should be placed after the encoder steps and the text input step.
      Components:
          pachifier (`QwenImagePachifier`)
      Inputs:
          num_images_per_prompt (`int`, *optional*, defaults to 1):
              The number of images to generate per prompt.
          batch_size (`int`, *optional*, defaults to 1):
              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
              be generated in input step.
          height (`int`, *optional*):
              The height in pixels of the generated image.
          width (`int`, *optional*):
              The width in pixels of the generated image.
          image_latents (`Tensor`):
              image latents used to guide the image generation. Can be generated from vae_encoder step.
      Outputs:
          image_height (`List`):
              The image heights calculated from the image latents dimension
          image_width (`List`):
              The image widths calculated from the image latents dimension
          height (`int`):
              if not provided, updated to image height
          width (`int`):
              if not provided, updated to image width
          image_latents (`Tensor`):
              image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified,
              concatenated, and batch-expanded)
    """
    model_name = "qwenimage-edit-plus"
    def __init__(
        self,
-        image_latent_inputs: List[str] = ["image_latents"],
+        image_latent_inputs: Optional[List[InputParam]] = None,
-        additional_batch_inputs: List[str] = [],
+        additional_batch_inputs: Optional[List[InputParam]] = None,
    ):
        if image_latent_inputs is None:
            image_latent_inputs = [InputParam.template("image_latents")]
        if additional_batch_inputs is None:
            additional_batch_inputs = []
        if not isinstance(image_latent_inputs, list):
-            image_latent_inputs = [image_latent_inputs]
+            raise ValueError(f"image_latent_inputs must be a list, but got {type(image_latent_inputs)}")
        else:
            for input_param in image_latent_inputs:
                if not isinstance(input_param, InputParam):
                    raise ValueError(f"image_latent_inputs must be a list of InputParam, but got {type(input_param)}")
        if not isinstance(additional_batch_inputs, list):
-            additional_batch_inputs = [additional_batch_inputs]
+            raise ValueError(f"additional_batch_inputs must be a list, but got {type(additional_batch_inputs)}")
        else:
            for input_param in additional_batch_inputs:
                if not isinstance(input_param, InputParam):
                    raise ValueError(
                        f"additional_batch_inputs must be a list of InputParam, but got {type(input_param)}"
                    )
        self._image_latent_inputs = image_latent_inputs
        self._additional_batch_inputs = additional_batch_inputs
@@ -381,9 +555,9 @@ class QwenImageEditPlusAdditionalInputsStep(ModularPipelineBlocks):
        if self._image_latent_inputs or self._additional_batch_inputs:
            inputs_info = "\n\nConfigured inputs:"
            if self._image_latent_inputs:
-                inputs_info += f"\n  - Image latent inputs: {self._image_latent_inputs}"
+                inputs_info += f"\n  - Image latent inputs: {[p.name for p in self._image_latent_inputs]}"
            if self._additional_batch_inputs:
-                inputs_info += f"\n  - Additional batch inputs: {self._additional_batch_inputs}"
+                inputs_info += f"\n  - Additional batch inputs: {[p.name for p in self._additional_batch_inputs]}"
        placement_section = "\n\nThis block should be placed after the encoder steps and the text input step."
@@ -398,23 +572,20 @@ class QwenImageEditPlusAdditionalInputsStep(ModularPipelineBlocks):
    @property
    def inputs(self) -> List[InputParam]:
        inputs = [
-            InputParam(name="num_images_per_prompt", default=1),
+            InputParam.template("num_images_per_prompt"),
-            InputParam(name="batch_size", required=True),
+            InputParam.template("batch_size"),
-            InputParam(name="height"),
+            InputParam.template("height"),
-            InputParam(name="width"),
+            InputParam.template("width"),
        ]
-        for image_latent_input_name in self._image_latent_inputs:
+        # default is `image_latents`
-            inputs.append(InputParam(name=image_latent_input_name))
+        inputs += self._image_latent_inputs + self._additional_batch_inputs
        for input_name in self._additional_batch_inputs:
            inputs.append(InputParam(name=input_name))
        return inputs
    @property
    def intermediate_outputs(self) -> List[OutputParam]:
-        return [
+        outputs = [
            OutputParam(
                name="image_height",
                type_hint=List[int],
@@ -427,11 +598,43 @@ class QwenImageEditPlusAdditionalInputsStep(ModularPipelineBlocks):
            ),
        ]
        # `height`/`width` are updated if any image latent inputs are provided
        if len(self._image_latent_inputs) > 0:
            outputs.append(
                OutputParam(name="height", type_hint=int, description="if not provided, updated to image height")
            )
            outputs.append(
                OutputParam(name="width", type_hint=int, description="if not provided, updated to image width")
            )
        # image latent inputs are modified in place (patchified, concatenated, and batch-expanded)
        for input_param in self._image_latent_inputs:
            outputs.append(
                OutputParam(
                    name=input_param.name,
                    type_hint=input_param.type_hint,
                    description=input_param.description + " (patchified, concatenated, and batch-expanded)",
                )
            )
        # additional batch inputs (batch-expanded only)
        for input_param in self._additional_batch_inputs:
            outputs.append(
                OutputParam(
                    name=input_param.name,
                    type_hint=input_param.type_hint,
                    description=input_param.description + " (batch-expanded)",
                )
            )
        return outputs
    def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
        block_state = self.get_block_state(state)
        # Process image latent inputs
-        for image_latent_input_name in self._image_latent_inputs:
+        for input_param in self._image_latent_inputs:
            image_latent_input_name = input_param.name
            image_latent_tensor = getattr(block_state, image_latent_input_name)
            if image_latent_tensor is None:
                continue
@@ -476,7 +679,8 @@ class QwenImageEditPlusAdditionalInputsStep(ModularPipelineBlocks):
            setattr(block_state, image_latent_input_name, packed_image_latent_tensors)
        # Process additional batch inputs (only batch expansion)
-        for input_name in self._additional_batch_inputs:
+        for input_param in self._additional_batch_inputs:
            input_name = input_param.name
            input_tensor = getattr(block_state, input_name)
            if input_tensor is None:
                continue
@@ -494,22 +698,75 @@ class QwenImageEditPlusAdditionalInputsStep(ModularPipelineBlocks):
        return components, state
-# YiYi TODO: support define config default component from the ModularPipeline level.
+# same as QwenImageAdditionalInputsStep, but with layered pachifier.
-# it is same as QwenImageAdditionalInputsStep, but with layered pachifier.
+
 # auto_docstring
 class QwenImageLayeredAdditionalInputsStep(ModularPipelineBlocks):
-    """Input step for QwenImage Layered: update height/width, expand batch, patchify with layered pachifier."""
+    """
    Input processing step for Layered that:
        1. For image latent inputs: Updates height/width if None, patchifies with layered pachifier, and expands batch
           size
        2. For additional batch inputs: Expands batch dimensions to match final batch size
      Configured inputs:
        - Image latent inputs: ['image_latents']
      This block should be placed after the encoder steps and the text input step.
      Components:
          pachifier (`QwenImageLayeredPachifier`)
      Inputs:
          num_images_per_prompt (`int`, *optional*, defaults to 1):
              The number of images to generate per prompt.
          batch_size (`int`, *optional*, defaults to 1):
              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
              be generated in input step.
          image_latents (`Tensor`):
              image latents used to guide the image generation. Can be generated from vae_encoder step.
      Outputs:
          image_height (`int`):
              The image height calculated from the image latents dimension
          image_width (`int`):
              The image width calculated from the image latents dimension
          height (`int`):
              if not provided, updated to image height
          width (`int`):
              if not provided, updated to image width
          image_latents (`Tensor`):
              image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified
              with layered pachifier and batch-expanded)
    """
    model_name = "qwenimage-layered"
    def __init__(
        self,
-        image_latent_inputs: List[str] = ["image_latents"],
+        image_latent_inputs: Optional[List[InputParam]] = None,
-        additional_batch_inputs: List[str] = [],
+        additional_batch_inputs: Optional[List[InputParam]] = None,
    ):
        if image_latent_inputs is None:
            image_latent_inputs = [InputParam.template("image_latents")]
        if additional_batch_inputs is None:
            additional_batch_inputs = []
        if not isinstance(image_latent_inputs, list):
-            image_latent_inputs = [image_latent_inputs]
+            raise ValueError(f"image_latent_inputs must be a list, but got {type(image_latent_inputs)}")
        else:
            for input_param in image_latent_inputs:
                if not isinstance(input_param, InputParam):
                    raise ValueError(f"image_latent_inputs must be a list of InputParam, but got {type(input_param)}")
        if not isinstance(additional_batch_inputs, list):
-            additional_batch_inputs = [additional_batch_inputs]
+            raise ValueError(f"additional_batch_inputs must be a list, but got {type(additional_batch_inputs)}")
        else:
            for input_param in additional_batch_inputs:
                if not isinstance(input_param, InputParam):
                    raise ValueError(
                        f"additional_batch_inputs must be a list of InputParam, but got {type(input_param)}"
                    )
        self._image_latent_inputs = image_latent_inputs
        self._additional_batch_inputs = additional_batch_inputs
@@ -527,9 +784,9 @@ class QwenImageLayeredAdditionalInputsStep(ModularPipelineBlocks):
        if self._image_latent_inputs or self._additional_batch_inputs:
            inputs_info = "\n\nConfigured inputs:"
            if self._image_latent_inputs:
-                inputs_info += f"\n  - Image latent inputs: {self._image_latent_inputs}"
+                inputs_info += f"\n  - Image latent inputs: {[p.name for p in self._image_latent_inputs]}"
            if self._additional_batch_inputs:
-                inputs_info += f"\n  - Additional batch inputs: {self._additional_batch_inputs}"
+                inputs_info += f"\n  - Additional batch inputs: {[p.name for p in self._additional_batch_inputs]}"
        placement_section = "\n\nThis block should be placed after the encoder steps and the text input step."
@@ -544,21 +801,18 @@ class QwenImageLayeredAdditionalInputsStep(ModularPipelineBlocks):
    @property
    def inputs(self) -> List[InputParam]:
        inputs = [
-            InputParam(name="num_images_per_prompt", default=1),
+            InputParam.template("num_images_per_prompt"),
-            InputParam(name="batch_size", required=True),
+            InputParam.template("batch_size"),
        ]
        # default is `image_latents`
-        for image_latent_input_name in self._image_latent_inputs:
+        inputs += self._image_latent_inputs + self._additional_batch_inputs
            inputs.append(InputParam(name=image_latent_input_name))
        for input_name in self._additional_batch_inputs:
            inputs.append(InputParam(name=input_name))
        return inputs
    @property
    def intermediate_outputs(self) -> List[OutputParam]:
-        return [
+        outputs = [
            OutputParam(
                name="image_height",
                type_hint=int,
@@ -569,15 +823,44 @@ class QwenImageLayeredAdditionalInputsStep(ModularPipelineBlocks):
                type_hint=int,
                description="The image width calculated from the image latents dimension",
            ),
            OutputParam(name="height", type_hint=int, description="The height of the image output"),
            OutputParam(name="width", type_hint=int, description="The width of the image output"),
        ]
        if len(self._image_latent_inputs) > 0:
            outputs.append(
                OutputParam(name="height", type_hint=int, description="if not provided, updated to image height")
            )
            outputs.append(
                OutputParam(name="width", type_hint=int, description="if not provided, updated to image width")
            )
        # Add outputs for image latent inputs (patchified with layered pachifier and batch-expanded)
        for input_param in self._image_latent_inputs:
            outputs.append(
                OutputParam(
                    name=input_param.name,
                    type_hint=input_param.type_hint,
                    description=input_param.description + " (patchified with layered pachifier and batch-expanded)",
                )
            )
        # Add outputs for additional batch inputs (batch-expanded only)
        for input_param in self._additional_batch_inputs:
            outputs.append(
                OutputParam(
                    name=input_param.name,
                    type_hint=input_param.type_hint,
                    description=input_param.description + " (batch-expanded)",
                )
            )
        return outputs
    def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
        block_state = self.get_block_state(state)
        # Process image latent inputs
-        for image_latent_input_name in self._image_latent_inputs:
+        for input_param in self._image_latent_inputs:
            image_latent_input_name = input_param.name
            image_latent_tensor = getattr(block_state, image_latent_input_name)
            if image_latent_tensor is None:
                continue
@@ -608,7 +891,8 @@ class QwenImageLayeredAdditionalInputsStep(ModularPipelineBlocks):
            setattr(block_state, image_latent_input_name, image_latent_tensor)
        # Process additional batch inputs (only batch expansion)
-        for input_name in self._additional_batch_inputs:
+        for input_param in self._additional_batch_inputs:
            input_name = input_param.name
            input_tensor = getattr(block_state, input_name)
            if input_tensor is None:
                continue
@@ -626,7 +910,34 @@ class QwenImageLayeredAdditionalInputsStep(ModularPipelineBlocks):
        return components, state
 # auto_docstring
 class QwenImageControlNetInputsStep(ModularPipelineBlocks):
    """
    prepare the `control_image_latents` for controlnet. Insert after all the other inputs steps.
      Inputs:
          control_image_latents (`Tensor`):
              The control image latents to use for the denoising process. Can be generated in controlnet vae encoder
              step.
          batch_size (`int`, *optional*, defaults to 1):
              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
              be generated in input step.
          num_images_per_prompt (`int`, *optional*, defaults to 1):
              The number of images to generate per prompt.
          height (`int`, *optional*):
              The height in pixels of the generated image.
          width (`int`, *optional*):
              The width in pixels of the generated image.
      Outputs:
          control_image_latents (`Tensor`):
              The control image latents (patchified and batch-expanded).
          height (`int`):
              if not provided, updated to control image height
          width (`int`):
              if not provided, updated to control image width
    """
    model_name = "qwenimage"
    @property
@@ -636,11 +947,28 @@ class QwenImageControlNetInputsStep(ModularPipelineBlocks):
    @property
    def inputs(self) -> List[InputParam]:
        return [
-            InputParam(name="control_image_latents", required=True),
+            InputParam(
-            InputParam(name="batch_size", required=True),
+                name="control_image_latents",
-            InputParam(name="num_images_per_prompt", default=1),
+                required=True,
-            InputParam(name="height"),
+                type_hint=torch.Tensor,
-            InputParam(name="width"),
+                description="The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step.",
            ),
            InputParam.template("batch_size"),
            InputParam.template("num_images_per_prompt"),
            InputParam.template("height"),
            InputParam.template("width"),
        ]
    @property
    def intermediate_outputs(self) -> List[OutputParam]:
        return [
            OutputParam(
                name="control_image_latents",
                type_hint=torch.Tensor,
                description="The control image latents (patchified and batch-expanded).",
            ),
            OutputParam(name="height", type_hint=int, description="if not provided, updated to control image height"),
            OutputParam(name="width", type_hint=int, description="if not provided, updated to control image width"),
        ]
    @torch.no_grad()
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
@@ -12,14 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from typing import List
 import PIL.Image
 import torch
 from ...utils import logging
 from ..modular_pipeline import AutoPipelineBlocks, ConditionalPipelineBlocks, SequentialPipelineBlocks
-from ..modular_pipeline_utils import InsertableDict, OutputParam
+from ..modular_pipeline_utils import InputParam, InsertableDict, OutputParam
 from .before_denoise import (
    QwenImageControlNetBeforeDenoiserStep,
    QwenImageCreateMaskLatentsStep,
@@ -59,11 +56,91 @@ logger = logging.get_logger(__name__)
 # ====================
-# 1. VAE ENCODER
+# 1. TEXT ENCODER
 # ====================
 # auto_docstring
 class QwenImageAutoTextEncoderStep(AutoPipelineBlocks):
    """
    Text encoder step that encodes the text prompt into a text embedding. This is an auto pipeline block.
      Components:
          text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use tokenizer (`Qwen2Tokenizer`):
          The tokenizer to use guider (`ClassifierFreeGuidance`)
      Inputs:
          prompt (`str`, *optional*):
              The prompt or prompts to guide image generation.
          negative_prompt (`str`, *optional*):
              The prompt or prompts not to guide the image generation.
          max_sequence_length (`int`, *optional*, defaults to 1024):
              Maximum sequence length for prompt encoding.
      Outputs:
          prompt_embeds (`Tensor`):
              The prompt embeddings.
          prompt_embeds_mask (`Tensor`):
              The encoder attention mask.
          negative_prompt_embeds (`Tensor`):
              The negative prompt embeddings.
          negative_prompt_embeds_mask (`Tensor`):
              The negative prompt embeddings mask.
    """
    model_name = "qwenimage"
    block_classes = [QwenImageTextEncoderStep()]
    block_names = ["text_encoder"]
    block_trigger_inputs = ["prompt"]
    @property
    def description(self) -> str:
        return "Text encoder step that encodes the text prompt into a text embedding. This is an auto pipeline block."
        " - `QwenImageTextEncoderStep` (text_encoder) is used when `prompt` is provided."
        " - if `prompt` is not provided, step will be skipped."
 # ====================
 # 2. VAE ENCODER
 # ====================
 # auto_docstring
 class QwenImageInpaintVaeEncoderStep(SequentialPipelineBlocks):
    """
    This step is used for processing image and mask inputs for inpainting tasks. It:
       - Resizes the image to the target size, based on `height` and `width`.
       - Processes and updates `image` and `mask_image`.
       - Creates `image_latents`.
      Components:
          image_mask_processor (`InpaintProcessor`) vae (`AutoencoderKLQwenImage`)
      Inputs:
          mask_image (`Image`):
              Mask image for inpainting.
          image (`Union[Image, List]`):
              Reference image(s) for denoising. Can be a single image or list of images.
          height (`int`, *optional*):
              The height in pixels of the generated image.
          width (`int`, *optional*):
              The width in pixels of the generated image.
          padding_mask_crop (`int`, *optional*):
              Padding for mask cropping in inpainting.
          generator (`Generator`, *optional*):
              Torch generator for deterministic generation.
      Outputs:
          processed_image (`Tensor`):
              The processed image
          processed_mask_image (`Tensor`):
              The processed mask image
          mask_overlay_kwargs (`Dict`):
              The kwargs for the postprocess step to apply the mask overlay
          image_latents (`Tensor`):
              The latent representation of the input image.
    """
    model_name = "qwenimage"
    block_classes = [QwenImageInpaintProcessImagesInputStep(), QwenImageVaeEncoderStep()]
    block_names = ["preprocess", "encode"]
@@ -78,7 +155,31 @@ class QwenImageInpaintVaeEncoderStep(SequentialPipelineBlocks):
        )
 # auto_docstring
 class QwenImageImg2ImgVaeEncoderStep(SequentialPipelineBlocks):
    """
    Vae encoder step that preprocess andencode the image inputs into their latent representations.
      Components:
          image_processor (`VaeImageProcessor`) vae (`AutoencoderKLQwenImage`)
      Inputs:
          image (`Union[Image, List]`):
              Reference image(s) for denoising. Can be a single image or list of images.
          height (`int`, *optional*):
              The height in pixels of the generated image.
          width (`int`, *optional*):
              The width in pixels of the generated image.
          generator (`Generator`, *optional*):
              Torch generator for deterministic generation.
      Outputs:
          processed_image (`Tensor`):
              The processed image
          image_latents (`Tensor`):
              The latent representation of the input image.
    """
    model_name = "qwenimage"
    block_classes = [QwenImageProcessImagesInputStep(), QwenImageVaeEncoderStep()]
@@ -89,7 +190,6 @@ class QwenImageImg2ImgVaeEncoderStep(SequentialPipelineBlocks):
        return "Vae encoder step that preprocess andencode the image inputs into their latent representations."
 # Auto VAE encoder
 class QwenImageAutoVaeEncoderStep(AutoPipelineBlocks):
    block_classes = [QwenImageInpaintVaeEncoderStep, QwenImageImg2ImgVaeEncoderStep]
    block_names = ["inpaint", "img2img"]
@@ -107,7 +207,33 @@ class QwenImageAutoVaeEncoderStep(AutoPipelineBlocks):
 # optional controlnet vae encoder
 # auto_docstring
 class QwenImageOptionalControlNetVaeEncoderStep(AutoPipelineBlocks):
    """
    Vae encoder step that encode the image inputs into their latent representations.
      This is an auto pipeline block.
       - `QwenImageControlNetVaeEncoderStep` (controlnet) is used when `control_image` is provided.
       - if `control_image` is not provided, step will be skipped.
      Components:
          vae (`AutoencoderKLQwenImage`) controlnet (`QwenImageControlNetModel`) control_image_processor
          (`VaeImageProcessor`)
      Inputs:
          control_image (`Image`, *optional*):
              Control image for ControlNet conditioning.
          height (`int`, *optional*):
              The height in pixels of the generated image.
          width (`int`, *optional*):
              The width in pixels of the generated image.
          generator (`Generator`, *optional*):
              Torch generator for deterministic generation.
      Outputs:
          control_image_latents (`Tensor`):
              The latents representing the control image
    """
    block_classes = [QwenImageControlNetVaeEncoderStep]
    block_names = ["controlnet"]
    block_trigger_inputs = ["control_image"]
@@ -123,14 +249,65 @@ class QwenImageOptionalControlNetVaeEncoderStep(AutoPipelineBlocks):
 # ====================
-# 2. DENOISE (input -> prepare_latents -> set_timesteps -> prepare_rope_inputs -> denoise -> after_denoise)
+# 3. DENOISE (input -> prepare_latents -> set_timesteps -> prepare_rope_inputs -> denoise -> after_denoise)
 # ====================
 # assemble input steps
 # auto_docstring
 class QwenImageImg2ImgInputStep(SequentialPipelineBlocks):
    """
    Input step that prepares the inputs for the img2img denoising step. It:
      Components:
          pachifier (`QwenImagePachifier`)
      Inputs:
          num_images_per_prompt (`int`, *optional*, defaults to 1):
              The number of images to generate per prompt.
          prompt_embeds (`Tensor`):
              text embeddings used to guide the image generation. Can be generated from text_encoder step.
          prompt_embeds_mask (`Tensor`):
              mask for the text embeddings. Can be generated from text_encoder step.
          negative_prompt_embeds (`Tensor`, *optional*):
              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
          negative_prompt_embeds_mask (`Tensor`, *optional*):
              mask for the negative text embeddings. Can be generated from text_encoder step.
          height (`int`, *optional*):
              The height in pixels of the generated image.
          width (`int`, *optional*):
              The width in pixels of the generated image.
          image_latents (`Tensor`):
              image latents used to guide the image generation. Can be generated from vae_encoder step.
      Outputs:
          batch_size (`int`):
              The batch size of the prompt embeddings
          dtype (`dtype`):
              The data type of the prompt embeddings
          prompt_embeds (`Tensor`):
              The prompt embeddings. (batch-expanded)
          prompt_embeds_mask (`Tensor`):
              The encoder attention mask. (batch-expanded)
          negative_prompt_embeds (`Tensor`):
              The negative prompt embeddings. (batch-expanded)
          negative_prompt_embeds_mask (`Tensor`):
              The negative prompt embeddings mask. (batch-expanded)
          image_height (`int`):
              The image height calculated from the image latents dimension
          image_width (`int`):
              The image width calculated from the image latents dimension
          height (`int`):
              if not provided, updated to image height
          width (`int`):
              if not provided, updated to image width
          image_latents (`Tensor`):
              image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified and
              batch-expanded)
    """
    model_name = "qwenimage"
-    block_classes = [QwenImageTextInputsStep(), QwenImageAdditionalInputsStep(image_latent_inputs=["image_latents"])]
+    block_classes = [QwenImageTextInputsStep(), QwenImageAdditionalInputsStep()]
    block_names = ["text_inputs", "additional_inputs"]
    @property
@@ -140,12 +317,69 @@ class QwenImageImg2ImgInputStep(SequentialPipelineBlocks):
        " - update height/width based `image_latents`, patchify `image_latents`."
 # auto_docstring
 class QwenImageInpaintInputStep(SequentialPipelineBlocks):
    """
    Input step that prepares the inputs for the inpainting denoising step. It:
      Components:
          pachifier (`QwenImagePachifier`)
      Inputs:
          num_images_per_prompt (`int`, *optional*, defaults to 1):
              The number of images to generate per prompt.
          prompt_embeds (`Tensor`):
              text embeddings used to guide the image generation. Can be generated from text_encoder step.
          prompt_embeds_mask (`Tensor`):
              mask for the text embeddings. Can be generated from text_encoder step.
          negative_prompt_embeds (`Tensor`, *optional*):
              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
          negative_prompt_embeds_mask (`Tensor`, *optional*):
              mask for the negative text embeddings. Can be generated from text_encoder step.
          height (`int`, *optional*):
              The height in pixels of the generated image.
          width (`int`, *optional*):
              The width in pixels of the generated image.
          image_latents (`Tensor`, *optional*):
              image latents used to guide the image generation. Can be generated from vae_encoder step.
          processed_mask_image (`Tensor`, *optional*):
              The processed mask image
      Outputs:
          batch_size (`int`):
              The batch size of the prompt embeddings
          dtype (`dtype`):
              The data type of the prompt embeddings
          prompt_embeds (`Tensor`):
              The prompt embeddings. (batch-expanded)
          prompt_embeds_mask (`Tensor`):
              The encoder attention mask. (batch-expanded)
          negative_prompt_embeds (`Tensor`):
              The negative prompt embeddings. (batch-expanded)
          negative_prompt_embeds_mask (`Tensor`):
              The negative prompt embeddings mask. (batch-expanded)
          image_height (`int`):
              The image height calculated from the image latents dimension
          image_width (`int`):
              The image width calculated from the image latents dimension
          height (`int`):
              if not provided, updated to image height
          width (`int`):
              if not provided, updated to image width
          image_latents (`Tensor`):
              image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified and
              batch-expanded)
          processed_mask_image (`Tensor`):
              The processed mask image (batch-expanded)
    """
    model_name = "qwenimage"
    block_classes = [
        QwenImageTextInputsStep(),
        QwenImageAdditionalInputsStep(
-            image_latent_inputs=["image_latents"], additional_batch_inputs=["processed_mask_image"]
+            additional_batch_inputs=[
                InputParam(name="processed_mask_image", type_hint=torch.Tensor, description="The processed mask image")
            ]
        ),
    ]
    block_names = ["text_inputs", "additional_inputs"]
@@ -158,7 +392,42 @@ class QwenImageInpaintInputStep(SequentialPipelineBlocks):
 # assemble prepare latents steps
 # auto_docstring
 class QwenImageInpaintPrepareLatentsStep(SequentialPipelineBlocks):
    """
    This step prepares the latents/image_latents and mask inputs for the inpainting denoising step. It:
       - Add noise to the image latents to create the latents input for the denoiser.
       - Create the pachified latents `mask` based on the processedmask image.
      Components:
          scheduler (`FlowMatchEulerDiscreteScheduler`) pachifier (`QwenImagePachifier`)
      Inputs:
          latents (`Tensor`):
              The initial random noised, can be generated in prepare latent step.
          image_latents (`Tensor`):
              image latents used to guide the image generation. Can be generated from vae_encoder step. (Can be
              generated from vae encoder and updated in input step.)
          timesteps (`Tensor`):
              The timesteps to use for the denoising process. Can be generated in set_timesteps step.
          processed_mask_image (`Tensor`):
              The processed mask to use for the inpainting process.
          height (`int`):
              The height in pixels of the generated image.
          width (`int`):
              The width in pixels of the generated image.
          dtype (`dtype`, *optional*, defaults to torch.float32):
              The dtype of the model inputs, can be generated in input step.
      Outputs:
          initial_noise (`Tensor`):
              The initial random noised used for inpainting denoising.
          latents (`Tensor`):
              The scaled noisy latents to use for inpainting/image-to-image denoising.
          mask (`Tensor`):
              The mask to use for the inpainting process.
    """
    model_name = "qwenimage"
    block_classes = [QwenImagePrepareLatentsWithStrengthStep(), QwenImageCreateMaskLatentsStep()]
    block_names = ["add_noise_to_latents", "create_mask_latents"]
@@ -176,7 +445,49 @@ class QwenImageInpaintPrepareLatentsStep(SequentialPipelineBlocks):
 # Qwen Image (text2image)
 # auto_docstring
 class QwenImageCoreDenoiseStep(SequentialPipelineBlocks):
    """
    step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs
    (timesteps, latents, rope inputs etc.).
      Components:
          pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider
          (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)
      Inputs:
          num_images_per_prompt (`int`, *optional*, defaults to 1):
              The number of images to generate per prompt.
          prompt_embeds (`Tensor`):
              text embeddings used to guide the image generation. Can be generated from text_encoder step.
          prompt_embeds_mask (`Tensor`):
              mask for the text embeddings. Can be generated from text_encoder step.
          negative_prompt_embeds (`Tensor`, *optional*):
              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
          negative_prompt_embeds_mask (`Tensor`, *optional*):
              mask for the negative text embeddings. Can be generated from text_encoder step.
          latents (`Tensor`, *optional*):
              Pre-generated noisy latents for image generation.
          height (`int`, *optional*):
              The height in pixels of the generated image.
          width (`int`, *optional*):
              The width in pixels of the generated image.
          generator (`Generator`, *optional*):
              Torch generator for deterministic generation.
          num_inference_steps (`int`, *optional*, defaults to 50):
              The number of denoising steps.
          sigmas (`List`, *optional*):
              Custom sigmas for the denoising process.
          attention_kwargs (`Dict`, *optional*):
              Additional kwargs for attention processors.
          **denoiser_input_fields (`None`, *optional*):
              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
      Outputs:
          latents (`Tensor`):
              Denoised latents.
    """
    model_name = "qwenimage"
    block_classes = [
        QwenImageTextInputsStep(),
@@ -199,9 +510,63 @@ class QwenImageCoreDenoiseStep(SequentialPipelineBlocks):
    def description(self):
        return "step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.)."
    @property
    def outputs(self):
        return [
            OutputParam.template("latents"),
        ]
 # Qwen Image (inpainting)
 # auto_docstring
 class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks):
    """
    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint
    task.
      Components:
          pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider
          (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)
      Inputs:
          num_images_per_prompt (`int`, *optional*, defaults to 1):
              The number of images to generate per prompt.
          prompt_embeds (`Tensor`):
              text embeddings used to guide the image generation. Can be generated from text_encoder step.
          prompt_embeds_mask (`Tensor`):
              mask for the text embeddings. Can be generated from text_encoder step.
          negative_prompt_embeds (`Tensor`, *optional*):
              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
          negative_prompt_embeds_mask (`Tensor`, *optional*):
              mask for the negative text embeddings. Can be generated from text_encoder step.
          height (`int`, *optional*):
              The height in pixels of the generated image.
          width (`int`, *optional*):
              The width in pixels of the generated image.
          image_latents (`Tensor`, *optional*):
              image latents used to guide the image generation. Can be generated from vae_encoder step.
          processed_mask_image (`Tensor`, *optional*):
              The processed mask image
          latents (`Tensor`, *optional*):
              Pre-generated noisy latents for image generation.
          generator (`Generator`, *optional*):
              Torch generator for deterministic generation.
          num_inference_steps (`int`, *optional*, defaults to 50):
              The number of denoising steps.
          sigmas (`List`, *optional*):
              Custom sigmas for the denoising process.
          strength (`float`, *optional*, defaults to 0.9):
              Strength for img2img/inpainting.
          attention_kwargs (`Dict`, *optional*):
              Additional kwargs for attention processors.
          **denoiser_input_fields (`None`, *optional*):
              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
      Outputs:
          latents (`Tensor`):
              Denoised latents.
    """
    model_name = "qwenimage"
    block_classes = [
        QwenImageInpaintInputStep(),
@@ -226,9 +591,61 @@ class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks):
    def description(self):
        return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task."
    @property
    def outputs(self):
        return [
            OutputParam.template("latents"),
        ]
 # Qwen Image (image2image)
 # auto_docstring
 class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
    """
    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img
    task.
      Components:
          pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider
          (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)
      Inputs:
          num_images_per_prompt (`int`, *optional*, defaults to 1):
              The number of images to generate per prompt.
          prompt_embeds (`Tensor`):
              text embeddings used to guide the image generation. Can be generated from text_encoder step.
          prompt_embeds_mask (`Tensor`):
              mask for the text embeddings. Can be generated from text_encoder step.
          negative_prompt_embeds (`Tensor`, *optional*):
              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
          negative_prompt_embeds_mask (`Tensor`, *optional*):
              mask for the negative text embeddings. Can be generated from text_encoder step.
          height (`int`, *optional*):
              The height in pixels of the generated image.
          width (`int`, *optional*):
              The width in pixels of the generated image.
          image_latents (`Tensor`):
              image latents used to guide the image generation. Can be generated from vae_encoder step.
          latents (`Tensor`, *optional*):
              Pre-generated noisy latents for image generation.
          generator (`Generator`, *optional*):
              Torch generator for deterministic generation.
          num_inference_steps (`int`, *optional*, defaults to 50):
              The number of denoising steps.
          sigmas (`List`, *optional*):
              Custom sigmas for the denoising process.
          strength (`float`, *optional*, defaults to 0.9):
              Strength for img2img/inpainting.
          attention_kwargs (`Dict`, *optional*):
              Additional kwargs for attention processors.
          **denoiser_input_fields (`None`, *optional*):
              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
      Outputs:
          latents (`Tensor`):
              Denoised latents.
    """
    model_name = "qwenimage"
    block_classes = [
        QwenImageImg2ImgInputStep(),
@@ -253,9 +670,66 @@ class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
    def description(self):
        return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task."
    @property
    def outputs(self):
        return [
            OutputParam.template("latents"),
        ]
 # Qwen Image (text2image) with controlnet
 # auto_docstring
 class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks):
    """
    step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs
    (timesteps, latents, rope inputs etc.).
      Components:
          pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) controlnet
          (`QwenImageControlNetModel`) guider (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)
      Inputs:
          num_images_per_prompt (`int`, *optional*, defaults to 1):
              The number of images to generate per prompt.
          prompt_embeds (`Tensor`):
              text embeddings used to guide the image generation. Can be generated from text_encoder step.
          prompt_embeds_mask (`Tensor`):
              mask for the text embeddings. Can be generated from text_encoder step.
          negative_prompt_embeds (`Tensor`, *optional*):
              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
          negative_prompt_embeds_mask (`Tensor`, *optional*):
              mask for the negative text embeddings. Can be generated from text_encoder step.
          control_image_latents (`Tensor`):
              The control image latents to use for the denoising process. Can be generated in controlnet vae encoder
              step.
          height (`int`, *optional*):
              The height in pixels of the generated image.
          width (`int`, *optional*):
              The width in pixels of the generated image.
          latents (`Tensor`, *optional*):
              Pre-generated noisy latents for image generation.
          generator (`Generator`, *optional*):
              Torch generator for deterministic generation.
          num_inference_steps (`int`, *optional*, defaults to 50):
              The number of denoising steps.
          sigmas (`List`, *optional*):
              Custom sigmas for the denoising process.
          control_guidance_start (`float`, *optional*, defaults to 0.0):
              When to start applying ControlNet.
          control_guidance_end (`float`, *optional*, defaults to 1.0):
              When to stop applying ControlNet.
          controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
              Scale for ControlNet conditioning.
          attention_kwargs (`Dict`, *optional*):
              Additional kwargs for attention processors.
          **denoiser_input_fields (`None`, *optional*):
              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
      Outputs:
          latents (`Tensor`):
              Denoised latents.
    """
    model_name = "qwenimage"
    block_classes = [
        QwenImageTextInputsStep(),
@@ -282,9 +756,72 @@ class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks):
    def description(self):
        return "step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.)."
    @property
    def outputs(self):
        return [
            OutputParam.template("latents"),
        ]
 # Qwen Image (inpainting) with controlnet
 # auto_docstring
 class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks):
    """
    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint
    task.
      Components:
          pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) controlnet
          (`QwenImageControlNetModel`) guider (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)
      Inputs:
          num_images_per_prompt (`int`, *optional*, defaults to 1):
              The number of images to generate per prompt.
          prompt_embeds (`Tensor`):
              text embeddings used to guide the image generation. Can be generated from text_encoder step.
          prompt_embeds_mask (`Tensor`):
              mask for the text embeddings. Can be generated from text_encoder step.
          negative_prompt_embeds (`Tensor`, *optional*):
              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
          negative_prompt_embeds_mask (`Tensor`, *optional*):
              mask for the negative text embeddings. Can be generated from text_encoder step.
          height (`int`, *optional*):
              The height in pixels of the generated image.
          width (`int`, *optional*):
              The width in pixels of the generated image.
          image_latents (`Tensor`, *optional*):
              image latents used to guide the image generation. Can be generated from vae_encoder step.
          processed_mask_image (`Tensor`, *optional*):
              The processed mask image
          control_image_latents (`Tensor`):
              The control image latents to use for the denoising process. Can be generated in controlnet vae encoder
              step.
          latents (`Tensor`, *optional*):
              Pre-generated noisy latents for image generation.
          generator (`Generator`, *optional*):
              Torch generator for deterministic generation.
          num_inference_steps (`int`, *optional*, defaults to 50):
              The number of denoising steps.
          sigmas (`List`, *optional*):
              Custom sigmas for the denoising process.
          strength (`float`, *optional*, defaults to 0.9):
              Strength for img2img/inpainting.
          control_guidance_start (`float`, *optional*, defaults to 0.0):
              When to start applying ControlNet.
          control_guidance_end (`float`, *optional*, defaults to 1.0):
              When to stop applying ControlNet.
          controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
              Scale for ControlNet conditioning.
          attention_kwargs (`Dict`, *optional*):
              Additional kwargs for attention processors.
          **denoiser_input_fields (`None`, *optional*):
              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
      Outputs:
          latents (`Tensor`):
              Denoised latents.
    """
    model_name = "qwenimage"
    block_classes = [
        QwenImageInpaintInputStep(),
@@ -313,9 +850,70 @@ class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks):
    def description(self):
        return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task."
    @property
    def outputs(self):
        return [
            OutputParam.template("latents"),
        ]
 # Qwen Image (image2image) with controlnet
 # auto_docstring
 class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
    """
    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img
    task.
      Components:
          pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) controlnet
          (`QwenImageControlNetModel`) guider (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)
      Inputs:
          num_images_per_prompt (`int`, *optional*, defaults to 1):
              The number of images to generate per prompt.
          prompt_embeds (`Tensor`):
              text embeddings used to guide the image generation. Can be generated from text_encoder step.
          prompt_embeds_mask (`Tensor`):
              mask for the text embeddings. Can be generated from text_encoder step.
          negative_prompt_embeds (`Tensor`, *optional*):
              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
          negative_prompt_embeds_mask (`Tensor`, *optional*):
              mask for the negative text embeddings. Can be generated from text_encoder step.
          height (`int`, *optional*):
              The height in pixels of the generated image.
          width (`int`, *optional*):
              The width in pixels of the generated image.
          image_latents (`Tensor`):
              image latents used to guide the image generation. Can be generated from vae_encoder step.
          control_image_latents (`Tensor`):
              The control image latents to use for the denoising process. Can be generated in controlnet vae encoder
              step.
          latents (`Tensor`, *optional*):
              Pre-generated noisy latents for image generation.
          generator (`Generator`, *optional*):
              Torch generator for deterministic generation.
          num_inference_steps (`int`, *optional*, defaults to 50):
              The number of denoising steps.
          sigmas (`List`, *optional*):
              Custom sigmas for the denoising process.
          strength (`float`, *optional*, defaults to 0.9):
              Strength for img2img/inpainting.
          control_guidance_start (`float`, *optional*, defaults to 0.0):
              When to start applying ControlNet.
          control_guidance_end (`float`, *optional*, defaults to 1.0):
              When to stop applying ControlNet.
          controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
              Scale for ControlNet conditioning.
          attention_kwargs (`Dict`, *optional*):
              Additional kwargs for attention processors.
          **denoiser_input_fields (`None`, *optional*):
              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
      Outputs:
          latents (`Tensor`):
              Denoised latents.
    """
    model_name = "qwenimage"
    block_classes = [
        QwenImageImg2ImgInputStep(),
@@ -344,6 +942,12 @@ class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
    def description(self):
        return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task."
    @property
    def outputs(self):
        return [
            OutputParam.template("latents"),
        ]
 # Auto denoise step for QwenImage
 class QwenImageAutoCoreDenoiseStep(ConditionalPipelineBlocks):
@@ -402,19 +1006,36 @@ class QwenImageAutoCoreDenoiseStep(ConditionalPipelineBlocks):
    @property
    def outputs(self):
        return [
-            OutputParam(
+            OutputParam.template("latents"),
                name="latents", type_hint=torch.Tensor, description="The latents generated by the denoising step"
            ),
        ]
 # ====================
-# 3. DECODE
+# 4. DECODE
 # ====================
 # standard decode step works for most tasks except for inpaint
 # auto_docstring
 class QwenImageDecodeStep(SequentialPipelineBlocks):
    """
    Decode step that decodes the latents to images and postprocess the generated image.
      Components:
          vae (`AutoencoderKLQwenImage`) image_processor (`VaeImageProcessor`)
      Inputs:
          latents (`Tensor`):
              The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise
              step.
          output_type (`str`, *optional*, defaults to pil):
              Output format: 'pil', 'np', 'pt'.
      Outputs:
          images (`List`):
              Generated images. (tensor output of the vae decoder.)
    """
    model_name = "qwenimage"
    block_classes = [QwenImageDecoderStep(), QwenImageProcessImagesOutputStep()]
    block_names = ["decode", "postprocess"]
@@ -425,7 +1046,30 @@ class QwenImageDecodeStep(SequentialPipelineBlocks):
 # Inpaint decode step
 # auto_docstring
 class QwenImageInpaintDecodeStep(SequentialPipelineBlocks):
    """
    Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask
    overally to the original image.
      Components:
          vae (`AutoencoderKLQwenImage`) image_mask_processor (`InpaintProcessor`)
      Inputs:
          latents (`Tensor`):
              The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise
              step.
          output_type (`str`, *optional*, defaults to pil):
              Output format: 'pil', 'np', 'pt'.
          mask_overlay_kwargs (`Dict`, *optional*):
              The kwargs for the postprocess step to apply the mask overlay. generated in
              InpaintProcessImagesInputStep.
      Outputs:
          images (`List`):
              Generated images. (tensor output of the vae decoder.)
    """
    model_name = "qwenimage"
    block_classes = [QwenImageDecoderStep(), QwenImageInpaintProcessImagesOutputStep()]
    block_names = ["decode", "postprocess"]
@@ -452,11 +1096,11 @@ class QwenImageAutoDecodeStep(AutoPipelineBlocks):
 # ====================
-# 4. AUTO BLOCKS & PRESETS
+# 5. AUTO BLOCKS & PRESETS
 # ====================
 AUTO_BLOCKS = InsertableDict(
    [
-        ("text_encoder", QwenImageTextEncoderStep()),
+        ("text_encoder", QwenImageAutoTextEncoderStep()),
        ("vae_encoder", QwenImageAutoVaeEncoderStep()),
        ("controlnet_vae_encoder", QwenImageOptionalControlNetVaeEncoderStep()),
        ("denoise", QwenImageAutoCoreDenoiseStep()),
@@ -465,24 +1109,119 @@ AUTO_BLOCKS = InsertableDict(
 )
 # auto_docstring
 class QwenImageAutoBlocks(SequentialPipelineBlocks):
    """
    Auto Modular pipeline for text-to-image, image-to-image, inpainting, and controlnet tasks using QwenImage.
      Supported workflows:
        - `text2image`: requires `prompt`
        - `image2image`: requires `prompt`, `image`
        - `inpainting`: requires `prompt`, `mask_image`, `image`
        - `controlnet_text2image`: requires `prompt`, `control_image`
        - `controlnet_image2image`: requires `prompt`, `image`, `control_image`
        - `controlnet_inpainting`: requires `prompt`, `mask_image`, `image`, `control_image`
      Components:
          text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use tokenizer (`Qwen2Tokenizer`):
          The tokenizer to use guider (`ClassifierFreeGuidance`) image_mask_processor (`InpaintProcessor`) vae
          (`AutoencoderKLQwenImage`) image_processor (`VaeImageProcessor`) controlnet (`QwenImageControlNetModel`)
          control_image_processor (`VaeImageProcessor`) pachifier (`QwenImagePachifier`) scheduler
          (`FlowMatchEulerDiscreteScheduler`) transformer (`QwenImageTransformer2DModel`)
      Inputs:
          prompt (`str`, *optional*):
              The prompt or prompts to guide image generation.
          negative_prompt (`str`, *optional*):
              The prompt or prompts not to guide the image generation.
          max_sequence_length (`int`, *optional*, defaults to 1024):
              Maximum sequence length for prompt encoding.
          mask_image (`Image`, *optional*):
              Mask image for inpainting.
          image (`Union[Image, List]`, *optional*):
              Reference image(s) for denoising. Can be a single image or list of images.
          height (`int`, *optional*):
              The height in pixels of the generated image.
          width (`int`, *optional*):
              The width in pixels of the generated image.
          padding_mask_crop (`int`, *optional*):
              Padding for mask cropping in inpainting.
          generator (`Generator`, *optional*):
              Torch generator for deterministic generation.
          control_image (`Image`, *optional*):
              Control image for ControlNet conditioning.
          num_images_per_prompt (`int`, *optional*, defaults to 1):
              The number of images to generate per prompt.
          prompt_embeds (`Tensor`):
              text embeddings used to guide the image generation. Can be generated from text_encoder step.
          prompt_embeds_mask (`Tensor`):
              mask for the text embeddings. Can be generated from text_encoder step.
          negative_prompt_embeds (`Tensor`, *optional*):
              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
          negative_prompt_embeds_mask (`Tensor`, *optional*):
              mask for the negative text embeddings. Can be generated from text_encoder step.
          latents (`Tensor`):
              Pre-generated noisy latents for image generation.
          num_inference_steps (`int`):
              The number of denoising steps.
          sigmas (`List`, *optional*):
              Custom sigmas for the denoising process.
          attention_kwargs (`Dict`, *optional*):
              Additional kwargs for attention processors.
          **denoiser_input_fields (`None`, *optional*):
              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
          image_latents (`Tensor`, *optional*):
              image latents used to guide the image generation. Can be generated from vae_encoder step.
          processed_mask_image (`Tensor`, *optional*):
              The processed mask image
          strength (`float`, *optional*, defaults to 0.9):
              Strength for img2img/inpainting.
          control_image_latents (`Tensor`, *optional*):
              The control image latents to use for the denoising process. Can be generated in controlnet vae encoder
              step.
          control_guidance_start (`float`, *optional*, defaults to 0.0):
              When to start applying ControlNet.
          control_guidance_end (`float`, *optional*, defaults to 1.0):
              When to stop applying ControlNet.
          controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
              Scale for ControlNet conditioning.
          output_type (`str`, *optional*, defaults to pil):
              Output format: 'pil', 'np', 'pt'.
          mask_overlay_kwargs (`Dict`, *optional*):
              The kwargs for the postprocess step to apply the mask overlay. generated in
              InpaintProcessImagesInputStep.
      Outputs:
          images (`List`):
              Generated images.
    """
    model_name = "qwenimage"
    block_classes = AUTO_BLOCKS.values()
    block_names = AUTO_BLOCKS.keys()
    # Workflow map defines the trigger conditions for each workflow.
    # How to define:
    #   - Only include required inputs and trigger inputs (inputs that determine which blocks run)
    #   - `True` means the workflow triggers when the input is not None (most common case)
    #   - Use specific values (e.g., `{"strength": 0.5}`) if your `select_block` logic depends on the value
    _workflow_map = {
        "text2image": {"prompt": True},
        "image2image": {"prompt": True, "image": True},
        "inpainting": {"prompt": True, "mask_image": True, "image": True},
        "controlnet_text2image": {"prompt": True, "control_image": True},
        "controlnet_image2image": {"prompt": True, "image": True, "control_image": True},
        "controlnet_inpainting": {"prompt": True, "mask_image": True, "image": True, "control_image": True},
    }
    @property
    def description(self):
-        return (
+        return "Auto Modular pipeline for text-to-image, image-to-image, inpainting, and controlnet tasks using QwenImage."
            "Auto Modular pipeline for text-to-image, image-to-image, inpainting, and controlnet tasks using QwenImage.\n"
            + "- for image-to-image generation, you need to provide `image`\n"
            + "- for inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop` \n"
            + "- to run the controlnet workflow, you need to provide `control_image`\n"
            + "- for text-to-image generation, all you need to provide is `prompt`"
        )
    @property
    def outputs(self):
        return [
-            OutputParam(name="images", type_hint=List[List[PIL.Image.Image]]),
+            OutputParam.template("images"),
        ]
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
@@ -12,14 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import List, Optional
+from typing import Optional
 import PIL.Image
 import torch
 from ...utils import logging
 from ..modular_pipeline import AutoPipelineBlocks, ConditionalPipelineBlocks, SequentialPipelineBlocks
-from ..modular_pipeline_utils import InsertableDict, OutputParam
+from ..modular_pipeline_utils import InputParam, InsertableDict, OutputParam
 from .before_denoise import (
    QwenImageCreateMaskLatentsStep,
    QwenImageEditRoPEInputsStep,
@@ -59,8 +58,35 @@ logger = logging.get_logger(__name__)
 # ====================
 # auto_docstring
 class QwenImageEditVLEncoderStep(SequentialPipelineBlocks):
-    """VL encoder that takes both image and text prompts."""
+    """
    QwenImage-Edit VL encoder step that encode the image and text prompts together.
      Components:
          image_resize_processor (`VaeImageProcessor`) text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor
          (`Qwen2VLProcessor`) guider (`ClassifierFreeGuidance`)
      Inputs:
          image (`Union[Image, List]`):
              Reference image(s) for denoising. Can be a single image or list of images.
          prompt (`str`):
              The prompt or prompts to guide image generation.
          negative_prompt (`str`, *optional*):
              The prompt or prompts not to guide the image generation.
      Outputs:
          resized_image (`List`):
              The resized images
          prompt_embeds (`Tensor`):
              The prompt embeddings.
          prompt_embeds_mask (`Tensor`):
              The encoder attention mask.
          negative_prompt_embeds (`Tensor`):
              The negative prompt embeddings.
          negative_prompt_embeds_mask (`Tensor`):
              The negative prompt embeddings mask.
    """
    model_name = "qwenimage-edit"
    block_classes = [
@@ -80,7 +106,30 @@ class QwenImageEditVLEncoderStep(SequentialPipelineBlocks):
 # Edit VAE encoder
 # auto_docstring
 class QwenImageEditVaeEncoderStep(SequentialPipelineBlocks):
    """
    Vae encoder step that encode the image inputs into their latent representations.
      Components:
          image_resize_processor (`VaeImageProcessor`) image_processor (`VaeImageProcessor`) vae
          (`AutoencoderKLQwenImage`)
      Inputs:
          image (`Union[Image, List]`):
              Reference image(s) for denoising. Can be a single image or list of images.
          generator (`Generator`, *optional*):
              Torch generator for deterministic generation.
      Outputs:
          resized_image (`List`):
              The resized images
          processed_image (`Tensor`):
              The processed image
          image_latents (`Tensor`):
              The latent representation of the input image.
    """
    model_name = "qwenimage-edit"
    block_classes = [
        QwenImageEditResizeStep(),
@@ -95,12 +144,46 @@ class QwenImageEditVaeEncoderStep(SequentialPipelineBlocks):
 # Edit Inpaint VAE encoder
 # auto_docstring
 class QwenImageEditInpaintVaeEncoderStep(SequentialPipelineBlocks):
    """
    This step is used for processing image and mask inputs for QwenImage-Edit inpaint tasks. It:
       - resize the image for target area (1024 * 1024) while maintaining the aspect ratio.
       - process the resized image and mask image.
       - create image latents.
      Components:
          image_resize_processor (`VaeImageProcessor`) image_mask_processor (`InpaintProcessor`) vae
          (`AutoencoderKLQwenImage`)
      Inputs:
          image (`Union[Image, List]`):
              Reference image(s) for denoising. Can be a single image or list of images.
          mask_image (`Image`):
              Mask image for inpainting.
          padding_mask_crop (`int`, *optional*):
              Padding for mask cropping in inpainting.
          generator (`Generator`, *optional*):
              Torch generator for deterministic generation.
      Outputs:
          resized_image (`List`):
              The resized images
          processed_image (`Tensor`):
              The processed image
          processed_mask_image (`Tensor`):
              The processed mask image
          mask_overlay_kwargs (`Dict`):
              The kwargs for the postprocess step to apply the mask overlay
          image_latents (`Tensor`):
              The latent representation of the input image.
    """
    model_name = "qwenimage-edit"
    block_classes = [
        QwenImageEditResizeStep(),
        QwenImageEditInpaintProcessImagesInputStep(),
-        QwenImageVaeEncoderStep(input_name="processed_image", output_name="image_latents"),
+        QwenImageVaeEncoderStep(),
    ]
    block_names = ["resize", "preprocess", "encode"]
@@ -137,11 +220,64 @@ class QwenImageEditAutoVaeEncoderStep(AutoPipelineBlocks):
 # assemble input steps
 # auto_docstring
 class QwenImageEditInputStep(SequentialPipelineBlocks):
    """
    Input step that prepares the inputs for the edit denoising step. It:
       - make sure the text embeddings have consistent batch size as well as the additional inputs.
       - update height/width based `image_latents`, patchify `image_latents`.
      Components:
          pachifier (`QwenImagePachifier`)
      Inputs:
          num_images_per_prompt (`int`, *optional*, defaults to 1):
              The number of images to generate per prompt.
          prompt_embeds (`Tensor`):
              text embeddings used to guide the image generation. Can be generated from text_encoder step.
          prompt_embeds_mask (`Tensor`):
              mask for the text embeddings. Can be generated from text_encoder step.
          negative_prompt_embeds (`Tensor`, *optional*):
              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
          negative_prompt_embeds_mask (`Tensor`, *optional*):
              mask for the negative text embeddings. Can be generated from text_encoder step.
          height (`int`, *optional*):
              The height in pixels of the generated image.
          width (`int`, *optional*):
              The width in pixels of the generated image.
          image_latents (`Tensor`):
              image latents used to guide the image generation. Can be generated from vae_encoder step.
      Outputs:
          batch_size (`int`):
              The batch size of the prompt embeddings
          dtype (`dtype`):
              The data type of the prompt embeddings
          prompt_embeds (`Tensor`):
              The prompt embeddings. (batch-expanded)
          prompt_embeds_mask (`Tensor`):
              The encoder attention mask. (batch-expanded)
          negative_prompt_embeds (`Tensor`):
              The negative prompt embeddings. (batch-expanded)
          negative_prompt_embeds_mask (`Tensor`):
              The negative prompt embeddings mask. (batch-expanded)
          image_height (`int`):
              The image height calculated from the image latents dimension
          image_width (`int`):
              The image width calculated from the image latents dimension
          height (`int`):
              if not provided, updated to image height
          width (`int`):
              if not provided, updated to image width
          image_latents (`Tensor`):
              image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified and
              batch-expanded)
    """
    model_name = "qwenimage-edit"
    block_classes = [
        QwenImageTextInputsStep(),
-        QwenImageAdditionalInputsStep(image_latent_inputs=["image_latents"]),
+        QwenImageAdditionalInputsStep(),
    ]
    block_names = ["text_inputs", "additional_inputs"]
@@ -154,12 +290,71 @@ class QwenImageEditInputStep(SequentialPipelineBlocks):
        )
 # auto_docstring
 class QwenImageEditInpaintInputStep(SequentialPipelineBlocks):
    """
    Input step that prepares the inputs for the edit inpaint denoising step. It:
       - make sure the text embeddings have consistent batch size as well as the additional inputs.
       - update height/width based `image_latents`, patchify `image_latents`.
      Components:
          pachifier (`QwenImagePachifier`)
      Inputs:
          num_images_per_prompt (`int`, *optional*, defaults to 1):
              The number of images to generate per prompt.
          prompt_embeds (`Tensor`):
              text embeddings used to guide the image generation. Can be generated from text_encoder step.
          prompt_embeds_mask (`Tensor`):
              mask for the text embeddings. Can be generated from text_encoder step.
          negative_prompt_embeds (`Tensor`, *optional*):
              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
          negative_prompt_embeds_mask (`Tensor`, *optional*):
              mask for the negative text embeddings. Can be generated from text_encoder step.
          height (`int`, *optional*):
              The height in pixels of the generated image.
          width (`int`, *optional*):
              The width in pixels of the generated image.
          image_latents (`Tensor`):
              image latents used to guide the image generation. Can be generated from vae_encoder step.
          processed_mask_image (`Tensor`, *optional*):
              The processed mask image
      Outputs:
          batch_size (`int`):
              The batch size of the prompt embeddings
          dtype (`dtype`):
              The data type of the prompt embeddings
          prompt_embeds (`Tensor`):
              The prompt embeddings. (batch-expanded)
          prompt_embeds_mask (`Tensor`):
              The encoder attention mask. (batch-expanded)
          negative_prompt_embeds (`Tensor`):
              The negative prompt embeddings. (batch-expanded)
          negative_prompt_embeds_mask (`Tensor`):
              The negative prompt embeddings mask. (batch-expanded)
          image_height (`int`):
              The image height calculated from the image latents dimension
          image_width (`int`):
              The image width calculated from the image latents dimension
          height (`int`):
              if not provided, updated to image height
          width (`int`):
              if not provided, updated to image width
          image_latents (`Tensor`):
              image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified and
              batch-expanded)
          processed_mask_image (`Tensor`):
              The processed mask image (batch-expanded)
    """
    model_name = "qwenimage-edit"
    block_classes = [
        QwenImageTextInputsStep(),
        QwenImageAdditionalInputsStep(
-            image_latent_inputs=["image_latents"], additional_batch_inputs=["processed_mask_image"]
+            additional_batch_inputs=[
                InputParam(name="processed_mask_image", type_hint=torch.Tensor, description="The processed mask image")
            ]
        ),
    ]
    block_names = ["text_inputs", "additional_inputs"]
@@ -174,7 +369,42 @@ class QwenImageEditInpaintInputStep(SequentialPipelineBlocks):
 # assemble prepare latents steps
 # auto_docstring
 class QwenImageEditInpaintPrepareLatentsStep(SequentialPipelineBlocks):
    """
    This step prepares the latents/image_latents and mask inputs for the edit inpainting denoising step. It:
       - Add noise to the image latents to create the latents input for the denoiser.
       - Create the patchified latents `mask` based on the processed mask image.
      Components:
          scheduler (`FlowMatchEulerDiscreteScheduler`) pachifier (`QwenImagePachifier`)
      Inputs:
          latents (`Tensor`):
              The initial random noised, can be generated in prepare latent step.
          image_latents (`Tensor`):
              image latents used to guide the image generation. Can be generated from vae_encoder step. (Can be
              generated from vae encoder and updated in input step.)
          timesteps (`Tensor`):
              The timesteps to use for the denoising process. Can be generated in set_timesteps step.
          processed_mask_image (`Tensor`):
              The processed mask to use for the inpainting process.
          height (`int`):
              The height in pixels of the generated image.
          width (`int`):
              The width in pixels of the generated image.
          dtype (`dtype`, *optional*, defaults to torch.float32):
              The dtype of the model inputs, can be generated in input step.
      Outputs:
          initial_noise (`Tensor`):
              The initial random noised used for inpainting denoising.
          latents (`Tensor`):
              The scaled noisy latents to use for inpainting/image-to-image denoising.
          mask (`Tensor`):
              The mask to use for the inpainting process.
    """
    model_name = "qwenimage-edit"
    block_classes = [QwenImagePrepareLatentsWithStrengthStep(), QwenImageCreateMaskLatentsStep()]
    block_names = ["add_noise_to_latents", "create_mask_latents"]
@@ -189,7 +419,50 @@ class QwenImageEditInpaintPrepareLatentsStep(SequentialPipelineBlocks):
 # Qwen Image Edit (image2image) core denoise step
 # auto_docstring
 class QwenImageEditCoreDenoiseStep(SequentialPipelineBlocks):
    """
    Core denoising workflow for QwenImage-Edit edit (img2img) task.
      Components:
          pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider
          (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)
      Inputs:
          num_images_per_prompt (`int`, *optional*, defaults to 1):
              The number of images to generate per prompt.
          prompt_embeds (`Tensor`):
              text embeddings used to guide the image generation. Can be generated from text_encoder step.
          prompt_embeds_mask (`Tensor`):
              mask for the text embeddings. Can be generated from text_encoder step.
          negative_prompt_embeds (`Tensor`, *optional*):
              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
          negative_prompt_embeds_mask (`Tensor`, *optional*):
              mask for the negative text embeddings. Can be generated from text_encoder step.
          height (`int`, *optional*):
              The height in pixels of the generated image.
          width (`int`, *optional*):
              The width in pixels of the generated image.
          image_latents (`Tensor`):
              image latents used to guide the image generation. Can be generated from vae_encoder step.
          latents (`Tensor`, *optional*):
              Pre-generated noisy latents for image generation.
          generator (`Generator`, *optional*):
              Torch generator for deterministic generation.
          num_inference_steps (`int`, *optional*, defaults to 50):
              The number of denoising steps.
          sigmas (`List`, *optional*):
              Custom sigmas for the denoising process.
          attention_kwargs (`Dict`, *optional*):
              Additional kwargs for attention processors.
          **denoiser_input_fields (`None`, *optional*):
              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
      Outputs:
          latents (`Tensor`):
              Denoised latents.
    """
    model_name = "qwenimage-edit"
    block_classes = [
        QwenImageEditInputStep(),
@@ -212,9 +485,62 @@ class QwenImageEditCoreDenoiseStep(SequentialPipelineBlocks):
    def description(self):
        return "Core denoising workflow for QwenImage-Edit edit (img2img) task."
    @property
    def outputs(self):
        return [
            OutputParam.template("latents"),
        ]
 # Qwen Image Edit (inpainting) core denoise step
 # auto_docstring
 class QwenImageEditInpaintCoreDenoiseStep(SequentialPipelineBlocks):
    """
    Core denoising workflow for QwenImage-Edit edit inpaint task.
      Components:
          pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider
          (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)
      Inputs:
          num_images_per_prompt (`int`, *optional*, defaults to 1):
              The number of images to generate per prompt.
          prompt_embeds (`Tensor`):
              text embeddings used to guide the image generation. Can be generated from text_encoder step.
          prompt_embeds_mask (`Tensor`):
              mask for the text embeddings. Can be generated from text_encoder step.
          negative_prompt_embeds (`Tensor`, *optional*):
              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
          negative_prompt_embeds_mask (`Tensor`, *optional*):
              mask for the negative text embeddings. Can be generated from text_encoder step.
          height (`int`, *optional*):
              The height in pixels of the generated image.
          width (`int`, *optional*):
              The width in pixels of the generated image.
          image_latents (`Tensor`):
              image latents used to guide the image generation. Can be generated from vae_encoder step.
          processed_mask_image (`Tensor`, *optional*):
              The processed mask image
          latents (`Tensor`, *optional*):
              Pre-generated noisy latents for image generation.
          generator (`Generator`, *optional*):
              Torch generator for deterministic generation.
          num_inference_steps (`int`, *optional*, defaults to 50):
              The number of denoising steps.
          sigmas (`List`, *optional*):
              Custom sigmas for the denoising process.
          strength (`float`, *optional*, defaults to 0.9):
              Strength for img2img/inpainting.
          attention_kwargs (`Dict`, *optional*):
              Additional kwargs for attention processors.
          **denoiser_input_fields (`None`, *optional*):
              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
      Outputs:
          latents (`Tensor`):
              Denoised latents.
    """
    model_name = "qwenimage-edit"
    block_classes = [
        QwenImageEditInpaintInputStep(),
@@ -239,6 +565,12 @@ class QwenImageEditInpaintCoreDenoiseStep(SequentialPipelineBlocks):
    def description(self):
        return "Core denoising workflow for QwenImage-Edit edit inpaint task."
    @property
    def outputs(self):
        return [
            OutputParam.template("latents"),
        ]
 # Auto core denoise step for QwenImage Edit
 class QwenImageEditAutoCoreDenoiseStep(ConditionalPipelineBlocks):
@@ -267,6 +599,12 @@ class QwenImageEditAutoCoreDenoiseStep(ConditionalPipelineBlocks):
            "Supports edit (img2img) and edit inpainting tasks for QwenImage-Edit."
        )
    @property
    def outputs(self):
        return [
            OutputParam.template("latents"),
        ]
 # ====================
 # 4. DECODE
@@ -274,7 +612,26 @@ class QwenImageEditAutoCoreDenoiseStep(ConditionalPipelineBlocks):
 # Decode step (standard)
 # auto_docstring
 class QwenImageEditDecodeStep(SequentialPipelineBlocks):
    """
    Decode step that decodes the latents to images and postprocess the generated image.
      Components:
          vae (`AutoencoderKLQwenImage`) image_processor (`VaeImageProcessor`)
      Inputs:
          latents (`Tensor`):
              The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise
              step.
          output_type (`str`, *optional*, defaults to pil):
              Output format: 'pil', 'np', 'pt'.
      Outputs:
          images (`List`):
              Generated images. (tensor output of the vae decoder.)
    """
    model_name = "qwenimage-edit"
    block_classes = [QwenImageDecoderStep(), QwenImageProcessImagesOutputStep()]
    block_names = ["decode", "postprocess"]
@@ -285,7 +642,30 @@ class QwenImageEditDecodeStep(SequentialPipelineBlocks):
 # Inpaint decode step
 # auto_docstring
 class QwenImageEditInpaintDecodeStep(SequentialPipelineBlocks):
    """
    Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask
    overlay to the original image.
      Components:
          vae (`AutoencoderKLQwenImage`) image_mask_processor (`InpaintProcessor`)
      Inputs:
          latents (`Tensor`):
              The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise
              step.
          output_type (`str`, *optional*, defaults to pil):
              Output format: 'pil', 'np', 'pt'.
          mask_overlay_kwargs (`Dict`, *optional*):
              The kwargs for the postprocess step to apply the mask overlay. generated in
              InpaintProcessImagesInputStep.
      Outputs:
          images (`List`):
              Generated images. (tensor output of the vae decoder.)
    """
    model_name = "qwenimage-edit"
    block_classes = [QwenImageDecoderStep(), QwenImageInpaintProcessImagesOutputStep()]
    block_names = ["decode", "postprocess"]
@@ -313,9 +693,7 @@ class QwenImageEditAutoDecodeStep(AutoPipelineBlocks):
    @property
    def outputs(self):
        return [
-            OutputParam(
+            OutputParam.template("latents"),
                name="latents", type_hint=torch.Tensor, description="The latents generated by the denoising step"
            ),
        ]
@@ -333,7 +711,66 @@ EDIT_AUTO_BLOCKS = InsertableDict(
 )
 # auto_docstring
 class QwenImageEditAutoBlocks(SequentialPipelineBlocks):
    """
    Auto Modular pipeline for edit (img2img) and edit inpaint tasks using QwenImage-Edit.
      - for edit (img2img) generation, you need to provide `image`
      - for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide
        `padding_mask_crop`
      Components:
          image_resize_processor (`VaeImageProcessor`) text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor
          (`Qwen2VLProcessor`) guider (`ClassifierFreeGuidance`) image_mask_processor (`InpaintProcessor`) vae
          (`AutoencoderKLQwenImage`) image_processor (`VaeImageProcessor`) pachifier (`QwenImagePachifier`) scheduler
          (`FlowMatchEulerDiscreteScheduler`) transformer (`QwenImageTransformer2DModel`)
      Inputs:
          image (`Union[Image, List]`):
              Reference image(s) for denoising. Can be a single image or list of images.
          prompt (`str`):
              The prompt or prompts to guide image generation.
          negative_prompt (`str`, *optional*):
              The prompt or prompts not to guide the image generation.
          mask_image (`Image`, *optional*):
              Mask image for inpainting.
          padding_mask_crop (`int`, *optional*):
              Padding for mask cropping in inpainting.
          generator (`Generator`, *optional*):
              Torch generator for deterministic generation.
          num_images_per_prompt (`int`, *optional*, defaults to 1):
              The number of images to generate per prompt.
          height (`int`):
              The height in pixels of the generated image.
          width (`int`):
              The width in pixels of the generated image.
          image_latents (`Tensor`):
              image latents used to guide the image generation. Can be generated from vae_encoder step.
          processed_mask_image (`Tensor`, *optional*):
              The processed mask image
          latents (`Tensor`):
              Pre-generated noisy latents for image generation.
          num_inference_steps (`int`):
              The number of denoising steps.
          sigmas (`List`, *optional*):
              Custom sigmas for the denoising process.
          strength (`float`, *optional*, defaults to 0.9):
              Strength for img2img/inpainting.
          attention_kwargs (`Dict`, *optional*):
              Additional kwargs for attention processors.
          **denoiser_input_fields (`None`, *optional*):
              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
          output_type (`str`, *optional*, defaults to pil):
              Output format: 'pil', 'np', 'pt'.
          mask_overlay_kwargs (`Dict`, *optional*):
              The kwargs for the postprocess step to apply the mask overlay. generated in
              InpaintProcessImagesInputStep.
      Outputs:
          images (`List`):
              Generated images.
    """
    model_name = "qwenimage-edit"
    block_classes = EDIT_AUTO_BLOCKS.values()
    block_names = EDIT_AUTO_BLOCKS.keys()
@@ -349,5 +786,5 @@ class QwenImageEditAutoBlocks(SequentialPipelineBlocks):
    @property
    def outputs(self):
        return [
-            OutputParam(name="images", type_hint=List[List[PIL.Image.Image]], description="The generated images"),
+            OutputParam.template("images"),
        ]
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
@@ -12,11 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from typing import List
 import PIL.Image
 import torch
 from ...utils import logging
 from ..modular_pipeline import SequentialPipelineBlocks
 from ..modular_pipeline_utils import InsertableDict, OutputParam
@@ -53,12 +48,41 @@ logger = logging.get_logger(__name__)
 # ====================
 # auto_docstring
 class QwenImageEditPlusVLEncoderStep(SequentialPipelineBlocks):
-    """VL encoder that takes both image and text prompts. Uses 384x384 target area."""
+    """
    QwenImage-Edit Plus VL encoder step that encodes the image and text prompts together.
      Components:
          image_resize_processor (`VaeImageProcessor`) text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor
          (`Qwen2VLProcessor`) guider (`ClassifierFreeGuidance`)
      Inputs:
          image (`Union[Image, List]`):
              Reference image(s) for denoising. Can be a single image or list of images.
          prompt (`str`):
              The prompt or prompts to guide image generation.
          negative_prompt (`str`, *optional*):
              The prompt or prompts not to guide the image generation.
      Outputs:
          resized_image (`List`):
              Images resized to 1024x1024 target area for VAE encoding
          resized_cond_image (`List`):
              Images resized to 384x384 target area for VL text encoding
          prompt_embeds (`Tensor`):
              The prompt embeddings.
          prompt_embeds_mask (`Tensor`):
              The encoder attention mask.
          negative_prompt_embeds (`Tensor`):
              The negative prompt embeddings.
          negative_prompt_embeds_mask (`Tensor`):
              The negative prompt embeddings mask.
    """
    model_name = "qwenimage-edit-plus"
    block_classes = [
-        QwenImageEditPlusResizeStep(target_area=384 * 384, output_name="resized_cond_image"),
+        QwenImageEditPlusResizeStep(),
        QwenImageEditPlusTextEncoderStep(),
    ]
    block_names = ["resize", "encode"]
@@ -73,12 +97,36 @@ class QwenImageEditPlusVLEncoderStep(SequentialPipelineBlocks):
 # ====================
 # auto_docstring
 class QwenImageEditPlusVaeEncoderStep(SequentialPipelineBlocks):
-    """VAE encoder that handles multiple images with different sizes. Uses 1024x1024 target area."""
+    """
    VAE encoder step that encodes image inputs into latent representations.
      Each image is resized independently based on its own aspect ratio to 1024x1024 target area.
      Components:
          image_resize_processor (`VaeImageProcessor`) image_processor (`VaeImageProcessor`) vae
          (`AutoencoderKLQwenImage`)
      Inputs:
          image (`Union[Image, List]`):
              Reference image(s) for denoising. Can be a single image or list of images.
          generator (`Generator`, *optional*):
              Torch generator for deterministic generation.
      Outputs:
          resized_image (`List`):
              Images resized to 1024x1024 target area for VAE encoding
          resized_cond_image (`List`):
              Images resized to 384x384 target area for VL text encoding
          processed_image (`Tensor`):
              The processed image
          image_latents (`Tensor`):
              The latent representation of the input image.
    """
    model_name = "qwenimage-edit-plus"
    block_classes = [
-        QwenImageEditPlusResizeStep(target_area=1024 * 1024, output_name="resized_image"),
+        QwenImageEditPlusResizeStep(),
        QwenImageEditPlusProcessImagesInputStep(),
        QwenImageVaeEncoderStep(),
    ]
@@ -98,11 +146,66 @@ class QwenImageEditPlusVaeEncoderStep(SequentialPipelineBlocks):
 # assemble input steps
 # auto_docstring
 class QwenImageEditPlusInputStep(SequentialPipelineBlocks):
    """
    Input step that prepares the inputs for the Edit Plus denoising step. It:
       - Standardizes text embeddings batch size.
       - Processes list of image latents: patchifies, concatenates along dim=1, expands batch.
       - Outputs lists of image_height/image_width for RoPE calculation.
       - Defaults height/width from last image in the list.
      Components:
          pachifier (`QwenImagePachifier`)
      Inputs:
          num_images_per_prompt (`int`, *optional*, defaults to 1):
              The number of images to generate per prompt.
          prompt_embeds (`Tensor`):
              text embeddings used to guide the image generation. Can be generated from text_encoder step.
          prompt_embeds_mask (`Tensor`):
              mask for the text embeddings. Can be generated from text_encoder step.
          negative_prompt_embeds (`Tensor`, *optional*):
              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
          negative_prompt_embeds_mask (`Tensor`, *optional*):
              mask for the negative text embeddings. Can be generated from text_encoder step.
          height (`int`, *optional*):
              The height in pixels of the generated image.
          width (`int`, *optional*):
              The width in pixels of the generated image.
          image_latents (`Tensor`):
              image latents used to guide the image generation. Can be generated from vae_encoder step.
      Outputs:
          batch_size (`int`):
              The batch size of the prompt embeddings
          dtype (`dtype`):
              The data type of the prompt embeddings
          prompt_embeds (`Tensor`):
              The prompt embeddings. (batch-expanded)
          prompt_embeds_mask (`Tensor`):
              The encoder attention mask. (batch-expanded)
          negative_prompt_embeds (`Tensor`):
              The negative prompt embeddings. (batch-expanded)
          negative_prompt_embeds_mask (`Tensor`):
              The negative prompt embeddings mask. (batch-expanded)
          image_height (`List`):
              The image heights calculated from the image latents dimension
          image_width (`List`):
              The image widths calculated from the image latents dimension
          height (`int`):
              if not provided, updated to image height
          width (`int`):
              if not provided, updated to image width
          image_latents (`Tensor`):
              image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified,
              concatenated, and batch-expanded)
    """
    model_name = "qwenimage-edit-plus"
    block_classes = [
        QwenImageTextInputsStep(),
-        QwenImageEditPlusAdditionalInputsStep(image_latent_inputs=["image_latents"]),
+        QwenImageEditPlusAdditionalInputsStep(),
    ]
    block_names = ["text_inputs", "additional_inputs"]
@@ -118,7 +221,50 @@ class QwenImageEditPlusInputStep(SequentialPipelineBlocks):
 # Qwen Image Edit Plus (image2image) core denoise step
 # auto_docstring
 class QwenImageEditPlusCoreDenoiseStep(SequentialPipelineBlocks):
    """
    Core denoising workflow for QwenImage-Edit Plus edit (img2img) task.
      Components:
          pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider
          (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)
      Inputs:
          num_images_per_prompt (`int`, *optional*, defaults to 1):
              The number of images to generate per prompt.
          prompt_embeds (`Tensor`):
              text embeddings used to guide the image generation. Can be generated from text_encoder step.
          prompt_embeds_mask (`Tensor`):
              mask for the text embeddings. Can be generated from text_encoder step.
          negative_prompt_embeds (`Tensor`, *optional*):
              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
          negative_prompt_embeds_mask (`Tensor`, *optional*):
              mask for the negative text embeddings. Can be generated from text_encoder step.
          height (`int`, *optional*):
              The height in pixels of the generated image.
          width (`int`, *optional*):
              The width in pixels of the generated image.
          image_latents (`Tensor`):
              image latents used to guide the image generation. Can be generated from vae_encoder step.
          latents (`Tensor`, *optional*):
              Pre-generated noisy latents for image generation.
          generator (`Generator`, *optional*):
              Torch generator for deterministic generation.
          num_inference_steps (`int`, *optional*, defaults to 50):
              The number of denoising steps.
          sigmas (`List`, *optional*):
              Custom sigmas for the denoising process.
          attention_kwargs (`Dict`, *optional*):
              Additional kwargs for attention processors.
          **denoiser_input_fields (`None`, *optional*):
              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
      Outputs:
          latents (`Tensor`):
              Denoised latents.
    """
    model_name = "qwenimage-edit-plus"
    block_classes = [
        QwenImageEditPlusInputStep(),
@@ -144,9 +290,7 @@ class QwenImageEditPlusCoreDenoiseStep(SequentialPipelineBlocks):
    @property
    def outputs(self):
        return [
-            OutputParam(
+            OutputParam.template("latents"),
                name="latents", type_hint=torch.Tensor, description="The latents generated by the denoising step"
            ),
        ]
@@ -155,7 +299,26 @@ class QwenImageEditPlusCoreDenoiseStep(SequentialPipelineBlocks):
 # ====================
 # auto_docstring
 class QwenImageEditPlusDecodeStep(SequentialPipelineBlocks):
    """
    Decode step that decodes the latents to images and postprocesses the generated image.
      Components:
          vae (`AutoencoderKLQwenImage`) image_processor (`VaeImageProcessor`)
      Inputs:
          latents (`Tensor`):
              The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise
              step.
          output_type (`str`, *optional*, defaults to pil):
              Output format: 'pil', 'np', 'pt'.
      Outputs:
          images (`List`):
              Generated images. (tensor output of the vae decoder.)
    """
    model_name = "qwenimage-edit-plus"
    block_classes = [QwenImageDecoderStep(), QwenImageProcessImagesOutputStep()]
    block_names = ["decode", "postprocess"]
@@ -179,7 +342,53 @@ EDIT_PLUS_AUTO_BLOCKS = InsertableDict(
 )
 # auto_docstring
 class QwenImageEditPlusAutoBlocks(SequentialPipelineBlocks):
    """
    Auto Modular pipeline for edit (img2img) tasks using QwenImage-Edit Plus.
      - `image` is required input (can be single image or list of images).
      - Each image is resized independently based on its own aspect ratio.
      - VL encoder uses 384x384 target area, VAE encoder uses 1024x1024 target area.
      Components:
          image_resize_processor (`VaeImageProcessor`) text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor
          (`Qwen2VLProcessor`) guider (`ClassifierFreeGuidance`) image_processor (`VaeImageProcessor`) vae
          (`AutoencoderKLQwenImage`) pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`)
          transformer (`QwenImageTransformer2DModel`)
      Inputs:
          image (`Union[Image, List]`):
              Reference image(s) for denoising. Can be a single image or list of images.
          prompt (`str`):
              The prompt or prompts to guide image generation.
          negative_prompt (`str`, *optional*):
              The prompt or prompts not to guide the image generation.
          generator (`Generator`, *optional*):
              Torch generator for deterministic generation.
          num_images_per_prompt (`int`, *optional*, defaults to 1):
              The number of images to generate per prompt.
          height (`int`, *optional*):
              The height in pixels of the generated image.
          width (`int`, *optional*):
              The width in pixels of the generated image.
          latents (`Tensor`, *optional*):
              Pre-generated noisy latents for image generation.
          num_inference_steps (`int`, *optional*, defaults to 50):
              The number of denoising steps.
          sigmas (`List`, *optional*):
              Custom sigmas for the denoising process.
          attention_kwargs (`Dict`, *optional*):
              Additional kwargs for attention processors.
          **denoiser_input_fields (`None`, *optional*):
              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
          output_type (`str`, *optional*, defaults to pil):
              Output format: 'pil', 'np', 'pt'.
      Outputs:
          images (`List`):
              Generated images.
    """
    model_name = "qwenimage-edit-plus"
    block_classes = EDIT_PLUS_AUTO_BLOCKS.values()
    block_names = EDIT_PLUS_AUTO_BLOCKS.keys()
@@ -196,5 +405,5 @@ class QwenImageEditPlusAutoBlocks(SequentialPipelineBlocks):
    @property
    def outputs(self):
        return [
-            OutputParam(name="images", type_hint=List[List[PIL.Image.Image]], description="The generated images"),
+            OutputParam.template("images"),
        ]
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
@@ -12,12 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from typing import List
 import PIL.Image
 import torch
 from ...utils import logging
 from ..modular_pipeline import SequentialPipelineBlocks
 from ..modular_pipeline_utils import InsertableDict, OutputParam
@@ -55,8 +49,44 @@ logger = logging.get_logger(__name__)
 # ====================
 # auto_docstring
 class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks):
-    """Text encoder that takes text prompt, will generate a prompt based on image if not provided."""
+    """
    QwenImage-Layered Text encoder step that encode the text prompt, will generate a prompt based on image if not
    provided.
      Components:
          image_resize_processor (`VaeImageProcessor`) text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor
          (`Qwen2VLProcessor`) tokenizer (`Qwen2Tokenizer`): The tokenizer to use guider (`ClassifierFreeGuidance`)
      Inputs:
          image (`Union[Image, List]`):
              Reference image(s) for denoising. Can be a single image or list of images.
          resolution (`int`, *optional*, defaults to 640):
              The target area to resize the image to, can be 1024 or 640
          prompt (`str`, *optional*):
              The prompt or prompts to guide image generation.
          use_en_prompt (`bool`, *optional*, defaults to False):
              Whether to use English prompt template
          negative_prompt (`str`, *optional*):
              The prompt or prompts not to guide the image generation.
          max_sequence_length (`int`, *optional*, defaults to 1024):
              Maximum sequence length for prompt encoding.
      Outputs:
          resized_image (`List`):
              The resized images
          prompt (`str`):
              The prompt or prompts to guide image generation. If not provided, updated using image caption
          prompt_embeds (`Tensor`):
              The prompt embeddings.
          prompt_embeds_mask (`Tensor`):
              The encoder attention mask.
          negative_prompt_embeds (`Tensor`):
              The negative prompt embeddings.
          negative_prompt_embeds_mask (`Tensor`):
              The negative prompt embeddings mask.
    """
    model_name = "qwenimage-layered"
    block_classes = [
@@ -77,7 +107,32 @@ class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks):
 # Edit VAE encoder
 # auto_docstring
 class QwenImageLayeredVaeEncoderStep(SequentialPipelineBlocks):
    """
    Vae encoder step that encode the image inputs into their latent representations.
      Components:
          image_resize_processor (`VaeImageProcessor`) image_processor (`VaeImageProcessor`) vae
          (`AutoencoderKLQwenImage`)
      Inputs:
          image (`Union[Image, List]`):
              Reference image(s) for denoising. Can be a single image or list of images.
          resolution (`int`, *optional*, defaults to 640):
              The target area to resize the image to, can be 1024 or 640
          generator (`Generator`, *optional*):
              Torch generator for deterministic generation.
      Outputs:
          resized_image (`List`):
              The resized images
          processed_image (`Tensor`):
              The processed image
          image_latents (`Tensor`):
              The latent representation of the input image.
    """
    model_name = "qwenimage-layered"
    block_classes = [
        QwenImageLayeredResizeStep(),
@@ -98,11 +153,60 @@ class QwenImageLayeredVaeEncoderStep(SequentialPipelineBlocks):
 # assemble input steps
 # auto_docstring
 class QwenImageLayeredInputStep(SequentialPipelineBlocks):
    """
    Input step that prepares the inputs for the layered denoising step. It:
       - make sure the text embeddings have consistent batch size as well as the additional inputs.
       - update height/width based `image_latents`, patchify `image_latents`.
      Components:
          pachifier (`QwenImageLayeredPachifier`)
      Inputs:
          num_images_per_prompt (`int`, *optional*, defaults to 1):
              The number of images to generate per prompt.
          prompt_embeds (`Tensor`):
              text embeddings used to guide the image generation. Can be generated from text_encoder step.
          prompt_embeds_mask (`Tensor`):
              mask for the text embeddings. Can be generated from text_encoder step.
          negative_prompt_embeds (`Tensor`, *optional*):
              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
          negative_prompt_embeds_mask (`Tensor`, *optional*):
              mask for the negative text embeddings. Can be generated from text_encoder step.
          image_latents (`Tensor`):
              image latents used to guide the image generation. Can be generated from vae_encoder step.
      Outputs:
          batch_size (`int`):
              The batch size of the prompt embeddings
          dtype (`dtype`):
              The data type of the prompt embeddings
          prompt_embeds (`Tensor`):
              The prompt embeddings. (batch-expanded)
          prompt_embeds_mask (`Tensor`):
              The encoder attention mask. (batch-expanded)
          negative_prompt_embeds (`Tensor`):
              The negative prompt embeddings. (batch-expanded)
          negative_prompt_embeds_mask (`Tensor`):
              The negative prompt embeddings mask. (batch-expanded)
          image_height (`int`):
              The image height calculated from the image latents dimension
          image_width (`int`):
              The image width calculated from the image latents dimension
          height (`int`):
              if not provided, updated to image height
          width (`int`):
              if not provided, updated to image width
          image_latents (`Tensor`):
              image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified
              with layered pachifier and batch-expanded)
    """
    model_name = "qwenimage-layered"
    block_classes = [
        QwenImageTextInputsStep(),
-        QwenImageLayeredAdditionalInputsStep(image_latent_inputs=["image_latents"]),
+        QwenImageLayeredAdditionalInputsStep(),
    ]
    block_names = ["text_inputs", "additional_inputs"]
@@ -116,7 +220,48 @@ class QwenImageLayeredInputStep(SequentialPipelineBlocks):
 # Qwen Image Layered (image2image) core denoise step
 # auto_docstring
 class QwenImageLayeredCoreDenoiseStep(SequentialPipelineBlocks):
    """
    Core denoising workflow for QwenImage-Layered img2img task.
      Components:
          pachifier (`QwenImageLayeredPachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider
          (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)
      Inputs:
          num_images_per_prompt (`int`, *optional*, defaults to 1):
              The number of images to generate per prompt.
          prompt_embeds (`Tensor`):
              text embeddings used to guide the image generation. Can be generated from text_encoder step.
          prompt_embeds_mask (`Tensor`):
              mask for the text embeddings. Can be generated from text_encoder step.
          negative_prompt_embeds (`Tensor`, *optional*):
              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
          negative_prompt_embeds_mask (`Tensor`, *optional*):
              mask for the negative text embeddings. Can be generated from text_encoder step.
          image_latents (`Tensor`):
              image latents used to guide the image generation. Can be generated from vae_encoder step.
          latents (`Tensor`, *optional*):
              Pre-generated noisy latents for image generation.
          layers (`int`, *optional*, defaults to 4):
              Number of layers to extract from the image
          generator (`Generator`, *optional*):
              Torch generator for deterministic generation.
          num_inference_steps (`int`, *optional*, defaults to 50):
              The number of denoising steps.
          sigmas (`List`, *optional*):
              Custom sigmas for the denoising process.
          attention_kwargs (`Dict`, *optional*):
              Additional kwargs for attention processors.
          **denoiser_input_fields (`None`, *optional*):
              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
      Outputs:
          latents (`Tensor`):
              Denoised latents.
    """
    model_name = "qwenimage-layered"
    block_classes = [
        QwenImageLayeredInputStep(),
@@ -142,9 +287,7 @@ class QwenImageLayeredCoreDenoiseStep(SequentialPipelineBlocks):
    @property
    def outputs(self):
        return [
-            OutputParam(
+            OutputParam.template("latents"),
                name="latents", type_hint=torch.Tensor, description="The latents generated by the denoising step"
            ),
        ]
@@ -162,7 +305,54 @@ LAYERED_AUTO_BLOCKS = InsertableDict(
 )
 # auto_docstring
 class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks):
    """
    Auto Modular pipeline for layered denoising tasks using QwenImage-Layered.
      Components:
          image_resize_processor (`VaeImageProcessor`) text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor
          (`Qwen2VLProcessor`) tokenizer (`Qwen2Tokenizer`): The tokenizer to use guider (`ClassifierFreeGuidance`)
          image_processor (`VaeImageProcessor`) vae (`AutoencoderKLQwenImage`) pachifier (`QwenImageLayeredPachifier`)
          scheduler (`FlowMatchEulerDiscreteScheduler`) transformer (`QwenImageTransformer2DModel`)
      Inputs:
          image (`Union[Image, List]`):
              Reference image(s) for denoising. Can be a single image or list of images.
          resolution (`int`, *optional*, defaults to 640):
              The target area to resize the image to, can be 1024 or 640
          prompt (`str`, *optional*):
              The prompt or prompts to guide image generation.
          use_en_prompt (`bool`, *optional*, defaults to False):
              Whether to use English prompt template
          negative_prompt (`str`, *optional*):
              The prompt or prompts not to guide the image generation.
          max_sequence_length (`int`, *optional*, defaults to 1024):
              Maximum sequence length for prompt encoding.
          generator (`Generator`, *optional*):
              Torch generator for deterministic generation.
          num_images_per_prompt (`int`, *optional*, defaults to 1):
              The number of images to generate per prompt.
          latents (`Tensor`, *optional*):
              Pre-generated noisy latents for image generation.
          layers (`int`, *optional*, defaults to 4):
              Number of layers to extract from the image
          num_inference_steps (`int`, *optional*, defaults to 50):
              The number of denoising steps.
          sigmas (`List`, *optional*):
              Custom sigmas for the denoising process.
          attention_kwargs (`Dict`, *optional*):
              Additional kwargs for attention processors.
          **denoiser_input_fields (`None`, *optional*):
              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
          output_type (`str`, *optional*, defaults to pil):
              Output format: 'pil', 'np', 'pt'.
      Outputs:
          images (`List`):
              Generated images.
    """
    model_name = "qwenimage-layered"
    block_classes = LAYERED_AUTO_BLOCKS.values()
    block_names = LAYERED_AUTO_BLOCKS.keys()
@@ -174,5 +364,5 @@ class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks):
    @property
    def outputs(self):
        return [
-            OutputParam(name="images", type_hint=List[List[PIL.Image.Image]], description="The generated images"),
+            OutputParam.template("images"),
        ]
--- a/src/diffusers/modular_pipelines/z_image/denoise.py
+++ b/src/diffusers/modular_pipelines/z_image/denoise.py
@@ -131,7 +131,7 @@ class ZImageLoopDenoiser(ModularPipelineBlocks):
            ),
            InputParam(
                kwargs_type="denoiser_input_fields",
-                description="conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.",
+                description="The conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.",
            ),
        ]
        guider_input_names = []
--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py
@@ -84,7 +84,6 @@ EXAMPLE_DOC_STRING = """
        >>> from diffusers import ControlNetModel, StableDiffusionXLControlNetImg2ImgPipeline, AutoencoderKL
        >>> from diffusers.utils import load_image
        >>> depth_estimator = DPTForDepthEstimation.from_pretrained("Intel/dpt-hybrid-midas").to("cuda")
        >>> feature_extractor = DPTImageProcessor.from_pretrained("Intel/dpt-hybrid-midas")
        >>> controlnet = ControlNetModel.from_pretrained(
--- a/src/diffusers/pipelines/hidream_image/pipeline_hidream_image.py
+++ b/src/diffusers/pipelines/hidream_image/pipeline_hidream_image.py
@@ -53,7 +53,6 @@ EXAMPLE_DOC_STRING = """
        >>> from transformers import AutoTokenizer, LlamaForCausalLM
        >>> from diffusers import HiDreamImagePipeline
        >>> tokenizer_4 = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct")
        >>> text_encoder_4 = LlamaForCausalLM.from_pretrained(
        ...     "meta-llama/Meta-Llama-3.1-8B-Instruct",
--- a/src/diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py
+++ b/src/diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py
@@ -85,7 +85,6 @@ EXAMPLE_DOC_STRING = """
        >>> from diffusers import ControlNetModel, StableDiffusionXLControlNetPAGImg2ImgPipeline, AutoencoderKL
        >>> from diffusers.utils import load_image
        >>> depth_estimator = DPTForDepthEstimation.from_pretrained("Intel/dpt-hybrid-midas").to("cuda")
        >>> feature_extractor = DPTFeatureExtractor.from_pretrained("Intel/dpt-hybrid-midas")
        >>> controlnet = ControlNetModel.from_pretrained(
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py
@@ -459,7 +459,6 @@ class StableDiffusionLatentUpscalePipeline(DiffusionPipeline, StableDiffusionMix
        >>> from diffusers import StableDiffusionLatentUpscalePipeline, StableDiffusionPipeline
        >>> import torch
        >>> pipeline = StableDiffusionPipeline.from_pretrained(
        ...     "CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16
        ... )
--- a/utils/modular_auto_docstring.py
+++ b/utils/modular_auto_docstring.py
@@ -0,0 +1,300 @@
 # coding=utf-8
 # Copyright 2025 The HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 Auto Docstring Generator for Modular Pipeline Blocks
 This script scans Python files for classes that have `# auto_docstring` comment above them
 and inserts/updates the docstring from the class's `doc` property.
 Run from the root of the repo:
    python utils/modular_auto_docstring.py [path] [--fix_and_overwrite]
 Examples:
    # Check for auto_docstring markers (will error if found without proper docstring)
    python utils/modular_auto_docstring.py
    # Check specific directory
    python utils/modular_auto_docstring.py src/diffusers/modular_pipelines/
    # Fix and overwrite the docstrings
    python utils/modular_auto_docstring.py --fix_and_overwrite
 Usage in code:
    # auto_docstring
    class QwenImageAutoVaeEncoderStep(AutoPipelineBlocks):
        # docstring will be automatically inserted here
        @property
        def doc(self):
            return "Your docstring content..."
 """
 import argparse
 import ast
 import glob
 import importlib
 import os
 import re
 import sys
 # All paths are set with the intent you should run this script from the root of the repo
 DIFFUSERS_PATH = "src/diffusers"
 REPO_PATH = "."
 # Pattern to match the auto_docstring comment
 AUTO_DOCSTRING_PATTERN = re.compile(r"^\s*#\s*auto_docstring\s*$")
 def setup_diffusers_import():
    """Setup import path to use the local diffusers module."""
    src_path = os.path.join(REPO_PATH, "src")
    if src_path not in sys.path:
        sys.path.insert(0, src_path)
 def get_module_from_filepath(filepath: str) -> str:
    """Convert a filepath to a module name."""
    filepath = os.path.normpath(filepath)
    if filepath.startswith("src" + os.sep):
        filepath = filepath[4:]
    if filepath.endswith(".py"):
        filepath = filepath[:-3]
    module_name = filepath.replace(os.sep, ".")
    return module_name
 def load_module(filepath: str):
    """Load a module from filepath."""
    setup_diffusers_import()
    module_name = get_module_from_filepath(filepath)
    try:
        module = importlib.import_module(module_name)
        return module
    except Exception as e:
        print(f"Warning: Could not import module {module_name}: {e}")
        return None
 def get_doc_from_class(module, class_name: str) -> str:
    """Get the doc property from an instantiated class."""
    if module is None:
        return None
    cls = getattr(module, class_name, None)
    if cls is None:
        return None
    try:
        instance = cls()
        if hasattr(instance, "doc"):
            return instance.doc
    except Exception as e:
        print(f"Warning: Could not instantiate {class_name}: {e}")
    return None
 def find_auto_docstring_classes(filepath: str) -> list:
    """
    Find all classes in a file that have # auto_docstring comment above them.
    Returns list of (class_name, class_line_number, has_existing_docstring, docstring_end_line)
    """
    with open(filepath, "r", encoding="utf-8", newline="\n") as f:
        lines = f.readlines()
    # Parse AST to find class locations and their docstrings
    content = "".join(lines)
    try:
        tree = ast.parse(content)
    except SyntaxError as e:
        print(f"Syntax error in {filepath}: {e}")
        return []
    # Build a map of class_name -> (class_line, has_docstring, docstring_end_line)
    class_info = {}
    for node in ast.walk(tree):
        if isinstance(node, ast.ClassDef):
            has_docstring = False
            docstring_end_line = node.lineno  # default to class line
            if node.body and isinstance(node.body[0], ast.Expr):
                first_stmt = node.body[0]
                if isinstance(first_stmt.value, ast.Constant) and isinstance(first_stmt.value.value, str):
                    has_docstring = True
                    docstring_end_line = first_stmt.end_lineno or first_stmt.lineno
            class_info[node.name] = (node.lineno, has_docstring, docstring_end_line)
    # Now scan for # auto_docstring comments
    classes_to_update = []
    for i, line in enumerate(lines):
        if AUTO_DOCSTRING_PATTERN.match(line):
            # Found the marker, look for class definition on next non-empty, non-comment line
            j = i + 1
            while j < len(lines):
                next_line = lines[j].strip()
                if next_line and not next_line.startswith("#"):
                    break
                j += 1
            if j < len(lines) and lines[j].strip().startswith("class "):
                # Extract class name
                match = re.match(r"class\s+(\w+)", lines[j].strip())
                if match:
                    class_name = match.group(1)
                    if class_name in class_info:
                        class_line, has_docstring, docstring_end_line = class_info[class_name]
                        classes_to_update.append((class_name, class_line, has_docstring, docstring_end_line))
    return classes_to_update
 def strip_class_name_line(doc: str, class_name: str) -> str:
    """Remove the 'class ClassName' line from the doc if present."""
    lines = doc.strip().split("\n")
    if lines and lines[0].strip() == f"class {class_name}":
        # Remove the class line and any blank line following it
        lines = lines[1:]
        while lines and not lines[0].strip():
            lines = lines[1:]
    return "\n".join(lines)
 def format_docstring(doc: str, indent: str = "    ") -> str:
    """Format a doc string as a properly indented docstring."""
    lines = doc.strip().split("\n")
    if len(lines) == 1:
        return f'{indent}"""{lines[0]}"""\n'
    else:
        result = [f'{indent}"""\n']
        for line in lines:
            if line.strip():
                result.append(f"{indent}{line}\n")
            else:
                result.append("\n")
        result.append(f'{indent}"""\n')
        return "".join(result)
 def process_file(filepath: str, overwrite: bool = False) -> list:
    """
    Process a file and find/insert docstrings for # auto_docstring marked classes.
    Returns list of classes that need updating.
    """
    classes_to_update = find_auto_docstring_classes(filepath)
    if not classes_to_update:
        return []
    if not overwrite:
        # Just return the list of classes that need updating
        return [(filepath, cls_name, line) for cls_name, line, _, _ in classes_to_update]
    # Load the module to get doc properties
    module = load_module(filepath)
    with open(filepath, "r", encoding="utf-8", newline="\n") as f:
        lines = f.readlines()
    # Process in reverse order to maintain line numbers
    updated = False
    for class_name, class_line, has_docstring, docstring_end_line in reversed(classes_to_update):
        doc = get_doc_from_class(module, class_name)
        if doc is None:
            print(f"Warning: Could not get doc for {class_name} in {filepath}")
            continue
        # Remove the "class ClassName" line since it's redundant in a docstring
        doc = strip_class_name_line(doc, class_name)
        # Format the new docstring with 4-space indent
        new_docstring = format_docstring(doc, "    ")
        if has_docstring:
            # Replace existing docstring (line after class definition to docstring_end_line)
            # class_line is 1-indexed, we want to replace from class_line+1 to docstring_end_line
            lines = lines[:class_line] + [new_docstring] + lines[docstring_end_line:]
        else:
            # Insert new docstring right after class definition line
            # class_line is 1-indexed, so lines[class_line-1] is the class line
            # Insert at position class_line (which is right after the class line)
            lines = lines[:class_line] + [new_docstring] + lines[class_line:]
        updated = True
        print(f"Updated docstring for {class_name} in {filepath}")
    if updated:
        with open(filepath, "w", encoding="utf-8", newline="\n") as f:
            f.writelines(lines)
    return [(filepath, cls_name, line) for cls_name, line, _, _ in classes_to_update]
 def check_auto_docstrings(path: str = None, overwrite: bool = False):
    """
    Check all files for # auto_docstring markers and optionally fix them.
    """
    if path is None:
        path = DIFFUSERS_PATH
    if os.path.isfile(path):
        all_files = [path]
    else:
        all_files = glob.glob(os.path.join(path, "**/*.py"), recursive=True)
    all_markers = []
    for filepath in all_files:
        markers = process_file(filepath, overwrite)
        all_markers.extend(markers)
    if not overwrite and len(all_markers) > 0:
        message = "\n".join([f"- {f}: {cls} at line {line}" for f, cls, line in all_markers])
        raise ValueError(
            f"Found the following # auto_docstring markers that need docstrings:\n{message}\n\n"
            f"Run `python utils/modular_auto_docstring.py --fix_and_overwrite` to fix them."
        )
    if overwrite and len(all_markers) > 0:
        print(f"\nUpdated {len(all_markers)} docstring(s).")
    elif len(all_markers) == 0:
        print("No # auto_docstring markers found.")
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Check and fix # auto_docstring markers in modular pipeline blocks",
    )
    parser.add_argument("path", nargs="?", default=None, help="File or directory to process (default: src/diffusers)")
    parser.add_argument(
        "--fix_and_overwrite",
        action="store_true",
        help="Whether to fix the docstrings by inserting them from doc property.",
    )
    args = parser.parse_args()
    check_auto_docstrings(args.path, args.fix_and_overwrite)
Author	SHA1	Message	Date
yiyixuxu	20c35da75c	up up	2026-01-25 12:11:37 +01:00
yiyixuxu	6a549f5f55	initial support: workflow	2026-01-25 11:40:52 +01:00
Sayak Paul	412e51c856	include auto-docstring check in the modular ci. (#13004 )	2026-01-23 22:34:24 -10:00
github-actions[bot]	23d06423ab	Apply style fixes	2026-01-19 09:23:31 +00:00
YiYi Xu	aba551c868	Merge branch 'main' into modular-doc-improv	2026-01-18 23:20:36 -10:00
yiyixuxu	1f9576a2ca	fix	2026-01-19 09:56:14 +01:00
yiyixuxu	d75fbc43c7	Merge branch 'modular-doc-improv' of github.com:huggingface/diffusers into modular-doc-improv	2026-01-19 09:54:46 +01:00
yiyixuxu	b7127ce7a7	revert change in z	2026-01-19 09:54:40 +01:00
YiYi Xu	7e9d2b954e	Apply suggestions from code review	2026-01-18 22:44:44 -10:00
yiyixuxu	94525200fd	rmove space in make docstring	2026-01-19 09:35:39 +01:00
yiyixuxu	f056af1fbb	make style	2026-01-19 09:27:40 +01:00
yiyixuxu	8d45ff5bf6	apply auto docstring	2026-01-19 09:22:04 +01:00
yiyixuxu	fb15752d55	up up up	2026-01-19 08:10:31 +01:00
yiyixuxu	1f2dbc9dd2	up	2026-01-19 04:10:17 +01:00
yiyixuxu	002c3e8239	add template method	2026-01-19 03:24:34 +01:00
yiyixuxu	de03d7f100	refactor based on dhruv's feedback: remove the class method	2026-01-18 00:35:01 +01:00
yiyixuxu	25c968a38f	add TODO in the description for empty docstring	2026-01-17 09:57:56 +01:00
yiyixuxu	aea0d046f6	address feedbacks	2026-01-17 09:36:58 +01:00
yiyixuxu	1c90ce33f2	up	2026-01-10 12:21:26 +01:00
yiyixuxu	507953f415	more more	2026-01-10 12:19:14 +01:00
yiyixuxu	f0555af1c6	up up up	2026-01-10 12:15:53 +01:00
yiyixuxu	2a81f2ec54	style	2026-01-10 12:15:36 +01:00
yiyixuxu	d20f413f78	more auto docstring	2026-01-10 12:11:28 +01:00
yiyixuxu	ff09bf1a63	add modular_auto_docstring!	2026-01-10 11:55:03 +01:00
yiyixuxu	34a743e2dc	style	2026-01-10 10:57:27 +01:00
yiyixuxu	43ab14845d	update outputs	2026-01-10 10:56:54 +01:00
YiYi Xu	fbfe5c8d6b	Merge branch 'main' into modular-doc-improv	2026-01-09 23:54:23 -10:00
yiyixuxu	b29873dee7	up up	2026-01-10 10:52:53 +01:00
yiyixuxu	7b499de6d0	up	2026-01-10 03:35:15 +01:00