refactor qwen modular

add conditoinal pipeline
3 files
2025-12-23 21:04:56 +08:00 · 2025-12-22 01:02:40 +01:00 · 2025-12-22 01:01:16 +01:00 · 2025-12-20 00:27:54 +01:00
9 changed files with 1506 additions and 1509 deletions
--- a/src/diffusers/modular_pipelines/modular_pipeline.py
+++ b/src/diffusers/modular_pipelines/modular_pipeline.py
@@ -231,7 +231,7 @@ class BlockState:

 class ModularPipelineBlocks(ConfigMixin, PushToHubMixin):
    """
-    Base class for all Pipeline Blocks: PipelineBlock, AutoPipelineBlocks, SequentialPipelineBlocks,
+    Base class for all Pipeline Blocks: ConditionalPipelineBlocks, AutoPipelineBlocks, SequentialPipelineBlocks,
    LoopSequentialPipelineBlocks

    [`ModularPipelineBlocks`] provides method to load and save the definition of pipeline blocks.
@@ -527,9 +527,10 @@ class ModularPipelineBlocks(ConfigMixin, PushToHubMixin):
        )


-class AutoPipelineBlocks(ModularPipelineBlocks):
+class ConditionalPipelineBlocks(ModularPipelineBlocks):
    """
-    A Pipeline Blocks that automatically selects a block to run based on the inputs.
+    A Pipeline Blocks that conditionally selects a block to run based on the inputs.
+    Subclasses must implement the `select_block` method to define the logic for selecting the block.

    This class inherits from [`ModularPipelineBlocks`]. Check the superclass documentation for the generic methods the
    library implements for all the pipeline blocks (such as loading or saving etc.)
@@ -539,12 +540,13 @@ class AutoPipelineBlocks(ModularPipelineBlocks):
    Attributes:
        block_classes: List of block classes to be used
        block_names: List of prefixes for each block
-        block_trigger_inputs: List of input names that trigger specific blocks, with None for default
+        block_trigger_inputs: List of input names that select_block() uses to determine which block to run
    """

    block_classes = []
    block_names = []
    block_trigger_inputs = []
+    default_block_name = None # name of the default block if no trigger inputs are provided, if None, this block can be skipped if no trigger inputs are provided

    def __init__(self):
        sub_blocks = InsertableDict()
@@ -554,26 +556,15 @@ class AutoPipelineBlocks(ModularPipelineBlocks):
            else:
                sub_blocks[block_name] = block
        self.sub_blocks = sub_blocks
-        if not (len(self.block_classes) == len(self.block_names) == len(self.block_trigger_inputs)):
+        if not (len(self.block_classes) == len(self.block_names)):
            raise ValueError(
-                f"In {self.__class__.__name__}, the number of block_classes, block_names, and block_trigger_inputs must be the same."
+                f"In {self.__class__.__name__}, the number of block_classes and block_names must be the same."
            )
-        default_blocks = [t for t in self.block_trigger_inputs if t is None]
-        # can only have 1 or 0 default block, and has to put in the last
-        # the order of blocks matters here because the first block with matching trigger will be dispatched
-        # e.g. blocks = [inpaint, img2img] and block_trigger_inputs = ["mask", "image"]
-        # as long as mask is provided, it is inpaint; if only image is provided, it is img2img
-        if len(default_blocks) > 1 or (len(default_blocks) == 1 and self.block_trigger_inputs[-1] is not None):
+        if self.default_block_name is not None and self.default_block_name not in self.block_names:
            raise ValueError(
-                f"In {self.__class__.__name__}, exactly one None must be specified as the last element "
-                "in block_trigger_inputs."
+                f"In {self.__class__.__name__}, default_block_name '{self.default_block_name}' must be one of block_names: {self.block_names}"
            )

-        # Map trigger inputs to block objects
-        self.trigger_to_block_map = dict(zip(self.block_trigger_inputs, self.sub_blocks.values()))
-        self.trigger_to_block_name_map = dict(zip(self.block_trigger_inputs, self.sub_blocks.keys()))
-        self.block_to_trigger_map = dict(zip(self.sub_blocks.keys(), self.block_trigger_inputs))
-
    @property
    def model_name(self):
        return next(iter(self.sub_blocks.values())).model_name
@@ -602,8 +593,11 @@ class AutoPipelineBlocks(ModularPipelineBlocks):

    @property
    def required_inputs(self) -> List[str]:
-        if None not in self.block_trigger_inputs:
+
+        # no default block means this conditional block can be skipped entirely
+        if self.default_block_name is None:
            return []
+        
        first_block = next(iter(self.sub_blocks.values()))
        required_by_all = set(getattr(first_block, "required_inputs", set()))

@@ -614,7 +608,7 @@ class AutoPipelineBlocks(ModularPipelineBlocks):

        return list(required_by_all)

-    # YiYi TODO: add test for this
+
    @property
    def inputs(self) -> List[Tuple[str, Any]]:
        named_inputs = [(name, block.inputs) for name, block in self.sub_blocks.items()]
@@ -639,36 +633,9 @@ class AutoPipelineBlocks(ModularPipelineBlocks):
        combined_outputs = self.combine_outputs(*named_outputs)
        return combined_outputs

-    @torch.no_grad()
-    def __call__(self, pipeline, state: PipelineState) -> PipelineState:
-        # Find default block first (if any)
-
-        block = self.trigger_to_block_map.get(None)
-        for input_name in self.block_trigger_inputs:
-            if input_name is not None and state.get(input_name) is not None:
-                block = self.trigger_to_block_map[input_name]
-                break
-
-        if block is None:
-            logger.info(f"skipping auto block: {self.__class__.__name__}")
-            return pipeline, state
-
-        try:
-            logger.info(f"Running block: {block.__class__.__name__}, trigger: {input_name}")
-            return block(pipeline, state)
-        except Exception as e:
-            error_msg = (
-                f"\nError in block: {block.__class__.__name__}\n"
-                f"Error details: {str(e)}\n"
-                f"Traceback:\n{traceback.format_exc()}"
-            )
-            logger.error(error_msg)
-            raise
-
-    def _get_trigger_inputs(self):
+    def _get_trigger_inputs(self) -> set:
        """
-        Returns a set of all unique trigger input values found in the blocks. Returns: Set[str] containing all unique
-        block_trigger_inputs values
+        Returns a set of all unique trigger input values found in this block and nested blocks.
        """

        def fn_recursive_get_trigger(blocks):
@@ -676,9 +643,8 @@ class AutoPipelineBlocks(ModularPipelineBlocks):

            if blocks is not None:
                for name, block in blocks.items():
-                    # Check if current block has trigger inputs(i.e. auto block)
+                    # Check if current block has block_trigger_inputs
                    if hasattr(block, "block_trigger_inputs") and block.block_trigger_inputs is not None:
-                        # Add all non-None values from the trigger inputs list
                        trigger_values.update(t for t in block.block_trigger_inputs if t is not None)

                    # If block has sub_blocks, recursively check them
@@ -688,15 +654,58 @@ class AutoPipelineBlocks(ModularPipelineBlocks):

            return trigger_values

-        trigger_inputs = set(self.block_trigger_inputs)
-        trigger_inputs.update(fn_recursive_get_trigger(self.sub_blocks))
+        # Start with this block's block_trigger_inputs
+        all_triggers = set(t for t in self.block_trigger_inputs if t is not None)
+        # Add nested triggers
+        all_triggers.update(fn_recursive_get_trigger(self.sub_blocks))

-        return trigger_inputs
+        return all_triggers

    @property
    def trigger_inputs(self):
+        """All trigger inputs including from nested blocks."""
        return self._get_trigger_inputs()

+    def select_block(self, **kwargs) -> Optional[str]:
+        """
+        Select the block to run based on the trigger inputs.
+        Subclasses must implement this method to define the logic for selecting the block.
+
+        Args:
+            **kwargs: Trigger input names and their values from the state.
+
+        Returns:
+            Optional[str]: The name of the block to run, or None to use default/skip.
+        """
+        raise NotImplementedError(f"Subclass {self.__class__.__name__} must implement the `select_block` method.")
+
+    @torch.no_grad()
+    def __call__(self, pipeline, state: PipelineState) -> PipelineState:
+        
+        trigger_kwargs = {name: state.get(name) for name in self.block_trigger_inputs if name is not None}
+        block_name = self.select_block(**trigger_kwargs)
+
+        if block_name is None:
+            block_name = self.default_block_name
+
+        if block_name is None:
+            logger.info(f"skipping conditional block: {self.__class__.__name__}")
+            return pipeline, state
+        
+        block = self.sub_blocks[block_name]
+
+        try:
+            logger.info(f"Running block: {block.__class__.__name__}")
+            return block(pipeline, state)
+        except Exception as e:
+            error_msg = (
+                f"\nError in block: {block.__class__.__name__}\n"
+                f"Error details: {str(e)}\n"
+                f"Traceback:\n{traceback.format_exc()}"
+            )
+            logger.error(error_msg)
+            raise
+
    def __repr__(self):
        class_name = self.__class__.__name__
        base_class = self.__class__.__bases__[0].__name__
@@ -708,7 +717,7 @@ class AutoPipelineBlocks(ModularPipelineBlocks):
            header += "\n"
            header += "  " + "=" * 100 + "\n"
            header += "  This pipeline contains blocks that are selected at runtime based on inputs.\n"
-            header += f"  Trigger Inputs: {[inp for inp in self.trigger_inputs if inp is not None]}\n"
+            header += f"  Trigger Inputs: {sorted(self.trigger_inputs)}\n"
            header += "  " + "=" * 100 + "\n\n"

        # Format description with proper indentation
@@ -729,31 +738,20 @@ class AutoPipelineBlocks(ModularPipelineBlocks):
        expected_configs = getattr(self, "expected_configs", [])
        configs_str = format_configs(expected_configs, indent_level=2, add_empty_lines=False)

-        # Blocks section - moved to the end with simplified format
+        # Blocks section 
        blocks_str = "  Sub-Blocks:\n"
        for i, (name, block) in enumerate(self.sub_blocks.items()):
-            # Get trigger input for this block
-            trigger = None
-            if hasattr(self, "block_to_trigger_map"):
-                trigger = self.block_to_trigger_map.get(name)
-                # Format the trigger info
-                if trigger is None:
-                    trigger_str = "[default]"
-                elif isinstance(trigger, (list, tuple)):
-                    trigger_str = f"[trigger: {', '.join(str(t) for t in trigger)}]"
-                else:
-                    trigger_str = f"[trigger: {trigger}]"
-                # For AutoPipelineBlocks, add bullet points
-                blocks_str += f"    • {name} {trigger_str} ({block.__class__.__name__})\n"
+            if name == self.default_block_name:
+                addtional_str  = " [default]"
            else:
-                # For SequentialPipelineBlocks, show execution order
-                blocks_str += f"    [{i}] {name} ({block.__class__.__name__})\n"
+                addtional_str = ""
+            blocks_str += f"    • {name}{addtional_str} ({block.__class__.__name__})\n"

            # Add block description
-            desc_lines = block.description.split("\n")
-            indented_desc = desc_lines[0]
-            if len(desc_lines) > 1:
-                indented_desc += "\n" + "\n".join("                   " + line for line in desc_lines[1:])
+            block_desc_lines = block.description.split("\n")
+            indented_desc = block_desc_lines[0]
+            if len(block_desc_lines) > 1:
+                indented_desc += "\n" + "\n".join("                   " + line for line in block_desc_lines[1:])
            blocks_str += f"       Description: {indented_desc}\n\n"

        # Build the representation with conditional sections
@@ -784,6 +782,35 @@ class AutoPipelineBlocks(ModularPipelineBlocks):
        )


+class AutoPipelineBlocks(ConditionalPipelineBlocks):
+    """
+    A Pipeline Blocks that automatically selects a block to run based on the presence of trigger inputs.
+    """
+
+    def __init__(self):
+        super().__init__()
+
+        if not (len(self.block_classes) == len(self.block_names) == len(self.block_trigger_inputs)):
+            raise ValueError(
+                f"In {self.__class__.__name__}, the number of block_classes, block_names, and block_trigger_inputs must be the same."
+            )
+
+    @property
+    def default_block_name(self) -> Optional[str]:
+        """Derive default_block_name from block_trigger_inputs (None entry)."""
+        if None in self.block_trigger_inputs:
+            idx = self.block_trigger_inputs.index(None)
+            return self.block_names[idx]
+        return None
+
+    def select_block(self, **kwargs) -> Optional[str]:
+        """Select block based on which trigger input is present (not None)."""
+        for trigger_input, block_name in zip(self.block_trigger_inputs, self.block_names):
+            if trigger_input is not None and kwargs.get(trigger_input) is not None:
+                return block_name
+        return None
+
+
 class SequentialPipelineBlocks(ModularPipelineBlocks):
    """
    A Pipeline Blocks that combines multiple pipeline block classes into one. When called, it will call each block in
@@ -885,7 +912,8 @@ class SequentialPipelineBlocks(ModularPipelineBlocks):

            # Only add outputs if the block cannot be skipped
            should_add_outputs = True
-            if hasattr(block, "block_trigger_inputs") and None not in block.block_trigger_inputs:
+            if isinstance(block, ConditionalPipelineBlocks) and block.default_block_name is None:
+                # ConditionalPipelineBlocks without default can be skipped
                should_add_outputs = False

            if should_add_outputs:
@@ -948,8 +976,7 @@ class SequentialPipelineBlocks(ModularPipelineBlocks):

    def _get_trigger_inputs(self):
        """
-        Returns a set of all unique trigger input values found in the blocks. Returns: Set[str] containing all unique
-        block_trigger_inputs values
+        Returns a set of all unique trigger input values found in the blocks.
        """

        def fn_recursive_get_trigger(blocks):
@@ -957,9 +984,8 @@ class SequentialPipelineBlocks(ModularPipelineBlocks):

            if blocks is not None:
                for name, block in blocks.items():
-                    # Check if current block has trigger inputs(i.e. auto block)
+                    # Check if current block has block_trigger_inputs (ConditionalPipelineBlocks)
                    if hasattr(block, "block_trigger_inputs") and block.block_trigger_inputs is not None:
-                        # Add all non-None values from the trigger inputs list
                        trigger_values.update(t for t in block.block_trigger_inputs if t is not None)

                    # If block has sub_blocks, recursively check them
@@ -975,82 +1001,85 @@ class SequentialPipelineBlocks(ModularPipelineBlocks):
    def trigger_inputs(self):
        return self._get_trigger_inputs()

-    def _traverse_trigger_blocks(self, trigger_inputs):
-        # Convert trigger_inputs to a set for easier manipulation
-        active_triggers = set(trigger_inputs)
+    def _traverse_trigger_blocks(self, active_inputs):
+        """
+        Traverse blocks and select which ones would run given the active inputs.

-        def fn_recursive_traverse(block, block_name, active_triggers):
+        Args:
+            active_inputs: Dict of input names to values that are "present"
+
+        Returns:
+            OrderedDict of block_name -> block that would execute
+        """
+
+        def fn_recursive_traverse(block, block_name, active_inputs):
            result_blocks = OrderedDict()

-            # sequential(include loopsequential) or PipelineBlock
-            if not hasattr(block, "block_trigger_inputs"):
-                if block.sub_blocks:
-                    # sequential or LoopSequentialPipelineBlocks (keep traversing)
-                    for sub_block_name, sub_block in block.sub_blocks.items():
-                        blocks_to_update = fn_recursive_traverse(sub_block, sub_block_name, active_triggers)
-                        blocks_to_update = fn_recursive_traverse(sub_block, sub_block_name, active_triggers)
-                        blocks_to_update = {f"{block_name}.{k}": v for k, v in blocks_to_update.items()}
-                        result_blocks.update(blocks_to_update)
+            # ConditionalPipelineBlocks (includes AutoPipelineBlocks)
+            if isinstance(block, ConditionalPipelineBlocks):
+                trigger_kwargs = {name: active_inputs.get(name) for name in block.block_trigger_inputs}
+                selected_block_name = block.select_block(**trigger_kwargs)
+
+                if selected_block_name is None:
+                    selected_block_name = block.default_block_name
+
+                if selected_block_name is None:
+                    return result_blocks
+
+                selected_block = block.sub_blocks[selected_block_name]
+
+                if selected_block.sub_blocks:
+                    result_blocks.update(fn_recursive_traverse(selected_block, block_name, active_inputs))
                else:
-                    # PipelineBlock
-                    result_blocks[block_name] = block
-                    # Add this block's output names to active triggers if defined
-                    if hasattr(block, "outputs"):
-                        active_triggers.update(out.name for out in block.outputs)
+                    result_blocks[block_name] = selected_block
+                    if hasattr(selected_block, "outputs"):
+                        for out in selected_block.outputs:
+                            active_inputs[out.name] = True
+
                return result_blocks

-            # auto
+            # SequentialPipelineBlocks or LoopSequentialPipelineBlocks
+            if block.sub_blocks:
+                for sub_block_name, sub_block in block.sub_blocks.items():
+                    blocks_to_update = fn_recursive_traverse(sub_block, sub_block_name, active_inputs)
+                    blocks_to_update = {f"{block_name}.{k}": v for k, v in blocks_to_update.items()}
+                    result_blocks.update(blocks_to_update)
            else:
-                # Find first block_trigger_input that matches any value in our active_triggers
-                this_block = None
-                for trigger_input in block.block_trigger_inputs:
-                    if trigger_input is not None and trigger_input in active_triggers:
-                        this_block = block.trigger_to_block_map[trigger_input]
-                        break
-
-                # If no matches found, try to get the default (None) block
-                if this_block is None and None in block.block_trigger_inputs:
-                    this_block = block.trigger_to_block_map[None]
-
-                if this_block is not None:
-                    # sequential/auto (keep traversing)
-                    if this_block.sub_blocks:
-                        result_blocks.update(fn_recursive_traverse(this_block, block_name, active_triggers))
-                    else:
-                        # PipelineBlock
-                        result_blocks[block_name] = this_block
-                        # Add this block's output names to active triggers if defined
-                        # YiYi TODO: do we need outputs here? can it just be intermediate_outputs? can we get rid of outputs attribute?
-                        if hasattr(this_block, "outputs"):
-                            active_triggers.update(out.name for out in this_block.outputs)
+                result_blocks[block_name] = block
+                if hasattr(block, "outputs"):
+                    for out in block.outputs:
+                        active_inputs[out.name] = True

            return result_blocks

        all_blocks = OrderedDict()
        for block_name, block in self.sub_blocks.items():
-            blocks_to_update = fn_recursive_traverse(block, block_name, active_triggers)
+            blocks_to_update = fn_recursive_traverse(block, block_name, active_inputs)
            all_blocks.update(blocks_to_update)
        return all_blocks

-    def get_execution_blocks(self, *trigger_inputs):
-        trigger_inputs_all = self.trigger_inputs
+    def get_execution_blocks(self, **kwargs):
+        """
+        Get the blocks that would execute given the specified inputs.

-        if trigger_inputs is not None:
-            if not isinstance(trigger_inputs, (list, tuple, set)):
-                trigger_inputs = [trigger_inputs]
-            invalid_inputs = [x for x in trigger_inputs if x not in trigger_inputs_all]
-            if invalid_inputs:
-                logger.warning(
-                    f"The following trigger inputs will be ignored as they are not supported: {invalid_inputs}"
-                )
-                trigger_inputs = [x for x in trigger_inputs if x in trigger_inputs_all]
+        Args:
+            **kwargs: Input names and values. Only trigger inputs affect block selection.
+                    Pass any inputs that would be non-None at runtime.

-        if trigger_inputs is None:
-            if None in trigger_inputs_all:
-                trigger_inputs = [None]
-            else:
-                trigger_inputs = [trigger_inputs_all[0]]
-        blocks_triggered = self._traverse_trigger_blocks(trigger_inputs)
+        Returns:
+            SequentialPipelineBlocks containing only the blocks that would execute
+        
+        Example:
+            # Get blocks for inpainting workflow
+            blocks = pipeline.get_execution_blocks(prompt="a cat", mask=mask, image=image)
+            
+            # Get blocks for text2image workflow
+            blocks = pipeline.get_execution_blocks(prompt="a cat")
+        """
+        # Filter out None values
+        active_inputs = {k: v for k, v in kwargs.items() if v is not None}
+        
+        blocks_triggered = self._traverse_trigger_blocks(active_inputs)
        return SequentialPipelineBlocks.from_blocks_dict(blocks_triggered)

    def __repr__(self):
@@ -1067,7 +1096,7 @@ class SequentialPipelineBlocks(ModularPipelineBlocks):
            header += f"  Trigger Inputs: {[inp for inp in self.trigger_inputs if inp is not None]}\n"
            # Get first trigger input as example
            example_input = next(t for t in self.trigger_inputs if t is not None)
-            header += f"  Use `get_execution_blocks()` with input names to see selected blocks (e.g. `get_execution_blocks('{example_input}')`).\n"
+            header += f"  Use `get_execution_blocks()` to see selected blocks (e.g. `get_execution_blocks({example_input}=...)`).\n"
            header += "  " + "=" * 100 + "\n\n"

        # Format description with proper indentation
@@ -1091,22 +1120,9 @@ class SequentialPipelineBlocks(ModularPipelineBlocks):
        # Blocks section - moved to the end with simplified format
        blocks_str = "  Sub-Blocks:\n"
        for i, (name, block) in enumerate(self.sub_blocks.items()):
-            # Get trigger input for this block
-            trigger = None
-            if hasattr(self, "block_to_trigger_map"):
-                trigger = self.block_to_trigger_map.get(name)
-                # Format the trigger info
-                if trigger is None:
-                    trigger_str = "[default]"
-                elif isinstance(trigger, (list, tuple)):
-                    trigger_str = f"[trigger: {', '.join(str(t) for t in trigger)}]"
-                else:
-                    trigger_str = f"[trigger: {trigger}]"
-                # For AutoPipelineBlocks, add bullet points
-                blocks_str += f"    • {name} {trigger_str} ({block.__class__.__name__})\n"
-            else:
-                # For SequentialPipelineBlocks, show execution order
-                blocks_str += f"    [{i}] {name} ({block.__class__.__name__})\n"
+
+            # show execution order
+            blocks_str += f"    [{i}] {name} ({block.__class__.__name__})\n"

            # Add block description
            desc_lines = block.description.split("\n")
@@ -1230,15 +1246,9 @@ class LoopSequentialPipelineBlocks(ModularPipelineBlocks):
                if inp.name not in outputs and inp not in inputs:
                    inputs.append(inp)

-            # Only add outputs if the block cannot be skipped
-            should_add_outputs = True
-            if hasattr(block, "block_trigger_inputs") and None not in block.block_trigger_inputs:
-                should_add_outputs = False
-
-            if should_add_outputs:
-                # Add this block's outputs
-                block_intermediate_outputs = [out.name for out in block.intermediate_outputs]
-                outputs.update(block_intermediate_outputs)
+            # Add this block's outputs
+            block_intermediate_outputs = [out.name for out in block.intermediate_outputs]
+            outputs.update(block_intermediate_outputs)

        for input_param in inputs:
            if input_param.name in self.required_inputs:
@@ -1295,6 +1305,14 @@ class LoopSequentialPipelineBlocks(ModularPipelineBlocks):
                sub_blocks[block_name] = block
        self.sub_blocks = sub_blocks

+        # Validate that sub_blocks are only leaf blocks
+        for block_name, block in self.sub_blocks.items():
+            if block.sub_blocks:
+                raise ValueError(
+                    f"In {self.__class__.__name__}, sub_blocks must be leaf blocks (no sub_blocks). "
+                    f"Block '{block_name}' ({block.__class__.__name__}) has sub_blocks."
+                )
+
    @classmethod
    def from_blocks_dict(cls, blocks_dict: Dict[str, Any]) -> "LoopSequentialPipelineBlocks":
        """
--- a/src/diffusers/modular_pipelines/qwenimage/init.py
+++ b/src/diffusers/modular_pipelines/qwenimage/init.py
@@ -21,21 +21,16 @@ except OptionalDependencyNotAvailable:

    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
 else:
-    _import_structure["encoders"] = ["QwenImageTextEncoderStep"]
-    _import_structure["modular_blocks"] = [
-        "ALL_BLOCKS",
+    _import_structure["modular_blocks_qwenimage"] = [
        "AUTO_BLOCKS",
-        "CONTROLNET_BLOCKS",
-        "EDIT_AUTO_BLOCKS",
-        "EDIT_BLOCKS",
-        "EDIT_INPAINT_BLOCKS",
-        "EDIT_PLUS_AUTO_BLOCKS",
-        "EDIT_PLUS_BLOCKS",
-        "IMAGE2IMAGE_BLOCKS",
-        "INPAINT_BLOCKS",
-        "TEXT2IMAGE_BLOCKS",
        "QwenImageAutoBlocks",
+    ]
+    _import_structure["modular_blocks_qwenimage_edit"] = [
+        "EDIT_AUTO_BLOCKS",
        "QwenImageEditAutoBlocks",
+    ]
+    _import_structure["modular_blocks_qwenimage_edit_plus"] = [
+        "EDIT_PLUS_AUTO_BLOCKS",
        "QwenImageEditPlusAutoBlocks",
    ]
    _import_structure["modular_pipeline"] = [
@@ -51,23 +46,16 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
    except OptionalDependencyNotAvailable:
        from ...utils.dummy_torch_and_transformers_objects import *  # noqa F403
    else:
-        from .encoders import (
-            QwenImageTextEncoderStep,
-        )
-        from .modular_blocks import (
-            ALL_BLOCKS,
+        from .modular_blocks_qwenimage import (
            AUTO_BLOCKS,
-            CONTROLNET_BLOCKS,
-            EDIT_AUTO_BLOCKS,
-            EDIT_BLOCKS,
-            EDIT_INPAINT_BLOCKS,
-            EDIT_PLUS_AUTO_BLOCKS,
-            EDIT_PLUS_BLOCKS,
-            IMAGE2IMAGE_BLOCKS,
-            INPAINT_BLOCKS,
-            TEXT2IMAGE_BLOCKS,
            QwenImageAutoBlocks,
+        )
+        from .modular_blocks_qwenimage_edit import (
+            EDIT_AUTO_BLOCKS,
            QwenImageEditAutoBlocks,
+        )
+        from .modular_blocks_qwenimage_edit_plus import (
+            EDIT_PLUS_AUTO_BLOCKS,
            QwenImageEditPlusAutoBlocks,
        )
        from .modular_pipeline import (
@@ -86,4 +74,4 @@ else:
    )

    for name, value in _dummy_objects.items():
-        setattr(sys.modules[__name__], name, value)
+        setattr(sys.modules[__name__], name, value)
--- a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
+++ b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
@@ -639,19 +639,65 @@ class QwenImageEditRoPEInputsStep(ModularPipelineBlocks):
        return components, state


-class QwenImageEditPlusRoPEInputsStep(QwenImageEditRoPEInputsStep):
+class QwenImageEditPlusRoPEInputsStep(ModularPipelineBlocks):
+    """RoPE inputs step for Edit Plus that handles lists of image heights/widths."""
+
    model_name = "qwenimage-edit-plus"

+    @property
+    def description(self) -> str:
+        return (
+            "Step that prepares the RoPE inputs for denoising process. This is used in QwenImage Edit Plus.\n"
+            "Unlike Edit, Edit Plus handles lists of image_height/image_width for multiple reference images.\n"
+            "Should be placed after prepare_latents step."
+        )
+
+    @property
+    def inputs(self) -> List[InputParam]:
+        return [
+            InputParam(name="batch_size", required=True),
+            InputParam(name="image_height", required=True, type_hint=List[int]),
+            InputParam(name="image_width", required=True, type_hint=List[int]),
+            InputParam(name="height", required=True),
+            InputParam(name="width", required=True),
+            InputParam(name="prompt_embeds_mask"),
+            InputParam(name="negative_prompt_embeds_mask"),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [
+            OutputParam(
+                name="img_shapes",
+                type_hint=List[List[Tuple[int, int, int]]],
+                description="The shapes of the image latents, used for RoPE calculation",
+            ),
+            OutputParam(
+                name="txt_seq_lens",
+                kwargs_type="denoiser_input_fields",
+                type_hint=List[int],
+                description="The sequence lengths of the prompt embeds, used for RoPE calculation",
+            ),
+            OutputParam(
+                name="negative_txt_seq_lens",
+                kwargs_type="denoiser_input_fields",
+                type_hint=List[int],
+                description="The sequence lengths of the negative prompt embeds, used for RoPE calculation",
+            ),
+        ]
+
    def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
        block_state = self.get_block_state(state)

        vae_scale_factor = components.vae_scale_factor
+
+        # Edit Plus: image_height and image_width are lists
        block_state.img_shapes = [
            [
                (1, block_state.height // vae_scale_factor // 2, block_state.width // vae_scale_factor // 2),
                *[
-                    (1, vae_height // vae_scale_factor // 2, vae_width // vae_scale_factor // 2)
-                    for vae_height, vae_width in zip(block_state.image_height, block_state.image_width)
+                    (1, img_height // vae_scale_factor // 2, img_width // vae_scale_factor // 2)
+                    for img_height, img_width in zip(block_state.image_height, block_state.image_width)
                ],
            ]
        ] * block_state.batch_size
--- a/src/diffusers/modular_pipelines/qwenimage/encoders.py
+++ b/src/diffusers/modular_pipelines/qwenimage/encoders.py
@@ -244,18 +244,19 @@ def encode_vae_image(
 class QwenImageEditResizeDynamicStep(ModularPipelineBlocks):
    model_name = "qwenimage"

-    def __init__(self, input_name: str = "image", output_name: str = "resized_image"):
-        """Create a configurable step for resizing images to the target area (1024 * 1024) while maintaining the aspect ratio.
-
-        This block resizes an input image tensor and exposes the resized result under configurable input and output
-        names. Use this when you need to wire the resize step to different image fields (e.g., "image",
-        "control_image")
-
+    def __init__(
+        self, 
+        input_name: str = "image", 
+        output_name: str = "resized_image",
+        target_area: int = 1024 * 1024,
+    ):
+        """Create a configurable step for resizing images to the target area while maintaining the aspect ratio.
        Args:
            input_name (str, optional): Name of the image field to read from the
                pipeline state. Defaults to "image".
            output_name (str, optional): Name of the resized image field to write
                back to the pipeline state. Defaults to "resized_image".
+            target_area (int, optional): Target area in pixels. Defaults to 1024*1024.
        """
        if not isinstance(input_name, str) or not isinstance(output_name, str):
            raise ValueError(
@@ -263,11 +264,12 @@ class QwenImageEditResizeDynamicStep(ModularPipelineBlocks):
            )
        self._image_input_name = input_name
        self._resized_image_output_name = output_name
+        self._target_area = target_area
        super().__init__()

    @property
    def description(self) -> str:
-        return f"Image Resize step that resize the {self._image_input_name} to the target area (1024 * 1024) while maintaining the aspect ratio."
+        return f"Image Resize step that resize the {self._image_input_name} to the target area {self._target_area} while maintaining the aspect ratio."

    @property
    def expected_components(self) -> List[ComponentSpec]:
@@ -320,48 +322,67 @@ class QwenImageEditResizeDynamicStep(ModularPipelineBlocks):
        self.set_block_state(state, block_state)
        return components, state

+class QwenImageEditPlusResizeDynamicStep(ModularPipelineBlocks):
+    """Resize each image independently based on its own aspect ratio. For QwenImage Edit Plus."""

-class QwenImageEditPlusResizeDynamicStep(QwenImageEditResizeDynamicStep):
-    model_name = "qwenimage"
+    model_name = "qwenimage-edit-plus"

    def __init__(
-        self,
-        input_name: str = "image",
+        self, 
+        input_name: str = "image", 
        output_name: str = "resized_image",
-        vae_image_output_name: str = "vae_image",
+        target_area: int = 1024 * 1024,
    ):
-        """Create a configurable step for resizing images to the target area (384 * 384) while maintaining the aspect ratio.
+        """Create a step for resizing images to a target area.

-        This block resizes an input image or a list input images and exposes the resized result under configurable
-        input and output names. Use this when you need to wire the resize step to different image fields (e.g.,
-        "image", "control_image")
+        Each image is resized independently based on its own aspect ratio.
+        This is suitable for Edit Plus where multiple reference images can have different dimensions.

        Args:
-            input_name (str, optional): Name of the image field to read from the
-                pipeline state. Defaults to "image".
-            output_name (str, optional): Name of the resized image field to write
-                back to the pipeline state. Defaults to "resized_image".
-            vae_image_output_name (str, optional): Name of the image field
-                to write back to the pipeline state. This is used by the VAE encoder step later on. QwenImage Edit Plus
-                processes the input image(s) differently for the VL and the VAE.
+            input_name (str, optional): Name of the image field to read. Defaults to "image".
+            output_name (str, optional): Name of the resized image field to write. Defaults to "resized_image".
+            target_area (int, optional): Target area in pixels. Defaults to 1024*1024.
        """
        if not isinstance(input_name, str) or not isinstance(output_name, str):
            raise ValueError(
                f"input_name and output_name must be strings but are {type(input_name)} and {type(output_name)}"
            )
-        self.condition_image_size = 384 * 384
        self._image_input_name = input_name
        self._resized_image_output_name = output_name
-        self._vae_image_output_name = vae_image_output_name
+        self._target_area = target_area
        super().__init__()

+    @property
+    def description(self) -> str:
+        return (
+            f"Image Resize step that resizes {self._image_input_name} to target area {self._target_area}.\n"
+            "Each image is resized independently based on its own aspect ratio."
+        )
+
+    @property
+    def expected_components(self) -> List[ComponentSpec]:
+        return [
+            ComponentSpec(
+                "image_resize_processor",
+                VaeImageProcessor,
+                config=FrozenDict({"vae_scale_factor": 16}),
+                default_creation_method="from_config",
+            ),
+        ]
+
+    @property
+    def inputs(self) -> List[InputParam]:
+        return [
+            InputParam(
+                name=self._image_input_name, required=True, type_hint=torch.Tensor, description="The image(s) to resize"
+            ),
+        ]
+
    @property
    def intermediate_outputs(self) -> List[OutputParam]:
-        return super().intermediate_outputs + [
+        return [
            OutputParam(
-                name=self._vae_image_output_name,
-                type_hint=List[PIL.Image.Image],
-                description="The images to be processed which will be further used by the VAE encoder.",
+                name=self._resized_image_output_name, type_hint=List[PIL.Image.Image], description="The resized images"
            ),
        ]

@@ -374,26 +395,21 @@ class QwenImageEditPlusResizeDynamicStep(QwenImageEditResizeDynamicStep):
        if not is_valid_image_imagelist(images):
            raise ValueError(f"Images must be image or list of images but are {type(images)}")

-        if (
-            not isinstance(images, torch.Tensor)
-            and isinstance(images, PIL.Image.Image)
-            and not isinstance(images, list)
-        ):
+        if is_valid_image(images):
            images = [images]

-        # TODO (sayakpaul): revisit this when the inputs are `torch.Tensor`s
-        condition_images = []
-        vae_images = []
-        for img in images:
-            image_width, image_height = img.size
-            condition_width, condition_height, _ = calculate_dimensions(
-                self.condition_image_size, image_width / image_height
+        # Resize each image independently based on its own aspect ratio
+        resized_images = []
+        for image in images:
+            image_width, image_height = image.size
+            calculated_width, calculated_height, _ = calculate_dimensions(
+                self._target_area, image_width / image_height
+            )
+            resized_images.append(
+                components.image_resize_processor.resize(image, height=calculated_height, width=calculated_width)
            )
-            condition_images.append(components.image_resize_processor.resize(img, condition_height, condition_width))
-            vae_images.append(img)

-        setattr(block_state, self._resized_image_output_name, condition_images)
-        setattr(block_state, self._vae_image_output_name, vae_images)
+        setattr(block_state, self._resized_image_output_name, resized_images)
        self.set_block_state(state, block_state)
        return components, state

@@ -647,8 +663,30 @@ class QwenImageEditTextEncoderStep(ModularPipelineBlocks):
        return components, state


-class QwenImageEditPlusTextEncoderStep(QwenImageEditTextEncoderStep):
-    model_name = "qwenimage"
+class QwenImageEditPlusTextEncoderStep(ModularPipelineBlocks):
+    """Text encoder for QwenImage Edit Plus that handles multiple reference images."""
+
+    model_name = "qwenimage-edit-plus"
+
+    @property
+    def description(self) -> str:
+        return (
+            "Text Encoder step for QwenImage Edit Plus that processes prompt and multiple images together "
+            "to generate text embeddings for guiding image generation."
+        )
+
+    @property
+    def expected_components(self) -> List[ComponentSpec]:
+        return [
+            ComponentSpec("text_encoder", Qwen2_5_VLForConditionalGeneration),
+            ComponentSpec("processor", Qwen2VLProcessor),
+            ComponentSpec(
+                "guider",
+                ClassifierFreeGuidance,
+                config=FrozenDict({"guidance_scale": 4.0}),
+                default_creation_method="from_config",
+            ),
+        ]

    @property
    def expected_configs(self) -> List[ConfigSpec]:
@@ -664,6 +702,60 @@ class QwenImageEditPlusTextEncoderStep(QwenImageEditTextEncoderStep):
            ConfigSpec(name="prompt_template_encode_start_idx", default=64),
        ]

+    @property
+    def inputs(self) -> List[InputParam]:
+        return [
+            InputParam(name="prompt", required=True, type_hint=str, description="The prompt to encode"),
+            InputParam(name="negative_prompt", type_hint=str, description="The negative prompt to encode"),
+            InputParam(
+                name="resized_cond_image",
+                required=True,
+                type_hint=torch.Tensor,
+                description="The image(s) to encode, can be a single image or list of images, should be resized to 384x384 using resize step",
+            ),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [
+            OutputParam(
+                name="prompt_embeds",
+                kwargs_type="denoiser_input_fields",
+                type_hint=torch.Tensor,
+                description="The prompt embeddings",
+            ),
+            OutputParam(
+                name="prompt_embeds_mask",
+                kwargs_type="denoiser_input_fields",
+                type_hint=torch.Tensor,
+                description="The encoder attention mask",
+            ),
+            OutputParam(
+                name="negative_prompt_embeds",
+                kwargs_type="denoiser_input_fields",
+                type_hint=torch.Tensor,
+                description="The negative prompt embeddings",
+            ),
+            OutputParam(
+                name="negative_prompt_embeds_mask",
+                kwargs_type="denoiser_input_fields",
+                type_hint=torch.Tensor,
+                description="The negative prompt embeddings mask",
+            ),
+        ]
+
+    @staticmethod
+    def check_inputs(prompt, negative_prompt):
+        if not isinstance(prompt, str) and not isinstance(prompt, list):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if (
+            negative_prompt is not None
+            and not isinstance(negative_prompt, str)
+            and not isinstance(negative_prompt, list)
+        ):
+            raise ValueError(f"`negative_prompt` has to be of type `str` or `list` but is {type(negative_prompt)}")
+
    @torch.no_grad()
    def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
        block_state = self.get_block_state(state)
@@ -676,7 +768,7 @@ class QwenImageEditPlusTextEncoderStep(QwenImageEditTextEncoderStep):
            components.text_encoder,
            components.processor,
            prompt=block_state.prompt,
-            image=block_state.resized_image,
+            image=block_state.resized_cond_image,
            prompt_template_encode=components.config.prompt_template_encode,
            img_template_encode=components.config.img_template_encode,
            prompt_template_encode_start_idx=components.config.prompt_template_encode_start_idx,
@@ -692,7 +784,7 @@ class QwenImageEditPlusTextEncoderStep(QwenImageEditTextEncoderStep):
                    components.text_encoder,
                    components.processor,
                    prompt=negative_prompt,
-                    image=block_state.resized_image,
+                    image=block_state.resized_cond_image,
                    prompt_template_encode=components.config.prompt_template_encode,
                    img_template_encode=components.config.img_template_encode,
                    prompt_template_encode_start_idx=components.config.prompt_template_encode_start_idx,
@@ -846,60 +938,60 @@ class QwenImageProcessImagesInputStep(ModularPipelineBlocks):
        self.set_block_state(state, block_state)
        return components, state

-
-class QwenImageEditPlusProcessImagesInputStep(QwenImageProcessImagesInputStep):
+class QwenImageEditPlusProcessImagesInputStep(ModularPipelineBlocks):
    model_name = "qwenimage-edit-plus"

-    def __init__(self):
-        self.vae_image_size = 1024 * 1024
-        super().__init__()
-
    @property
    def description(self) -> str:
-        return "Image Preprocess step for QwenImage Edit Plus. Unlike QwenImage Edit, QwenImage Edit Plus doesn't use the same resized image for further preprocessing."
+        return "Image Preprocess step. Images can be resized first using QwenImageEditResizeDynamicStep."
+
+    @property
+    def expected_components(self) -> List[ComponentSpec]:
+        return [
+            ComponentSpec(
+                "image_processor",
+                VaeImageProcessor,
+                config=FrozenDict({"vae_scale_factor": 16}),
+                default_creation_method="from_config",
+            ),
+        ]

    @property
    def inputs(self) -> List[InputParam]:
-        return [InputParam("vae_image"), InputParam("image"), InputParam("height"), InputParam("width")]
+        return [InputParam("resized_image")]
+
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [OutputParam(name="processed_image")]

    @torch.no_grad()
    def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
        block_state = self.get_block_state(state)

-        if block_state.vae_image is None and block_state.image is None:
-            raise ValueError("`vae_image` and `image` cannot be None at the same time")

-        vae_image_sizes = None
-        if block_state.vae_image is None:
-            image = block_state.image
-            self.check_inputs(
-                height=block_state.height, width=block_state.width, vae_scale_factor=components.vae_scale_factor
-            )
-            height = block_state.height or components.default_height
-            width = block_state.width or components.default_width
-            block_state.processed_image = components.image_processor.preprocess(
-                image=image, height=height, width=width
-            )
-        else:
-            # QwenImage Edit Plus can allow multiple input images with varied resolutions
-            processed_images = []
-            vae_image_sizes = []
-            for img in block_state.vae_image:
-                width, height = img.size
-                vae_width, vae_height, _ = calculate_dimensions(self.vae_image_size, width / height)
-                vae_image_sizes.append((vae_width, vae_height))
-                processed_images.append(
-                    components.image_processor.preprocess(image=img, height=vae_height, width=vae_width)
-                )
+
+        image = block_state.resized_image
+
+        is_image_list = isinstance(image, list)
+        if not is_image_list:
+            image = [image]
+
+        processed_images = []
+        for img in image:
+            img_width, img_height = img.size
+            processed_images.append(components.image_processor.preprocess(image=img, height=img_height, width=img_width))
+        block_state.processed_image = processed_images
+        if is_image_list:
            block_state.processed_image = processed_images
-
-        block_state.vae_image_sizes = vae_image_sizes
+        else:
+            block_state.processed_image = processed_images[0]

        self.set_block_state(state, block_state)
        return components, state

-
 class QwenImageVaeEncoderDynamicStep(ModularPipelineBlocks):
+    """VAE encoder that handles both single images and lists of images with varied resolutions."""
+
    model_name = "qwenimage"

    def __init__(
@@ -909,21 +1001,12 @@ class QwenImageVaeEncoderDynamicStep(ModularPipelineBlocks):
    ):
        """Initialize a VAE encoder step for converting images to latent representations.

-        Both the input and output names are configurable so this block can be configured to process to different image
-        inputs (e.g., "processed_image" -> "image_latents", "processed_control_image" -> "control_image_latents").
+        Handles both single images and lists of images. When input is a list, outputs a list of latents.
+        When input is a single tensor, outputs a single latent tensor.

        Args:
-            input_name (str, optional): Name of the input image tensor. Defaults to "processed_image".
-                Examples: "processed_image" or "processed_control_image"
-            output_name (str, optional): Name of the output latent tensor. Defaults to "image_latents".
-                Examples: "image_latents" or "control_image_latents"
-
-        Examples:
-            # Basic usage with default settings (includes image processor) QwenImageVaeEncoderDynamicStep()
-
-            # Custom input/output names for control image QwenImageVaeEncoderDynamicStep(
-                input_name="processed_control_image", output_name="control_image_latents"
-            )
+            input_name (str, optional): Name of the input image tensor or list. Defaults to "processed_image".
+            output_name (str, optional): Name of the output latent tensor or list. Defaults to "image_latents".
        """
        self._image_input_name = input_name
        self._image_latents_output_name = output_name
@@ -931,17 +1014,18 @@ class QwenImageVaeEncoderDynamicStep(ModularPipelineBlocks):

    @property
    def description(self) -> str:
-        return f"Dynamic VAE Encoder step that converts {self._image_input_name} into latent representations {self._image_latents_output_name}.\n"
+        return (
+            f"VAE Encoder step that converts {self._image_input_name} into latent representations {self._image_latents_output_name}.\n"
+            "Handles both single images and lists of images with varied resolutions."
+        )

    @property
    def expected_components(self) -> List[ComponentSpec]:
-        components = [ComponentSpec("vae", AutoencoderKLQwenImage)]
-        return components
+        return [ComponentSpec("vae", AutoencoderKLQwenImage)]

    @property
    def inputs(self) -> List[InputParam]:
-        inputs = [InputParam(self._image_input_name, required=True), InputParam("generator")]
-        return inputs
+        return [InputParam(self._image_input_name, required=True), InputParam("generator")]

    @property
    def intermediate_outputs(self) -> List[OutputParam]:
@@ -949,7 +1033,7 @@ class QwenImageVaeEncoderDynamicStep(ModularPipelineBlocks):
            OutputParam(
                self._image_latents_output_name,
                type_hint=torch.Tensor,
-                description="The latents representing the reference image",
+                description="The latents representing the reference image(s). Single tensor or list depending on input.",
            )
        ]

@@ -961,47 +1045,11 @@ class QwenImageVaeEncoderDynamicStep(ModularPipelineBlocks):
        dtype = components.vae.dtype

        image = getattr(block_state, self._image_input_name)
+        is_image_list = isinstance(image, list)
+        if not is_image_list:
+            image = [image]

-        # Encode image into latents
-        image_latents = encode_vae_image(
-            image=image,
-            vae=components.vae,
-            generator=block_state.generator,
-            device=device,
-            dtype=dtype,
-            latent_channels=components.num_channels_latents,
-        )
-        setattr(block_state, self._image_latents_output_name, image_latents)
-
-        self.set_block_state(state, block_state)
-
-        return components, state
-
-
-class QwenImageEditPlusVaeEncoderDynamicStep(QwenImageVaeEncoderDynamicStep):
-    model_name = "qwenimage-edit-plus"
-
-    @property
-    def intermediate_outputs(self) -> List[OutputParam]:
-        # Each reference image latent can have varied resolutions hence we return this as a list.
-        return [
-            OutputParam(
-                self._image_latents_output_name,
-                type_hint=List[torch.Tensor],
-                description="The latents representing the reference image(s).",
-            )
-        ]
-
-    @torch.no_grad()
-    def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
-        block_state = self.get_block_state(state)
-
-        device = components._execution_device
-        dtype = components.vae.dtype
-
-        image = getattr(block_state, self._image_input_name)
-
-        # Encode image into latents
+        # Handle both single image and list of images
        image_latents = []
        for img in image:
            image_latents.append(
@@ -1014,9 +1062,12 @@ class QwenImageEditPlusVaeEncoderDynamicStep(QwenImageVaeEncoderDynamicStep):
                    latent_channels=components.num_channels_latents,
                )
            )
+        if not is_image_list:
+            image_latents = image_latents[0]

        setattr(block_state, self._image_latents_output_name, image_latents)

+
        self.set_block_state(state, block_state)

        return components, state
--- a/src/diffusers/modular_pipelines/qwenimage/inputs.py
+++ b/src/diffusers/modular_pipelines/qwenimage/inputs.py
@@ -222,36 +222,15 @@ class QwenImageTextInputsStep(ModularPipelineBlocks):


 class QwenImageInputsDynamicStep(ModularPipelineBlocks):
+    """Input step for QwenImage: update height/width, expand batch, patchify."""
+
    model_name = "qwenimage"

-    def __init__(self, image_latent_inputs: List[str] = ["image_latents"], additional_batch_inputs: List[str] = []):
-        """Initialize a configurable step that standardizes the inputs for the denoising step. It:\n"
-
-        This step handles multiple common tasks to prepare inputs for the denoising step:
-        1. For encoded image latents, use it update height/width if None, patchifies, and expands batch size
-        2. For additional_batch_inputs: Only expands batch dimensions to match final batch size
-
-        This is a dynamic block that allows you to configure which inputs to process.
-
-        Args:
-            image_latent_inputs (List[str], optional): Names of image latent tensors to process.
-                These will be used to determine height/width, patchified, and batch-expanded. Can be a single string or
-                list of strings. Defaults to ["image_latents"]. Examples: ["image_latents"], ["control_image_latents"]
-            additional_batch_inputs (List[str], optional):
-                Names of additional conditional input tensors to expand batch size. These tensors will only have their
-                batch dimensions adjusted to match the final batch size. Can be a single string or list of strings.
-                Defaults to []. Examples: ["processed_mask_image"]
-
-        Examples:
-            # Configure to process image_latents (default behavior) QwenImageInputsDynamicStep()
-
-            # Configure to process multiple image latent inputs
-            QwenImageInputsDynamicStep(image_latent_inputs=["image_latents", "control_image_latents"])
-
-            # Configure to process image latents and additional batch inputs QwenImageInputsDynamicStep(
-                image_latent_inputs=["image_latents"], additional_batch_inputs=["processed_mask_image"]
-            )
-        """
+    def __init__(
+        self,
+        image_latent_inputs: List[str] = ["image_latents"],
+        additional_batch_inputs: List[str] = [],
+    ):
        if not isinstance(image_latent_inputs, list):
            image_latent_inputs = [image_latent_inputs]
        if not isinstance(additional_batch_inputs, list):
@@ -263,14 +242,12 @@ class QwenImageInputsDynamicStep(ModularPipelineBlocks):

    @property
    def description(self) -> str:
-        # Functionality section
        summary_section = (
            "Input processing step that:\n"
-            "  1. For image latent inputs: Updates height/width if None, patchifies latents, and expands batch size\n"
+            "  1. For image latent inputs: Updates height/width if None, patchifies, and expands batch size\n"
            "  2. For additional batch inputs: Expands batch dimensions to match final batch size"
        )

-        # Inputs info
        inputs_info = ""
        if self._image_latent_inputs or self._additional_batch_inputs:
            inputs_info = "\n\nConfigured inputs:"
@@ -279,11 +256,16 @@ class QwenImageInputsDynamicStep(ModularPipelineBlocks):
            if self._additional_batch_inputs:
                inputs_info += f"\n  - Additional batch inputs: {self._additional_batch_inputs}"

-        # Placement guidance
        placement_section = "\n\nThis block should be placed after the encoder steps and the text input step."

        return summary_section + inputs_info + placement_section

+    @property
+    def expected_components(self) -> List[ComponentSpec]:
+        return [
+            ComponentSpec("pachifier", QwenImagePachifier, default_creation_method="from_config"),
+        ]
+
    @property
    def inputs(self) -> List[InputParam]:
        inputs = [
@@ -293,11 +275,9 @@ class QwenImageInputsDynamicStep(ModularPipelineBlocks):
            InputParam(name="width"),
        ]

-        # Add image latent inputs
        for image_latent_input_name in self._image_latent_inputs:
            inputs.append(InputParam(name=image_latent_input_name))

-        # Add additional batch inputs
        for input_name in self._additional_batch_inputs:
            inputs.append(InputParam(name=input_name))

@@ -310,22 +290,16 @@ class QwenImageInputsDynamicStep(ModularPipelineBlocks):
            OutputParam(name="image_width", type_hint=int, description="The width of the image latents"),
        ]

-    @property
-    def expected_components(self) -> List[ComponentSpec]:
-        return [
-            ComponentSpec("pachifier", QwenImagePachifier, default_creation_method="from_config"),
-        ]
-
    def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
        block_state = self.get_block_state(state)

-        # Process image latent inputs (height/width calculation, patchify, and batch expansion)
+        # Process image latent inputs
        for image_latent_input_name in self._image_latent_inputs:
            image_latent_tensor = getattr(block_state, image_latent_input_name)
            if image_latent_tensor is None:
                continue

-            # 1. Calculate height/width from latents
+            # 1. Calculate height/width from latents and update if not provided
            height, width = calculate_dimension_from_latents(image_latent_tensor, components.vae_scale_factor)
            block_state.height = block_state.height or height
            block_state.width = block_state.width or width
@@ -335,7 +309,7 @@ class QwenImageInputsDynamicStep(ModularPipelineBlocks):
            if not hasattr(block_state, "image_width"):
                block_state.image_width = width

-            # 2. Patchify the image latent tensor
+            # 2. Patchify
            image_latent_tensor = components.pachifier.pack_latents(image_latent_tensor)

            # 3. Expand batch size
@@ -354,7 +328,6 @@ class QwenImageInputsDynamicStep(ModularPipelineBlocks):
            if input_tensor is None:
                continue

-            # Only expand batch size
            input_tensor = repeat_tensor_to_batch_size(
                input_name=input_name,
                input_tensor=input_tensor,
@@ -368,63 +341,130 @@ class QwenImageInputsDynamicStep(ModularPipelineBlocks):
        return components, state


-class QwenImageEditPlusInputsDynamicStep(QwenImageInputsDynamicStep):
+class QwenImageEditPlusInputsDynamicStep(ModularPipelineBlocks):
+    """Input step for QwenImage Edit Plus: handles list of latents with different sizes."""
+
    model_name = "qwenimage-edit-plus"

+    def __init__(
+        self,
+        image_latent_inputs: List[str] = ["image_latents"],
+        additional_batch_inputs: List[str] = [],
+    ):
+        if not isinstance(image_latent_inputs, list):
+            image_latent_inputs = [image_latent_inputs]
+        if not isinstance(additional_batch_inputs, list):
+            additional_batch_inputs = [additional_batch_inputs]
+
+        self._image_latent_inputs = image_latent_inputs
+        self._additional_batch_inputs = additional_batch_inputs
+        super().__init__()
+
+    @property
+    def description(self) -> str:
+        summary_section = (
+            "Input processing step for Edit Plus that:\n"
+            "  1. For image latent inputs (list): Collects heights/widths, patchifies each, concatenates, expands batch\n"
+            "  2. For additional batch inputs: Expands batch dimensions to match final batch size\n"
+            "  Height/width defaults to last image in the list."
+        )
+
+        inputs_info = ""
+        if self._image_latent_inputs or self._additional_batch_inputs:
+            inputs_info = "\n\nConfigured inputs:"
+            if self._image_latent_inputs:
+                inputs_info += f"\n  - Image latent inputs: {self._image_latent_inputs}"
+            if self._additional_batch_inputs:
+                inputs_info += f"\n  - Additional batch inputs: {self._additional_batch_inputs}"
+
+        placement_section = "\n\nThis block should be placed after the encoder steps and the text input step."
+
+        return summary_section + inputs_info + placement_section
+
+    @property
+    def expected_components(self) -> List[ComponentSpec]:
+        return [
+            ComponentSpec("pachifier", QwenImagePachifier, default_creation_method="from_config"),
+        ]
+
+    @property
+    def inputs(self) -> List[InputParam]:
+        inputs = [
+            InputParam(name="num_images_per_prompt", default=1),
+            InputParam(name="batch_size", required=True),
+            InputParam(name="height"),
+            InputParam(name="width"),
+        ]
+
+        for image_latent_input_name in self._image_latent_inputs:
+            inputs.append(InputParam(name=image_latent_input_name))
+
+        for input_name in self._additional_batch_inputs:
+            inputs.append(InputParam(name=input_name))
+
+        return inputs
+
    @property
    def intermediate_outputs(self) -> List[OutputParam]:
        return [
-            OutputParam(name="image_height", type_hint=List[int], description="The height of the image latents"),
-            OutputParam(name="image_width", type_hint=List[int], description="The width of the image latents"),
+            OutputParam(name="image_height", type_hint=List[int], description="The heights of the image latents"),
+            OutputParam(name="image_width", type_hint=List[int], description="The widths of the image latents"),
        ]

    def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
        block_state = self.get_block_state(state)

-        # Process image latent inputs (height/width calculation, patchify, and batch expansion)
+        # Process image latent inputs
        for image_latent_input_name in self._image_latent_inputs:
            image_latent_tensor = getattr(block_state, image_latent_input_name)
            if image_latent_tensor is None:
                continue

-            # Each image latent can have different size in QwenImage Edit Plus.
+            is_list = isinstance(image_latent_tensor, list)
+            if not is_list:
+                image_latent_tensor = [image_latent_tensor]
+
            image_heights = []
            image_widths = []
            packed_image_latent_tensors = []

-            for img_latent_tensor in image_latent_tensor:
+            for i, img_latent_tensor in enumerate(image_latent_tensor):
                # 1. Calculate height/width from latents
                height, width = calculate_dimension_from_latents(img_latent_tensor, components.vae_scale_factor)
                image_heights.append(height)
                image_widths.append(width)

-                # 2. Patchify the image latent tensor
+                # 2. Patchify
                img_latent_tensor = components.pachifier.pack_latents(img_latent_tensor)

                # 3. Expand batch size
                img_latent_tensor = repeat_tensor_to_batch_size(
-                    input_name=image_latent_input_name,
+                    input_name=f"{image_latent_input_name}[{i}]",
                    input_tensor=img_latent_tensor,
                    num_images_per_prompt=block_state.num_images_per_prompt,
                    batch_size=block_state.batch_size,
                )
                packed_image_latent_tensors.append(img_latent_tensor)

+            # Concatenate all packed latents along dim=1
            packed_image_latent_tensors = torch.cat(packed_image_latent_tensors, dim=1)
+
+            # Output lists of heights/widths
            block_state.image_height = image_heights
            block_state.image_width = image_widths
-            setattr(block_state, image_latent_input_name, packed_image_latent_tensors)

+            # Default height/width from last image
            block_state.height = block_state.height or image_heights[-1]
            block_state.width = block_state.width or image_widths[-1]

+            setattr(block_state, image_latent_input_name, packed_image_latent_tensors)
+
        # Process additional batch inputs (only batch expansion)
        for input_name in self._additional_batch_inputs:
            input_tensor = getattr(block_state, input_name)
            if input_tensor is None:
                continue

-            # Only expand batch size
            input_tensor = repeat_tensor_to_batch_size(
                input_name=input_name,
                input_tensor=input_tensor,
@@ -436,8 +476,6 @@ class QwenImageEditPlusInputsDynamicStep(QwenImageInputsDynamicStep):

        self.set_block_state(state, block_state)
        return components, state
-
-
 class QwenImageControlNetInputsStep(ModularPipelineBlocks):
    model_name = "qwenimage"

--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks.py
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
@@ -0,0 +1,465 @@
+# Copyright 2025 Qwen-Image Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...utils import logging
+from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks, ConditionalPipelineBlocks
+from ..modular_pipeline_utils import InsertableDict
+from .before_denoise import (
+    QwenImageControlNetBeforeDenoiserStep,
+    QwenImageCreateMaskLatentsStep,
+    QwenImagePrepareLatentsStep,
+    QwenImagePrepareLatentsWithStrengthStep,
+    QwenImageRoPEInputsStep,
+    QwenImageSetTimestepsStep,
+    QwenImageSetTimestepsWithStrengthStep,
+)
+from .decoders import (
+    QwenImageAfterDenoiseStep,
+    QwenImageDecoderStep,
+    QwenImageInpaintProcessImagesOutputStep,
+    QwenImageProcessImagesOutputStep,
+)
+from .denoise import (
+    QwenImageControlNetDenoiseStep,
+    QwenImageDenoiseStep,
+    QwenImageInpaintControlNetDenoiseStep,
+    QwenImageInpaintDenoiseStep,
+    QwenImageLoopBeforeDenoiserControlNet,
+)
+from .encoders import (
+    QwenImageControlNetVaeEncoderStep,
+    QwenImageInpaintProcessImagesInputStep,
+    QwenImageProcessImagesInputStep,
+    QwenImageTextEncoderStep,
+    QwenImageVaeEncoderDynamicStep,
+)
+from .inputs import (
+    QwenImageControlNetInputsStep,
+    QwenImageInputsDynamicStep,
+    QwenImageTextInputsStep,
+)
+
+
+logger = logging.get_logger(__name__)
+
+
+
+# 1. VAE ENCODER
+
+# inpaint vae encoder
+class QwenImageInpaintVaeEncoderStep(SequentialPipelineBlocks):
+    model_name = "qwenimage"
+    block_classes = [QwenImageInpaintProcessImagesInputStep(), QwenImageVaeEncoderDynamicStep()]
+    block_names = ["preprocess", "encode"]
+
+    @property
+    def description(self) -> str:
+        return (
+            "This step is used for processing image and mask inputs for inpainting tasks. It:\n"
+            " - Resizes the image to the target size, based on `height` and `width`.\n"
+            " - Processes and updates `image` and `mask_image`.\n"
+            " - Creates `image_latents`."
+        )
+
+
+# img2img vae encoder
+class QwenImageImg2ImgVaeEncoderStep(SequentialPipelineBlocks):
+    model_name = "qwenimage"
+
+    block_classes = [QwenImageProcessImagesInputStep(), QwenImageVaeEncoderDynamicStep()]
+    block_names = ["preprocess", "encode"]
+
+    @property
+    def description(self) -> str:
+        return "Vae encoder step that preprocess andencode the image inputs into their latent representations."
+
+
+# auto vae encoder
+class QwenImageAutoVaeEncoderStep(AutoPipelineBlocks):
+    block_classes = [QwenImageInpaintVaeEncoderStep, QwenImageImg2ImgVaeEncoderStep]
+    block_names = ["inpaint", "img2img"]
+    block_trigger_inputs = ["mask_image", "image"]
+
+    @property
+    def description(self):
+        return (
+            "Vae encoder step that encode the image inputs into their latent representations.\n"
+            + "This is an auto pipeline block.\n"
+            + " - `QwenImageInpaintVaeEncoderStep` (inpaint) is used when `mask_image` is provided.\n"
+            + " - `QwenImageImg2ImgVaeEncoderStep` (img2img) is used when `image` is provided.\n"
+            + " - if `mask_image` or `image` is not provided, step will be skipped."
+        )
+
+
+# optional controlnet vae encoder
+class QwenImageOptionalControlNetVaeEncoderStep(AutoPipelineBlocks):
+    block_classes = [QwenImageControlNetVaeEncoderStep]
+    block_names = ["controlnet"]
+    block_trigger_inputs = ["control_image"]
+
+    @property
+    def description(self):
+        return (
+            "Vae encoder step that encode the image inputs into their latent representations.\n"
+            + "This is an auto pipeline block.\n"
+            + " - `QwenImageControlNetVaeEncoderStep` (controlnet) is used when `control_image` is provided.\n"
+            + " - if `control_image` is not provided, step will be skipped."
+        )
+
+# 2. DENOISE
+# input -> prepare_latents -> set_timesteps -> prepare_rope_inputs -> denoise -> after_denoise
+
+# img2img input
+class QwenImageImg2ImgInputStep(SequentialPipelineBlocks):
+    model_name = "qwenimage"
+    block_classes = [QwenImageTextInputsStep(), QwenImageInputsDynamicStep(image_latent_inputs=["image_latents"])]
+    block_names = ["text_inputs", "additional_inputs"]
+
+    @property
+    def description(self):
+        return "Input step that prepares the inputs for the img2img denoising step. It:\n"
+        " - make sure the text embeddings have consistent batch size as well as the additional inputs (`image_latents`).\n"
+        " - update height/width based `image_latents`, patchify `image_latents`."
+
+
+# inpaint input
+class QwenImageInpaintInputStep(SequentialPipelineBlocks):
+    model_name = "qwenimage"
+    block_classes = [QwenImageTextInputsStep(), QwenImageInputsDynamicStep(image_latent_inputs=["image_latents"], additional_batch_inputs=["processed_mask_image"])]
+    block_names = ["text_inputs", "additional_inputs"]
+
+    @property
+    def description(self):
+        return "Input step that prepares the inputs for the inpainting denoising step. It:\n"
+        " - make sure the text embeddings have consistent batch size as well as the additional inputs (`image_latents` and `processed_mask_image`).\n"
+        " - update height/width based `image_latents`, patchify `image_latents`."
+
+# inpaint prepare latents
+class QwenImageInpaintPrepareLatentsStep(SequentialPipelineBlocks):
+    model_name = "qwenimage"
+    block_classes = [QwenImagePrepareLatentsWithStrengthStep(), QwenImageCreateMaskLatentsStep()]
+    block_names = ["add_noise_to_latents", "create_mask_latents"]
+
+    @property
+    def description(self) -> str:
+        return (
+            "This step prepares the latents/image_latents and mask inputs for the inpainting denoising step. It:\n"
+            " - Add noise to the image latents to create the latents input for the denoiser.\n"
+            " - Create the pachified latents `mask` based on the processedmask image.\n"
+        )
+
+# CoreDenoiseStep: 
+# (input +  prepare_latents + set_timesteps + prepare_rope_inputs + denoise + after_denoise)
+
+# 1. text2image
+class QwenImageCoreDenoiseStep(SequentialPipelineBlocks):
+    model_name = "qwenimage"
+    block_classes = [
+        QwenImageTextInputsStep(),
+        QwenImagePrepareLatentsStep(), 
+        QwenImageSetTimestepsStep(), 
+        QwenImageRoPEInputsStep(), 
+        QwenImageDenoiseStep(),
+        QwenImageAfterDenoiseStep(),
+    ]
+    block_names = [
+        "input",
+        "prepare_latents", 
+        "set_timesteps", 
+        "prepare_rope_inputs", 
+        "denoise", 
+        "after_denoise",
+    ]
+
+    @property
+    def description(self):
+        return "step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.)."
+
+
+# 2.inpaint
+class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks):
+    model_name = "qwenimage"
+    block_classes = [
+        QwenImageInpaintInputStep(),
+        QwenImagePrepareLatentsStep(), 
+        QwenImageSetTimestepsWithStrengthStep(), 
+        QwenImageInpaintPrepareLatentsStep(), 
+        QwenImageRoPEInputsStep(),
+        QwenImageInpaintDenoiseStep(),
+        QwenImageAfterDenoiseStep(),
+        ]
+    block_names = [
+        "input",
+        "prepare_latents",
+        "set_timesteps", 
+        "prepare_inpaint_latents", 
+        "prepare_rope_inputs",
+        "denoise",
+        "after_denoise",
+        ]
+
+    @property
+    def description(self):
+        return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task."
+
+
+# 3. img2img
+class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
+    model_name = "qwenimage"
+    block_classes = [
+        QwenImageImg2ImgInputStep(),
+        QwenImagePrepareLatentsStep(), 
+        QwenImageSetTimestepsWithStrengthStep(), 
+        QwenImagePrepareLatentsWithStrengthStep(), 
+        QwenImageRoPEInputsStep(),
+        QwenImageDenoiseStep(),
+        QwenImageAfterDenoiseStep(),
+        ]
+    block_names = [
+        "input",
+        "prepare_latents", 
+        "set_timesteps", 
+        "prepare_img2img_latents", 
+        "prepare_rope_inputs",
+        "denoise",
+        "after_denoise",
+        ]
+
+    @property
+    def description(self):
+        return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task."
+
+
+
+# 4. text2image + controlnet
+class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks):
+    model_name = "qwenimage"
+    block_classes = [
+        QwenImageTextInputsStep(),
+        QwenImageControlNetInputsStep(),
+        QwenImagePrepareLatentsStep(), 
+        QwenImageSetTimestepsStep(), 
+        QwenImageRoPEInputsStep(), 
+        QwenImageControlNetBeforeDenoiserStep(),
+        QwenImageControlNetDenoiseStep(),
+        QwenImageAfterDenoiseStep(),
+    ]
+    block_names = [
+        "input",
+        "controlnet_input",
+        "prepare_latents", 
+        "set_timesteps", 
+        "prepare_rope_inputs", 
+        "controlnet_before_denoise",
+        "controlnet_denoise",
+        "after_denoise",
+    ]
+
+    @property
+    def description(self):
+        return "step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.)."
+
+
+# 5. inpaint + controlnet
+class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks):
+    model_name = "qwenimage"
+    block_classes = [
+        QwenImageInpaintInputStep(),
+        QwenImageControlNetInputsStep(),
+        QwenImagePrepareLatentsStep(), 
+        QwenImageSetTimestepsWithStrengthStep(), 
+        QwenImageInpaintPrepareLatentsStep(), 
+        QwenImageRoPEInputsStep(),
+        QwenImageControlNetBeforeDenoiserStep(),
+        QwenImageInpaintControlNetDenoiseStep(),
+        QwenImageAfterDenoiseStep(),
+        ]
+    block_names = [
+        "input",
+        "controlnet_input",
+        "prepare_latents",
+        "set_timesteps", 
+        "prepare_inpaint_latents", 
+        "prepare_rope_inputs",
+        "controlnet_before_denoise",
+        "controlnet_denoise",
+        "after_denoise",
+        ]
+
+    @property
+    def description(self):
+        return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task."
+
+
+# 6. img2img + controlnet
+class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
+    model_name = "qwenimage"
+    block_classes = [
+        QwenImageImg2ImgInputStep(),
+        QwenImageControlNetInputsStep(),
+        QwenImagePrepareLatentsStep(), 
+        QwenImageSetTimestepsWithStrengthStep(), 
+        QwenImagePrepareLatentsWithStrengthStep(), 
+        QwenImageRoPEInputsStep(),
+        QwenImageControlNetBeforeDenoiserStep(),
+        QwenImageControlNetDenoiseStep(),
+        QwenImageAfterDenoiseStep(),
+        ]
+    block_names = [
+        "input",
+        "controlnet_input",
+        "prepare_latents", 
+        "set_timesteps", 
+        "prepare_img2img_latents", 
+        "prepare_rope_inputs",
+        "controlnet_before_denoise",
+        "controlnet_denoise",
+        "after_denoise",
+        ]
+
+    @property
+    def description(self):
+        return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task."
+
+
+# auto denoise
+# auto denoise step for controlnet tasks: works for all tasks with controlnet
+class QwenImageAutoCoreDenoiseStep(ConditionalPipelineBlocks):
+    block_classes = [
+        QwenImageCoreDenoiseStep,
+        QwenImageInpaintCoreDenoiseStep, 
+        QwenImageImg2ImgCoreDenoiseStep,
+        QwenImageControlNetCoreDenoiseStep,
+        QwenImageControlNetInpaintCoreDenoiseStep,
+        QwenImageControlNetImg2ImgCoreDenoiseStep,
+    ]
+    block_names = [
+        "text2image",
+        "inpaint",
+        "img2img",
+        "controlnet_text2image",
+        "controlnet_inpaint",
+        "controlnet_img2img"]
+    block_trigger_inputs = ["control_image_latents", "processed_mask_image", "image_latents"]
+    default_block_name = "text2image"
+
+    def select_block(self, control_image_latents=None, processed_mask_image=None, image_latents=None):
+
+        if control_image_latents is not None:
+            if processed_mask_image is not None:
+                return "controlnet_inpaint"
+            elif image_latents is not None:
+                return "controlnet_img2img"
+            else:
+                return "controlnet_text2image"
+        else:
+            if processed_mask_image is not None:
+                return "inpaint"
+            elif image_latents is not None:
+                return "img2img"
+            else:
+                return "text2image"
+
+    @property
+    def description(self):
+        return (
+            "Core step that performs the denoising process. \n"
+            + " - `QwenImageCoreDenoiseStep` (text2image) for text2image tasks.\n"
+            + " - `QwenImageInpaintCoreDenoiseStep` (inpaint) for inpaint tasks.\n"
+            + " - `QwenImageImg2ImgCoreDenoiseStep` (img2img) for img2img tasks.\n"
+            + " - `QwenImageControlNetCoreDenoiseStep` (controlnet_text2image) for text2image tasks with controlnet.\n"
+            + " - `QwenImageControlNetInpaintCoreDenoiseStep` (controlnet_inpaint) for inpaint tasks with controlnet.\n"
+            + " - `QwenImageControlNetImg2ImgCoreDenoiseStep` (controlnet_img2img) for img2img tasks with controlnet.\n"
+            + "This step support text-to-image, image-to-image, inpainting, and controlnet tasks for QwenImage:\n"
+            + " - for image-to-image generation, you need to provide `image_latents`\n"
+            + " - for inpainting, you need to provide `processed_mask_image` and `image_latents`\n"
+            + " - to run the controlnet workflow, you need to provide `control_image_latents`\n"
+            + " - for text-to-image generation, all you need to provide is prompt embeddings"
+        )
+
+
+# 4. DECODE
+
+## 1.1 text2image
+
+#### decode
+#### (standard decode step works for most tasks except for inpaint)
+
+class QwenImageDecodeStep(SequentialPipelineBlocks):
+    model_name = "qwenimage"
+    block_classes = [QwenImageDecoderStep(), QwenImageProcessImagesOutputStep()]
+    block_names = ["decode", "postprocess"]
+
+    @property
+    def description(self):
+        return "Decode step that decodes the latents to images and postprocess the generated image."
+
+
+
+#### inpaint decode
+
+class QwenImageInpaintDecodeStep(SequentialPipelineBlocks):
+    model_name = "qwenimage"
+    block_classes = [QwenImageDecoderStep(), QwenImageInpaintProcessImagesOutputStep()]
+    block_names = ["decode", "postprocess"]
+
+    @property
+    def description(self):
+        return "Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask overally to the original image."
+
+
+# auto decode step for inpaint and text2image tasks
+class QwenImageAutoDecodeStep(AutoPipelineBlocks):
+    block_classes = [QwenImageInpaintDecodeStep, QwenImageDecodeStep]
+    block_names = ["inpaint_decode", "decode"]
+    block_trigger_inputs = ["mask", None]
+
+    @property
+    def description(self):
+        return (
+            "Decode step that decode the latents into images. \n"
+            " This is an auto pipeline block that works for inpaint/text2image/img2img tasks, for both QwenImage and QwenImage-Edit.\n"
+            + " - `QwenImageInpaintDecodeStep` (inpaint) is used when `mask` is provided.\n"
+            + " - `QwenImageDecodeStep` (text2image/img2img) is used when `mask` is not provided.\n"
+        )
+
+
+
+## 1.10 QwenImage/auto block & presets
+AUTO_BLOCKS = InsertableDict(
+    [
+        ("text_encoder", QwenImageTextEncoderStep()),
+        ("vae_encoder", QwenImageAutoVaeEncoderStep()),
+        ("controlnet_vae_encoder", QwenImageOptionalControlNetVaeEncoderStep()),
+        ("denoise", QwenImageAutoCoreDenoiseStep()),
+        ("decode", QwenImageAutoDecodeStep()),
+    ]
+)
+
+
+class QwenImageAutoBlocks(SequentialPipelineBlocks):
+    model_name = "qwenimage"
+
+    block_classes = AUTO_BLOCKS.values()
+    block_names = AUTO_BLOCKS.keys()
+
+    @property
+    def description(self):
+        return (
+            "Auto Modular pipeline for text-to-image, image-to-image, inpainting, and controlnet tasks using QwenImage.\n"
+            + "- for image-to-image generation, you need to provide `image`\n"
+            + "- for inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop` \n"
+            + "- to run the controlnet workflow, you need to provide `control_image`\n"
+            + "- for text-to-image generation, all you need to provide is `prompt`"
+        )
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
@@ -0,0 +1,329 @@
+# Copyright 2025 Qwen-Image Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional
+
+from ...utils import logging
+from ..modular_pipeline import AutoPipelineBlocks, ConditionalPipelineBlocks, SequentialPipelineBlocks
+from ..modular_pipeline_utils import InsertableDict
+from .before_denoise import (
+    QwenImageCreateMaskLatentsStep,
+    QwenImageEditRoPEInputsStep,
+    QwenImagePrepareLatentsStep,
+    QwenImagePrepareLatentsWithStrengthStep,
+    QwenImageSetTimestepsStep,
+    QwenImageSetTimestepsWithStrengthStep,
+)
+from .decoders import (
+    QwenImageAfterDenoiseStep,
+    QwenImageDecoderStep,
+    QwenImageInpaintProcessImagesOutputStep,
+    QwenImageProcessImagesOutputStep,
+)
+from .denoise import (
+    QwenImageEditDenoiseStep,
+    QwenImageEditInpaintDenoiseStep,
+)
+from .encoders import (
+    QwenImageEditResizeDynamicStep,
+    QwenImageEditTextEncoderStep,
+    QwenImageInpaintProcessImagesInputStep,
+    QwenImageProcessImagesInputStep,
+    QwenImageVaeEncoderDynamicStep,
+)
+from .inputs import (
+    QwenImageInputsDynamicStep,
+    QwenImageTextInputsStep,
+)
+
+
+logger = logging.get_logger(__name__)
+
+
+# ====================
+# 1. TEXT ENCODER
+# ====================
+
+class QwenImageEditVLEncoderStep(SequentialPipelineBlocks):
+    """VL encoder that takes both image and text prompts."""
+    model_name = "qwenimage-edit"
+    block_classes = [
+        QwenImageEditResizeDynamicStep(),
+        QwenImageEditTextEncoderStep(),
+    ]
+    block_names = ["resize", "encode"]
+
+    @property
+    def description(self) -> str:
+        return "QwenImage-Edit VL encoder step that encode the image and text prompts together."
+
+
+# ====================
+# 2. VAE ENCODER
+# ====================
+
+# Edit VAE encoder
+class QwenImageEditVaeEncoderStep(SequentialPipelineBlocks):
+    model_name = "qwenimage-edit"
+    block_classes = [
+        QwenImageEditResizeDynamicStep(),
+        QwenImageProcessImagesInputStep(),
+        QwenImageVaeEncoderDynamicStep(),
+    ]
+    block_names = ["resize", "preprocess", "encode"]
+
+    @property
+    def description(self) -> str:
+        return "Vae encoder step that encode the image inputs into their latent representations."
+
+
+# Edit Inpaint VAE encoder
+class QwenImageEditInpaintVaeEncoderStep(SequentialPipelineBlocks):
+    model_name = "qwenimage-edit"
+    block_classes = [
+        QwenImageEditResizeDynamicStep(),
+        QwenImageInpaintProcessImagesInputStep(),
+        QwenImageVaeEncoderDynamicStep(input_name="processed_image", output_name="image_latents"),
+    ]
+    block_names = ["resize", "preprocess", "encode"]
+
+    @property
+    def description(self) -> str:
+        return (
+            "This step is used for processing image and mask inputs for QwenImage-Edit inpaint tasks. It:\n"
+            " - resize the image for target area (1024 * 1024) while maintaining the aspect ratio.\n"
+            " - process the resized image and mask image.\n"
+            " - create image latents."
+        )
+
+
+# Auto VAE encoder
+class QwenImageEditAutoVaeEncoderStep(AutoPipelineBlocks):
+    block_classes = [QwenImageEditInpaintVaeEncoderStep, QwenImageEditVaeEncoderStep]
+    block_names = ["edit_inpaint", "edit"]
+    block_trigger_inputs = ["mask_image", "image"]
+
+    @property
+    def description(self):
+        return (
+            "Vae encoder step that encode the image inputs into their latent representations.\n"
+            "This is an auto pipeline block.\n"
+            " - `QwenImageEditInpaintVaeEncoderStep` (edit_inpaint) is used when `mask_image` is provided.\n"
+            " - `QwenImageEditVaeEncoderStep` (edit) is used when `image` is provided.\n"
+            " - if `mask_image` or `image` is not provided, step will be skipped."
+        )
+
+
+# ====================
+# 3. DENOISE - input -> prepare_latents -> set_timesteps -> prepare_rope_inputs -> denoise -> after_denoise
+# ====================
+
+# Edit input step
+class QwenImageEditInputStep(SequentialPipelineBlocks):
+    model_name = "qwenimage-edit"
+    block_classes = [
+        QwenImageTextInputsStep(),
+        QwenImageInputsDynamicStep(image_latent_inputs=["image_latents"]),
+    ]
+    block_names = ["text_inputs", "additional_inputs"]
+
+    @property
+    def description(self):
+        return (
+            "Input step that prepares the inputs for the edit denoising step. It:\n"
+            " - make sure the text embeddings have consistent batch size as well as the additional inputs.\n"
+            " - update height/width based `image_latents`, patchify `image_latents`."
+        )
+
+
+# Edit Inpaint input step
+class QwenImageEditInpaintInputStep(SequentialPipelineBlocks):
+    model_name = "qwenimage-edit"
+    block_classes = [
+        QwenImageTextInputsStep(),
+        QwenImageInputsDynamicStep(image_latent_inputs=["image_latents"], additional_batch_inputs=["processed_mask_image"]),
+    ]
+    block_names = ["text_inputs", "additional_inputs"]
+
+    @property
+    def description(self):
+        return (
+            "Input step that prepares the inputs for the edit inpaint denoising step. It:\n"
+            " - make sure the text embeddings have consistent batch size as well as the additional inputs.\n"
+            " - update height/width based `image_latents`, patchify `image_latents`."
+        )
+
+
+# Edit Inpaint prepare latents step
+class QwenImageEditInpaintPrepareLatentsStep(SequentialPipelineBlocks):
+    model_name = "qwenimage-edit"
+    block_classes = [QwenImagePrepareLatentsWithStrengthStep(), QwenImageCreateMaskLatentsStep()]
+    block_names = ["add_noise_to_latents", "create_mask_latents"]
+
+    @property
+    def description(self) -> str:
+        return (
+            "This step prepares the latents/image_latents and mask inputs for the edit inpainting denoising step. It:\n"
+            " - Add noise to the image latents to create the latents input for the denoiser.\n"
+            " - Create the patchified latents `mask` based on the processed mask image.\n"
+        )
+
+
+# 1. Edit (img2img) core denoise
+class QwenImageEditCoreDenoiseStep(SequentialPipelineBlocks):
+    model_name = "qwenimage-edit"
+    block_classes = [
+        QwenImageEditInputStep(),
+        QwenImagePrepareLatentsStep(),
+        QwenImageSetTimestepsStep(),
+        QwenImageEditRoPEInputsStep(),
+        QwenImageEditDenoiseStep(),
+        QwenImageAfterDenoiseStep(),
+    ]
+    block_names = [
+        "input",
+        "prepare_latents",
+        "set_timesteps",
+        "prepare_rope_inputs",
+        "denoise",
+        "after_denoise",
+    ]
+
+    @property
+    def description(self):
+        return "Core denoising workflow for QwenImage-Edit edit (img2img) task."
+
+
+# 2. Edit Inpaint core denoise
+class QwenImageEditInpaintCoreDenoiseStep(SequentialPipelineBlocks):
+    model_name = "qwenimage-edit"
+    block_classes = [
+        QwenImageEditInpaintInputStep(),
+        QwenImagePrepareLatentsStep(),
+        QwenImageSetTimestepsWithStrengthStep(),
+        QwenImageEditInpaintPrepareLatentsStep(),
+        QwenImageEditRoPEInputsStep(),
+        QwenImageEditInpaintDenoiseStep(),
+        QwenImageAfterDenoiseStep(),
+    ]
+    block_names = [
+        "input",
+        "prepare_latents",
+        "set_timesteps",
+        "prepare_inpaint_latents",
+        "prepare_rope_inputs",
+        "denoise",
+        "after_denoise",
+    ]
+
+    @property
+    def description(self):
+        return "Core denoising workflow for QwenImage-Edit edit inpaint task."
+
+
+# Auto core denoise step
+class QwenImageEditAutoCoreDenoiseStep(ConditionalPipelineBlocks):
+    block_classes = [
+        QwenImageEditInpaintCoreDenoiseStep,
+        QwenImageEditCoreDenoiseStep,
+    ]
+    block_names = ["edit_inpaint", "edit"]
+    block_trigger_inputs = ["processed_mask_image", "image_latents"]
+    default_block_name = "edit"
+
+    def select_block(self, processed_mask_image=None, image_latents=None) -> Optional[str]:
+        if processed_mask_image is not None:
+            return "edit_inpaint"
+        elif image_latents is not None:
+            return "edit"
+        return None
+
+    @property
+    def description(self):
+        return (
+            "Auto core denoising step that selects the appropriate workflow based on inputs.\n"
+            " - `QwenImageEditInpaintCoreDenoiseStep` when `processed_mask_image` is provided\n"
+            " - `QwenImageEditCoreDenoiseStep` when `image_latents` is provided\n"
+            "Supports edit (img2img) and edit inpainting tasks for QwenImage-Edit."
+        )
+
+
+# ====================
+# 4. DECODE
+# ====================
+
+# Decode step (standard)
+class QwenImageEditDecodeStep(SequentialPipelineBlocks):
+    model_name = "qwenimage-edit"
+    block_classes = [QwenImageDecoderStep(), QwenImageProcessImagesOutputStep()]
+    block_names = ["decode", "postprocess"]
+
+    @property
+    def description(self):
+        return "Decode step that decodes the latents to images and postprocess the generated image."
+
+
+# Inpaint decode step
+class QwenImageEditInpaintDecodeStep(SequentialPipelineBlocks):
+    model_name = "qwenimage-edit"
+    block_classes = [QwenImageDecoderStep(), QwenImageInpaintProcessImagesOutputStep()]
+    block_names = ["decode", "postprocess"]
+
+    @property
+    def description(self):
+        return "Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask overlay to the original image."
+
+
+# Auto decode step
+class QwenImageEditAutoDecodeStep(AutoPipelineBlocks):
+    block_classes = [QwenImageEditInpaintDecodeStep, QwenImageEditDecodeStep]
+    block_names = ["inpaint_decode", "decode"]
+    block_trigger_inputs = ["mask", None]
+
+    @property
+    def description(self):
+        return (
+            "Decode step that decode the latents into images.\n"
+            "This is an auto pipeline block.\n"
+            " - `QwenImageEditInpaintDecodeStep` (inpaint) is used when `mask` is provided.\n"
+            " - `QwenImageEditDecodeStep` (edit) is used when `mask` is not provided.\n"
+        )
+
+
+# ====================
+# 5. AUTO BLOCKS & PRESETS
+# ====================
+
+EDIT_AUTO_BLOCKS = InsertableDict(
+    [
+        ("text_encoder", QwenImageEditVLEncoderStep()),
+        ("vae_encoder", QwenImageEditAutoVaeEncoderStep()),
+        ("denoise", QwenImageEditAutoCoreDenoiseStep()),
+        ("decode", QwenImageEditAutoDecodeStep()),
+    ]
+)
+
+
+class QwenImageEditAutoBlocks(SequentialPipelineBlocks):
+    model_name = "qwenimage-edit"
+    block_classes = EDIT_AUTO_BLOCKS.values()
+    block_names = EDIT_AUTO_BLOCKS.keys()
+
+    @property
+    def description(self):
+        return (
+            "Auto Modular pipeline for edit (img2img) and edit inpaint tasks using QwenImage-Edit.\n"
+            "- for edit (img2img) generation, you need to provide `image`\n"
+            "- for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop`\n"
+        )
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
@@ -0,0 +1,175 @@
+# Copyright 2025 Qwen-Image Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...utils import logging
+from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks
+from ..modular_pipeline_utils import InsertableDict
+from .before_denoise import (
+    QwenImageEditPlusRoPEInputsStep,
+    QwenImagePrepareLatentsStep,
+    QwenImageSetTimestepsStep,
+)
+from .decoders import (
+    QwenImageAfterDenoiseStep,
+    QwenImageDecoderStep,
+    QwenImageProcessImagesOutputStep,
+)
+from .denoise import (
+    QwenImageEditDenoiseStep,
+)
+from .encoders import (
+    QwenImageEditPlusResizeDynamicStep,
+    QwenImageEditPlusTextEncoderStep,
+    QwenImageEditPlusProcessImagesInputStep,
+    QwenImageVaeEncoderDynamicStep,
+)
+from .inputs import (
+    QwenImageEditPlusInputsDynamicStep,
+    QwenImageTextInputsStep,
+)
+
+
+logger = logging.get_logger(__name__)
+
+
+# ====================
+# 1. TEXT ENCODER
+# ====================
+
+class QwenImageEditPlusVLEncoderStep(SequentialPipelineBlocks):
+    """VL encoder that takes both image and text prompts. Uses 384x384 target area."""
+    model_name = "qwenimage-edit-plus"
+    block_classes = [
+        QwenImageEditPlusResizeDynamicStep(target_area=384 * 384, output_name="resized_cond_image"),
+        QwenImageEditPlusTextEncoderStep(),
+    ]
+    block_names = ["resize", "encode"]
+
+    @property
+    def description(self) -> str:
+        return "QwenImage-Edit Plus VL encoder step that encodes the image and text prompts together."
+
+
+# ====================
+# 2. VAE ENCODER
+# ====================
+
+class QwenImageEditPlusVaeEncoderStep(SequentialPipelineBlocks):
+    """VAE encoder that handles multiple images with different sizes. Uses 1024x1024 target area."""
+    model_name = "qwenimage-edit-plus"
+    block_classes = [
+        QwenImageEditPlusResizeDynamicStep(target_area=1024 * 1024, output_name="resized_image"),
+        QwenImageEditPlusProcessImagesInputStep(),
+        QwenImageVaeEncoderDynamicStep(),
+    ]
+    block_names = ["resize", "preprocess", "encode"]
+
+    @property
+    def description(self) -> str:
+        return (
+            "VAE encoder step that encodes image inputs into latent representations.\n"
+            "Each image is resized independently based on its own aspect ratio to 1024x1024 target area."
+        )
+
+
+# ====================
+# 3. DENOISE - input -> prepare_latents -> set_timesteps -> prepare_rope_inputs -> denoise -> after_denoise
+# ====================
+
+# Edit Plus input step
+class QwenImageEditPlusInputStep(SequentialPipelineBlocks):
+    model_name = "qwenimage-edit-plus"
+    block_classes = [
+        QwenImageTextInputsStep(),
+        QwenImageEditPlusInputsDynamicStep(image_latent_inputs=["image_latents"]),
+    ]
+    block_names = ["text_inputs", "additional_inputs"]
+
+    @property
+    def description(self):
+        return (
+            "Input step that prepares the inputs for the Edit Plus denoising step. It:\n"
+            " - Standardizes text embeddings batch size.\n"
+            " - Processes list of image latents: patchifies, concatenates along dim=1, expands batch.\n"
+            " - Outputs lists of image_height/image_width for RoPE calculation.\n"
+            " - Defaults height/width from last image in the list."
+        )
+
+
+# Edit Plus core denoise
+class QwenImageEditPlusCoreDenoiseStep(SequentialPipelineBlocks):
+    model_name = "qwenimage-edit-plus"
+    block_classes = [
+        QwenImageEditPlusInputStep(),
+        QwenImagePrepareLatentsStep(),
+        QwenImageSetTimestepsStep(),
+        QwenImageEditPlusRoPEInputsStep(),
+        QwenImageEditDenoiseStep(),
+        QwenImageAfterDenoiseStep(),
+    ]
+    block_names = [
+        "input",
+        "prepare_latents",
+        "set_timesteps",
+        "prepare_rope_inputs",
+        "denoise",
+        "after_denoise",
+    ]
+
+    @property
+    def description(self):
+        return "Core denoising workflow for QwenImage-Edit Plus edit (img2img) task."
+
+
+# ====================
+# 4. DECODE
+# ====================
+
+class QwenImageEditPlusDecodeStep(SequentialPipelineBlocks):
+    model_name = "qwenimage-edit-plus"
+    block_classes = [QwenImageDecoderStep(), QwenImageProcessImagesOutputStep()]
+    block_names = ["decode", "postprocess"]
+
+    @property
+    def description(self):
+        return "Decode step that decodes the latents to images and postprocesses the generated image."
+
+
+# ====================
+# 5. AUTO BLOCKS & PRESETS
+# ====================
+
+EDIT_PLUS_AUTO_BLOCKS = InsertableDict(
+    [
+        ("text_encoder", QwenImageEditPlusVLEncoderStep()),
+        ("vae_encoder", QwenImageEditPlusVaeEncoderStep()),
+        ("denoise", QwenImageEditPlusCoreDenoiseStep()),
+        ("decode", QwenImageEditPlusDecodeStep()),
+    ]
+)
+
+
+class QwenImageEditPlusAutoBlocks(SequentialPipelineBlocks):
+    model_name = "qwenimage-edit-plus"
+    block_classes = EDIT_PLUS_AUTO_BLOCKS.values()
+    block_names = EDIT_PLUS_AUTO_BLOCKS.keys()
+
+    @property
+    def description(self):
+        return (
+            "Auto Modular pipeline for edit (img2img) tasks using QwenImage-Edit Plus.\n"
+            "- `image` is required input (can be single image or list of images).\n"
+            "- Each image is resized independently based on its own aspect ratio.\n"
+            "- VL encoder uses 384x384 target area, VAE encoder uses 1024x1024 target area."
+        )
Author	SHA1	Message	Date
yiyixuxu	19e2ce1b2d	refactor qwen modular	2025-12-22 01:02:40 +01:00
yiyixuxu	a1af845169	add conditoinal pipeline	2025-12-22 01:01:16 +01:00
yiyixuxu	3a1ba1a0e2	3 files	2025-12-20 00:27:54 +01:00