up

start qwenimage layer testsing.
2025-12-22 12:24:39 +08:00 · 2025-12-18 11:37:01 +05:30 · 2025-12-18 09:58:17 +05:30
30 changed files with 2598 additions and 3470 deletions
--- a/docs/source/en/api/pipelines/cosmos.md
+++ b/docs/source/en/api/pipelines/cosmos.md
@@ -70,12 +70,6 @@ output.save("output.png")
  - all
  - __call__

-## Cosmos2_5_PredictBasePipeline
-
-[[autodoc]] Cosmos2_5_PredictBasePipeline
-  - all
-  - __call__
-
 ## CosmosPipelineOutput

 [[autodoc]] pipelines.cosmos.pipeline_output.CosmosPipelineOutput
--- a/scripts/convert_cosmos_to_diffusers.py
+++ b/scripts/convert_cosmos_to_diffusers.py
@@ -1,55 +1,11 @@
-"""
-# Cosmos 2 Predict
-
-Download checkpoint
-```bash
-hf download nvidia/Cosmos-Predict2-2B-Text2Image
-```
-
-convert checkpoint
-```bash
-transformer_ckpt_path=~/.cache/huggingface/hub/models--nvidia--Cosmos-Predict2-2B-Text2Image/snapshots/acdb5fde992a73ef0355f287977d002cbfd127e0/model.pt
-
-python scripts/convert_cosmos_to_diffusers.py \
-    --transformer_ckpt_path $transformer_ckpt_path \
-    --transformer_type Cosmos-2.0-Diffusion-2B-Text2Image \
-    --text_encoder_path google-t5/t5-11b \
-    --tokenizer_path google-t5/t5-11b \
-    --vae_type wan2.1 \
-    --output_path converted/cosmos-p2-t2i-2b \
-    --save_pipeline
-```
-
-# Cosmos 2.5 Predict
-
-Download checkpoint
-```bash
-hf download nvidia/Cosmos-Predict2.5-2B
-```
-
-Convert checkpoint
-```bash
-transformer_ckpt_path=~/.cache/huggingface/hub/models--nvidia--Cosmos-Predict2.5-2B/snapshots/865baf084d4c9e850eac59a021277d5a9b9e8b63/base/pre-trained/d20b7120-df3e-4911-919d-db6e08bad31c_ema_bf16.pt
-
-python scripts/convert_cosmos_to_diffusers.py \
-    --transformer_type Cosmos-2.5-Predict-Base-2B \
-    --transformer_ckpt_path $transformer_ckpt_path \
-    --vae_type wan2.1 \
-    --output_path converted/cosmos-p2.5-base-2b \
-    --save_pipeline
-```
-
-"""
-
 import argparse
 import pathlib
-import sys
 from typing import Any, Dict

 import torch
 from accelerate import init_empty_weights
 from huggingface_hub import snapshot_download
-from transformers import AutoTokenizer, Qwen2_5_VLForConditionalGeneration, T5EncoderModel, T5TokenizerFast
+from transformers import T5EncoderModel, T5TokenizerFast

 from diffusers import (
    AutoencoderKLCosmos,
@@ -61,9 +17,7 @@ from diffusers import (
    CosmosVideoToWorldPipeline,
    EDMEulerScheduler,
    FlowMatchEulerDiscreteScheduler,
-    UniPCMultistepScheduler,
 )
-from diffusers.pipelines.cosmos.pipeline_cosmos2_5_predict import Cosmos2_5_PredictBasePipeline


 def remove_keys_(key: str, state_dict: Dict[str, Any]):
@@ -279,25 +233,6 @@ TRANSFORMER_CONFIGS = {
        "concat_padding_mask": True,
        "extra_pos_embed_type": None,
    },
-    "Cosmos-2.5-Predict-Base-2B": {
-        "in_channels": 16 + 1,
-        "out_channels": 16,
-        "num_attention_heads": 16,
-        "attention_head_dim": 128,
-        "num_layers": 28,
-        "mlp_ratio": 4.0,
-        "text_embed_dim": 1024,
-        "adaln_lora_dim": 256,
-        "max_size": (128, 240, 240),
-        "patch_size": (1, 2, 2),
-        "rope_scale": (1.0, 3.0, 3.0),
-        "concat_padding_mask": True,
-        # NOTE: source config has pos_emb_learnable: 'True' - but params are missing
-        "extra_pos_embed_type": None,
-        "use_crossattn_projection": True,
-        "crossattn_proj_in_channels": 100352,
-        "encoder_hidden_states_channels": 1024,
-    },
 }

 VAE_KEYS_RENAME_DICT = {
@@ -399,9 +334,6 @@ def convert_transformer(transformer_type: str, ckpt_path: str, weights_only: boo
    elif "Cosmos-2.0" in transformer_type:
        TRANSFORMER_KEYS_RENAME_DICT = TRANSFORMER_KEYS_RENAME_DICT_COSMOS_2_0
        TRANSFORMER_SPECIAL_KEYS_REMAP = TRANSFORMER_SPECIAL_KEYS_REMAP_COSMOS_2_0
-    elif "Cosmos-2.5" in transformer_type:
-        TRANSFORMER_KEYS_RENAME_DICT = TRANSFORMER_KEYS_RENAME_DICT_COSMOS_2_0
-        TRANSFORMER_SPECIAL_KEYS_REMAP = TRANSFORMER_SPECIAL_KEYS_REMAP_COSMOS_2_0
    else:
        assert False

@@ -415,7 +347,6 @@ def convert_transformer(transformer_type: str, ckpt_path: str, weights_only: boo
            new_key = new_key.removeprefix(PREFIX_KEY)
        for replace_key, rename_key in TRANSFORMER_KEYS_RENAME_DICT.items():
            new_key = new_key.replace(replace_key, rename_key)
-        print(key, "->", new_key, flush=True)
        update_state_dict_(original_state_dict, key, new_key)

    for key in list(original_state_dict.keys()):
@@ -424,21 +355,6 @@ def convert_transformer(transformer_type: str, ckpt_path: str, weights_only: boo
                continue
            handler_fn_inplace(key, original_state_dict)

-    expected_keys = set(transformer.state_dict().keys())
-    mapped_keys = set(original_state_dict.keys())
-    missing_keys = expected_keys - mapped_keys
-    unexpected_keys = mapped_keys - expected_keys
-    if missing_keys:
-        print(f"ERROR: missing keys ({len(missing_keys)} from state_dict:", flush=True, file=sys.stderr)
-        for k in missing_keys:
-            print(k)
-        sys.exit(1)
-    if unexpected_keys:
-        print(f"ERROR: unexpected keys ({len(unexpected_keys)}) from state_dict:", flush=True, file=sys.stderr)
-        for k in unexpected_keys:
-            print(k)
-        sys.exit(2)
-
    transformer.load_state_dict(original_state_dict, strict=True, assign=True)
    return transformer

@@ -528,34 +444,6 @@ def save_pipeline_cosmos_2_0(args, transformer, vae):
    pipe.save_pretrained(args.output_path, safe_serialization=True, max_shard_size="5GB")


-def save_pipeline_cosmos2_5(args, transformer, vae):
-    text_encoder_path = args.text_encoder_path or "nvidia/Cosmos-Reason1-7B"
-    tokenizer_path = args.tokenizer_path or "Qwen/Qwen2.5-VL-7B-Instruct"
-
-    text_encoder = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-        text_encoder_path, torch_dtype="auto", device_map="cpu"
-    )
-    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
-
-    scheduler = UniPCMultistepScheduler(
-        use_karras_sigmas=True,
-        use_flow_sigmas=True,
-        prediction_type="flow_prediction",
-        sigma_max=200.0,
-        sigma_min=0.01,
-    )
-
-    pipe = Cosmos2_5_PredictBasePipeline(
-        text_encoder=text_encoder,
-        tokenizer=tokenizer,
-        transformer=transformer,
-        vae=vae,
-        scheduler=scheduler,
-        safety_checker=lambda *args, **kwargs: None,
-    )
-    pipe.save_pretrained(args.output_path, safe_serialization=True, max_shard_size="5GB")
-
-
 def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--transformer_type", type=str, default=None, choices=list(TRANSFORMER_CONFIGS.keys()))
@@ -563,10 +451,10 @@ def get_args():
        "--transformer_ckpt_path", type=str, default=None, help="Path to original transformer checkpoint"
    )
    parser.add_argument(
-        "--vae_type", type=str, default="wan2.1", choices=["wan2.1", *list(VAE_CONFIGS.keys())], help="Type of VAE"
+        "--vae_type", type=str, default=None, choices=["none", *list(VAE_CONFIGS.keys())], help="Type of VAE"
    )
-    parser.add_argument("--text_encoder_path", type=str, default=None)
-    parser.add_argument("--tokenizer_path", type=str, default=None)
+    parser.add_argument("--text_encoder_path", type=str, default="google-t5/t5-11b")
+    parser.add_argument("--tokenizer_path", type=str, default="google-t5/t5-11b")
    parser.add_argument("--save_pipeline", action="store_true")
    parser.add_argument("--output_path", type=str, required=True, help="Path where converted model should be saved")
    parser.add_argument("--dtype", default="bf16", help="Torch dtype to save the transformer in.")
@@ -589,6 +477,8 @@ if __name__ == "__main__":
    if args.save_pipeline:
        assert args.transformer_ckpt_path is not None
        assert args.vae_type is not None
+        assert args.text_encoder_path is not None
+        assert args.tokenizer_path is not None

    if args.transformer_ckpt_path is not None:
        weights_only = "Cosmos-1.0" in args.transformer_type
@@ -600,26 +490,17 @@ if __name__ == "__main__":
    if args.vae_type is not None:
        if "Cosmos-1.0" in args.transformer_type:
            vae = convert_vae(args.vae_type)
-        elif "Cosmos-2.0" in args.transformer_type or "Cosmos-2.5" in args.transformer_type:
+        else:
            vae = AutoencoderKLWan.from_pretrained(
                "Wan-AI/Wan2.1-T2V-1.3B-Diffusers", subfolder="vae", torch_dtype=torch.float32
            )
-        else:
-            raise AssertionError(f"{args.transformer_type} not supported")
-
        if not args.save_pipeline:
            vae.save_pretrained(args.output_path, safe_serialization=True, max_shard_size="5GB")

    if args.save_pipeline:
        if "Cosmos-1.0" in args.transformer_type:
-            assert args.text_encoder_path is not None
-            assert args.tokenizer_path is not None
            save_pipeline_cosmos_1_0(args, transformer, vae)
        elif "Cosmos-2.0" in args.transformer_type:
-            assert args.text_encoder_path is not None
-            assert args.tokenizer_path is not None
            save_pipeline_cosmos_2_0(args, transformer, vae)
-        elif "Cosmos-2.5" in args.transformer_type:
-            save_pipeline_cosmos2_5(args, transformer, vae)
        else:
-            raise AssertionError(f"{args.transformer_type} not supported")
+            assert False
--- a/src/diffusers/init.py
+++ b/src/diffusers/init.py
@@ -463,7 +463,6 @@ else:
            "CogView4ControlPipeline",
            "CogView4Pipeline",
            "ConsisIDPipeline",
-            "Cosmos2_5_PredictBasePipeline",
            "Cosmos2TextToImagePipeline",
            "Cosmos2VideoToWorldPipeline",
            "CosmosTextToWorldPipeline",
@@ -1176,7 +1175,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
            CogView4ControlPipeline,
            CogView4Pipeline,
            ConsisIDPipeline,
-            Cosmos2_5_PredictBasePipeline,
            Cosmos2TextToImagePipeline,
            Cosmos2VideoToWorldPipeline,
            CosmosTextToWorldPipeline,
--- a/src/diffusers/models/transformers/transformer_cosmos.py
+++ b/src/diffusers/models/transformers/transformer_cosmos.py
@@ -439,9 +439,6 @@ class CosmosTransformer3DModel(ModelMixin, ConfigMixin, FromOriginalModelMixin):
        rope_scale: Tuple[float, float, float] = (2.0, 1.0, 1.0),
        concat_padding_mask: bool = True,
        extra_pos_embed_type: Optional[str] = "learnable",
-        use_crossattn_projection: bool = False,
-        crossattn_proj_in_channels: int = 1024,
-        encoder_hidden_states_channels: int = 1024,
    ) -> None:
        super().__init__()
        hidden_size = num_attention_heads * attention_head_dim
@@ -488,12 +485,6 @@ class CosmosTransformer3DModel(ModelMixin, ConfigMixin, FromOriginalModelMixin):
            hidden_size, patch_size[0] * patch_size[1] * patch_size[2] * out_channels, bias=False
        )

-        if self.config.use_crossattn_projection:
-            self.crossattn_proj = nn.Sequential(
-                nn.Linear(crossattn_proj_in_channels, encoder_hidden_states_channels, bias=True),
-                nn.GELU(),
-            )
-
        self.gradient_checkpointing = False

    def forward(
@@ -533,7 +524,6 @@ class CosmosTransformer3DModel(ModelMixin, ConfigMixin, FromOriginalModelMixin):
        post_patch_num_frames = num_frames // p_t
        post_patch_height = height // p_h
        post_patch_width = width // p_w
-
        hidden_states = self.patch_embed(hidden_states)
        hidden_states = hidden_states.flatten(1, 3)  # [B, T, H, W, C] -> [B, THW, C]

@@ -556,9 +546,6 @@ class CosmosTransformer3DModel(ModelMixin, ConfigMixin, FromOriginalModelMixin):
        else:
            assert False

-        if self.config.use_crossattn_projection:
-            encoder_hidden_states = self.crossattn_proj(encoder_hidden_states)
-
        # 5. Transformer blocks
        for block in self.transformer_blocks:
            if torch.is_grad_enabled() and self.gradient_checkpointing:
--- a/src/diffusers/modular_pipelines/flux/modular_blocks.py
+++ b/src/diffusers/modular_pipelines/flux/modular_blocks.py
@@ -360,7 +360,7 @@ class FluxKontextCoreDenoiseStep(SequentialPipelineBlocks):
 AUTO_BLOCKS = InsertableDict(
    [
        ("text_encoder", FluxTextEncoderStep()),
-        ("vae_encoder", FluxAutoVaeEncoderStep()),
+        ("image_encoder", FluxAutoVaeEncoderStep()),
        ("denoise", FluxCoreDenoiseStep()),
        ("decode", FluxDecodeStep()),
    ]
@@ -369,7 +369,7 @@ AUTO_BLOCKS = InsertableDict(
 AUTO_BLOCKS_KONTEXT = InsertableDict(
    [
        ("text_encoder", FluxTextEncoderStep()),
-        ("vae_encoder", FluxKontextAutoVaeEncoderStep()),
+        ("image_encoder", FluxKontextAutoVaeEncoderStep()),
        ("denoise", FluxKontextCoreDenoiseStep()),
        ("decode", FluxDecodeStep()),
    ]
--- a/src/diffusers/modular_pipelines/mellon_node_utils.py
+++ b/src/diffusers/modular_pipelines/mellon_node_utils.py
--- a/src/diffusers/modular_pipelines/modular_pipeline.py
+++ b/src/diffusers/modular_pipelines/modular_pipeline.py
@@ -231,7 +231,7 @@ class BlockState:

 class ModularPipelineBlocks(ConfigMixin, PushToHubMixin):
    """
-    Base class for all Pipeline Blocks: ConditionalPipelineBlocks, AutoPipelineBlocks, SequentialPipelineBlocks,
+    Base class for all Pipeline Blocks: PipelineBlock, AutoPipelineBlocks, SequentialPipelineBlocks,
    LoopSequentialPipelineBlocks

    [`ModularPipelineBlocks`] provides method to load and save the definition of pipeline blocks.
@@ -501,19 +501,15 @@ class ModularPipelineBlocks(ConfigMixin, PushToHubMixin):

    @property
    def input_names(self) -> List[str]:
-        return [input_param.name for input_param in self.inputs if input_param.name is not None]
+        return [input_param.name for input_param in self.inputs]

    @property
    def intermediate_output_names(self) -> List[str]:
-        return [output_param.name for output_param in self.intermediate_outputs if output_param.name is not None]
+        return [output_param.name for output_param in self.intermediate_outputs]

    @property
    def output_names(self) -> List[str]:
-        return [output_param.name for output_param in self.outputs if output_param.name is not None]
-
-    @property
-    def component_names(self) -> List[str]:
-        return [component.name for component in self.expected_components]
+        return [output_param.name for output_param in self.outputs]

    @property
    def doc(self):
@@ -527,10 +523,9 @@ class ModularPipelineBlocks(ConfigMixin, PushToHubMixin):
        )


-class ConditionalPipelineBlocks(ModularPipelineBlocks):
+class AutoPipelineBlocks(ModularPipelineBlocks):
    """
-    A Pipeline Blocks that conditionally selects a block to run based on the inputs.
-    Subclasses must implement the `select_block` method to define the logic for selecting the block.
+    A Pipeline Blocks that automatically selects a block to run based on the inputs.

    This class inherits from [`ModularPipelineBlocks`]. Check the superclass documentation for the generic methods the
    library implements for all the pipeline blocks (such as loading or saving etc.)
@@ -540,13 +535,12 @@ class ConditionalPipelineBlocks(ModularPipelineBlocks):
    Attributes:
        block_classes: List of block classes to be used
        block_names: List of prefixes for each block
-        block_trigger_inputs: List of input names that select_block() uses to determine which block to run
+        block_trigger_inputs: List of input names that trigger specific blocks, with None for default
    """

    block_classes = []
    block_names = []
    block_trigger_inputs = []
-    default_block_name = None # name of the default block if no trigger inputs are provided, if None, this block can be skipped if no trigger inputs are provided

    def __init__(self):
        sub_blocks = InsertableDict()
@@ -556,15 +550,26 @@ class ConditionalPipelineBlocks(ModularPipelineBlocks):
            else:
                sub_blocks[block_name] = block
        self.sub_blocks = sub_blocks
-        if not (len(self.block_classes) == len(self.block_names)):
+        if not (len(self.block_classes) == len(self.block_names) == len(self.block_trigger_inputs)):
            raise ValueError(
-                f"In {self.__class__.__name__}, the number of block_classes and block_names must be the same."
+                f"In {self.__class__.__name__}, the number of block_classes, block_names, and block_trigger_inputs must be the same."
            )
-        if self.default_block_name is not None and self.default_block_name not in self.block_names:
+        default_blocks = [t for t in self.block_trigger_inputs if t is None]
+        # can only have 1 or 0 default block, and has to put in the last
+        # the order of blocks matters here because the first block with matching trigger will be dispatched
+        # e.g. blocks = [inpaint, img2img] and block_trigger_inputs = ["mask", "image"]
+        # as long as mask is provided, it is inpaint; if only image is provided, it is img2img
+        if len(default_blocks) > 1 or (len(default_blocks) == 1 and self.block_trigger_inputs[-1] is not None):
            raise ValueError(
-                f"In {self.__class__.__name__}, default_block_name '{self.default_block_name}' must be one of block_names: {self.block_names}"
+                f"In {self.__class__.__name__}, exactly one None must be specified as the last element "
+                "in block_trigger_inputs."
            )

+        # Map trigger inputs to block objects
+        self.trigger_to_block_map = dict(zip(self.block_trigger_inputs, self.sub_blocks.values()))
+        self.trigger_to_block_name_map = dict(zip(self.block_trigger_inputs, self.sub_blocks.keys()))
+        self.block_to_trigger_map = dict(zip(self.sub_blocks.keys(), self.block_trigger_inputs))
+
    @property
    def model_name(self):
        return next(iter(self.sub_blocks.values())).model_name
@@ -593,11 +598,8 @@ class ConditionalPipelineBlocks(ModularPipelineBlocks):

    @property
    def required_inputs(self) -> List[str]:
-
-        # no default block means this conditional block can be skipped entirely
-        if self.default_block_name is None:
+        if None not in self.block_trigger_inputs:
            return []
-        
        first_block = next(iter(self.sub_blocks.values()))
        required_by_all = set(getattr(first_block, "required_inputs", set()))

@@ -608,7 +610,7 @@ class ConditionalPipelineBlocks(ModularPipelineBlocks):

        return list(required_by_all)

-
+    # YiYi TODO: add test for this
    @property
    def inputs(self) -> List[Tuple[str, Any]]:
        named_inputs = [(name, block.inputs) for name, block in self.sub_blocks.items()]
@@ -633,69 +635,22 @@ class ConditionalPipelineBlocks(ModularPipelineBlocks):
        combined_outputs = self.combine_outputs(*named_outputs)
        return combined_outputs

-    def _get_trigger_inputs(self) -> set:
-        """
-        Returns a set of all unique trigger input values found in this block and nested blocks.
-        """
-
-        def fn_recursive_get_trigger(blocks):
-            trigger_values = set()
-
-            if blocks is not None:
-                for name, block in blocks.items():
-                    # Check if current block has block_trigger_inputs
-                    if hasattr(block, "block_trigger_inputs") and block.block_trigger_inputs is not None:
-                        trigger_values.update(t for t in block.block_trigger_inputs if t is not None)
-
-                    # If block has sub_blocks, recursively check them
-                    if block.sub_blocks:
-                        nested_triggers = fn_recursive_get_trigger(block.sub_blocks)
-                        trigger_values.update(nested_triggers)
-
-            return trigger_values
-
-        # Start with this block's block_trigger_inputs
-        all_triggers = set(t for t in self.block_trigger_inputs if t is not None)
-        # Add nested triggers
-        all_triggers.update(fn_recursive_get_trigger(self.sub_blocks))
-
-        return all_triggers
-
-    @property
-    def trigger_inputs(self):
-        """All trigger inputs including from nested blocks."""
-        return self._get_trigger_inputs()
-
-    def select_block(self, **kwargs) -> Optional[str]:
-        """
-        Select the block to run based on the trigger inputs.
-        Subclasses must implement this method to define the logic for selecting the block.
-
-        Args:
-            **kwargs: Trigger input names and their values from the state.
-
-        Returns:
-            Optional[str]: The name of the block to run, or None to use default/skip.
-        """
-        raise NotImplementedError(f"Subclass {self.__class__.__name__} must implement the `select_block` method.")
-
    @torch.no_grad()
    def __call__(self, pipeline, state: PipelineState) -> PipelineState:
-        
-        trigger_kwargs = {name: state.get(name) for name in self.block_trigger_inputs if name is not None}
-        block_name = self.select_block(**trigger_kwargs)
+        # Find default block first (if any)

-        if block_name is None:
-            block_name = self.default_block_name
+        block = self.trigger_to_block_map.get(None)
+        for input_name in self.block_trigger_inputs:
+            if input_name is not None and state.get(input_name) is not None:
+                block = self.trigger_to_block_map[input_name]
+                break

-        if block_name is None:
-            logger.info(f"skipping conditional block: {self.__class__.__name__}")
+        if block is None:
+            logger.info(f"skipping auto block: {self.__class__.__name__}")
            return pipeline, state
-        
-        block = self.sub_blocks[block_name]

        try:
-            logger.info(f"Running block: {block.__class__.__name__}")
+            logger.info(f"Running block: {block.__class__.__name__}, trigger: {input_name}")
            return block(pipeline, state)
        except Exception as e:
            error_msg = (
@@ -706,6 +661,38 @@ class ConditionalPipelineBlocks(ModularPipelineBlocks):
            logger.error(error_msg)
            raise

+    def _get_trigger_inputs(self):
+        """
+        Returns a set of all unique trigger input values found in the blocks. Returns: Set[str] containing all unique
+        block_trigger_inputs values
+        """
+
+        def fn_recursive_get_trigger(blocks):
+            trigger_values = set()
+
+            if blocks is not None:
+                for name, block in blocks.items():
+                    # Check if current block has trigger inputs(i.e. auto block)
+                    if hasattr(block, "block_trigger_inputs") and block.block_trigger_inputs is not None:
+                        # Add all non-None values from the trigger inputs list
+                        trigger_values.update(t for t in block.block_trigger_inputs if t is not None)
+
+                    # If block has sub_blocks, recursively check them
+                    if block.sub_blocks:
+                        nested_triggers = fn_recursive_get_trigger(block.sub_blocks)
+                        trigger_values.update(nested_triggers)
+
+            return trigger_values
+
+        trigger_inputs = set(self.block_trigger_inputs)
+        trigger_inputs.update(fn_recursive_get_trigger(self.sub_blocks))
+
+        return trigger_inputs
+
+    @property
+    def trigger_inputs(self):
+        return self._get_trigger_inputs()
+
    def __repr__(self):
        class_name = self.__class__.__name__
        base_class = self.__class__.__bases__[0].__name__
@@ -717,7 +704,7 @@ class ConditionalPipelineBlocks(ModularPipelineBlocks):
            header += "\n"
            header += "  " + "=" * 100 + "\n"
            header += "  This pipeline contains blocks that are selected at runtime based on inputs.\n"
-            header += f"  Trigger Inputs: {sorted(self.trigger_inputs)}\n"
+            header += f"  Trigger Inputs: {[inp for inp in self.trigger_inputs if inp is not None]}\n"
            header += "  " + "=" * 100 + "\n\n"

        # Format description with proper indentation
@@ -738,20 +725,31 @@ class ConditionalPipelineBlocks(ModularPipelineBlocks):
        expected_configs = getattr(self, "expected_configs", [])
        configs_str = format_configs(expected_configs, indent_level=2, add_empty_lines=False)

-        # Blocks section 
+        # Blocks section - moved to the end with simplified format
        blocks_str = "  Sub-Blocks:\n"
        for i, (name, block) in enumerate(self.sub_blocks.items()):
-            if name == self.default_block_name:
-                addtional_str  = " [default]"
+            # Get trigger input for this block
+            trigger = None
+            if hasattr(self, "block_to_trigger_map"):
+                trigger = self.block_to_trigger_map.get(name)
+                # Format the trigger info
+                if trigger is None:
+                    trigger_str = "[default]"
+                elif isinstance(trigger, (list, tuple)):
+                    trigger_str = f"[trigger: {', '.join(str(t) for t in trigger)}]"
+                else:
+                    trigger_str = f"[trigger: {trigger}]"
+                # For AutoPipelineBlocks, add bullet points
+                blocks_str += f"    • {name} {trigger_str} ({block.__class__.__name__})\n"
            else:
-                addtional_str = ""
-            blocks_str += f"    • {name}{addtional_str} ({block.__class__.__name__})\n"
+                # For SequentialPipelineBlocks, show execution order
+                blocks_str += f"    [{i}] {name} ({block.__class__.__name__})\n"

            # Add block description
-            block_desc_lines = block.description.split("\n")
-            indented_desc = block_desc_lines[0]
-            if len(block_desc_lines) > 1:
-                indented_desc += "\n" + "\n".join("                   " + line for line in block_desc_lines[1:])
+            desc_lines = block.description.split("\n")
+            indented_desc = desc_lines[0]
+            if len(desc_lines) > 1:
+                indented_desc += "\n" + "\n".join("                   " + line for line in desc_lines[1:])
            blocks_str += f"       Description: {indented_desc}\n\n"

        # Build the representation with conditional sections
@@ -782,35 +780,6 @@ class ConditionalPipelineBlocks(ModularPipelineBlocks):
        )


-class AutoPipelineBlocks(ConditionalPipelineBlocks):
-    """
-    A Pipeline Blocks that automatically selects a block to run based on the presence of trigger inputs.
-    """
-
-    def __init__(self):
-        super().__init__()
-
-        if not (len(self.block_classes) == len(self.block_names) == len(self.block_trigger_inputs)):
-            raise ValueError(
-                f"In {self.__class__.__name__}, the number of block_classes, block_names, and block_trigger_inputs must be the same."
-            )
-
-    @property
-    def default_block_name(self) -> Optional[str]:
-        """Derive default_block_name from block_trigger_inputs (None entry)."""
-        if None in self.block_trigger_inputs:
-            idx = self.block_trigger_inputs.index(None)
-            return self.block_names[idx]
-        return None
-
-    def select_block(self, **kwargs) -> Optional[str]:
-        """Select block based on which trigger input is present (not None)."""
-        for trigger_input, block_name in zip(self.block_trigger_inputs, self.block_names):
-            if trigger_input is not None and kwargs.get(trigger_input) is not None:
-                return block_name
-        return None
-
-
 class SequentialPipelineBlocks(ModularPipelineBlocks):
    """
    A Pipeline Blocks that combines multiple pipeline block classes into one. When called, it will call each block in
@@ -912,8 +881,7 @@ class SequentialPipelineBlocks(ModularPipelineBlocks):

            # Only add outputs if the block cannot be skipped
            should_add_outputs = True
-            if isinstance(block, ConditionalPipelineBlocks) and block.default_block_name is None:
-                # ConditionalPipelineBlocks without default can be skipped
+            if hasattr(block, "block_trigger_inputs") and None not in block.block_trigger_inputs:
                should_add_outputs = False

            if should_add_outputs:
@@ -976,7 +944,8 @@ class SequentialPipelineBlocks(ModularPipelineBlocks):

    def _get_trigger_inputs(self):
        """
-        Returns a set of all unique trigger input values found in the blocks.
+        Returns a set of all unique trigger input values found in the blocks. Returns: Set[str] containing all unique
+        block_trigger_inputs values
        """

        def fn_recursive_get_trigger(blocks):
@@ -984,8 +953,9 @@ class SequentialPipelineBlocks(ModularPipelineBlocks):

            if blocks is not None:
                for name, block in blocks.items():
-                    # Check if current block has block_trigger_inputs (ConditionalPipelineBlocks)
+                    # Check if current block has trigger inputs(i.e. auto block)
                    if hasattr(block, "block_trigger_inputs") and block.block_trigger_inputs is not None:
+                        # Add all non-None values from the trigger inputs list
                        trigger_values.update(t for t in block.block_trigger_inputs if t is not None)

                    # If block has sub_blocks, recursively check them
@@ -1001,85 +971,82 @@ class SequentialPipelineBlocks(ModularPipelineBlocks):
    def trigger_inputs(self):
        return self._get_trigger_inputs()

-    def _traverse_trigger_blocks(self, active_inputs):
-        """
-        Traverse blocks and select which ones would run given the active inputs.
+    def _traverse_trigger_blocks(self, trigger_inputs):
+        # Convert trigger_inputs to a set for easier manipulation
+        active_triggers = set(trigger_inputs)

-        Args:
-            active_inputs: Dict of input names to values that are "present"
-
-        Returns:
-            OrderedDict of block_name -> block that would execute
-        """
-
-        def fn_recursive_traverse(block, block_name, active_inputs):
+        def fn_recursive_traverse(block, block_name, active_triggers):
            result_blocks = OrderedDict()

-            # ConditionalPipelineBlocks (includes AutoPipelineBlocks)
-            if isinstance(block, ConditionalPipelineBlocks):
-                trigger_kwargs = {name: active_inputs.get(name) for name in block.block_trigger_inputs}
-                selected_block_name = block.select_block(**trigger_kwargs)
-
-                if selected_block_name is None:
-                    selected_block_name = block.default_block_name
-
-                if selected_block_name is None:
-                    return result_blocks
-
-                selected_block = block.sub_blocks[selected_block_name]
-
-                if selected_block.sub_blocks:
-                    result_blocks.update(fn_recursive_traverse(selected_block, block_name, active_inputs))
+            # sequential(include loopsequential) or PipelineBlock
+            if not hasattr(block, "block_trigger_inputs"):
+                if block.sub_blocks:
+                    # sequential or LoopSequentialPipelineBlocks (keep traversing)
+                    for sub_block_name, sub_block in block.sub_blocks.items():
+                        blocks_to_update = fn_recursive_traverse(sub_block, sub_block_name, active_triggers)
+                        blocks_to_update = fn_recursive_traverse(sub_block, sub_block_name, active_triggers)
+                        blocks_to_update = {f"{block_name}.{k}": v for k, v in blocks_to_update.items()}
+                        result_blocks.update(blocks_to_update)
                else:
-                    result_blocks[block_name] = selected_block
-                    if hasattr(selected_block, "outputs"):
-                        for out in selected_block.outputs:
-                            active_inputs[out.name] = True
-
+                    # PipelineBlock
+                    result_blocks[block_name] = block
+                    # Add this block's output names to active triggers if defined
+                    if hasattr(block, "outputs"):
+                        active_triggers.update(out.name for out in block.outputs)
                return result_blocks

-            # SequentialPipelineBlocks or LoopSequentialPipelineBlocks
-            if block.sub_blocks:
-                for sub_block_name, sub_block in block.sub_blocks.items():
-                    blocks_to_update = fn_recursive_traverse(sub_block, sub_block_name, active_inputs)
-                    blocks_to_update = {f"{block_name}.{k}": v for k, v in blocks_to_update.items()}
-                    result_blocks.update(blocks_to_update)
+            # auto
            else:
-                result_blocks[block_name] = block
-                if hasattr(block, "outputs"):
-                    for out in block.outputs:
-                        active_inputs[out.name] = True
+                # Find first block_trigger_input that matches any value in our active_triggers
+                this_block = None
+                for trigger_input in block.block_trigger_inputs:
+                    if trigger_input is not None and trigger_input in active_triggers:
+                        this_block = block.trigger_to_block_map[trigger_input]
+                        break
+
+                # If no matches found, try to get the default (None) block
+                if this_block is None and None in block.block_trigger_inputs:
+                    this_block = block.trigger_to_block_map[None]
+
+                if this_block is not None:
+                    # sequential/auto (keep traversing)
+                    if this_block.sub_blocks:
+                        result_blocks.update(fn_recursive_traverse(this_block, block_name, active_triggers))
+                    else:
+                        # PipelineBlock
+                        result_blocks[block_name] = this_block
+                        # Add this block's output names to active triggers if defined
+                        # YiYi TODO: do we need outputs here? can it just be intermediate_outputs? can we get rid of outputs attribute?
+                        if hasattr(this_block, "outputs"):
+                            active_triggers.update(out.name for out in this_block.outputs)

            return result_blocks

        all_blocks = OrderedDict()
        for block_name, block in self.sub_blocks.items():
-            blocks_to_update = fn_recursive_traverse(block, block_name, active_inputs)
+            blocks_to_update = fn_recursive_traverse(block, block_name, active_triggers)
            all_blocks.update(blocks_to_update)
        return all_blocks

-    def get_execution_blocks(self, **kwargs):
-        """
-        Get the blocks that would execute given the specified inputs.
+    def get_execution_blocks(self, *trigger_inputs):
+        trigger_inputs_all = self.trigger_inputs

-        Args:
-            **kwargs: Input names and values. Only trigger inputs affect block selection.
-                    Pass any inputs that would be non-None at runtime.
+        if trigger_inputs is not None:
+            if not isinstance(trigger_inputs, (list, tuple, set)):
+                trigger_inputs = [trigger_inputs]
+            invalid_inputs = [x for x in trigger_inputs if x not in trigger_inputs_all]
+            if invalid_inputs:
+                logger.warning(
+                    f"The following trigger inputs will be ignored as they are not supported: {invalid_inputs}"
+                )
+                trigger_inputs = [x for x in trigger_inputs if x in trigger_inputs_all]

-        Returns:
-            SequentialPipelineBlocks containing only the blocks that would execute
-        
-        Example:
-            # Get blocks for inpainting workflow
-            blocks = pipeline.get_execution_blocks(prompt="a cat", mask=mask, image=image)
-            
-            # Get blocks for text2image workflow
-            blocks = pipeline.get_execution_blocks(prompt="a cat")
-        """
-        # Filter out None values
-        active_inputs = {k: v for k, v in kwargs.items() if v is not None}
-        
-        blocks_triggered = self._traverse_trigger_blocks(active_inputs)
+        if trigger_inputs is None:
+            if None in trigger_inputs_all:
+                trigger_inputs = [None]
+            else:
+                trigger_inputs = [trigger_inputs_all[0]]
+        blocks_triggered = self._traverse_trigger_blocks(trigger_inputs)
        return SequentialPipelineBlocks.from_blocks_dict(blocks_triggered)

    def __repr__(self):
@@ -1096,7 +1063,7 @@ class SequentialPipelineBlocks(ModularPipelineBlocks):
            header += f"  Trigger Inputs: {[inp for inp in self.trigger_inputs if inp is not None]}\n"
            # Get first trigger input as example
            example_input = next(t for t in self.trigger_inputs if t is not None)
-            header += f"  Use `get_execution_blocks()` to see selected blocks (e.g. `get_execution_blocks({example_input}=...)`).\n"
+            header += f"  Use `get_execution_blocks()` with input names to see selected blocks (e.g. `get_execution_blocks('{example_input}')`).\n"
            header += "  " + "=" * 100 + "\n\n"

        # Format description with proper indentation
@@ -1120,9 +1087,22 @@ class SequentialPipelineBlocks(ModularPipelineBlocks):
        # Blocks section - moved to the end with simplified format
        blocks_str = "  Sub-Blocks:\n"
        for i, (name, block) in enumerate(self.sub_blocks.items()):
-
-            # show execution order
-            blocks_str += f"    [{i}] {name} ({block.__class__.__name__})\n"
+            # Get trigger input for this block
+            trigger = None
+            if hasattr(self, "block_to_trigger_map"):
+                trigger = self.block_to_trigger_map.get(name)
+                # Format the trigger info
+                if trigger is None:
+                    trigger_str = "[default]"
+                elif isinstance(trigger, (list, tuple)):
+                    trigger_str = f"[trigger: {', '.join(str(t) for t in trigger)}]"
+                else:
+                    trigger_str = f"[trigger: {trigger}]"
+                # For AutoPipelineBlocks, add bullet points
+                blocks_str += f"    • {name} {trigger_str} ({block.__class__.__name__})\n"
+            else:
+                # For SequentialPipelineBlocks, show execution order
+                blocks_str += f"    [{i}] {name} ({block.__class__.__name__})\n"

            # Add block description
            desc_lines = block.description.split("\n")
@@ -1246,9 +1226,15 @@ class LoopSequentialPipelineBlocks(ModularPipelineBlocks):
                if inp.name not in outputs and inp not in inputs:
                    inputs.append(inp)

-            # Add this block's outputs
-            block_intermediate_outputs = [out.name for out in block.intermediate_outputs]
-            outputs.update(block_intermediate_outputs)
+            # Only add outputs if the block cannot be skipped
+            should_add_outputs = True
+            if hasattr(block, "block_trigger_inputs") and None not in block.block_trigger_inputs:
+                should_add_outputs = False
+
+            if should_add_outputs:
+                # Add this block's outputs
+                block_intermediate_outputs = [out.name for out in block.intermediate_outputs]
+                outputs.update(block_intermediate_outputs)

        for input_param in inputs:
            if input_param.name in self.required_inputs:
@@ -1305,14 +1291,6 @@ class LoopSequentialPipelineBlocks(ModularPipelineBlocks):
                sub_blocks[block_name] = block
        self.sub_blocks = sub_blocks

-        # Validate that sub_blocks are only leaf blocks
-        for block_name, block in self.sub_blocks.items():
-            if block.sub_blocks:
-                raise ValueError(
-                    f"In {self.__class__.__name__}, sub_blocks must be leaf blocks (no sub_blocks). "
-                    f"Block '{block_name}' ({block.__class__.__name__}) has sub_blocks."
-                )
-
    @classmethod
    def from_blocks_dict(cls, blocks_dict: Dict[str, Any]) -> "LoopSequentialPipelineBlocks":
        """
@@ -1547,8 +1525,10 @@ class ModularPipeline(ConfigMixin, PushToHubMixin):
        if blocks is None:
            if modular_config_dict is not None:
                blocks_class_name = modular_config_dict.get("_blocks_class_name")
-            else:
+            elif config_dict is not None:
                blocks_class_name = self.get_default_blocks_name(config_dict)
+            else:
+                blocks_class_name = None
            if blocks_class_name is not None:
                diffusers_module = importlib.import_module("diffusers")
                blocks_class = getattr(diffusers_module, blocks_class_name)
@@ -1645,10 +1625,7 @@ class ModularPipeline(ConfigMixin, PushToHubMixin):
            return None, config_dict

        except EnvironmentError as e:
-            raise EnvironmentError(
-                f"Failed to load config from '{pretrained_model_name_or_path}'. "
-                f"Could not find or load 'modular_model_index.json' or 'model_index.json'."
-            ) from e
+            logger.debug(f" model_index.json not found in the repo: {e}")

        return None, None

@@ -2573,11 +2550,7 @@ class ModularPipeline(ConfigMixin, PushToHubMixin):
            kwargs_type = expected_input_param.kwargs_type
            if name in passed_kwargs:
                state.set(name, passed_kwargs.pop(name), kwargs_type)
-            elif kwargs_type is not None and kwargs_type in passed_kwargs:
-                kwargs_dict = passed_kwargs.pop(kwargs_type)
-                for k, v in kwargs_dict.items():
-                    state.set(k, v, kwargs_type)
-            elif name is not None and name not in state.values:
+            elif name not in state.values:
                state.set(name, default, kwargs_type)

        # Warn about unexpected inputs
--- a/src/diffusers/modular_pipelines/qwenimage/init.py
+++ b/src/diffusers/modular_pipelines/qwenimage/init.py
@@ -21,16 +21,21 @@ except OptionalDependencyNotAvailable:

    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
 else:
-    _import_structure["modular_blocks_qwenimage"] = [
+    _import_structure["encoders"] = ["QwenImageTextEncoderStep"]
+    _import_structure["modular_blocks"] = [
+        "ALL_BLOCKS",
        "AUTO_BLOCKS",
-        "QwenImageAutoBlocks",
-    ]
-    _import_structure["modular_blocks_qwenimage_edit"] = [
+        "CONTROLNET_BLOCKS",
        "EDIT_AUTO_BLOCKS",
-        "QwenImageEditAutoBlocks",
-    ]
-    _import_structure["modular_blocks_qwenimage_edit_plus"] = [
+        "EDIT_BLOCKS",
+        "EDIT_INPAINT_BLOCKS",
        "EDIT_PLUS_AUTO_BLOCKS",
+        "EDIT_PLUS_BLOCKS",
+        "IMAGE2IMAGE_BLOCKS",
+        "INPAINT_BLOCKS",
+        "TEXT2IMAGE_BLOCKS",
+        "QwenImageAutoBlocks",
+        "QwenImageEditAutoBlocks",
        "QwenImageEditPlusAutoBlocks",
    ]
    _import_structure["modular_pipeline"] = [
@@ -46,16 +51,23 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
    except OptionalDependencyNotAvailable:
        from ...utils.dummy_torch_and_transformers_objects import *  # noqa F403
    else:
-        from .modular_blocks_qwenimage import (
+        from .encoders import (
+            QwenImageTextEncoderStep,
+        )
+        from .modular_blocks import (
+            ALL_BLOCKS,
            AUTO_BLOCKS,
-            QwenImageAutoBlocks,
-        )
-        from .modular_blocks_qwenimage_edit import (
+            CONTROLNET_BLOCKS,
            EDIT_AUTO_BLOCKS,
-            QwenImageEditAutoBlocks,
-        )
-        from .modular_blocks_qwenimage_edit_plus import (
+            EDIT_BLOCKS,
+            EDIT_INPAINT_BLOCKS,
            EDIT_PLUS_AUTO_BLOCKS,
+            EDIT_PLUS_BLOCKS,
+            IMAGE2IMAGE_BLOCKS,
+            INPAINT_BLOCKS,
+            TEXT2IMAGE_BLOCKS,
+            QwenImageAutoBlocks,
+            QwenImageEditAutoBlocks,
            QwenImageEditPlusAutoBlocks,
        )
        from .modular_pipeline import (
@@ -74,4 +86,4 @@ else:
    )

    for name, value in _dummy_objects.items():
-        setattr(sys.modules[__name__], name, value)
+        setattr(sys.modules[__name__], name, value)
--- a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
+++ b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
@@ -639,65 +639,19 @@ class QwenImageEditRoPEInputsStep(ModularPipelineBlocks):
        return components, state


-class QwenImageEditPlusRoPEInputsStep(ModularPipelineBlocks):
-    """RoPE inputs step for Edit Plus that handles lists of image heights/widths."""
-
+class QwenImageEditPlusRoPEInputsStep(QwenImageEditRoPEInputsStep):
    model_name = "qwenimage-edit-plus"

-    @property
-    def description(self) -> str:
-        return (
-            "Step that prepares the RoPE inputs for denoising process. This is used in QwenImage Edit Plus.\n"
-            "Unlike Edit, Edit Plus handles lists of image_height/image_width for multiple reference images.\n"
-            "Should be placed after prepare_latents step."
-        )
-
-    @property
-    def inputs(self) -> List[InputParam]:
-        return [
-            InputParam(name="batch_size", required=True),
-            InputParam(name="image_height", required=True, type_hint=List[int]),
-            InputParam(name="image_width", required=True, type_hint=List[int]),
-            InputParam(name="height", required=True),
-            InputParam(name="width", required=True),
-            InputParam(name="prompt_embeds_mask"),
-            InputParam(name="negative_prompt_embeds_mask"),
-        ]
-
-    @property
-    def intermediate_outputs(self) -> List[OutputParam]:
-        return [
-            OutputParam(
-                name="img_shapes",
-                type_hint=List[List[Tuple[int, int, int]]],
-                description="The shapes of the image latents, used for RoPE calculation",
-            ),
-            OutputParam(
-                name="txt_seq_lens",
-                kwargs_type="denoiser_input_fields",
-                type_hint=List[int],
-                description="The sequence lengths of the prompt embeds, used for RoPE calculation",
-            ),
-            OutputParam(
-                name="negative_txt_seq_lens",
-                kwargs_type="denoiser_input_fields",
-                type_hint=List[int],
-                description="The sequence lengths of the negative prompt embeds, used for RoPE calculation",
-            ),
-        ]
-
    def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
        block_state = self.get_block_state(state)

        vae_scale_factor = components.vae_scale_factor
-
-        # Edit Plus: image_height and image_width are lists
        block_state.img_shapes = [
            [
                (1, block_state.height // vae_scale_factor // 2, block_state.width // vae_scale_factor // 2),
                *[
-                    (1, img_height // vae_scale_factor // 2, img_width // vae_scale_factor // 2)
-                    for img_height, img_width in zip(block_state.image_height, block_state.image_width)
+                    (1, vae_height // vae_scale_factor // 2, vae_width // vae_scale_factor // 2)
+                    for vae_height, vae_width in zip(block_state.image_height, block_state.image_width)
                ],
            ]
        ] * block_state.batch_size
--- a/src/diffusers/modular_pipelines/qwenimage/decoders.py
+++ b/src/diffusers/modular_pipelines/qwenimage/decoders.py
@@ -30,47 +30,6 @@ from .modular_pipeline import QwenImageModularPipeline, QwenImagePachifier
 logger = logging.get_logger(__name__)


-class QwenImageAfterDenoiseStep(ModularPipelineBlocks):
-    model_name = "qwenimage"
-
-    @property
-    def description(self) -> str:
-        return "Step that unpack the latents from 3D tensor (batch_size, sequence_length, channels) into 5D tensor (batch_size, channels, 1, height, width)"
-
-    @property
-    def expected_components(self) -> List[ComponentSpec]:
-        components = [
-            ComponentSpec("pachifier", QwenImagePachifier, default_creation_method="from_config"),
-        ]
-
-        return components
-
-    @property
-    def inputs(self) -> List[InputParam]:
-        return [
-            InputParam(name="height", required=True),
-            InputParam(name="width", required=True),
-            InputParam(
-                name="latents",
-                required=True,
-                type_hint=torch.Tensor,
-                description="The latents to decode, can be generated in the denoise step",
-            ),
-        ]
-
-    @torch.no_grad()
-    def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
-        block_state = self.get_block_state(state)
-
-        vae_scale_factor = components.vae_scale_factor
-        block_state.latents = components.pachifier.unpack_latents(
-            block_state.latents, block_state.height, block_state.width, vae_scale_factor=vae_scale_factor
-        )
-
-        self.set_block_state(state, block_state)
-        return components, state
-
-
 class QwenImageDecoderStep(ModularPipelineBlocks):
    model_name = "qwenimage"

@@ -82,6 +41,7 @@ class QwenImageDecoderStep(ModularPipelineBlocks):
    def expected_components(self) -> List[ComponentSpec]:
        components = [
            ComponentSpec("vae", AutoencoderKLQwenImage),
+            ComponentSpec("pachifier", QwenImagePachifier, default_creation_method="from_config"),
        ]

        return components
@@ -89,6 +49,8 @@ class QwenImageDecoderStep(ModularPipelineBlocks):
    @property
    def inputs(self) -> List[InputParam]:
        return [
+            InputParam(name="height", required=True),
+            InputParam(name="width", required=True),
            InputParam(
                name="latents",
                required=True,
@@ -112,12 +74,10 @@ class QwenImageDecoderStep(ModularPipelineBlocks):
        block_state = self.get_block_state(state)

        # YiYi Notes: remove support for output_type = "latents', we can just skip decode/encode step in modular
-        if block_state.latents.ndim == 4:
-            block_state.latents = block_state.latents.unsqueeze(dim=1)
-        elif block_state.latents.ndim != 5:
-            raise ValueError(
-                f"expect latents to be a 4D or 5D tensor but got: {block_state.latents.shape}. Please make sure the latents are unpacked before decode step."
-            )
+        vae_scale_factor = components.vae_scale_factor
+        block_state.latents = components.pachifier.unpack_latents(
+            block_state.latents, block_state.height, block_state.width, vae_scale_factor=vae_scale_factor
+        )
        block_state.latents = block_state.latents.to(components.vae.dtype)

        latents_mean = (
--- a/src/diffusers/modular_pipelines/qwenimage/encoders.py
+++ b/src/diffusers/modular_pipelines/qwenimage/encoders.py
@@ -244,19 +244,18 @@ def encode_vae_image(
 class QwenImageEditResizeDynamicStep(ModularPipelineBlocks):
    model_name = "qwenimage"

-    def __init__(
-        self, 
-        input_name: str = "image", 
-        output_name: str = "resized_image",
-        target_area: int = 1024 * 1024,
-    ):
-        """Create a configurable step for resizing images to the target area while maintaining the aspect ratio.
+    def __init__(self, input_name: str = "image", output_name: str = "resized_image"):
+        """Create a configurable step for resizing images to the target area (1024 * 1024) while maintaining the aspect ratio.
+
+        This block resizes an input image tensor and exposes the resized result under configurable input and output
+        names. Use this when you need to wire the resize step to different image fields (e.g., "image",
+        "control_image")
+
        Args:
            input_name (str, optional): Name of the image field to read from the
                pipeline state. Defaults to "image".
            output_name (str, optional): Name of the resized image field to write
                back to the pipeline state. Defaults to "resized_image".
-            target_area (int, optional): Target area in pixels. Defaults to 1024*1024.
        """
        if not isinstance(input_name, str) or not isinstance(output_name, str):
            raise ValueError(
@@ -264,12 +263,11 @@ class QwenImageEditResizeDynamicStep(ModularPipelineBlocks):
            )
        self._image_input_name = input_name
        self._resized_image_output_name = output_name
-        self._target_area = target_area
        super().__init__()

    @property
    def description(self) -> str:
-        return f"Image Resize step that resize the {self._image_input_name} to the target area {self._target_area} while maintaining the aspect ratio."
+        return f"Image Resize step that resize the {self._image_input_name} to the target area (1024 * 1024) while maintaining the aspect ratio."

    @property
    def expected_components(self) -> List[ComponentSpec]:
@@ -322,67 +320,48 @@ class QwenImageEditResizeDynamicStep(ModularPipelineBlocks):
        self.set_block_state(state, block_state)
        return components, state

-class QwenImageEditPlusResizeDynamicStep(ModularPipelineBlocks):
-    """Resize each image independently based on its own aspect ratio. For QwenImage Edit Plus."""

-    model_name = "qwenimage-edit-plus"
+class QwenImageEditPlusResizeDynamicStep(QwenImageEditResizeDynamicStep):
+    model_name = "qwenimage"

    def __init__(
-        self, 
-        input_name: str = "image", 
+        self,
+        input_name: str = "image",
        output_name: str = "resized_image",
-        target_area: int = 1024 * 1024,
+        vae_image_output_name: str = "vae_image",
    ):
-        """Create a step for resizing images to a target area.
+        """Create a configurable step for resizing images to the target area (384 * 384) while maintaining the aspect ratio.

-        Each image is resized independently based on its own aspect ratio.
-        This is suitable for Edit Plus where multiple reference images can have different dimensions.
+        This block resizes an input image or a list input images and exposes the resized result under configurable
+        input and output names. Use this when you need to wire the resize step to different image fields (e.g.,
+        "image", "control_image")

        Args:
-            input_name (str, optional): Name of the image field to read. Defaults to "image".
-            output_name (str, optional): Name of the resized image field to write. Defaults to "resized_image".
-            target_area (int, optional): Target area in pixels. Defaults to 1024*1024.
+            input_name (str, optional): Name of the image field to read from the
+                pipeline state. Defaults to "image".
+            output_name (str, optional): Name of the resized image field to write
+                back to the pipeline state. Defaults to "resized_image".
+            vae_image_output_name (str, optional): Name of the image field
+                to write back to the pipeline state. This is used by the VAE encoder step later on. QwenImage Edit Plus
+                processes the input image(s) differently for the VL and the VAE.
        """
        if not isinstance(input_name, str) or not isinstance(output_name, str):
            raise ValueError(
                f"input_name and output_name must be strings but are {type(input_name)} and {type(output_name)}"
            )
+        self.condition_image_size = 384 * 384
        self._image_input_name = input_name
        self._resized_image_output_name = output_name
-        self._target_area = target_area
+        self._vae_image_output_name = vae_image_output_name
        super().__init__()

-    @property
-    def description(self) -> str:
-        return (
-            f"Image Resize step that resizes {self._image_input_name} to target area {self._target_area}.\n"
-            "Each image is resized independently based on its own aspect ratio."
-        )
-
-    @property
-    def expected_components(self) -> List[ComponentSpec]:
-        return [
-            ComponentSpec(
-                "image_resize_processor",
-                VaeImageProcessor,
-                config=FrozenDict({"vae_scale_factor": 16}),
-                default_creation_method="from_config",
-            ),
-        ]
-
-    @property
-    def inputs(self) -> List[InputParam]:
-        return [
-            InputParam(
-                name=self._image_input_name, required=True, type_hint=torch.Tensor, description="The image(s) to resize"
-            ),
-        ]
-
    @property
    def intermediate_outputs(self) -> List[OutputParam]:
-        return [
+        return super().intermediate_outputs + [
            OutputParam(
-                name=self._resized_image_output_name, type_hint=List[PIL.Image.Image], description="The resized images"
+                name=self._vae_image_output_name,
+                type_hint=List[PIL.Image.Image],
+                description="The images to be processed which will be further used by the VAE encoder.",
            ),
        ]

@@ -395,21 +374,26 @@ class QwenImageEditPlusResizeDynamicStep(ModularPipelineBlocks):
        if not is_valid_image_imagelist(images):
            raise ValueError(f"Images must be image or list of images but are {type(images)}")

-        if is_valid_image(images):
+        if (
+            not isinstance(images, torch.Tensor)
+            and isinstance(images, PIL.Image.Image)
+            and not isinstance(images, list)
+        ):
            images = [images]

-        # Resize each image independently based on its own aspect ratio
-        resized_images = []
-        for image in images:
-            image_width, image_height = image.size
-            calculated_width, calculated_height, _ = calculate_dimensions(
-                self._target_area, image_width / image_height
-            )
-            resized_images.append(
-                components.image_resize_processor.resize(image, height=calculated_height, width=calculated_width)
+        # TODO (sayakpaul): revisit this when the inputs are `torch.Tensor`s
+        condition_images = []
+        vae_images = []
+        for img in images:
+            image_width, image_height = img.size
+            condition_width, condition_height, _ = calculate_dimensions(
+                self.condition_image_size, image_width / image_height
            )
+            condition_images.append(components.image_resize_processor.resize(img, condition_height, condition_width))
+            vae_images.append(img)

-        setattr(block_state, self._resized_image_output_name, resized_images)
+        setattr(block_state, self._resized_image_output_name, condition_images)
+        setattr(block_state, self._vae_image_output_name, vae_images)
        self.set_block_state(state, block_state)
        return components, state

@@ -663,30 +647,8 @@ class QwenImageEditTextEncoderStep(ModularPipelineBlocks):
        return components, state


-class QwenImageEditPlusTextEncoderStep(ModularPipelineBlocks):
-    """Text encoder for QwenImage Edit Plus that handles multiple reference images."""
-
-    model_name = "qwenimage-edit-plus"
-
-    @property
-    def description(self) -> str:
-        return (
-            "Text Encoder step for QwenImage Edit Plus that processes prompt and multiple images together "
-            "to generate text embeddings for guiding image generation."
-        )
-
-    @property
-    def expected_components(self) -> List[ComponentSpec]:
-        return [
-            ComponentSpec("text_encoder", Qwen2_5_VLForConditionalGeneration),
-            ComponentSpec("processor", Qwen2VLProcessor),
-            ComponentSpec(
-                "guider",
-                ClassifierFreeGuidance,
-                config=FrozenDict({"guidance_scale": 4.0}),
-                default_creation_method="from_config",
-            ),
-        ]
+class QwenImageEditPlusTextEncoderStep(QwenImageEditTextEncoderStep):
+    model_name = "qwenimage"

    @property
    def expected_configs(self) -> List[ConfigSpec]:
@@ -702,60 +664,6 @@ class QwenImageEditPlusTextEncoderStep(ModularPipelineBlocks):
            ConfigSpec(name="prompt_template_encode_start_idx", default=64),
        ]

-    @property
-    def inputs(self) -> List[InputParam]:
-        return [
-            InputParam(name="prompt", required=True, type_hint=str, description="The prompt to encode"),
-            InputParam(name="negative_prompt", type_hint=str, description="The negative prompt to encode"),
-            InputParam(
-                name="resized_cond_image",
-                required=True,
-                type_hint=torch.Tensor,
-                description="The image(s) to encode, can be a single image or list of images, should be resized to 384x384 using resize step",
-            ),
-        ]
-
-    @property
-    def intermediate_outputs(self) -> List[OutputParam]:
-        return [
-            OutputParam(
-                name="prompt_embeds",
-                kwargs_type="denoiser_input_fields",
-                type_hint=torch.Tensor,
-                description="The prompt embeddings",
-            ),
-            OutputParam(
-                name="prompt_embeds_mask",
-                kwargs_type="denoiser_input_fields",
-                type_hint=torch.Tensor,
-                description="The encoder attention mask",
-            ),
-            OutputParam(
-                name="negative_prompt_embeds",
-                kwargs_type="denoiser_input_fields",
-                type_hint=torch.Tensor,
-                description="The negative prompt embeddings",
-            ),
-            OutputParam(
-                name="negative_prompt_embeds_mask",
-                kwargs_type="denoiser_input_fields",
-                type_hint=torch.Tensor,
-                description="The negative prompt embeddings mask",
-            ),
-        ]
-
-    @staticmethod
-    def check_inputs(prompt, negative_prompt):
-        if not isinstance(prompt, str) and not isinstance(prompt, list):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if (
-            negative_prompt is not None
-            and not isinstance(negative_prompt, str)
-            and not isinstance(negative_prompt, list)
-        ):
-            raise ValueError(f"`negative_prompt` has to be of type `str` or `list` but is {type(negative_prompt)}")
-
    @torch.no_grad()
    def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
        block_state = self.get_block_state(state)
@@ -768,7 +676,7 @@ class QwenImageEditPlusTextEncoderStep(ModularPipelineBlocks):
            components.text_encoder,
            components.processor,
            prompt=block_state.prompt,
-            image=block_state.resized_cond_image,
+            image=block_state.resized_image,
            prompt_template_encode=components.config.prompt_template_encode,
            img_template_encode=components.config.img_template_encode,
            prompt_template_encode_start_idx=components.config.prompt_template_encode_start_idx,
@@ -784,7 +692,7 @@ class QwenImageEditPlusTextEncoderStep(ModularPipelineBlocks):
                    components.text_encoder,
                    components.processor,
                    prompt=negative_prompt,
-                    image=block_state.resized_cond_image,
+                    image=block_state.resized_image,
                    prompt_template_encode=components.config.prompt_template_encode,
                    img_template_encode=components.config.img_template_encode,
                    prompt_template_encode_start_idx=components.config.prompt_template_encode_start_idx,
@@ -938,60 +846,60 @@ class QwenImageProcessImagesInputStep(ModularPipelineBlocks):
        self.set_block_state(state, block_state)
        return components, state

-class QwenImageEditPlusProcessImagesInputStep(ModularPipelineBlocks):
+
+class QwenImageEditPlusProcessImagesInputStep(QwenImageProcessImagesInputStep):
    model_name = "qwenimage-edit-plus"

+    def __init__(self):
+        self.vae_image_size = 1024 * 1024
+        super().__init__()
+
    @property
    def description(self) -> str:
-        return "Image Preprocess step. Images can be resized first using QwenImageEditResizeDynamicStep."
-
-    @property
-    def expected_components(self) -> List[ComponentSpec]:
-        return [
-            ComponentSpec(
-                "image_processor",
-                VaeImageProcessor,
-                config=FrozenDict({"vae_scale_factor": 16}),
-                default_creation_method="from_config",
-            ),
-        ]
+        return "Image Preprocess step for QwenImage Edit Plus. Unlike QwenImage Edit, QwenImage Edit Plus doesn't use the same resized image for further preprocessing."

    @property
    def inputs(self) -> List[InputParam]:
-        return [InputParam("resized_image")]
-
-    @property
-    def intermediate_outputs(self) -> List[OutputParam]:
-        return [OutputParam(name="processed_image")]
+        return [InputParam("vae_image"), InputParam("image"), InputParam("height"), InputParam("width")]

    @torch.no_grad()
    def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
        block_state = self.get_block_state(state)

+        if block_state.vae_image is None and block_state.image is None:
+            raise ValueError("`vae_image` and `image` cannot be None at the same time")

-
-        image = block_state.resized_image
-
-        is_image_list = isinstance(image, list)
-        if not is_image_list:
-            image = [image]
-
-        processed_images = []
-        for img in image:
-            img_width, img_height = img.size
-            processed_images.append(components.image_processor.preprocess(image=img, height=img_height, width=img_width))
-        block_state.processed_image = processed_images
-        if is_image_list:
-            block_state.processed_image = processed_images
+        vae_image_sizes = None
+        if block_state.vae_image is None:
+            image = block_state.image
+            self.check_inputs(
+                height=block_state.height, width=block_state.width, vae_scale_factor=components.vae_scale_factor
+            )
+            height = block_state.height or components.default_height
+            width = block_state.width or components.default_width
+            block_state.processed_image = components.image_processor.preprocess(
+                image=image, height=height, width=width
+            )
        else:
-            block_state.processed_image = processed_images[0]
+            # QwenImage Edit Plus can allow multiple input images with varied resolutions
+            processed_images = []
+            vae_image_sizes = []
+            for img in block_state.vae_image:
+                width, height = img.size
+                vae_width, vae_height, _ = calculate_dimensions(self.vae_image_size, width / height)
+                vae_image_sizes.append((vae_width, vae_height))
+                processed_images.append(
+                    components.image_processor.preprocess(image=img, height=vae_height, width=vae_width)
+                )
+            block_state.processed_image = processed_images
+
+        block_state.vae_image_sizes = vae_image_sizes

        self.set_block_state(state, block_state)
        return components, state

-class QwenImageVaeEncoderDynamicStep(ModularPipelineBlocks):
-    """VAE encoder that handles both single images and lists of images with varied resolutions."""

+class QwenImageVaeEncoderDynamicStep(ModularPipelineBlocks):
    model_name = "qwenimage"

    def __init__(
@@ -1001,12 +909,21 @@ class QwenImageVaeEncoderDynamicStep(ModularPipelineBlocks):
    ):
        """Initialize a VAE encoder step for converting images to latent representations.

-        Handles both single images and lists of images. When input is a list, outputs a list of latents.
-        When input is a single tensor, outputs a single latent tensor.
+        Both the input and output names are configurable so this block can be configured to process to different image
+        inputs (e.g., "processed_image" -> "image_latents", "processed_control_image" -> "control_image_latents").

        Args:
-            input_name (str, optional): Name of the input image tensor or list. Defaults to "processed_image".
-            output_name (str, optional): Name of the output latent tensor or list. Defaults to "image_latents".
+            input_name (str, optional): Name of the input image tensor. Defaults to "processed_image".
+                Examples: "processed_image" or "processed_control_image"
+            output_name (str, optional): Name of the output latent tensor. Defaults to "image_latents".
+                Examples: "image_latents" or "control_image_latents"
+
+        Examples:
+            # Basic usage with default settings (includes image processor) QwenImageVaeEncoderDynamicStep()
+
+            # Custom input/output names for control image QwenImageVaeEncoderDynamicStep(
+                input_name="processed_control_image", output_name="control_image_latents"
+            )
        """
        self._image_input_name = input_name
        self._image_latents_output_name = output_name
@@ -1014,18 +931,17 @@ class QwenImageVaeEncoderDynamicStep(ModularPipelineBlocks):

    @property
    def description(self) -> str:
-        return (
-            f"VAE Encoder step that converts {self._image_input_name} into latent representations {self._image_latents_output_name}.\n"
-            "Handles both single images and lists of images with varied resolutions."
-        )
+        return f"Dynamic VAE Encoder step that converts {self._image_input_name} into latent representations {self._image_latents_output_name}.\n"

    @property
    def expected_components(self) -> List[ComponentSpec]:
-        return [ComponentSpec("vae", AutoencoderKLQwenImage)]
+        components = [ComponentSpec("vae", AutoencoderKLQwenImage)]
+        return components

    @property
    def inputs(self) -> List[InputParam]:
-        return [InputParam(self._image_input_name, required=True), InputParam("generator")]
+        inputs = [InputParam(self._image_input_name, required=True), InputParam("generator")]
+        return inputs

    @property
    def intermediate_outputs(self) -> List[OutputParam]:
@@ -1033,7 +949,7 @@ class QwenImageVaeEncoderDynamicStep(ModularPipelineBlocks):
            OutputParam(
                self._image_latents_output_name,
                type_hint=torch.Tensor,
-                description="The latents representing the reference image(s). Single tensor or list depending on input.",
+                description="The latents representing the reference image",
            )
        ]

@@ -1045,11 +961,47 @@ class QwenImageVaeEncoderDynamicStep(ModularPipelineBlocks):
        dtype = components.vae.dtype

        image = getattr(block_state, self._image_input_name)
-        is_image_list = isinstance(image, list)
-        if not is_image_list:
-            image = [image]

-        # Handle both single image and list of images
+        # Encode image into latents
+        image_latents = encode_vae_image(
+            image=image,
+            vae=components.vae,
+            generator=block_state.generator,
+            device=device,
+            dtype=dtype,
+            latent_channels=components.num_channels_latents,
+        )
+        setattr(block_state, self._image_latents_output_name, image_latents)
+
+        self.set_block_state(state, block_state)
+
+        return components, state
+
+
+class QwenImageEditPlusVaeEncoderDynamicStep(QwenImageVaeEncoderDynamicStep):
+    model_name = "qwenimage-edit-plus"
+
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        # Each reference image latent can have varied resolutions hence we return this as a list.
+        return [
+            OutputParam(
+                self._image_latents_output_name,
+                type_hint=List[torch.Tensor],
+                description="The latents representing the reference image(s).",
+            )
+        ]
+
+    @torch.no_grad()
+    def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+
+        device = components._execution_device
+        dtype = components.vae.dtype
+
+        image = getattr(block_state, self._image_input_name)
+
+        # Encode image into latents
        image_latents = []
        for img in image:
            image_latents.append(
@@ -1062,12 +1014,9 @@ class QwenImageVaeEncoderDynamicStep(ModularPipelineBlocks):
                    latent_channels=components.num_channels_latents,
                )
            )
-        if not is_image_list:
-            image_latents = image_latents[0]

        setattr(block_state, self._image_latents_output_name, image_latents)

-
        self.set_block_state(state, block_state)

        return components, state
--- a/src/diffusers/modular_pipelines/qwenimage/inputs.py
+++ b/src/diffusers/modular_pipelines/qwenimage/inputs.py
@@ -222,15 +222,36 @@ class QwenImageTextInputsStep(ModularPipelineBlocks):


 class QwenImageInputsDynamicStep(ModularPipelineBlocks):
-    """Input step for QwenImage: update height/width, expand batch, patchify."""
-
    model_name = "qwenimage"

-    def __init__(
-        self,
-        image_latent_inputs: List[str] = ["image_latents"],
-        additional_batch_inputs: List[str] = [],
-    ):
+    def __init__(self, image_latent_inputs: List[str] = ["image_latents"], additional_batch_inputs: List[str] = []):
+        """Initialize a configurable step that standardizes the inputs for the denoising step. It:\n"
+
+        This step handles multiple common tasks to prepare inputs for the denoising step:
+        1. For encoded image latents, use it update height/width if None, patchifies, and expands batch size
+        2. For additional_batch_inputs: Only expands batch dimensions to match final batch size
+
+        This is a dynamic block that allows you to configure which inputs to process.
+
+        Args:
+            image_latent_inputs (List[str], optional): Names of image latent tensors to process.
+                These will be used to determine height/width, patchified, and batch-expanded. Can be a single string or
+                list of strings. Defaults to ["image_latents"]. Examples: ["image_latents"], ["control_image_latents"]
+            additional_batch_inputs (List[str], optional):
+                Names of additional conditional input tensors to expand batch size. These tensors will only have their
+                batch dimensions adjusted to match the final batch size. Can be a single string or list of strings.
+                Defaults to []. Examples: ["processed_mask_image"]
+
+        Examples:
+            # Configure to process image_latents (default behavior) QwenImageInputsDynamicStep()
+
+            # Configure to process multiple image latent inputs
+            QwenImageInputsDynamicStep(image_latent_inputs=["image_latents", "control_image_latents"])
+
+            # Configure to process image latents and additional batch inputs QwenImageInputsDynamicStep(
+                image_latent_inputs=["image_latents"], additional_batch_inputs=["processed_mask_image"]
+            )
+        """
        if not isinstance(image_latent_inputs, list):
            image_latent_inputs = [image_latent_inputs]
        if not isinstance(additional_batch_inputs, list):
@@ -242,12 +263,14 @@ class QwenImageInputsDynamicStep(ModularPipelineBlocks):

    @property
    def description(self) -> str:
+        # Functionality section
        summary_section = (
            "Input processing step that:\n"
-            "  1. For image latent inputs: Updates height/width if None, patchifies, and expands batch size\n"
+            "  1. For image latent inputs: Updates height/width if None, patchifies latents, and expands batch size\n"
            "  2. For additional batch inputs: Expands batch dimensions to match final batch size"
        )

+        # Inputs info
        inputs_info = ""
        if self._image_latent_inputs or self._additional_batch_inputs:
            inputs_info = "\n\nConfigured inputs:"
@@ -256,16 +279,11 @@ class QwenImageInputsDynamicStep(ModularPipelineBlocks):
            if self._additional_batch_inputs:
                inputs_info += f"\n  - Additional batch inputs: {self._additional_batch_inputs}"

+        # Placement guidance
        placement_section = "\n\nThis block should be placed after the encoder steps and the text input step."

        return summary_section + inputs_info + placement_section

-    @property
-    def expected_components(self) -> List[ComponentSpec]:
-        return [
-            ComponentSpec("pachifier", QwenImagePachifier, default_creation_method="from_config"),
-        ]
-
    @property
    def inputs(self) -> List[InputParam]:
        inputs = [
@@ -275,9 +293,11 @@ class QwenImageInputsDynamicStep(ModularPipelineBlocks):
            InputParam(name="width"),
        ]

+        # Add image latent inputs
        for image_latent_input_name in self._image_latent_inputs:
            inputs.append(InputParam(name=image_latent_input_name))

+        # Add additional batch inputs
        for input_name in self._additional_batch_inputs:
            inputs.append(InputParam(name=input_name))

@@ -290,16 +310,22 @@ class QwenImageInputsDynamicStep(ModularPipelineBlocks):
            OutputParam(name="image_width", type_hint=int, description="The width of the image latents"),
        ]

+    @property
+    def expected_components(self) -> List[ComponentSpec]:
+        return [
+            ComponentSpec("pachifier", QwenImagePachifier, default_creation_method="from_config"),
+        ]
+
    def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
        block_state = self.get_block_state(state)

-        # Process image latent inputs
+        # Process image latent inputs (height/width calculation, patchify, and batch expansion)
        for image_latent_input_name in self._image_latent_inputs:
            image_latent_tensor = getattr(block_state, image_latent_input_name)
            if image_latent_tensor is None:
                continue

-            # 1. Calculate height/width from latents and update if not provided
+            # 1. Calculate height/width from latents
            height, width = calculate_dimension_from_latents(image_latent_tensor, components.vae_scale_factor)
            block_state.height = block_state.height or height
            block_state.width = block_state.width or width
@@ -309,7 +335,7 @@ class QwenImageInputsDynamicStep(ModularPipelineBlocks):
            if not hasattr(block_state, "image_width"):
                block_state.image_width = width

-            # 2. Patchify
+            # 2. Patchify the image latent tensor
            image_latent_tensor = components.pachifier.pack_latents(image_latent_tensor)

            # 3. Expand batch size
@@ -328,6 +354,7 @@ class QwenImageInputsDynamicStep(ModularPipelineBlocks):
            if input_tensor is None:
                continue

+            # Only expand batch size
            input_tensor = repeat_tensor_to_batch_size(
                input_name=input_name,
                input_tensor=input_tensor,
@@ -341,130 +368,63 @@ class QwenImageInputsDynamicStep(ModularPipelineBlocks):
        return components, state


-class QwenImageEditPlusInputsDynamicStep(ModularPipelineBlocks):
-    """Input step for QwenImage Edit Plus: handles list of latents with different sizes."""
-
+class QwenImageEditPlusInputsDynamicStep(QwenImageInputsDynamicStep):
    model_name = "qwenimage-edit-plus"

-    def __init__(
-        self,
-        image_latent_inputs: List[str] = ["image_latents"],
-        additional_batch_inputs: List[str] = [],
-    ):
-        if not isinstance(image_latent_inputs, list):
-            image_latent_inputs = [image_latent_inputs]
-        if not isinstance(additional_batch_inputs, list):
-            additional_batch_inputs = [additional_batch_inputs]
-
-        self._image_latent_inputs = image_latent_inputs
-        self._additional_batch_inputs = additional_batch_inputs
-        super().__init__()
-
-    @property
-    def description(self) -> str:
-        summary_section = (
-            "Input processing step for Edit Plus that:\n"
-            "  1. For image latent inputs (list): Collects heights/widths, patchifies each, concatenates, expands batch\n"
-            "  2. For additional batch inputs: Expands batch dimensions to match final batch size\n"
-            "  Height/width defaults to last image in the list."
-        )
-
-        inputs_info = ""
-        if self._image_latent_inputs or self._additional_batch_inputs:
-            inputs_info = "\n\nConfigured inputs:"
-            if self._image_latent_inputs:
-                inputs_info += f"\n  - Image latent inputs: {self._image_latent_inputs}"
-            if self._additional_batch_inputs:
-                inputs_info += f"\n  - Additional batch inputs: {self._additional_batch_inputs}"
-
-        placement_section = "\n\nThis block should be placed after the encoder steps and the text input step."
-
-        return summary_section + inputs_info + placement_section
-
-    @property
-    def expected_components(self) -> List[ComponentSpec]:
-        return [
-            ComponentSpec("pachifier", QwenImagePachifier, default_creation_method="from_config"),
-        ]
-
-    @property
-    def inputs(self) -> List[InputParam]:
-        inputs = [
-            InputParam(name="num_images_per_prompt", default=1),
-            InputParam(name="batch_size", required=True),
-            InputParam(name="height"),
-            InputParam(name="width"),
-        ]
-
-        for image_latent_input_name in self._image_latent_inputs:
-            inputs.append(InputParam(name=image_latent_input_name))
-
-        for input_name in self._additional_batch_inputs:
-            inputs.append(InputParam(name=input_name))
-
-        return inputs
-
    @property
    def intermediate_outputs(self) -> List[OutputParam]:
        return [
-            OutputParam(name="image_height", type_hint=List[int], description="The heights of the image latents"),
-            OutputParam(name="image_width", type_hint=List[int], description="The widths of the image latents"),
+            OutputParam(name="image_height", type_hint=List[int], description="The height of the image latents"),
+            OutputParam(name="image_width", type_hint=List[int], description="The width of the image latents"),
        ]

    def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
        block_state = self.get_block_state(state)

-        # Process image latent inputs
+        # Process image latent inputs (height/width calculation, patchify, and batch expansion)
        for image_latent_input_name in self._image_latent_inputs:
            image_latent_tensor = getattr(block_state, image_latent_input_name)
            if image_latent_tensor is None:
                continue

-            is_list = isinstance(image_latent_tensor, list)
-            if not is_list:
-                image_latent_tensor = [image_latent_tensor]
-
+            # Each image latent can have different size in QwenImage Edit Plus.
            image_heights = []
            image_widths = []
            packed_image_latent_tensors = []

-            for i, img_latent_tensor in enumerate(image_latent_tensor):
+            for img_latent_tensor in image_latent_tensor:
                # 1. Calculate height/width from latents
                height, width = calculate_dimension_from_latents(img_latent_tensor, components.vae_scale_factor)
                image_heights.append(height)
                image_widths.append(width)

-                # 2. Patchify
+                # 2. Patchify the image latent tensor
                img_latent_tensor = components.pachifier.pack_latents(img_latent_tensor)

                # 3. Expand batch size
                img_latent_tensor = repeat_tensor_to_batch_size(
-                    input_name=f"{image_latent_input_name}[{i}]",
+                    input_name=image_latent_input_name,
                    input_tensor=img_latent_tensor,
                    num_images_per_prompt=block_state.num_images_per_prompt,
                    batch_size=block_state.batch_size,
                )
                packed_image_latent_tensors.append(img_latent_tensor)

-            # Concatenate all packed latents along dim=1
            packed_image_latent_tensors = torch.cat(packed_image_latent_tensors, dim=1)
-
-            # Output lists of heights/widths
            block_state.image_height = image_heights
            block_state.image_width = image_widths
+            setattr(block_state, image_latent_input_name, packed_image_latent_tensors)

-            # Default height/width from last image
            block_state.height = block_state.height or image_heights[-1]
            block_state.width = block_state.width or image_widths[-1]

-            setattr(block_state, image_latent_input_name, packed_image_latent_tensors)
-
        # Process additional batch inputs (only batch expansion)
        for input_name in self._additional_batch_inputs:
            input_tensor = getattr(block_state, input_name)
            if input_tensor is None:
                continue

+            # Only expand batch size
            input_tensor = repeat_tensor_to_batch_size(
                input_name=input_name,
                input_tensor=input_tensor,
@@ -476,6 +436,8 @@ class QwenImageEditPlusInputsDynamicStep(ModularPipelineBlocks):

        self.set_block_state(state, block_state)
        return components, state
+
+
 class QwenImageControlNetInputsStep(ModularPipelineBlocks):
    model_name = "qwenimage"

--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks.py
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
@@ -1,465 +0,0 @@
-# Copyright 2025 Qwen-Image Team and The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from ...utils import logging
-from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks, ConditionalPipelineBlocks
-from ..modular_pipeline_utils import InsertableDict
-from .before_denoise import (
-    QwenImageControlNetBeforeDenoiserStep,
-    QwenImageCreateMaskLatentsStep,
-    QwenImagePrepareLatentsStep,
-    QwenImagePrepareLatentsWithStrengthStep,
-    QwenImageRoPEInputsStep,
-    QwenImageSetTimestepsStep,
-    QwenImageSetTimestepsWithStrengthStep,
-)
-from .decoders import (
-    QwenImageAfterDenoiseStep,
-    QwenImageDecoderStep,
-    QwenImageInpaintProcessImagesOutputStep,
-    QwenImageProcessImagesOutputStep,
-)
-from .denoise import (
-    QwenImageControlNetDenoiseStep,
-    QwenImageDenoiseStep,
-    QwenImageInpaintControlNetDenoiseStep,
-    QwenImageInpaintDenoiseStep,
-    QwenImageLoopBeforeDenoiserControlNet,
-)
-from .encoders import (
-    QwenImageControlNetVaeEncoderStep,
-    QwenImageInpaintProcessImagesInputStep,
-    QwenImageProcessImagesInputStep,
-    QwenImageTextEncoderStep,
-    QwenImageVaeEncoderDynamicStep,
-)
-from .inputs import (
-    QwenImageControlNetInputsStep,
-    QwenImageInputsDynamicStep,
-    QwenImageTextInputsStep,
-)
-
-
-logger = logging.get_logger(__name__)
-
-
-
-# 1. VAE ENCODER
-
-# inpaint vae encoder
-class QwenImageInpaintVaeEncoderStep(SequentialPipelineBlocks):
-    model_name = "qwenimage"
-    block_classes = [QwenImageInpaintProcessImagesInputStep(), QwenImageVaeEncoderDynamicStep()]
-    block_names = ["preprocess", "encode"]
-
-    @property
-    def description(self) -> str:
-        return (
-            "This step is used for processing image and mask inputs for inpainting tasks. It:\n"
-            " - Resizes the image to the target size, based on `height` and `width`.\n"
-            " - Processes and updates `image` and `mask_image`.\n"
-            " - Creates `image_latents`."
-        )
-
-
-# img2img vae encoder
-class QwenImageImg2ImgVaeEncoderStep(SequentialPipelineBlocks):
-    model_name = "qwenimage"
-
-    block_classes = [QwenImageProcessImagesInputStep(), QwenImageVaeEncoderDynamicStep()]
-    block_names = ["preprocess", "encode"]
-
-    @property
-    def description(self) -> str:
-        return "Vae encoder step that preprocess andencode the image inputs into their latent representations."
-
-
-# auto vae encoder
-class QwenImageAutoVaeEncoderStep(AutoPipelineBlocks):
-    block_classes = [QwenImageInpaintVaeEncoderStep, QwenImageImg2ImgVaeEncoderStep]
-    block_names = ["inpaint", "img2img"]
-    block_trigger_inputs = ["mask_image", "image"]
-
-    @property
-    def description(self):
-        return (
-            "Vae encoder step that encode the image inputs into their latent representations.\n"
-            + "This is an auto pipeline block.\n"
-            + " - `QwenImageInpaintVaeEncoderStep` (inpaint) is used when `mask_image` is provided.\n"
-            + " - `QwenImageImg2ImgVaeEncoderStep` (img2img) is used when `image` is provided.\n"
-            + " - if `mask_image` or `image` is not provided, step will be skipped."
-        )
-
-
-# optional controlnet vae encoder
-class QwenImageOptionalControlNetVaeEncoderStep(AutoPipelineBlocks):
-    block_classes = [QwenImageControlNetVaeEncoderStep]
-    block_names = ["controlnet"]
-    block_trigger_inputs = ["control_image"]
-
-    @property
-    def description(self):
-        return (
-            "Vae encoder step that encode the image inputs into their latent representations.\n"
-            + "This is an auto pipeline block.\n"
-            + " - `QwenImageControlNetVaeEncoderStep` (controlnet) is used when `control_image` is provided.\n"
-            + " - if `control_image` is not provided, step will be skipped."
-        )
-
-# 2. DENOISE
-# input -> prepare_latents -> set_timesteps -> prepare_rope_inputs -> denoise -> after_denoise
-
-# img2img input
-class QwenImageImg2ImgInputStep(SequentialPipelineBlocks):
-    model_name = "qwenimage"
-    block_classes = [QwenImageTextInputsStep(), QwenImageInputsDynamicStep(image_latent_inputs=["image_latents"])]
-    block_names = ["text_inputs", "additional_inputs"]
-
-    @property
-    def description(self):
-        return "Input step that prepares the inputs for the img2img denoising step. It:\n"
-        " - make sure the text embeddings have consistent batch size as well as the additional inputs (`image_latents`).\n"
-        " - update height/width based `image_latents`, patchify `image_latents`."
-
-
-# inpaint input
-class QwenImageInpaintInputStep(SequentialPipelineBlocks):
-    model_name = "qwenimage"
-    block_classes = [QwenImageTextInputsStep(), QwenImageInputsDynamicStep(image_latent_inputs=["image_latents"], additional_batch_inputs=["processed_mask_image"])]
-    block_names = ["text_inputs", "additional_inputs"]
-
-    @property
-    def description(self):
-        return "Input step that prepares the inputs for the inpainting denoising step. It:\n"
-        " - make sure the text embeddings have consistent batch size as well as the additional inputs (`image_latents` and `processed_mask_image`).\n"
-        " - update height/width based `image_latents`, patchify `image_latents`."
-
-# inpaint prepare latents
-class QwenImageInpaintPrepareLatentsStep(SequentialPipelineBlocks):
-    model_name = "qwenimage"
-    block_classes = [QwenImagePrepareLatentsWithStrengthStep(), QwenImageCreateMaskLatentsStep()]
-    block_names = ["add_noise_to_latents", "create_mask_latents"]
-
-    @property
-    def description(self) -> str:
-        return (
-            "This step prepares the latents/image_latents and mask inputs for the inpainting denoising step. It:\n"
-            " - Add noise to the image latents to create the latents input for the denoiser.\n"
-            " - Create the pachified latents `mask` based on the processedmask image.\n"
-        )
-
-# CoreDenoiseStep: 
-# (input +  prepare_latents + set_timesteps + prepare_rope_inputs + denoise + after_denoise)
-
-# 1. text2image
-class QwenImageCoreDenoiseStep(SequentialPipelineBlocks):
-    model_name = "qwenimage"
-    block_classes = [
-        QwenImageTextInputsStep(),
-        QwenImagePrepareLatentsStep(), 
-        QwenImageSetTimestepsStep(), 
-        QwenImageRoPEInputsStep(), 
-        QwenImageDenoiseStep(),
-        QwenImageAfterDenoiseStep(),
-    ]
-    block_names = [
-        "input",
-        "prepare_latents", 
-        "set_timesteps", 
-        "prepare_rope_inputs", 
-        "denoise", 
-        "after_denoise",
-    ]
-
-    @property
-    def description(self):
-        return "step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.)."
-
-
-# 2.inpaint
-class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks):
-    model_name = "qwenimage"
-    block_classes = [
-        QwenImageInpaintInputStep(),
-        QwenImagePrepareLatentsStep(), 
-        QwenImageSetTimestepsWithStrengthStep(), 
-        QwenImageInpaintPrepareLatentsStep(), 
-        QwenImageRoPEInputsStep(),
-        QwenImageInpaintDenoiseStep(),
-        QwenImageAfterDenoiseStep(),
-        ]
-    block_names = [
-        "input",
-        "prepare_latents",
-        "set_timesteps", 
-        "prepare_inpaint_latents", 
-        "prepare_rope_inputs",
-        "denoise",
-        "after_denoise",
-        ]
-
-    @property
-    def description(self):
-        return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task."
-
-
-# 3. img2img
-class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
-    model_name = "qwenimage"
-    block_classes = [
-        QwenImageImg2ImgInputStep(),
-        QwenImagePrepareLatentsStep(), 
-        QwenImageSetTimestepsWithStrengthStep(), 
-        QwenImagePrepareLatentsWithStrengthStep(), 
-        QwenImageRoPEInputsStep(),
-        QwenImageDenoiseStep(),
-        QwenImageAfterDenoiseStep(),
-        ]
-    block_names = [
-        "input",
-        "prepare_latents", 
-        "set_timesteps", 
-        "prepare_img2img_latents", 
-        "prepare_rope_inputs",
-        "denoise",
-        "after_denoise",
-        ]
-
-    @property
-    def description(self):
-        return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task."
-
-
-
-# 4. text2image + controlnet
-class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks):
-    model_name = "qwenimage"
-    block_classes = [
-        QwenImageTextInputsStep(),
-        QwenImageControlNetInputsStep(),
-        QwenImagePrepareLatentsStep(), 
-        QwenImageSetTimestepsStep(), 
-        QwenImageRoPEInputsStep(), 
-        QwenImageControlNetBeforeDenoiserStep(),
-        QwenImageControlNetDenoiseStep(),
-        QwenImageAfterDenoiseStep(),
-    ]
-    block_names = [
-        "input",
-        "controlnet_input",
-        "prepare_latents", 
-        "set_timesteps", 
-        "prepare_rope_inputs", 
-        "controlnet_before_denoise",
-        "controlnet_denoise",
-        "after_denoise",
-    ]
-
-    @property
-    def description(self):
-        return "step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.)."
-
-
-# 5. inpaint + controlnet
-class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks):
-    model_name = "qwenimage"
-    block_classes = [
-        QwenImageInpaintInputStep(),
-        QwenImageControlNetInputsStep(),
-        QwenImagePrepareLatentsStep(), 
-        QwenImageSetTimestepsWithStrengthStep(), 
-        QwenImageInpaintPrepareLatentsStep(), 
-        QwenImageRoPEInputsStep(),
-        QwenImageControlNetBeforeDenoiserStep(),
-        QwenImageInpaintControlNetDenoiseStep(),
-        QwenImageAfterDenoiseStep(),
-        ]
-    block_names = [
-        "input",
-        "controlnet_input",
-        "prepare_latents",
-        "set_timesteps", 
-        "prepare_inpaint_latents", 
-        "prepare_rope_inputs",
-        "controlnet_before_denoise",
-        "controlnet_denoise",
-        "after_denoise",
-        ]
-
-    @property
-    def description(self):
-        return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task."
-
-
-# 6. img2img + controlnet
-class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
-    model_name = "qwenimage"
-    block_classes = [
-        QwenImageImg2ImgInputStep(),
-        QwenImageControlNetInputsStep(),
-        QwenImagePrepareLatentsStep(), 
-        QwenImageSetTimestepsWithStrengthStep(), 
-        QwenImagePrepareLatentsWithStrengthStep(), 
-        QwenImageRoPEInputsStep(),
-        QwenImageControlNetBeforeDenoiserStep(),
-        QwenImageControlNetDenoiseStep(),
-        QwenImageAfterDenoiseStep(),
-        ]
-    block_names = [
-        "input",
-        "controlnet_input",
-        "prepare_latents", 
-        "set_timesteps", 
-        "prepare_img2img_latents", 
-        "prepare_rope_inputs",
-        "controlnet_before_denoise",
-        "controlnet_denoise",
-        "after_denoise",
-        ]
-
-    @property
-    def description(self):
-        return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task."
-
-
-# auto denoise
-# auto denoise step for controlnet tasks: works for all tasks with controlnet
-class QwenImageAutoCoreDenoiseStep(ConditionalPipelineBlocks):
-    block_classes = [
-        QwenImageCoreDenoiseStep,
-        QwenImageInpaintCoreDenoiseStep, 
-        QwenImageImg2ImgCoreDenoiseStep,
-        QwenImageControlNetCoreDenoiseStep,
-        QwenImageControlNetInpaintCoreDenoiseStep,
-        QwenImageControlNetImg2ImgCoreDenoiseStep,
-    ]
-    block_names = [
-        "text2image",
-        "inpaint",
-        "img2img",
-        "controlnet_text2image",
-        "controlnet_inpaint",
-        "controlnet_img2img"]
-    block_trigger_inputs = ["control_image_latents", "processed_mask_image", "image_latents"]
-    default_block_name = "text2image"
-
-    def select_block(self, control_image_latents=None, processed_mask_image=None, image_latents=None):
-
-        if control_image_latents is not None:
-            if processed_mask_image is not None:
-                return "controlnet_inpaint"
-            elif image_latents is not None:
-                return "controlnet_img2img"
-            else:
-                return "controlnet_text2image"
-        else:
-            if processed_mask_image is not None:
-                return "inpaint"
-            elif image_latents is not None:
-                return "img2img"
-            else:
-                return "text2image"
-
-    @property
-    def description(self):
-        return (
-            "Core step that performs the denoising process. \n"
-            + " - `QwenImageCoreDenoiseStep` (text2image) for text2image tasks.\n"
-            + " - `QwenImageInpaintCoreDenoiseStep` (inpaint) for inpaint tasks.\n"
-            + " - `QwenImageImg2ImgCoreDenoiseStep` (img2img) for img2img tasks.\n"
-            + " - `QwenImageControlNetCoreDenoiseStep` (controlnet_text2image) for text2image tasks with controlnet.\n"
-            + " - `QwenImageControlNetInpaintCoreDenoiseStep` (controlnet_inpaint) for inpaint tasks with controlnet.\n"
-            + " - `QwenImageControlNetImg2ImgCoreDenoiseStep` (controlnet_img2img) for img2img tasks with controlnet.\n"
-            + "This step support text-to-image, image-to-image, inpainting, and controlnet tasks for QwenImage:\n"
-            + " - for image-to-image generation, you need to provide `image_latents`\n"
-            + " - for inpainting, you need to provide `processed_mask_image` and `image_latents`\n"
-            + " - to run the controlnet workflow, you need to provide `control_image_latents`\n"
-            + " - for text-to-image generation, all you need to provide is prompt embeddings"
-        )
-
-
-# 4. DECODE
-
-## 1.1 text2image
-
-#### decode
-#### (standard decode step works for most tasks except for inpaint)
-
-class QwenImageDecodeStep(SequentialPipelineBlocks):
-    model_name = "qwenimage"
-    block_classes = [QwenImageDecoderStep(), QwenImageProcessImagesOutputStep()]
-    block_names = ["decode", "postprocess"]
-
-    @property
-    def description(self):
-        return "Decode step that decodes the latents to images and postprocess the generated image."
-
-
-
-#### inpaint decode
-
-class QwenImageInpaintDecodeStep(SequentialPipelineBlocks):
-    model_name = "qwenimage"
-    block_classes = [QwenImageDecoderStep(), QwenImageInpaintProcessImagesOutputStep()]
-    block_names = ["decode", "postprocess"]
-
-    @property
-    def description(self):
-        return "Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask overally to the original image."
-
-
-# auto decode step for inpaint and text2image tasks
-class QwenImageAutoDecodeStep(AutoPipelineBlocks):
-    block_classes = [QwenImageInpaintDecodeStep, QwenImageDecodeStep]
-    block_names = ["inpaint_decode", "decode"]
-    block_trigger_inputs = ["mask", None]
-
-    @property
-    def description(self):
-        return (
-            "Decode step that decode the latents into images. \n"
-            " This is an auto pipeline block that works for inpaint/text2image/img2img tasks, for both QwenImage and QwenImage-Edit.\n"
-            + " - `QwenImageInpaintDecodeStep` (inpaint) is used when `mask` is provided.\n"
-            + " - `QwenImageDecodeStep` (text2image/img2img) is used when `mask` is not provided.\n"
-        )
-
-
-
-## 1.10 QwenImage/auto block & presets
-AUTO_BLOCKS = InsertableDict(
-    [
-        ("text_encoder", QwenImageTextEncoderStep()),
-        ("vae_encoder", QwenImageAutoVaeEncoderStep()),
-        ("controlnet_vae_encoder", QwenImageOptionalControlNetVaeEncoderStep()),
-        ("denoise", QwenImageAutoCoreDenoiseStep()),
-        ("decode", QwenImageAutoDecodeStep()),
-    ]
-)
-
-
-class QwenImageAutoBlocks(SequentialPipelineBlocks):
-    model_name = "qwenimage"
-
-    block_classes = AUTO_BLOCKS.values()
-    block_names = AUTO_BLOCKS.keys()
-
-    @property
-    def description(self):
-        return (
-            "Auto Modular pipeline for text-to-image, image-to-image, inpainting, and controlnet tasks using QwenImage.\n"
-            + "- for image-to-image generation, you need to provide `image`\n"
-            + "- for inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop` \n"
-            + "- to run the controlnet workflow, you need to provide `control_image`\n"
-            + "- for text-to-image generation, all you need to provide is `prompt`"
-        )
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
@@ -1,329 +0,0 @@
-# Copyright 2025 Qwen-Image Team and The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Optional
-
-from ...utils import logging
-from ..modular_pipeline import AutoPipelineBlocks, ConditionalPipelineBlocks, SequentialPipelineBlocks
-from ..modular_pipeline_utils import InsertableDict
-from .before_denoise import (
-    QwenImageCreateMaskLatentsStep,
-    QwenImageEditRoPEInputsStep,
-    QwenImagePrepareLatentsStep,
-    QwenImagePrepareLatentsWithStrengthStep,
-    QwenImageSetTimestepsStep,
-    QwenImageSetTimestepsWithStrengthStep,
-)
-from .decoders import (
-    QwenImageAfterDenoiseStep,
-    QwenImageDecoderStep,
-    QwenImageInpaintProcessImagesOutputStep,
-    QwenImageProcessImagesOutputStep,
-)
-from .denoise import (
-    QwenImageEditDenoiseStep,
-    QwenImageEditInpaintDenoiseStep,
-)
-from .encoders import (
-    QwenImageEditResizeDynamicStep,
-    QwenImageEditTextEncoderStep,
-    QwenImageInpaintProcessImagesInputStep,
-    QwenImageProcessImagesInputStep,
-    QwenImageVaeEncoderDynamicStep,
-)
-from .inputs import (
-    QwenImageInputsDynamicStep,
-    QwenImageTextInputsStep,
-)
-
-
-logger = logging.get_logger(__name__)
-
-
-# ====================
-# 1. TEXT ENCODER
-# ====================
-
-class QwenImageEditVLEncoderStep(SequentialPipelineBlocks):
-    """VL encoder that takes both image and text prompts."""
-    model_name = "qwenimage-edit"
-    block_classes = [
-        QwenImageEditResizeDynamicStep(),
-        QwenImageEditTextEncoderStep(),
-    ]
-    block_names = ["resize", "encode"]
-
-    @property
-    def description(self) -> str:
-        return "QwenImage-Edit VL encoder step that encode the image and text prompts together."
-
-
-# ====================
-# 2. VAE ENCODER
-# ====================
-
-# Edit VAE encoder
-class QwenImageEditVaeEncoderStep(SequentialPipelineBlocks):
-    model_name = "qwenimage-edit"
-    block_classes = [
-        QwenImageEditResizeDynamicStep(),
-        QwenImageProcessImagesInputStep(),
-        QwenImageVaeEncoderDynamicStep(),
-    ]
-    block_names = ["resize", "preprocess", "encode"]
-
-    @property
-    def description(self) -> str:
-        return "Vae encoder step that encode the image inputs into their latent representations."
-
-
-# Edit Inpaint VAE encoder
-class QwenImageEditInpaintVaeEncoderStep(SequentialPipelineBlocks):
-    model_name = "qwenimage-edit"
-    block_classes = [
-        QwenImageEditResizeDynamicStep(),
-        QwenImageInpaintProcessImagesInputStep(),
-        QwenImageVaeEncoderDynamicStep(input_name="processed_image", output_name="image_latents"),
-    ]
-    block_names = ["resize", "preprocess", "encode"]
-
-    @property
-    def description(self) -> str:
-        return (
-            "This step is used for processing image and mask inputs for QwenImage-Edit inpaint tasks. It:\n"
-            " - resize the image for target area (1024 * 1024) while maintaining the aspect ratio.\n"
-            " - process the resized image and mask image.\n"
-            " - create image latents."
-        )
-
-
-# Auto VAE encoder
-class QwenImageEditAutoVaeEncoderStep(AutoPipelineBlocks):
-    block_classes = [QwenImageEditInpaintVaeEncoderStep, QwenImageEditVaeEncoderStep]
-    block_names = ["edit_inpaint", "edit"]
-    block_trigger_inputs = ["mask_image", "image"]
-
-    @property
-    def description(self):
-        return (
-            "Vae encoder step that encode the image inputs into their latent representations.\n"
-            "This is an auto pipeline block.\n"
-            " - `QwenImageEditInpaintVaeEncoderStep` (edit_inpaint) is used when `mask_image` is provided.\n"
-            " - `QwenImageEditVaeEncoderStep` (edit) is used when `image` is provided.\n"
-            " - if `mask_image` or `image` is not provided, step will be skipped."
-        )
-
-
-# ====================
-# 3. DENOISE - input -> prepare_latents -> set_timesteps -> prepare_rope_inputs -> denoise -> after_denoise
-# ====================
-
-# Edit input step
-class QwenImageEditInputStep(SequentialPipelineBlocks):
-    model_name = "qwenimage-edit"
-    block_classes = [
-        QwenImageTextInputsStep(),
-        QwenImageInputsDynamicStep(image_latent_inputs=["image_latents"]),
-    ]
-    block_names = ["text_inputs", "additional_inputs"]
-
-    @property
-    def description(self):
-        return (
-            "Input step that prepares the inputs for the edit denoising step. It:\n"
-            " - make sure the text embeddings have consistent batch size as well as the additional inputs.\n"
-            " - update height/width based `image_latents`, patchify `image_latents`."
-        )
-
-
-# Edit Inpaint input step
-class QwenImageEditInpaintInputStep(SequentialPipelineBlocks):
-    model_name = "qwenimage-edit"
-    block_classes = [
-        QwenImageTextInputsStep(),
-        QwenImageInputsDynamicStep(image_latent_inputs=["image_latents"], additional_batch_inputs=["processed_mask_image"]),
-    ]
-    block_names = ["text_inputs", "additional_inputs"]
-
-    @property
-    def description(self):
-        return (
-            "Input step that prepares the inputs for the edit inpaint denoising step. It:\n"
-            " - make sure the text embeddings have consistent batch size as well as the additional inputs.\n"
-            " - update height/width based `image_latents`, patchify `image_latents`."
-        )
-
-
-# Edit Inpaint prepare latents step
-class QwenImageEditInpaintPrepareLatentsStep(SequentialPipelineBlocks):
-    model_name = "qwenimage-edit"
-    block_classes = [QwenImagePrepareLatentsWithStrengthStep(), QwenImageCreateMaskLatentsStep()]
-    block_names = ["add_noise_to_latents", "create_mask_latents"]
-
-    @property
-    def description(self) -> str:
-        return (
-            "This step prepares the latents/image_latents and mask inputs for the edit inpainting denoising step. It:\n"
-            " - Add noise to the image latents to create the latents input for the denoiser.\n"
-            " - Create the patchified latents `mask` based on the processed mask image.\n"
-        )
-
-
-# 1. Edit (img2img) core denoise
-class QwenImageEditCoreDenoiseStep(SequentialPipelineBlocks):
-    model_name = "qwenimage-edit"
-    block_classes = [
-        QwenImageEditInputStep(),
-        QwenImagePrepareLatentsStep(),
-        QwenImageSetTimestepsStep(),
-        QwenImageEditRoPEInputsStep(),
-        QwenImageEditDenoiseStep(),
-        QwenImageAfterDenoiseStep(),
-    ]
-    block_names = [
-        "input",
-        "prepare_latents",
-        "set_timesteps",
-        "prepare_rope_inputs",
-        "denoise",
-        "after_denoise",
-    ]
-
-    @property
-    def description(self):
-        return "Core denoising workflow for QwenImage-Edit edit (img2img) task."
-
-
-# 2. Edit Inpaint core denoise
-class QwenImageEditInpaintCoreDenoiseStep(SequentialPipelineBlocks):
-    model_name = "qwenimage-edit"
-    block_classes = [
-        QwenImageEditInpaintInputStep(),
-        QwenImagePrepareLatentsStep(),
-        QwenImageSetTimestepsWithStrengthStep(),
-        QwenImageEditInpaintPrepareLatentsStep(),
-        QwenImageEditRoPEInputsStep(),
-        QwenImageEditInpaintDenoiseStep(),
-        QwenImageAfterDenoiseStep(),
-    ]
-    block_names = [
-        "input",
-        "prepare_latents",
-        "set_timesteps",
-        "prepare_inpaint_latents",
-        "prepare_rope_inputs",
-        "denoise",
-        "after_denoise",
-    ]
-
-    @property
-    def description(self):
-        return "Core denoising workflow for QwenImage-Edit edit inpaint task."
-
-
-# Auto core denoise step
-class QwenImageEditAutoCoreDenoiseStep(ConditionalPipelineBlocks):
-    block_classes = [
-        QwenImageEditInpaintCoreDenoiseStep,
-        QwenImageEditCoreDenoiseStep,
-    ]
-    block_names = ["edit_inpaint", "edit"]
-    block_trigger_inputs = ["processed_mask_image", "image_latents"]
-    default_block_name = "edit"
-
-    def select_block(self, processed_mask_image=None, image_latents=None) -> Optional[str]:
-        if processed_mask_image is not None:
-            return "edit_inpaint"
-        elif image_latents is not None:
-            return "edit"
-        return None
-
-    @property
-    def description(self):
-        return (
-            "Auto core denoising step that selects the appropriate workflow based on inputs.\n"
-            " - `QwenImageEditInpaintCoreDenoiseStep` when `processed_mask_image` is provided\n"
-            " - `QwenImageEditCoreDenoiseStep` when `image_latents` is provided\n"
-            "Supports edit (img2img) and edit inpainting tasks for QwenImage-Edit."
-        )
-
-
-# ====================
-# 4. DECODE
-# ====================
-
-# Decode step (standard)
-class QwenImageEditDecodeStep(SequentialPipelineBlocks):
-    model_name = "qwenimage-edit"
-    block_classes = [QwenImageDecoderStep(), QwenImageProcessImagesOutputStep()]
-    block_names = ["decode", "postprocess"]
-
-    @property
-    def description(self):
-        return "Decode step that decodes the latents to images and postprocess the generated image."
-
-
-# Inpaint decode step
-class QwenImageEditInpaintDecodeStep(SequentialPipelineBlocks):
-    model_name = "qwenimage-edit"
-    block_classes = [QwenImageDecoderStep(), QwenImageInpaintProcessImagesOutputStep()]
-    block_names = ["decode", "postprocess"]
-
-    @property
-    def description(self):
-        return "Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask overlay to the original image."
-
-
-# Auto decode step
-class QwenImageEditAutoDecodeStep(AutoPipelineBlocks):
-    block_classes = [QwenImageEditInpaintDecodeStep, QwenImageEditDecodeStep]
-    block_names = ["inpaint_decode", "decode"]
-    block_trigger_inputs = ["mask", None]
-
-    @property
-    def description(self):
-        return (
-            "Decode step that decode the latents into images.\n"
-            "This is an auto pipeline block.\n"
-            " - `QwenImageEditInpaintDecodeStep` (inpaint) is used when `mask` is provided.\n"
-            " - `QwenImageEditDecodeStep` (edit) is used when `mask` is not provided.\n"
-        )
-
-
-# ====================
-# 5. AUTO BLOCKS & PRESETS
-# ====================
-
-EDIT_AUTO_BLOCKS = InsertableDict(
-    [
-        ("text_encoder", QwenImageEditVLEncoderStep()),
-        ("vae_encoder", QwenImageEditAutoVaeEncoderStep()),
-        ("denoise", QwenImageEditAutoCoreDenoiseStep()),
-        ("decode", QwenImageEditAutoDecodeStep()),
-    ]
-)
-
-
-class QwenImageEditAutoBlocks(SequentialPipelineBlocks):
-    model_name = "qwenimage-edit"
-    block_classes = EDIT_AUTO_BLOCKS.values()
-    block_names = EDIT_AUTO_BLOCKS.keys()
-
-    @property
-    def description(self):
-        return (
-            "Auto Modular pipeline for edit (img2img) and edit inpaint tasks using QwenImage-Edit.\n"
-            "- for edit (img2img) generation, you need to provide `image`\n"
-            "- for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop`\n"
-        )
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
@@ -1,175 +0,0 @@
-# Copyright 2025 Qwen-Image Team and The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from ...utils import logging
-from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks
-from ..modular_pipeline_utils import InsertableDict
-from .before_denoise import (
-    QwenImageEditPlusRoPEInputsStep,
-    QwenImagePrepareLatentsStep,
-    QwenImageSetTimestepsStep,
-)
-from .decoders import (
-    QwenImageAfterDenoiseStep,
-    QwenImageDecoderStep,
-    QwenImageProcessImagesOutputStep,
-)
-from .denoise import (
-    QwenImageEditDenoiseStep,
-)
-from .encoders import (
-    QwenImageEditPlusResizeDynamicStep,
-    QwenImageEditPlusTextEncoderStep,
-    QwenImageEditPlusProcessImagesInputStep,
-    QwenImageVaeEncoderDynamicStep,
-)
-from .inputs import (
-    QwenImageEditPlusInputsDynamicStep,
-    QwenImageTextInputsStep,
-)
-
-
-logger = logging.get_logger(__name__)
-
-
-# ====================
-# 1. TEXT ENCODER
-# ====================
-
-class QwenImageEditPlusVLEncoderStep(SequentialPipelineBlocks):
-    """VL encoder that takes both image and text prompts. Uses 384x384 target area."""
-    model_name = "qwenimage-edit-plus"
-    block_classes = [
-        QwenImageEditPlusResizeDynamicStep(target_area=384 * 384, output_name="resized_cond_image"),
-        QwenImageEditPlusTextEncoderStep(),
-    ]
-    block_names = ["resize", "encode"]
-
-    @property
-    def description(self) -> str:
-        return "QwenImage-Edit Plus VL encoder step that encodes the image and text prompts together."
-
-
-# ====================
-# 2. VAE ENCODER
-# ====================
-
-class QwenImageEditPlusVaeEncoderStep(SequentialPipelineBlocks):
-    """VAE encoder that handles multiple images with different sizes. Uses 1024x1024 target area."""
-    model_name = "qwenimage-edit-plus"
-    block_classes = [
-        QwenImageEditPlusResizeDynamicStep(target_area=1024 * 1024, output_name="resized_image"),
-        QwenImageEditPlusProcessImagesInputStep(),
-        QwenImageVaeEncoderDynamicStep(),
-    ]
-    block_names = ["resize", "preprocess", "encode"]
-
-    @property
-    def description(self) -> str:
-        return (
-            "VAE encoder step that encodes image inputs into latent representations.\n"
-            "Each image is resized independently based on its own aspect ratio to 1024x1024 target area."
-        )
-
-
-# ====================
-# 3. DENOISE - input -> prepare_latents -> set_timesteps -> prepare_rope_inputs -> denoise -> after_denoise
-# ====================
-
-# Edit Plus input step
-class QwenImageEditPlusInputStep(SequentialPipelineBlocks):
-    model_name = "qwenimage-edit-plus"
-    block_classes = [
-        QwenImageTextInputsStep(),
-        QwenImageEditPlusInputsDynamicStep(image_latent_inputs=["image_latents"]),
-    ]
-    block_names = ["text_inputs", "additional_inputs"]
-
-    @property
-    def description(self):
-        return (
-            "Input step that prepares the inputs for the Edit Plus denoising step. It:\n"
-            " - Standardizes text embeddings batch size.\n"
-            " - Processes list of image latents: patchifies, concatenates along dim=1, expands batch.\n"
-            " - Outputs lists of image_height/image_width for RoPE calculation.\n"
-            " - Defaults height/width from last image in the list."
-        )
-
-
-# Edit Plus core denoise
-class QwenImageEditPlusCoreDenoiseStep(SequentialPipelineBlocks):
-    model_name = "qwenimage-edit-plus"
-    block_classes = [
-        QwenImageEditPlusInputStep(),
-        QwenImagePrepareLatentsStep(),
-        QwenImageSetTimestepsStep(),
-        QwenImageEditPlusRoPEInputsStep(),
-        QwenImageEditDenoiseStep(),
-        QwenImageAfterDenoiseStep(),
-    ]
-    block_names = [
-        "input",
-        "prepare_latents",
-        "set_timesteps",
-        "prepare_rope_inputs",
-        "denoise",
-        "after_denoise",
-    ]
-
-    @property
-    def description(self):
-        return "Core denoising workflow for QwenImage-Edit Plus edit (img2img) task."
-
-
-# ====================
-# 4. DECODE
-# ====================
-
-class QwenImageEditPlusDecodeStep(SequentialPipelineBlocks):
-    model_name = "qwenimage-edit-plus"
-    block_classes = [QwenImageDecoderStep(), QwenImageProcessImagesOutputStep()]
-    block_names = ["decode", "postprocess"]
-
-    @property
-    def description(self):
-        return "Decode step that decodes the latents to images and postprocesses the generated image."
-
-
-# ====================
-# 5. AUTO BLOCKS & PRESETS
-# ====================
-
-EDIT_PLUS_AUTO_BLOCKS = InsertableDict(
-    [
-        ("text_encoder", QwenImageEditPlusVLEncoderStep()),
-        ("vae_encoder", QwenImageEditPlusVaeEncoderStep()),
-        ("denoise", QwenImageEditPlusCoreDenoiseStep()),
-        ("decode", QwenImageEditPlusDecodeStep()),
-    ]
-)
-
-
-class QwenImageEditPlusAutoBlocks(SequentialPipelineBlocks):
-    model_name = "qwenimage-edit-plus"
-    block_classes = EDIT_PLUS_AUTO_BLOCKS.values()
-    block_names = EDIT_PLUS_AUTO_BLOCKS.keys()
-
-    @property
-    def description(self):
-        return (
-            "Auto Modular pipeline for edit (img2img) tasks using QwenImage-Edit Plus.\n"
-            "- `image` is required input (can be single image or list of images).\n"
-            "- Each image is resized independently based on its own aspect ratio.\n"
-            "- VL encoder uses 384x384 target area, VAE encoder uses 1024x1024 target area."
-        )
--- a/src/diffusers/modular_pipelines/qwenimage/node_utils.py
+++ b/src/diffusers/modular_pipelines/qwenimage/node_utils.py
@@ -0,0 +1,95 @@
+# Copyright 2025 Qwen-Image Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# mellon nodes
+QwenImage_NODE_TYPES_PARAMS_MAP = {
+    "controlnet": {
+        "inputs": [
+            "control_image",
+            "controlnet_conditioning_scale",
+            "control_guidance_start",
+            "control_guidance_end",
+            "height",
+            "width",
+        ],
+        "model_inputs": [
+            "controlnet",
+            "vae",
+        ],
+        "outputs": [
+            "controlnet_out",
+        ],
+        "block_names": ["controlnet_vae_encoder"],
+    },
+    "denoise": {
+        "inputs": [
+            "embeddings",
+            "width",
+            "height",
+            "seed",
+            "num_inference_steps",
+            "guidance_scale",
+            "image_latents",
+            "strength",
+            "controlnet",
+        ],
+        "model_inputs": [
+            "unet",
+            "guider",
+            "scheduler",
+        ],
+        "outputs": [
+            "latents",
+            "latents_preview",
+        ],
+        "block_names": ["denoise"],
+    },
+    "vae_encoder": {
+        "inputs": [
+            "image",
+            "width",
+            "height",
+        ],
+        "model_inputs": [
+            "vae",
+        ],
+        "outputs": [
+            "image_latents",
+        ],
+    },
+    "text_encoder": {
+        "inputs": [
+            "prompt",
+            "negative_prompt",
+        ],
+        "model_inputs": [
+            "text_encoders",
+        ],
+        "outputs": [
+            "embeddings",
+        ],
+    },
+    "decoder": {
+        "inputs": [
+            "latents",
+        ],
+        "model_inputs": [
+            "vae",
+        ],
+        "outputs": [
+            "images",
+        ],
+    },
+}
--- a/src/diffusers/modular_pipelines/stable_diffusion_xl/node_utils.py
+++ b/src/diffusers/modular_pipelines/stable_diffusion_xl/node_utils.py
@@ -0,0 +1,99 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+SDXL_NODE_TYPES_PARAMS_MAP = {
+    "controlnet": {
+        "inputs": [
+            "control_image",
+            "controlnet_conditioning_scale",
+            "control_guidance_start",
+            "control_guidance_end",
+            "height",
+            "width",
+        ],
+        "model_inputs": [
+            "controlnet",
+        ],
+        "outputs": [
+            "controlnet_out",
+        ],
+        "block_names": [None],
+    },
+    "denoise": {
+        "inputs": [
+            "embeddings",
+            "width",
+            "height",
+            "seed",
+            "num_inference_steps",
+            "guidance_scale",
+            "image_latents",
+            "strength",
+            # custom adapters coming in as inputs
+            "controlnet",
+            # ip_adapter is optional and custom; include if available
+            "ip_adapter",
+        ],
+        "model_inputs": [
+            "unet",
+            "guider",
+            "scheduler",
+        ],
+        "outputs": [
+            "latents",
+            "latents_preview",
+        ],
+        "block_names": ["denoise"],
+    },
+    "vae_encoder": {
+        "inputs": [
+            "image",
+            "width",
+            "height",
+        ],
+        "model_inputs": [
+            "vae",
+        ],
+        "outputs": [
+            "image_latents",
+        ],
+        "block_names": ["vae_encoder"],
+    },
+    "text_encoder": {
+        "inputs": [
+            "prompt",
+            "negative_prompt",
+        ],
+        "model_inputs": [
+            "text_encoders",
+        ],
+        "outputs": [
+            "embeddings",
+        ],
+        "block_names": ["text_encoder"],
+    },
+    "decoder": {
+        "inputs": [
+            "latents",
+        ],
+        "model_inputs": [
+            "vae",
+        ],
+        "outputs": [
+            "images",
+        ],
+        "block_names": ["decode"],
+    },
+}
--- a/src/diffusers/modular_pipelines/z_image/denoise.py
+++ b/src/diffusers/modular_pipelines/z_image/denoise.py
@@ -129,10 +129,6 @@ class ZImageLoopDenoiser(ModularPipelineBlocks):
                type_hint=int,
                description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.",
            ),
-            InputParam(
-                kwargs_type="denoiser_input_fields",
-                description="conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.",
-            ),
        ]
        guider_input_names = []
        uncond_guider_input_names = []
--- a/src/diffusers/modular_pipelines/z_image/modular_blocks.py
+++ b/src/diffusers/modular_pipelines/z_image/modular_blocks.py
@@ -119,7 +119,7 @@ class ZImageAutoDenoiseStep(AutoPipelineBlocks):

 class ZImageAutoVaeImageEncoderStep(AutoPipelineBlocks):
    block_classes = [ZImageVaeImageEncoderStep]
-    block_names = ["vae_encoder"]
+    block_names = ["vae_image_encoder"]
    block_trigger_inputs = ["image"]

    @property
@@ -137,7 +137,7 @@ class ZImageAutoBlocks(SequentialPipelineBlocks):
        ZImageAutoDenoiseStep,
        ZImageVaeDecoderStep,
    ]
-    block_names = ["text_encoder", "vae_encoder", "denoise", "decode"]
+    block_names = ["text_encoder", "vae_image_encoder", "denoise", "decode"]

    @property
    def description(self) -> str:
@@ -162,7 +162,7 @@ TEXT2IMAGE_BLOCKS = InsertableDict(
 IMAGE2IMAGE_BLOCKS = InsertableDict(
    [
        ("text_encoder", ZImageTextEncoderStep),
-        ("vae_encoder", ZImageVaeImageEncoderStep),
+        ("vae_image_encoder", ZImageVaeImageEncoderStep),
        ("input", ZImageTextInputStep),
        ("additional_inputs", ZImageAdditionalInputsStep(image_latent_inputs=["image_latents"])),
        ("prepare_latents", ZImagePrepareLatentsStep),
@@ -178,7 +178,7 @@ IMAGE2IMAGE_BLOCKS = InsertableDict(
 AUTO_BLOCKS = InsertableDict(
    [
        ("text_encoder", ZImageTextEncoderStep),
-        ("vae_encoder", ZImageAutoVaeImageEncoderStep),
+        ("vae_image_encoder", ZImageAutoVaeImageEncoderStep),
        ("denoise", ZImageAutoDenoiseStep),
        ("decode", ZImageVaeDecoderStep),
    ]
--- a/src/diffusers/pipelines/init.py
+++ b/src/diffusers/pipelines/init.py
@@ -165,7 +165,6 @@ else:
    _import_structure["cogview4"] = ["CogView4Pipeline", "CogView4ControlPipeline"]
    _import_structure["consisid"] = ["ConsisIDPipeline"]
    _import_structure["cosmos"] = [
-        "Cosmos2_5_PredictBasePipeline",
        "Cosmos2TextToImagePipeline",
        "CosmosTextToWorldPipeline",
        "CosmosVideoToWorldPipeline",
@@ -623,7 +622,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
            StableDiffusionXLControlNetXSPipeline,
        )
        from .cosmos import (
-            Cosmos2_5_PredictBasePipeline,
            Cosmos2TextToImagePipeline,
            Cosmos2VideoToWorldPipeline,
            CosmosTextToWorldPipeline,
--- a/src/diffusers/pipelines/cosmos/init.py
+++ b/src/diffusers/pipelines/cosmos/init.py
@@ -22,9 +22,6 @@ except OptionalDependencyNotAvailable:

    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
 else:
-    _import_structure["pipeline_cosmos2_5_predict"] = [
-        "Cosmos2_5_PredictBasePipeline",
-    ]
    _import_structure["pipeline_cosmos2_text2image"] = ["Cosmos2TextToImagePipeline"]
    _import_structure["pipeline_cosmos2_video2world"] = ["Cosmos2VideoToWorldPipeline"]
    _import_structure["pipeline_cosmos_text2world"] = ["CosmosTextToWorldPipeline"]
@@ -38,9 +35,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
    except OptionalDependencyNotAvailable:
        from ...utils.dummy_torch_and_transformers_objects import *
    else:
-        from .pipeline_cosmos2_5_predict import (
-            Cosmos2_5_PredictBasePipeline,
-        )
        from .pipeline_cosmos2_text2image import Cosmos2TextToImagePipeline
        from .pipeline_cosmos2_video2world import Cosmos2VideoToWorldPipeline
        from .pipeline_cosmos_text2world import CosmosTextToWorldPipeline
--- a/src/diffusers/pipelines/cosmos/pipeline_cosmos2_5_predict.py
+++ b/src/diffusers/pipelines/cosmos/pipeline_cosmos2_5_predict.py
@@ -1,847 +0,0 @@
-# Copyright 2025 The NVIDIA Team and The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Callable, Dict, List, Optional, Union
-
-import numpy as np
-import torch
-import torchvision
-import torchvision.transforms
-import torchvision.transforms.functional
-from transformers import AutoTokenizer, Qwen2_5_VLForConditionalGeneration
-
-from ...callbacks import MultiPipelineCallbacks, PipelineCallback
-from ...image_processor import PipelineImageInput
-from ...models import AutoencoderKLWan, CosmosTransformer3DModel
-from ...schedulers import UniPCMultistepScheduler
-from ...utils import is_cosmos_guardrail_available, is_torch_xla_available, logging, replace_example_docstring
-from ...utils.torch_utils import randn_tensor
-from ...video_processor import VideoProcessor
-from ..pipeline_utils import DiffusionPipeline
-from .pipeline_output import CosmosPipelineOutput
-
-
-if is_cosmos_guardrail_available():
-    from cosmos_guardrail import CosmosSafetyChecker
-else:
-
-    class CosmosSafetyChecker:
-        def __init__(self, *args, **kwargs):
-            raise ImportError(
-                "`cosmos_guardrail` is not installed. Please install it to use the safety checker for Cosmos: `pip install cosmos_guardrail`."
-            )
-
-
-if is_torch_xla_available():
-    import torch_xla.core.xla_model as xm
-
-    XLA_AVAILABLE = True
-else:
-    XLA_AVAILABLE = False
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-
-# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
-def retrieve_latents(
-    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
-):
-    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
-        return encoder_output.latent_dist.sample(generator)
-    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
-        return encoder_output.latent_dist.mode()
-    elif hasattr(encoder_output, "latents"):
-        return encoder_output.latents
-    else:
-        raise AttributeError("Could not access latents of provided encoder_output")
-
-
-EXAMPLE_DOC_STRING = """
-    Examples:
-        ```python
-        >>> import torch
-        >>> from diffusers import Cosmos2_5_PredictBasePipeline
-        >>> from diffusers.utils import export_to_video, load_image, load_video
-
-        >>> model_id = "nvidia/Cosmos-Predict2.5-2B"
-        >>> pipe = Cosmos2_5_PredictBasePipeline.from_pretrained(
-        ...     model_id, revision="diffusers/base/pre-trianed", torch_dtype=torch.bfloat16
-        ... )
-        >>> pipe = pipe.to("cuda")
-
-        >>> # Common negative prompt reused across modes.
-        >>> negative_prompt = (
-        ...     "The video captures a series of frames showing ugly scenes, static with no motion, motion blur, "
-        ...     "over-saturation, shaky footage, low resolution, grainy texture, pixelated images, poorly lit areas, "
-        ...     "underexposed and overexposed scenes, poor color balance, washed out colors, choppy sequences, jerky "
-        ...     "movements, low frame rate, artifacting, color banding, unnatural transitions, outdated special effects, "
-        ...     "fake elements, unconvincing visuals, poorly edited content, jump cuts, visual noise, and flickering. "
-        ...     "Overall, the video is of poor quality."
-        ... )
-
-        >>> # Text2World: generate a 93-frame world video from text only.
-        >>> prompt = (
-        ...     "As the red light shifts to green, the red bus at the intersection begins to move forward, its headlights "
-        ...     "cutting through the falling snow. The snowy tire tracks deepen as the vehicle inches ahead, casting fresh "
-        ...     "lines onto the slushy road. Around it, streetlights glow warmer, illuminating the drifting flakes and wet "
-        ...     "reflections on the asphalt. Other cars behind start to edge forward, their beams joining the scene. "
-        ...     "The stillness of the urban street transitions into motion as the quiet snowfall is punctuated by the slow "
-        ...     "advance of traffic through the frosty city corridor."
-        ... )
-        >>> video = pipe(
-        ...     image=None,
-        ...     video=None,
-        ...     prompt=prompt,
-        ...     negative_prompt=negative_prompt,
-        ...     num_frames=93,
-        ...     generator=torch.Generator().manual_seed(1),
-        ... ).frames[0]
-        >>> export_to_video(video, "text2world.mp4", fps=16)
-
-        >>> # Image2World: condition on a single image and generate a 93-frame world video.
-        >>> prompt = (
-        ...     "A high-definition video captures the precision of robotic welding in an industrial setting. "
-        ...     "The first frame showcases a robotic arm, equipped with a welding torch, positioned over a large metal structure. "
-        ...     "The welding process is in full swing, with bright sparks and intense light illuminating the scene, creating a vivid "
-        ...     "display of blue and white hues. A significant amount of smoke billows around the welding area, partially obscuring "
-        ...     "the view but emphasizing the heat and activity. The background reveals parts of the workshop environment, including a "
-        ...     "ventilation system and various pieces of machinery, indicating a busy and functional industrial workspace. As the video "
-        ...     "progresses, the robotic arm maintains its steady position, continuing the welding process and moving to its left. "
-        ...     "The welding torch consistently emits sparks and light, and the smoke continues to rise, diffusing slightly as it moves upward. "
-        ...     "The metal surface beneath the torch shows ongoing signs of heating and melting. The scene retains its industrial ambiance, with "
-        ...     "the welding sparks and smoke dominating the visual field, underscoring the ongoing nature of the welding operation."
-        ... )
-        >>> image = load_image(
-        ...     "https://media.githubusercontent.com/media/nvidia-cosmos/cosmos-predict2.5/refs/heads/main/assets/base/robot_welding.jpg"
-        ... )
-        >>> video = pipe(
-        ...     image=image,
-        ...     video=None,
-        ...     prompt=prompt,
-        ...     negative_prompt=negative_prompt,
-        ...     num_frames=93,
-        ...     generator=torch.Generator().manual_seed(1),
-        ... ).frames[0]
-        >>> # export_to_video(video, "image2world.mp4", fps=16)
-
-        >>> # Video2World: condition on an input clip and predict a 93-frame world video.
-        >>> prompt = (
-        ...     "The video opens with an aerial view of a large-scale sand mining construction operation, showcasing extensive piles "
-        ...     "of brown sand meticulously arranged in parallel rows. A central water channel, fed by a water pipe, flows through the "
-        ...     "middle of these sand heaps, creating ripples and movement as it cascades down. The surrounding area features dense green "
-        ...     "vegetation on the left, contrasting with the sandy terrain, while a body of water is visible in the background on the right. "
-        ...     "As the video progresses, a piece of heavy machinery, likely a bulldozer, enters the frame from the right, moving slowly along "
-        ...     "the edge of the sand piles. This machinery's presence indicates ongoing construction work in the operation. The final frame "
-        ...     "captures the same scene, with the water continuing its flow and the bulldozer still in motion, maintaining the dynamic yet "
-        ...     "steady pace of the construction activity."
-        ... )
-        >>> input_video = load_video(
-        ...     "https://github.com/nvidia-cosmos/cosmos-predict2.5/raw/refs/heads/main/assets/base/sand_mining.mp4"
-        ... )
-        >>> video = pipe(
-        ...     image=None,
-        ...     video=input_video,
-        ...     prompt=prompt,
-        ...     negative_prompt=negative_prompt,
-        ...     num_frames=93,
-        ...     generator=torch.Generator().manual_seed(1),
-        ... ).frames[0]
-        >>> export_to_video(video, "video2world.mp4", fps=16)
-
-        >>> # To produce an image instead of a world (video) clip, set num_frames=1 and
-        >>> # save the first frame: pipe(..., num_frames=1).frames[0][0].
-        ```
-"""
-
-
-class Cosmos2_5_PredictBasePipeline(DiffusionPipeline):
-    r"""
-    Pipeline for [Cosmos Predict2.5](https://github.com/nvidia-cosmos/cosmos-predict2.5) base model.
-
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
-    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
-
-    Args:
-        text_encoder ([`Qwen2_5_VLForConditionalGeneration`]):
-            Frozen text-encoder. Cosmos Predict2.5 uses the [Qwen2.5
-            VL](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct) encoder.
-        tokenizer (`AutoTokenizer`):
-            Tokenizer associated with the Qwen2.5 VL encoder.
-        transformer ([`CosmosTransformer3DModel`]):
-            Conditional Transformer to denoise the encoded image latents.
-        scheduler ([`UniPCMultistepScheduler`]):
-            A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
-        vae ([`AutoencoderKLWan`]):
-            Variational Auto-Encoder (VAE) Model to encode and decode videos to and from latent representations.
-    """
-
-    model_cpu_offload_seq = "text_encoder->transformer->vae"
-    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
-    # We mark safety_checker as optional here to get around some test failures, but it is not really optional
-    _optional_components = ["safety_checker"]
-    _exclude_from_cpu_offload = ["safety_checker"]
-
-    def __init__(
-        self,
-        text_encoder: Qwen2_5_VLForConditionalGeneration,
-        tokenizer: AutoTokenizer,
-        transformer: CosmosTransformer3DModel,
-        vae: AutoencoderKLWan,
-        scheduler: UniPCMultistepScheduler,
-        safety_checker: CosmosSafetyChecker = None,
-    ):
-        super().__init__()
-
-        if safety_checker is None:
-            safety_checker = CosmosSafetyChecker()
-
-        self.register_modules(
-            vae=vae,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            transformer=transformer,
-            scheduler=scheduler,
-            safety_checker=safety_checker,
-        )
-
-        self.vae_scale_factor_temporal = 2 ** sum(self.vae.temperal_downsample) if getattr(self, "vae", None) else 4
-        self.vae_scale_factor_spatial = 2 ** len(self.vae.temperal_downsample) if getattr(self, "vae", None) else 8
-        self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial)
-
-        latents_mean = (
-            torch.tensor(self.vae.config.latents_mean).view(1, self.vae.config.z_dim, 1, 1, 1).float()
-            if getattr(self.vae.config, "latents_mean", None) is not None
-            else None
-        )
-        latents_std = (
-            torch.tensor(self.vae.config.latents_std).view(1, self.vae.config.z_dim, 1, 1, 1).float()
-            if getattr(self.vae.config, "latents_std", None) is not None
-            else None
-        )
-        self.latents_mean = latents_mean
-        self.latents_std = latents_std
-
-        if self.latents_mean is None or self.latents_std is None:
-            raise ValueError("VAE configuration must define both `latents_mean` and `latents_std`.")
-
-    def _get_prompt_embeds(
-        self,
-        prompt: Union[str, List[str]] = None,
-        max_sequence_length: int = 512,
-        device: Optional[torch.device] = None,
-        dtype: Optional[torch.dtype] = None,
-    ):
-        device = device or self._execution_device
-        dtype = dtype or self.text_encoder.dtype
-        prompt = [prompt] if isinstance(prompt, str) else prompt
-
-        input_ids_batch = []
-
-        for sample_idx in range(len(prompt)):
-            conversations = [
-                {
-                    "role": "system",
-                    "content": [
-                        {
-                            "type": "text",
-                            "text": "You are a helpful assistant who will provide prompts to an image generator.",
-                        }
-                    ],
-                },
-                {
-                    "role": "user",
-                    "content": [
-                        {
-                            "type": "text",
-                            "text": prompt[sample_idx],
-                        }
-                    ],
-                },
-            ]
-            input_ids = self.tokenizer.apply_chat_template(
-                conversations,
-                tokenize=True,
-                add_generation_prompt=False,
-                add_vision_id=False,
-                max_length=max_sequence_length,
-                truncation=True,
-                padding="max_length",
-            )
-            input_ids = torch.LongTensor(input_ids)
-            input_ids_batch.append(input_ids)
-
-        input_ids_batch = torch.stack(input_ids_batch, dim=0)
-
-        outputs = self.text_encoder(
-            input_ids_batch.to(device),
-            output_hidden_states=True,
-        )
-        hidden_states = outputs.hidden_states
-
-        normalized_hidden_states = []
-        for layer_idx in range(1, len(hidden_states)):
-            normalized_state = (hidden_states[layer_idx] - hidden_states[layer_idx].mean(dim=-1, keepdim=True)) / (
-                hidden_states[layer_idx].std(dim=-1, keepdim=True) + 1e-8
-            )
-            normalized_hidden_states.append(normalized_state)
-
-        prompt_embeds = torch.cat(normalized_hidden_states, dim=-1)
-        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
-
-        return prompt_embeds
-
-    # Modified from diffusers.pipelines.cosmos.pipeline_cosmos_text2world.CosmosTextToWorldPipeline.encode_prompt
-    def encode_prompt(
-        self,
-        prompt: Union[str, List[str]],
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        do_classifier_free_guidance: bool = True,
-        num_videos_per_prompt: int = 1,
-        prompt_embeds: Optional[torch.Tensor] = None,
-        negative_prompt_embeds: Optional[torch.Tensor] = None,
-        max_sequence_length: int = 512,
-        device: Optional[torch.device] = None,
-        dtype: Optional[torch.dtype] = None,
-    ):
-        r"""
-        Encodes the prompt into text encoder hidden states.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                prompt to be encoded
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
-                less than `1`).
-            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
-                Whether to use classifier free guidance or not.
-            num_videos_per_prompt (`int`, *optional*, defaults to 1):
-                Number of videos that should be generated per prompt. torch device to place the resulting embeddings on
-            prompt_embeds (`torch.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`torch.Tensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            device: (`torch.device`, *optional*):
-                torch device
-            dtype: (`torch.dtype`, *optional*):
-                torch dtype
-        """
-        device = device or self._execution_device
-
-        prompt = [prompt] if isinstance(prompt, str) else prompt
-        if prompt is not None:
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        if prompt_embeds is None:
-            prompt_embeds = self._get_prompt_embeds(
-                prompt=prompt, max_sequence_length=max_sequence_length, device=device, dtype=dtype
-            )
-
-            # duplicate text embeddings for each generation per prompt, using mps friendly method
-            _, seq_len, _ = prompt_embeds.shape
-            prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt, 1)
-            prompt_embeds = prompt_embeds.view(batch_size * num_videos_per_prompt, seq_len, -1)
-
-        if do_classifier_free_guidance and negative_prompt_embeds is None:
-            negative_prompt = negative_prompt or ""
-            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
-
-            if prompt is not None and type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}."
-                )
-            elif batch_size != len(negative_prompt):
-                raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`."
-                )
-
-            negative_prompt_embeds = self._get_prompt_embeds(
-                prompt=negative_prompt, max_sequence_length=max_sequence_length, device=device, dtype=dtype
-            )
-
-            # duplicate text embeddings for each generation per prompt, using mps friendly method
-            _, seq_len, _ = negative_prompt_embeds.shape
-            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_videos_per_prompt, 1)
-            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_videos_per_prompt, seq_len, -1)
-
-        return prompt_embeds, negative_prompt_embeds
-
-    # Modified from diffusers.pipelines.cosmos.pipeline_cosmos2_video2world.Cosmos2VideoToWorldPipeline.prepare_latents and
-    # diffusers.pipelines.cosmos.pipeline_cosmos2_video2world.Cosmos2TextToImagePipeline.prepare_latents
-    def prepare_latents(
-        self,
-        video: Optional[torch.Tensor],
-        batch_size: int,
-        num_channels_latents: int = 16,
-        height: int = 704,
-        width: int = 1280,
-        num_frames_in: int = 93,
-        num_frames_out: int = 93,
-        do_classifier_free_guidance: bool = True,
-        dtype: Optional[torch.dtype] = None,
-        device: Optional[torch.device] = None,
-        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
-        latents: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        if isinstance(generator, list) and len(generator) != batch_size:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
-            )
-
-        B = batch_size
-        C = num_channels_latents
-        T = (num_frames_out - 1) // self.vae_scale_factor_temporal + 1
-        H = height // self.vae_scale_factor_spatial
-        W = width // self.vae_scale_factor_spatial
-        shape = (B, C, T, H, W)
-
-        if num_frames_in == 0:
-            if latents is None:
-                latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
-
-            cond_mask = torch.zeros((B, 1, T, H, W), dtype=latents.dtype, device=latents.device)
-            cond_indicator = torch.zeros((B, 1, T, 1, 1), dtype=latents.dtype, device=latents.device)
-
-            cond_latents = torch.zeros_like(latents)
-
-            return (
-                latents,
-                cond_latents,
-                cond_mask,
-                cond_indicator,
-            )
-        else:
-            if video is None:
-                raise ValueError("`video` must be provided when `num_frames_in` is greater than 0.")
-            needs_preprocessing = not (isinstance(video, torch.Tensor) and video.ndim == 5 and video.shape[1] == 3)
-            if needs_preprocessing:
-                video = self.video_processor.preprocess_video(video, height, width)
-            video = video.to(device=device, dtype=self.vae.dtype)
-            if isinstance(generator, list):
-                cond_latents = [
-                    retrieve_latents(self.vae.encode(video[i].unsqueeze(0)), generator=generator[i])
-                    for i in range(batch_size)
-                ]
-            else:
-                cond_latents = [retrieve_latents(self.vae.encode(vid.unsqueeze(0)), generator) for vid in video]
-
-            cond_latents = torch.cat(cond_latents, dim=0).to(dtype)
-
-            latents_mean = self.latents_mean.to(device=device, dtype=dtype)
-            latents_std = self.latents_std.to(device=device, dtype=dtype)
-            cond_latents = (cond_latents - latents_mean) / latents_std
-
-            if latents is None:
-                latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
-            else:
-                latents = latents.to(device=device, dtype=dtype)
-
-            padding_shape = (B, 1, T, H, W)
-            ones_padding = latents.new_ones(padding_shape)
-            zeros_padding = latents.new_zeros(padding_shape)
-
-            num_cond_latent_frames = (num_frames_in - 1) // self.vae_scale_factor_temporal + 1
-            cond_indicator = latents.new_zeros(1, 1, latents.size(2), 1, 1)
-            cond_indicator[:, :, 0:num_cond_latent_frames] = 1.0
-            cond_mask = cond_indicator * ones_padding + (1 - cond_indicator) * zeros_padding
-
-            return (
-                latents,
-                cond_latents,
-                cond_mask,
-                cond_indicator,
-            )
-
-    # Copied from diffusers.pipelines.cosmos.pipeline_cosmos_text2world.CosmosTextToWorldPipeline.check_inputs
-    def check_inputs(
-        self,
-        prompt,
-        height,
-        width,
-        prompt_embeds=None,
-        callback_on_step_end_tensor_inputs=None,
-    ):
-        if height % 16 != 0 or width % 16 != 0:
-            raise ValueError(f"`height` and `width` have to be divisible by 16 but are {height} and {width}.")
-
-        if callback_on_step_end_tensor_inputs is not None and not all(
-            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
-        ):
-            raise ValueError(
-                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
-            )
-
-        if prompt is not None and prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two."
-            )
-        elif prompt is None and prompt_embeds is None:
-            raise ValueError(
-                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
-            )
-        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-    @property
-    def guidance_scale(self):
-        return self._guidance_scale
-
-    @property
-    def do_classifier_free_guidance(self):
-        return self._guidance_scale > 1.0
-
-    @property
-    def num_timesteps(self):
-        return self._num_timesteps
-
-    @property
-    def current_timestep(self):
-        return self._current_timestep
-
-    @property
-    def interrupt(self):
-        return self._interrupt
-
-    @torch.no_grad()
-    @replace_example_docstring(EXAMPLE_DOC_STRING)
-    def __call__(
-        self,
-        image: PipelineImageInput | None = None,
-        video: List[PipelineImageInput] | None = None,
-        prompt: Union[str, List[str]] | None = None,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        height: int = 704,
-        width: int = 1280,
-        num_frames: int = 93,
-        num_inference_steps: int = 36,
-        guidance_scale: float = 7.0,
-        num_videos_per_prompt: Optional[int] = 1,
-        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
-        latents: Optional[torch.Tensor] = None,
-        prompt_embeds: Optional[torch.Tensor] = None,
-        negative_prompt_embeds: Optional[torch.Tensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback_on_step_end: Optional[
-            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
-        ] = None,
-        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
-        max_sequence_length: int = 512,
-        conditional_frame_timestep: float = 0.1,
-    ):
-        r"""
-        The call function to the pipeline for generation. Supports three modes:
-
-        - **Text2World**: `image=None`, `video=None`, `prompt` provided. Generates a world clip.
-        - **Image2World**: `image` provided, `video=None`, `prompt` provided. Conditions on a single frame.
-        - **Video2World**: `video` provided, `image=None`, `prompt` provided. Conditions on an input clip.
-
-        Set `num_frames=93` (default) to produce a world video, or `num_frames=1` to produce a single image frame (the
-        above in "*2Image mode").
-
-        Outputs follow `output_type` (e.g., `"pil"` returns a list of `num_frames` PIL images per prompt).
-
-        Args:
-            image (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, *optional*):
-                Optional single image for Image2World conditioning. Must be `None` when `video` is provided.
-            video (`List[PIL.Image.Image]`, `np.ndarray`, `torch.Tensor`, *optional*):
-                Optional input video for Video2World conditioning. Must be `None` when `image` is provided.
-            prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide generation. Required unless `prompt_embeds` is supplied.
-            height (`int`, defaults to `704`):
-                The height in pixels of the generated image.
-            width (`int`, defaults to `1280`):
-                The width in pixels of the generated image.
-            num_frames (`int`, defaults to `93`):
-                Number of output frames. Use `93` for world (video) generation; set to `1` to return a single frame.
-            num_inference_steps (`int`, defaults to `35`):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            guidance_scale (`float`, defaults to `7.0`):
-                Guidance scale as defined in [Classifier-Free Diffusion
-                Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
-                of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
-                `guidance_scale > 1`.
-            num_videos_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
-                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
-                generation deterministic.
-            latents (`torch.Tensor`, *optional*):
-                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor is generated by sampling using the supplied random `generator`.
-            prompt_embeds (`torch.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated negative text embeddings. For PixArt-Sigma this negative prompt should be "". If not
-                provided, negative_prompt_embeds will be generated from `negative_prompt` input argument.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`CosmosPipelineOutput`] instead of a plain tuple.
-            callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
-                A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
-                each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
-                DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
-                list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
-            callback_on_step_end_tensor_inputs (`List`, *optional*):
-                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
-                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
-                `._callback_tensor_inputs` attribute of your pipeline class.
-            max_sequence_length (`int`, defaults to `512`):
-                The maximum number of tokens in the prompt. If the prompt exceeds this length, it will be truncated. If
-                the prompt is shorter than this length, it will be padded.
-
-        Examples:
-
-        Returns:
-            [`~CosmosPipelineOutput`] or `tuple`:
-                If `return_dict` is `True`, [`CosmosPipelineOutput`] is returned, otherwise a `tuple` is returned where
-                the first element is a list with the generated images and the second element is a list of `bool`s
-                indicating whether the corresponding generated image contains "not-safe-for-work" (nsfw) content.
-        """
-        if self.safety_checker is None:
-            raise ValueError(
-                f"You have disabled the safety checker for {self.__class__}. This is in violation of the "
-                "[NVIDIA Open Model License Agreement](https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-open-model-license). "
-                f"Please ensure that you are compliant with the license agreement."
-            )
-
-        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
-            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
-
-        # Check inputs. Raise error if not correct
-        self.check_inputs(prompt, height, width, prompt_embeds, callback_on_step_end_tensor_inputs)
-
-        self._guidance_scale = guidance_scale
-        self._current_timestep = None
-        self._interrupt = False
-
-        device = self._execution_device
-
-        if self.safety_checker is not None:
-            self.safety_checker.to(device)
-            if prompt is not None:
-                prompt_list = [prompt] if isinstance(prompt, str) else prompt
-                for p in prompt_list:
-                    if not self.safety_checker.check_text_safety(p):
-                        raise ValueError(
-                            f"Cosmos Guardrail detected unsafe text in the prompt: {p}. Please ensure that the "
-                            f"prompt abides by the NVIDIA Open Model License Agreement."
-                        )
-
-        # Define call parameters
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        # Encode input prompt
-        (
-            prompt_embeds,
-            negative_prompt_embeds,
-        ) = self.encode_prompt(
-            prompt=prompt,
-            negative_prompt=negative_prompt,
-            do_classifier_free_guidance=self.do_classifier_free_guidance,
-            num_videos_per_prompt=num_videos_per_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-            device=device,
-            max_sequence_length=max_sequence_length,
-        )
-
-        vae_dtype = self.vae.dtype
-        transformer_dtype = self.transformer.dtype
-
-        num_frames_in = None
-        if image is not None:
-            if batch_size != 1:
-                raise ValueError(f"batch_size must be 1 for image input (given {batch_size})")
-
-            image = torchvision.transforms.functional.to_tensor(image).unsqueeze(0)
-            video = torch.cat([image, torch.zeros_like(image).repeat(num_frames - 1, 1, 1, 1)], dim=0)
-            video = video.unsqueeze(0)
-            num_frames_in = 1
-        elif video is None:
-            video = torch.zeros(batch_size, num_frames, 3, height, width, dtype=torch.uint8)
-            num_frames_in = 0
-        else:
-            num_frames_in = len(video)
-
-            if batch_size != 1:
-                raise ValueError(f"batch_size must be 1 for video input (given {batch_size})")
-
-        assert video is not None
-        video = self.video_processor.preprocess_video(video, height, width)
-
-        # pad with last frame (for video2world)
-        num_frames_out = num_frames
-        if video.shape[2] < num_frames_out:
-            n_pad_frames = num_frames_out - num_frames_in
-            last_frame = video[0, :, -1:, :, :]  # [C, T==1, H, W]
-            pad_frames = last_frame.repeat(1, 1, n_pad_frames, 1, 1)  # [B, C, T, H, W]
-            video = torch.cat((video, pad_frames), dim=2)
-
-        assert num_frames_in <= num_frames_out, f"expected ({num_frames_in=}) <= ({num_frames_out=})"
-
-        video = video.to(device=device, dtype=vae_dtype)
-
-        num_channels_latents = self.transformer.config.in_channels - 1
-        latents, cond_latent, cond_mask, cond_indicator = self.prepare_latents(
-            video=video,
-            batch_size=batch_size * num_videos_per_prompt,
-            num_channels_latents=num_channels_latents,
-            height=height,
-            width=width,
-            num_frames_in=num_frames_in,
-            num_frames_out=num_frames,
-            do_classifier_free_guidance=self.do_classifier_free_guidance,
-            dtype=torch.float32,
-            device=device,
-            generator=generator,
-            latents=latents,
-        )
-        cond_timestep = torch.ones_like(cond_indicator) * conditional_frame_timestep
-        cond_mask = cond_mask.to(transformer_dtype)
-
-        padding_mask = latents.new_zeros(1, 1, height, width, dtype=transformer_dtype)
-
-        # Denoising loop
-        self.scheduler.set_timesteps(num_inference_steps, device=device)
-        timesteps = self.scheduler.timesteps
-        self._num_timesteps = len(timesteps)
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-
-        gt_velocity = (latents - cond_latent) * cond_mask
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                if self.interrupt:
-                    continue
-
-                self._current_timestep = t.cpu().item()
-
-                # NOTE: assumes sigma(t) \in [0, 1]
-                sigma_t = (
-                    torch.tensor(self.scheduler.sigmas[i].item())
-                    .unsqueeze(0)
-                    .to(device=device, dtype=transformer_dtype)
-                )
-
-                in_latents = cond_mask * cond_latent + (1 - cond_mask) * latents
-                in_latents = in_latents.to(transformer_dtype)
-                in_timestep = cond_indicator * cond_timestep + (1 - cond_indicator) * sigma_t
-                noise_pred = self.transformer(
-                    hidden_states=in_latents,
-                    condition_mask=cond_mask,
-                    timestep=in_timestep,
-                    encoder_hidden_states=prompt_embeds,
-                    padding_mask=padding_mask,
-                    return_dict=False,
-                )[0]
-                # NOTE: replace velocity (noise_pred) with gt_velocity for conditioning inputs only
-                noise_pred = gt_velocity + noise_pred * (1 - cond_mask)
-
-                if self.do_classifier_free_guidance:
-                    noise_pred_neg = self.transformer(
-                        hidden_states=in_latents,
-                        condition_mask=cond_mask,
-                        timestep=in_timestep,
-                        encoder_hidden_states=negative_prompt_embeds,
-                        padding_mask=padding_mask,
-                        return_dict=False,
-                    )[0]
-                    # NOTE: replace velocity (noise_pred_neg) with gt_velocity for conditioning inputs only
-                    noise_pred_neg = gt_velocity + noise_pred_neg * (1 - cond_mask)
-                    noise_pred = noise_pred + self.guidance_scale * (noise_pred - noise_pred_neg)
-
-                latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
-
-                if callback_on_step_end is not None:
-                    callback_kwargs = {}
-                    for k in callback_on_step_end_tensor_inputs:
-                        callback_kwargs[k] = locals()[k]
-                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
-
-                    latents = callback_outputs.pop("latents", latents)
-                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
-                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
-
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    progress_bar.update()
-
-                if XLA_AVAILABLE:
-                    xm.mark_step()
-
-        self._current_timestep = None
-
-        if not output_type == "latent":
-            latents_mean = self.latents_mean.to(latents.device, latents.dtype)
-            latents_std = self.latents_std.to(latents.device, latents.dtype)
-            latents = latents * latents_std + latents_mean
-            video = self.vae.decode(latents.to(self.vae.dtype), return_dict=False)[0]
-            video = self._match_num_frames(video, num_frames)
-
-            assert self.safety_checker is not None
-            self.safety_checker.to(device)
-            video = self.video_processor.postprocess_video(video, output_type="np")
-            video = (video * 255).astype(np.uint8)
-            video_batch = []
-            for vid in video:
-                vid = self.safety_checker.check_video_safety(vid)
-                video_batch.append(vid)
-            video = np.stack(video_batch).astype(np.float32) / 255.0 * 2 - 1
-            video = torch.from_numpy(video).permute(0, 4, 1, 2, 3)
-            video = self.video_processor.postprocess_video(video, output_type=output_type)
-        else:
-            video = latents
-
-        # Offload all models
-        self.maybe_free_model_hooks()
-
-        if not return_dict:
-            return (video,)
-
-        return CosmosPipelineOutput(frames=video)
-
-    def _match_num_frames(self, video: torch.Tensor, target_num_frames: int) -> torch.Tensor:
-        if target_num_frames <= 0 or video.shape[2] == target_num_frames:
-            return video
-
-        frames_per_latent = max(self.vae_scale_factor_temporal, 1)
-        video = torch.repeat_interleave(video, repeats=frames_per_latent, dim=2)
-
-        current_frames = video.shape[2]
-        if current_frames < target_num_frames:
-            pad = video[:, :, -1:, :, :].repeat(1, 1, target_num_frames - current_frames, 1, 1)
-            video = torch.cat([video, pad], dim=2)
-        elif current_frames > target_num_frames:
-            video = video[:, :, :target_num_frames]
-
-        return video
--- a/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_layered.py
+++ b/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_layered.py
@@ -882,24 +882,21 @@ the image\n<|vision_start|><|image_pad|><|vision_end|><|im_end|>\n<|im_start|>as
            latents = latents / latents_std + latents_mean

            b, c, f, h, w = latents.shape
-
            latents = latents[:, :, 1:]  # remove the first frame as it is the orgin input
-
            latents = latents.permute(0, 2, 1, 3, 4).view(-1, c, 1, h, w)

-            image = self.vae.decode(latents, return_dict=False)[0]  # (b f) c 1 h w
+            img = self.vae.decode(latents, return_dict=False)[0]  # (b f) c 1 h w
+            img = img.squeeze(2)

-            image = image.squeeze(2)
-
-            image = self.image_processor.postprocess(image, output_type=output_type)
-            images = []
+            img = self.image_processor.postprocess(img, output_type=output_type)
+            image = []
            for bidx in range(b):
-                images.append(image[bidx * f : (bidx + 1) * f])
+                image.append(img[bidx * f : (bidx + 1) * f])

        # Offload all models
        self.maybe_free_model_hooks()

        if not return_dict:
-            return (images,)
+            return (image,)

-        return QwenImagePipelineOutput(images=images)
+        return QwenImagePipelineOutput(images=image)
--- a/src/diffusers/schedulers/scheduling_unipc_multistep.py
+++ b/src/diffusers/schedulers/scheduling_unipc_multistep.py
@@ -217,8 +217,6 @@ class UniPCMultistepScheduler(SchedulerMixin, ConfigMixin):
        rescale_betas_zero_snr: bool = False,
        use_dynamic_shifting: bool = False,
        time_shift_type: Literal["exponential"] = "exponential",
-        sigma_min: Optional[float] = None,
-        sigma_max: Optional[float] = None,
    ) -> None:
        if self.config.use_beta_sigmas and not is_scipy_available():
            raise ImportError("Make sure to install scipy if you want to use beta sigmas.")
@@ -352,12 +350,7 @@ class UniPCMultistepScheduler(SchedulerMixin, ConfigMixin):
            log_sigmas = np.log(sigmas)
            sigmas = np.flip(sigmas).copy()
            sigmas = self._convert_to_karras(in_sigmas=sigmas, num_inference_steps=num_inference_steps)
-            if self.config.use_flow_sigmas:
-                sigmas = sigmas / (sigmas + 1)
-                timesteps = (sigmas * self.config.num_train_timesteps).copy()
-            else:
-                timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas]).round()
-
+            timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas]).round()
            if self.config.final_sigmas_type == "sigma_min":
                sigma_last = sigmas[-1]
            elif self.config.final_sigmas_type == "zero":
--- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py
+++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
@@ -767,21 +767,6 @@ class ConsisIDPipeline(metaclass=DummyObject):
        requires_backends(cls, ["torch", "transformers"])


-class Cosmos2_5_PredictBasePipeline(metaclass=DummyObject):
-    _backends = ["torch", "transformers"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch", "transformers"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "transformers"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "transformers"])
-
-
 class Cosmos2TextToImagePipeline(metaclass=DummyObject):
    _backends = ["torch", "transformers"]

--- a/tests/pipelines/cosmos/cosmos_guardrail.py
+++ b/tests/pipelines/cosmos/cosmos_guardrail.py
@@ -27,7 +27,7 @@ class DummyCosmosSafetyChecker(ModelMixin, ConfigMixin):
    def __init__(self) -> None:
        super().__init__()

-        self.register_buffer("_device_tracker", torch.zeros(1, dtype=torch.float32), persistent=False)
+        self._dtype = torch.float32

    def check_text_safety(self, prompt: str) -> bool:
        return True
@@ -35,14 +35,13 @@ class DummyCosmosSafetyChecker(ModelMixin, ConfigMixin):
    def check_video_safety(self, frames: np.ndarray) -> np.ndarray:
        return frames

-    def to(self, device: Union[str, torch.device] = None, dtype: torch.dtype = None):
-        module = super().to(device=device, dtype=dtype)
-        return module
+    def to(self, device: Union[str, torch.device] = None, dtype: torch.dtype = None) -> None:
+        self._dtype = dtype

    @property
    def device(self) -> torch.device:
-        return self._device_tracker.device
+        return None

    @property
    def dtype(self) -> torch.dtype:
-        return self._device_tracker.dtype
+        return self._dtype
--- a/tests/pipelines/cosmos/test_cosmos2_5_predict.py
+++ b/tests/pipelines/cosmos/test_cosmos2_5_predict.py
@@ -1,337 +0,0 @@
-# Copyright 2025 The HuggingFace Team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-import json
-import os
-import tempfile
-import unittest
-
-import numpy as np
-import torch
-from transformers import Qwen2_5_VLConfig, Qwen2_5_VLForConditionalGeneration, Qwen2Tokenizer
-
-from diffusers import (
-    AutoencoderKLWan,
-    Cosmos2_5_PredictBasePipeline,
-    CosmosTransformer3DModel,
-    UniPCMultistepScheduler,
-)
-
-from ...testing_utils import enable_full_determinism, torch_device
-from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
-from ..test_pipelines_common import PipelineTesterMixin, to_np
-from .cosmos_guardrail import DummyCosmosSafetyChecker
-
-
-enable_full_determinism()
-
-
-class Cosmos2_5_PredictBaseWrapper(Cosmos2_5_PredictBasePipeline):
-    @staticmethod
-    def from_pretrained(*args, **kwargs):
-        if "safety_checker" not in kwargs or kwargs["safety_checker"] is None:
-            safety_checker = DummyCosmosSafetyChecker()
-            device_map = kwargs.get("device_map", "cpu")
-            torch_dtype = kwargs.get("torch_dtype")
-            if device_map is not None or torch_dtype is not None:
-                safety_checker = safety_checker.to(device_map, dtype=torch_dtype)
-            kwargs["safety_checker"] = safety_checker
-        return Cosmos2_5_PredictBasePipeline.from_pretrained(*args, **kwargs)
-
-
-class Cosmos2_5_PredictPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = Cosmos2_5_PredictBaseWrapper
-    params = TEXT_TO_IMAGE_PARAMS - {"cross_attention_kwargs"}
-    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
-    image_params = TEXT_TO_IMAGE_IMAGE_PARAMS
-    image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
-    required_optional_params = frozenset(
-        [
-            "num_inference_steps",
-            "generator",
-            "latents",
-            "return_dict",
-            "callback_on_step_end",
-            "callback_on_step_end_tensor_inputs",
-        ]
-    )
-    supports_dduf = False
-    test_xformers_attention = False
-    test_layerwise_casting = True
-    test_group_offloading = True
-
-    def get_dummy_components(self):
-        torch.manual_seed(0)
-        transformer = CosmosTransformer3DModel(
-            in_channels=16 + 1,
-            out_channels=16,
-            num_attention_heads=2,
-            attention_head_dim=16,
-            num_layers=2,
-            mlp_ratio=2,
-            text_embed_dim=32,
-            adaln_lora_dim=4,
-            max_size=(4, 32, 32),
-            patch_size=(1, 2, 2),
-            rope_scale=(2.0, 1.0, 1.0),
-            concat_padding_mask=True,
-            extra_pos_embed_type="learnable",
-        )
-
-        torch.manual_seed(0)
-        vae = AutoencoderKLWan(
-            base_dim=3,
-            z_dim=16,
-            dim_mult=[1, 1, 1, 1],
-            num_res_blocks=1,
-            temperal_downsample=[False, True, True],
-        )
-
-        torch.manual_seed(0)
-        scheduler = UniPCMultistepScheduler()
-
-        torch.manual_seed(0)
-        config = Qwen2_5_VLConfig(
-            text_config={
-                "hidden_size": 16,
-                "intermediate_size": 16,
-                "num_hidden_layers": 2,
-                "num_attention_heads": 2,
-                "num_key_value_heads": 2,
-                "rope_scaling": {
-                    "mrope_section": [1, 1, 2],
-                    "rope_type": "default",
-                    "type": "default",
-                },
-                "rope_theta": 1000000.0,
-            },
-            vision_config={
-                "depth": 2,
-                "hidden_size": 16,
-                "intermediate_size": 16,
-                "num_heads": 2,
-                "out_hidden_size": 16,
-            },
-            hidden_size=16,
-            vocab_size=152064,
-            vision_end_token_id=151653,
-            vision_start_token_id=151652,
-            vision_token_id=151654,
-        )
-        text_encoder = Qwen2_5_VLForConditionalGeneration(config)
-        tokenizer = Qwen2Tokenizer.from_pretrained("hf-internal-testing/tiny-random-Qwen2VLForConditionalGeneration")
-
-        components = {
-            "transformer": transformer,
-            "vae": vae,
-            "scheduler": scheduler,
-            "text_encoder": text_encoder,
-            "tokenizer": tokenizer,
-            "safety_checker": DummyCosmosSafetyChecker(),
-        }
-        return components
-
-    def get_dummy_inputs(self, device, seed=0):
-        if str(device).startswith("mps"):
-            generator = torch.manual_seed(seed)
-        else:
-            generator = torch.Generator(device=device).manual_seed(seed)
-
-        inputs = {
-            "prompt": "dance monkey",
-            "negative_prompt": "bad quality",
-            "generator": generator,
-            "num_inference_steps": 2,
-            "guidance_scale": 3.0,
-            "height": 32,
-            "width": 32,
-            "num_frames": 3,
-            "max_sequence_length": 16,
-            "output_type": "pt",
-        }
-
-        return inputs
-
-    def test_components_function(self):
-        init_components = self.get_dummy_components()
-        init_components = {k: v for k, v in init_components.items() if not isinstance(v, (str, int, float))}
-        pipe = self.pipeline_class(**init_components)
-        self.assertTrue(hasattr(pipe, "components"))
-        self.assertTrue(set(pipe.components.keys()) == set(init_components.keys()))
-
-    def test_inference(self):
-        device = "cpu"
-
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        pipe.to(device)
-        pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        video = pipe(**inputs).frames
-        generated_video = video[0]
-        self.assertEqual(generated_video.shape, (3, 3, 32, 32))
-        self.assertTrue(torch.isfinite(generated_video).all())
-
-    def test_callback_inputs(self):
-        sig = inspect.signature(self.pipeline_class.__call__)
-        has_callback_tensor_inputs = "callback_on_step_end_tensor_inputs" in sig.parameters
-        has_callback_step_end = "callback_on_step_end" in sig.parameters
-
-        if not (has_callback_tensor_inputs and has_callback_step_end):
-            return
-
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        pipe = pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        self.assertTrue(
-            hasattr(pipe, "_callback_tensor_inputs"),
-            f" {self.pipeline_class} should have `_callback_tensor_inputs` that defines a list of tensor variables its callback function can use as inputs",
-        )
-
-        def callback_inputs_subset(pipe, i, t, callback_kwargs):
-            for tensor_name in callback_kwargs.keys():
-                assert tensor_name in pipe._callback_tensor_inputs
-            return callback_kwargs
-
-        def callback_inputs_all(pipe, i, t, callback_kwargs):
-            for tensor_name in pipe._callback_tensor_inputs:
-                assert tensor_name in callback_kwargs
-            for tensor_name in callback_kwargs.keys():
-                assert tensor_name in pipe._callback_tensor_inputs
-            return callback_kwargs
-
-        inputs = self.get_dummy_inputs(torch_device)
-
-        inputs["callback_on_step_end"] = callback_inputs_subset
-        inputs["callback_on_step_end_tensor_inputs"] = ["latents"]
-        _ = pipe(**inputs)[0]
-
-        inputs["callback_on_step_end"] = callback_inputs_all
-        inputs["callback_on_step_end_tensor_inputs"] = pipe._callback_tensor_inputs
-        _ = pipe(**inputs)[0]
-
-        def callback_inputs_change_tensor(pipe, i, t, callback_kwargs):
-            is_last = i == (pipe.num_timesteps - 1)
-            if is_last:
-                callback_kwargs["latents"] = torch.zeros_like(callback_kwargs["latents"])
-            return callback_kwargs
-
-        inputs["callback_on_step_end"] = callback_inputs_change_tensor
-        inputs["callback_on_step_end_tensor_inputs"] = pipe._callback_tensor_inputs
-        output = pipe(**inputs)[0]
-        assert output.abs().sum() < 1e10
-
-    def test_inference_batch_single_identical(self):
-        self._test_inference_batch_single_identical(batch_size=2, expected_max_diff=1e-2)
-
-    def test_attention_slicing_forward_pass(
-        self, test_max_difference=True, test_mean_pixel_difference=True, expected_max_diff=1e-3
-    ):
-        if not getattr(self, "test_attention_slicing", True):
-            return
-
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        for component in pipe.components.values():
-            if hasattr(component, "set_default_attn_processor"):
-                component.set_default_attn_processor()
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-
-        generator_device = "cpu"
-        inputs = self.get_dummy_inputs(generator_device)
-        output_without_slicing = pipe(**inputs)[0]
-
-        pipe.enable_attention_slicing(slice_size=1)
-        inputs = self.get_dummy_inputs(generator_device)
-        output_with_slicing1 = pipe(**inputs)[0]
-
-        pipe.enable_attention_slicing(slice_size=2)
-        inputs = self.get_dummy_inputs(generator_device)
-        output_with_slicing2 = pipe(**inputs)[0]
-
-        if test_max_difference:
-            max_diff1 = np.abs(to_np(output_with_slicing1) - to_np(output_without_slicing)).max()
-            max_diff2 = np.abs(to_np(output_with_slicing2) - to_np(output_without_slicing)).max()
-            self.assertLess(
-                max(max_diff1, max_diff2),
-                expected_max_diff,
-                "Attention slicing should not affect the inference results",
-            )
-
-    def test_save_load_optional_components(self, expected_max_difference=1e-4):
-        self.pipeline_class._optional_components.remove("safety_checker")
-        super().test_save_load_optional_components(expected_max_difference=expected_max_difference)
-        self.pipeline_class._optional_components.append("safety_checker")
-
-    def test_serialization_with_variants(self):
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        model_components = [
-            component_name
-            for component_name, component in pipe.components.items()
-            if isinstance(component, torch.nn.Module)
-        ]
-        model_components.remove("safety_checker")
-        variant = "fp16"
-
-        with tempfile.TemporaryDirectory() as tmpdir:
-            pipe.save_pretrained(tmpdir, variant=variant, safe_serialization=False)
-
-            with open(f"{tmpdir}/model_index.json", "r") as f:
-                config = json.load(f)
-
-            for subfolder in os.listdir(tmpdir):
-                if not os.path.isfile(subfolder) and subfolder in model_components:
-                    folder_path = os.path.join(tmpdir, subfolder)
-                    is_folder = os.path.isdir(folder_path) and subfolder in config
-                    assert is_folder and any(p.split(".")[1].startswith(variant) for p in os.listdir(folder_path))
-
-    def test_torch_dtype_dict(self):
-        components = self.get_dummy_components()
-        if not components:
-            self.skipTest("No dummy components defined.")
-
-        pipe = self.pipeline_class(**components)
-
-        specified_key = next(iter(components.keys()))
-
-        with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as tmpdirname:
-            pipe.save_pretrained(tmpdirname, safe_serialization=False)
-            torch_dtype_dict = {specified_key: torch.bfloat16, "default": torch.float16}
-            loaded_pipe = self.pipeline_class.from_pretrained(
-                tmpdirname, safety_checker=DummyCosmosSafetyChecker(), torch_dtype=torch_dtype_dict
-            )
-
-        for name, component in loaded_pipe.components.items():
-            if name == "safety_checker":
-                continue
-            if isinstance(component, torch.nn.Module) and hasattr(component, "dtype"):
-                expected_dtype = torch_dtype_dict.get(name, torch_dtype_dict.get("default", torch.float32))
-                self.assertEqual(
-                    component.dtype,
-                    expected_dtype,
-                    f"Component '{name}' has dtype {component.dtype} but expected {expected_dtype}",
-                )
-
-    @unittest.skip(
-        "The pipeline should not be runnable without a safety checker. The test creates a pipeline without passing in "
-        "a safety checker, which makes the pipeline default to the actual Cosmos Guardrail. The Cosmos Guardrail is "
-        "too large and slow to run on CI."
-    )
-    def test_encode_prompt_works_in_isolation(self):
-        pass
--- a/tests/pipelines/qwenimage/test_qwenimage_layered.py
+++ b/tests/pipelines/qwenimage/test_qwenimage_layered.py
@@ -0,0 +1,223 @@
+# Copyright 2025 The HuggingFace Team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import diffusers
+import numpy as np
+import torch
+from PIL import Image
+from transformers import Qwen2_5_VLConfig, Qwen2_5_VLForConditionalGeneration, Qwen2Tokenizer, Qwen2VLProcessor
+
+from diffusers import (
+    AutoencoderKLQwenImage,
+    FlowMatchEulerDiscreteScheduler,
+    QwenImageLayeredPipeline,
+    QwenImageTransformer2DModel,
+)
+
+from ...testing_utils import enable_full_determinism, torch_device
+from ..pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
+from ..test_pipelines_common import PipelineTesterMixin, to_np
+
+
+enable_full_determinism()
+
+
+class QwenImageLayeredPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = QwenImageLayeredPipeline
+    params = TEXT_TO_IMAGE_PARAMS - {"height", "width", "cross_attention_kwargs"}
+    batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS
+    image_params = frozenset(["image"])
+    image_latents_params = frozenset(["latents"])
+    required_optional_params = frozenset(
+        [
+            "num_inference_steps",
+            "generator",
+            "latents",
+            "return_dict",
+            "callback_on_step_end",
+            "callback_on_step_end_tensor_inputs",
+        ]
+    )
+    supports_dduf = False
+    test_xformers_attention = False
+    test_layerwise_casting = True
+    test_group_offloading = True
+
+    def get_dummy_components(self):
+        tiny_ckpt_id = "hf-internal-testing/tiny-random-Qwen2VLForConditionalGeneration"
+
+        torch.manual_seed(0)
+        transformer = QwenImageTransformer2DModel(
+            patch_size=2,
+            in_channels=16,
+            out_channels=4,
+            num_layers=2,
+            attention_head_dim=16,
+            num_attention_heads=3,
+            joint_attention_dim=16,
+            guidance_embeds=False,
+            axes_dims_rope=(8, 4, 4),
+        )
+
+        torch.manual_seed(0)
+        z_dim = 4
+        vae = AutoencoderKLQwenImage(
+            base_dim=z_dim * 6,
+            z_dim=z_dim,
+            dim_mult=[1, 2, 4],
+            num_res_blocks=1,
+            temperal_downsample=[False, True],
+            latents_mean=[0.0] * z_dim,
+            latents_std=[1.0] * z_dim,
+        )
+
+        torch.manual_seed(0)
+        scheduler = FlowMatchEulerDiscreteScheduler()
+
+        torch.manual_seed(0)
+        config = Qwen2_5_VLConfig(
+            text_config={
+                "hidden_size": 16,
+                "intermediate_size": 16,
+                "num_hidden_layers": 2,
+                "num_attention_heads": 2,
+                "num_key_value_heads": 2,
+                "rope_scaling": {
+                    "mrope_section": [1, 1, 2],
+                    "rope_type": "default",
+                    "type": "default",
+                },
+                "rope_theta": 1000000.0,
+            },
+            vision_config={
+                "depth": 2,
+                "hidden_size": 16,
+                "intermediate_size": 16,
+                "num_heads": 2,
+                "out_hidden_size": 16,
+            },
+            hidden_size=16,
+            vocab_size=152064,
+            vision_end_token_id=151653,
+            vision_start_token_id=151652,
+            vision_token_id=151654,
+        )
+        text_encoder = Qwen2_5_VLForConditionalGeneration(config)
+        tokenizer = Qwen2Tokenizer.from_pretrained(tiny_ckpt_id)
+        processor = Qwen2VLProcessor.from_pretrained(tiny_ckpt_id)
+
+        components = {
+            "transformer": transformer,
+            "vae": vae,
+            "scheduler": scheduler,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "processor": processor,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+
+        inputs = {
+            "prompt": "dance monkey",
+            "image": Image.new("RGB", (32, 32)),
+            "negative_prompt": "bad quality",
+            "generator": generator,
+            "true_cfg_scale": 1.0,
+            "layers": 2,
+            "num_inference_steps": 2,
+            "max_sequence_length": 16,
+            "resolution": 640,
+            "output_type": "pt",
+        }
+
+        return inputs
+
+    def test_inference(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        images = pipe(**inputs).images
+        self.assertEqual(len(images), 1)
+
+        generated_layers = images[0]
+        self.assertEqual(generated_layers.shape, (inputs["layers"], 3, 640, 640))
+
+        # fmt: off
+        expected_slice_layer_0 = torch.tensor([0.5752, 0.6324, 0.4913, 0.4421, 0.4917, 0.4923, 0.4790, 0.4299, 0.4029, 0.3506, 0.3302, 0.3352, 0.3579, 0.4422, 0.5086, 0.5961])
+        expected_slice_layer_1 = torch.tensor([0.5103, 0.6606, 0.5652, 0.6512, 0.5900, 0.5814, 0.5873, 0.5083, 0.5058, 0.4131, 0.4321, 0.5300, 0.3507, 0.4826, 0.4745, 0.5426])
+        # fmt: on
+
+        layer_0_slice = torch.cat([generated_layers[0].flatten()[:8], generated_layers[0].flatten()[-8:]])
+        layer_1_slice = torch.cat([generated_layers[1].flatten()[:8], generated_layers[1].flatten()[-8:]])
+
+        self.assertTrue(torch.allclose(layer_0_slice, expected_slice_layer_0, atol=1e-3))
+        self.assertTrue(torch.allclose(layer_1_slice, expected_slice_layer_1, atol=1e-3))
+
+    def test_inference_batch_single_identical(self, batch_size=3, expected_max_diff=1e-1):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        for component in pipe.components.values():
+            if hasattr(component, "set_default_attn_processor"):
+                component.set_default_attn_processor()
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["generator"] = self.get_generator(0)
+
+        logger = diffusers.logging.get_logger(pipe.__module__)
+        logger.setLevel(level=diffusers.logging.FATAL)
+
+        batched_inputs = {}
+        batched_inputs.update(inputs)
+
+        for name in self.batch_params:
+            if name not in inputs:
+                continue
+
+            value = inputs[name]
+            if name == "prompt":
+                len_prompt = len(value)
+                batched_inputs[name] = [value[: len_prompt // i] for i in range(1, batch_size + 1)]
+                batched_inputs[name][-1] = 100 * "very long"
+            else:
+                batched_inputs[name] = batch_size * [value]
+
+        if "generator" in inputs:
+            batched_inputs["generator"] = [self.get_generator(i) for i in range(batch_size)]
+
+        if "batch_size" in inputs:
+            batched_inputs["batch_size"] = batch_size
+
+        batched_inputs["num_inference_steps"] = inputs["num_inference_steps"]
+
+        output = pipe(**inputs).images
+        output_batch = pipe(**batched_inputs).images
+
+        self.assertEqual(len(output_batch), batch_size)
+
+        max_diff = np.abs(to_np(output_batch[0][0]) - to_np(output[0][0])).max()
+        self.assertLess(max_diff, expected_max_diff)
--- a/tests/schedulers/test_scheduler_unipc.py
+++ b/tests/schedulers/test_scheduler_unipc.py
@@ -399,32 +399,3 @@ class UniPCMultistepScheduler1DTest(UniPCMultistepSchedulerTest):

    def test_exponential_sigmas(self):
        self.check_over_configs(use_exponential_sigmas=True)
-
-    def test_flow_and_karras_sigmas(self):
-        self.check_over_configs(use_flow_sigmas=True, use_karras_sigmas=True)
-
-    def test_flow_and_karras_sigmas_values(self):
-        num_train_timesteps = 1000
-        num_inference_steps = 5
-        scheduler = UniPCMultistepScheduler(
-            sigma_min=0.01,
-            sigma_max=200.0,
-            use_flow_sigmas=True,
-            use_karras_sigmas=True,
-            num_train_timesteps=num_train_timesteps,
-        )
-        scheduler.set_timesteps(num_inference_steps=num_inference_steps)
-
-        expected_sigmas = [
-            0.9950248599052429,
-            0.9787454605102539,
-            0.8774884343147278,
-            0.3604971766471863,
-            0.009900986216962337,
-            0.0,  # 0 appended as default
-        ]
-        expected_sigmas = torch.tensor(expected_sigmas)
-        expected_timesteps = (expected_sigmas * num_train_timesteps).to(torch.int64)
-        expected_timesteps = expected_timesteps[0:-1]
-        self.assertTrue(torch.allclose(scheduler.sigmas, expected_sigmas))
-        self.assertTrue(torch.all(expected_timesteps == scheduler.timesteps))
Author	SHA1	Message	Date
sayakpaul	f70010ca5d	up	2025-12-18 11:37:01 +05:30
sayakpaul	2f0b35fd84	start qwenimage layer testsing.	2025-12-18 09:58:17 +05:30