Apply style fixes

Merge branch 'main' into modular-doc-improv
fix
2026-01-20 10:35:36 +08:00 · 2026-01-19 09:23:31 +00:00 · 2026-01-18 23:20:36 -10:00 · 2026-01-19 09:56:14 +01:00 · 2026-01-19 09:54:46 +01:00 · 2026-01-19 09:54:40 +01:00
31 changed files with 4084 additions and 2777 deletions
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -496,8 +496,6 @@
        title: Bria 3.2
      - local: api/pipelines/bria_fibo
        title: Bria Fibo
-      - local: api/pipelines/bria_fibo_edit
-        title: Bria Fibo Edit
      - local: api/pipelines/chroma
        title: Chroma
      - local: api/pipelines/cogview3
--- a/docs/source/en/api/pipelines/bria_fibo_edit.md
+++ b/docs/source/en/api/pipelines/bria_fibo_edit.md
@@ -1,33 +0,0 @@
-<!--Copyright 2025 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# Bria Fibo Edit
-
-Fibo Edit is an 8B parameter image-to-image model that introduces a new paradigm of structured control, operating on JSON inputs paired with source images to enable deterministic and repeatable editing workflows.
-Featuring native masking for granular precision, it moves beyond simple prompt-based diffusion to offer explicit, interpretable control optimized for production environments.
-Its lightweight architecture is designed for deep customization, empowering researchers to build specialized "Edit" models for domain-specific tasks while delivering top-tier aesthetic quality
-
-## Usage
-_As the model is gated, before using it with diffusers you first need to go to the [Bria Fibo Hugging Face page](https://huggingface.co/briaai/Fibo-Edit), fill in the form and accept the gate. Once you are in, you need to login so that your system knows you’ve accepted the gate._
-
-Use the command below to log in:
-
-```bash
-hf auth login
-```
-
-
-## BriaFiboEditPipeline
-
-[[autodoc]] BriaFiboEditPipeline
-	- all
-	- __call__
--- a/src/diffusers/init.py
+++ b/src/diffusers/init.py
@@ -413,9 +413,6 @@ else:
    _import_structure["modular_pipelines"].extend(
        [
            "Flux2AutoBlocks",
-            "Flux2KleinAutoBlocks",
-            "Flux2KleinBaseAutoBlocks",
-            "Flux2KleinModularPipeline",
            "Flux2ModularPipeline",
            "FluxAutoBlocks",
            "FluxKontextAutoBlocks",
@@ -460,7 +457,6 @@ else:
            "AuraFlowPipeline",
            "BlipDiffusionControlNetPipeline",
            "BlipDiffusionPipeline",
-            "BriaFiboEditPipeline",
            "BriaFiboPipeline",
            "BriaPipeline",
            "ChromaImg2ImgPipeline",
@@ -1149,9 +1145,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
    else:
        from .modular_pipelines import (
            Flux2AutoBlocks,
-            Flux2KleinAutoBlocks,
-            Flux2KleinBaseAutoBlocks,
-            Flux2KleinModularPipeline,
            Flux2ModularPipeline,
            FluxAutoBlocks,
            FluxKontextAutoBlocks,
@@ -1192,7 +1185,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
            AudioLDM2UNet2DConditionModel,
            AudioLDMPipeline,
            AuraFlowPipeline,
-            BriaFiboEditPipeline,
            BriaFiboPipeline,
            BriaPipeline,
            ChromaImg2ImgPipeline,
--- a/src/diffusers/modular_pipelines/init.py
+++ b/src/diffusers/modular_pipelines/init.py
@@ -54,10 +54,7 @@ else:
    ]
    _import_structure["flux2"] = [
        "Flux2AutoBlocks",
-        "Flux2KleinAutoBlocks",
-        "Flux2KleinBaseAutoBlocks",
        "Flux2ModularPipeline",
-        "Flux2KleinModularPipeline",
    ]
    _import_structure["qwenimage"] = [
        "QwenImageAutoBlocks",
@@ -84,13 +81,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
    else:
        from .components_manager import ComponentsManager
        from .flux import FluxAutoBlocks, FluxKontextAutoBlocks, FluxKontextModularPipeline, FluxModularPipeline
-        from .flux2 import (
-            Flux2AutoBlocks,
-            Flux2KleinAutoBlocks,
-            Flux2KleinBaseAutoBlocks,
-            Flux2KleinModularPipeline,
-            Flux2ModularPipeline,
-        )
+        from .flux2 import Flux2AutoBlocks, Flux2ModularPipeline
        from .modular_pipeline import (
            AutoPipelineBlocks,
            BlockState,
--- a/src/diffusers/modular_pipelines/flux2/init.py
+++ b/src/diffusers/modular_pipelines/flux2/init.py
@@ -43,7 +43,7 @@ else:
        "Flux2ProcessImagesInputStep",
        "Flux2TextInputStep",
    ]
-    _import_structure["modular_blocks_flux2"] = [
+    _import_structure["modular_blocks"] = [
        "ALL_BLOCKS",
        "AUTO_BLOCKS",
        "REMOTE_AUTO_BLOCKS",
@@ -54,8 +54,7 @@ else:
        "Flux2BeforeDenoiseStep",
        "Flux2VaeEncoderSequentialStep",
    ]
-    _import_structure["modular_blocks_flux2_klein"] = ["Flux2KleinAutoBlocks", "Flux2KleinBaseAutoBlocks"]
-    _import_structure["modular_pipeline"] = ["Flux2ModularPipeline", "Flux2KleinModularPipeline"]
+    _import_structure["modular_pipeline"] = ["Flux2ModularPipeline"]

 if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
    try:
@@ -86,7 +85,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
            Flux2ProcessImagesInputStep,
            Flux2TextInputStep,
        )
-        from .modular_blocks_flux2 import (
+        from .modular_blocks import (
            ALL_BLOCKS,
            AUTO_BLOCKS,
            IMAGE_CONDITIONED_BLOCKS,
@@ -97,11 +96,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
            Flux2BeforeDenoiseStep,
            Flux2VaeEncoderSequentialStep,
        )
-        from .modular_blocks_flux2_klein import (
-            Flux2KleinAutoBlocks,
-            Flux2KleinBaseAutoBlocks,
-        )
-        from .modular_pipeline import Flux2KleinModularPipeline, Flux2ModularPipeline
+        from .modular_pipeline import Flux2ModularPipeline
 else:
    import sys

--- a/src/diffusers/modular_pipelines/flux2/before_denoise.py
+++ b/src/diffusers/modular_pipelines/flux2/before_denoise.py
@@ -353,7 +353,7 @@ class Flux2RoPEInputsStep(ModularPipelineBlocks):
    def inputs(self) -> List[InputParam]:
        return [
            InputParam(name="prompt_embeds", required=True),
-            InputParam(name="negative_prompt_embeds", required=False),
+            InputParam(name="latent_ids"),
        ]

    @property
@@ -366,10 +366,10 @@ class Flux2RoPEInputsStep(ModularPipelineBlocks):
                description="4D position IDs (T, H, W, L) for text tokens, used for RoPE calculation.",
            ),
            OutputParam(
-                name="negative_txt_ids",
+                name="latent_ids",
                kwargs_type="denoiser_input_fields",
                type_hint=torch.Tensor,
-                description="4D position IDs (T, H, W, L) for negative text tokens, used for RoPE calculation.",
+                description="4D position IDs (T, H, W, L) for image latents, used for RoPE calculation.",
            ),
        ]

@@ -399,11 +399,6 @@ class Flux2RoPEInputsStep(ModularPipelineBlocks):
        block_state.txt_ids = self._prepare_text_ids(prompt_embeds)
        block_state.txt_ids = block_state.txt_ids.to(device)

-        block_state.negative_txt_ids = None
-        if block_state.negative_prompt_embeds is not None:
-            block_state.negative_txt_ids = self._prepare_text_ids(block_state.negative_prompt_embeds)
-            block_state.negative_txt_ids = block_state.negative_txt_ids.to(device)
-
        self.set_block_state(state, block_state)
        return components, state

--- a/src/diffusers/modular_pipelines/flux2/denoise.py
+++ b/src/diffusers/modular_pipelines/flux2/denoise.py
@@ -12,13 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import inspect
 from typing import Any, List, Tuple

 import torch

-from ...configuration_utils import FrozenDict
-from ...guiders import ClassifierFreeGuidance
 from ...models import Flux2Transformer2DModel
 from ...schedulers import FlowMatchEulerDiscreteScheduler
 from ...utils import is_torch_xla_available, logging
@@ -28,8 +25,8 @@ from ..modular_pipeline import (
    ModularPipelineBlocks,
    PipelineState,
 )
-from ..modular_pipeline_utils import ComponentSpec, ConfigSpec, InputParam, OutputParam
-from .modular_pipeline import Flux2KleinModularPipeline, Flux2ModularPipeline
+from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam
+from .modular_pipeline import Flux2ModularPipeline


 if is_torch_xla_available():
@@ -137,241 +134,6 @@ class Flux2LoopDenoiser(ModularPipelineBlocks):
        return components, block_state


-# same as Flux2LoopDenoiser but guidance=None
-class Flux2KleinLoopDenoiser(ModularPipelineBlocks):
-    model_name = "flux2-klein"
-
-    @property
-    def expected_components(self) -> List[ComponentSpec]:
-        return [ComponentSpec("transformer", Flux2Transformer2DModel)]
-
-    @property
-    def description(self) -> str:
-        return (
-            "Step within the denoising loop that denoises the latents for Flux2. "
-            "This block should be used to compose the `sub_blocks` attribute of a `LoopSequentialPipelineBlocks` "
-            "object (e.g. `Flux2DenoiseLoopWrapper`)"
-        )
-
-    @property
-    def inputs(self) -> List[Tuple[str, Any]]:
-        return [
-            InputParam("joint_attention_kwargs"),
-            InputParam(
-                "latents",
-                required=True,
-                type_hint=torch.Tensor,
-                description="The latents to denoise. Shape: (B, seq_len, C)",
-            ),
-            InputParam(
-                "image_latents",
-                type_hint=torch.Tensor,
-                description="Packed image latents for conditioning. Shape: (B, img_seq_len, C)",
-            ),
-            InputParam(
-                "image_latent_ids",
-                type_hint=torch.Tensor,
-                description="Position IDs for image latents. Shape: (B, img_seq_len, 4)",
-            ),
-            InputParam(
-                "prompt_embeds",
-                required=True,
-                type_hint=torch.Tensor,
-                description="Text embeddings from Mistral3",
-            ),
-            InputParam(
-                "txt_ids",
-                required=True,
-                type_hint=torch.Tensor,
-                description="4D position IDs for text tokens (T, H, W, L)",
-            ),
-            InputParam(
-                "latent_ids",
-                required=True,
-                type_hint=torch.Tensor,
-                description="4D position IDs for latent tokens (T, H, W, L)",
-            ),
-        ]
-
-    @torch.no_grad()
-    def __call__(
-        self, components: Flux2KleinModularPipeline, block_state: BlockState, i: int, t: torch.Tensor
-    ) -> PipelineState:
-        latents = block_state.latents
-        latent_model_input = latents.to(components.transformer.dtype)
-        img_ids = block_state.latent_ids
-
-        image_latents = getattr(block_state, "image_latents", None)
-        if image_latents is not None:
-            latent_model_input = torch.cat([latents, image_latents], dim=1).to(components.transformer.dtype)
-            image_latent_ids = block_state.image_latent_ids
-            img_ids = torch.cat([img_ids, image_latent_ids], dim=1)
-
-        timestep = t.expand(latents.shape[0]).to(latents.dtype)
-
-        noise_pred = components.transformer(
-            hidden_states=latent_model_input,
-            timestep=timestep / 1000,
-            guidance=None,
-            encoder_hidden_states=block_state.prompt_embeds,
-            txt_ids=block_state.txt_ids,
-            img_ids=img_ids,
-            joint_attention_kwargs=block_state.joint_attention_kwargs,
-            return_dict=False,
-        )[0]
-
-        noise_pred = noise_pred[:, : latents.size(1)]
-        block_state.noise_pred = noise_pred
-
-        return components, block_state
-
-
-# support CFG for Flux2-Klein base model
-class Flux2KleinBaseLoopDenoiser(ModularPipelineBlocks):
-    model_name = "flux2-klein"
-
-    @property
-    def expected_components(self) -> List[ComponentSpec]:
-        return [
-            ComponentSpec("transformer", Flux2Transformer2DModel),
-            ComponentSpec(
-                "guider",
-                ClassifierFreeGuidance,
-                config=FrozenDict({"guidance_scale": 4.0}),
-                default_creation_method="from_config",
-            ),
-        ]
-
-    @property
-    def expected_configs(self) -> List[ConfigSpec]:
-        return [
-            ConfigSpec(name="is_distilled", default=False),
-        ]
-
-    @property
-    def description(self) -> str:
-        return (
-            "Step within the denoising loop that denoises the latents for Flux2. "
-            "This block should be used to compose the `sub_blocks` attribute of a `LoopSequentialPipelineBlocks` "
-            "object (e.g. `Flux2DenoiseLoopWrapper`)"
-        )
-
-    @property
-    def inputs(self) -> List[Tuple[str, Any]]:
-        return [
-            InputParam("joint_attention_kwargs"),
-            InputParam(
-                "latents",
-                required=True,
-                type_hint=torch.Tensor,
-                description="The latents to denoise. Shape: (B, seq_len, C)",
-            ),
-            InputParam(
-                "image_latents",
-                type_hint=torch.Tensor,
-                description="Packed image latents for conditioning. Shape: (B, img_seq_len, C)",
-            ),
-            InputParam(
-                "image_latent_ids",
-                type_hint=torch.Tensor,
-                description="Position IDs for image latents. Shape: (B, img_seq_len, 4)",
-            ),
-            InputParam(
-                "prompt_embeds",
-                required=True,
-                type_hint=torch.Tensor,
-                description="Text embeddings from Mistral3",
-            ),
-            InputParam(
-                "negative_prompt_embeds",
-                required=False,
-                type_hint=torch.Tensor,
-                description="Negative text embeddings from Qwen3",
-            ),
-            InputParam(
-                "txt_ids",
-                required=True,
-                type_hint=torch.Tensor,
-                description="4D position IDs for text tokens (T, H, W, L)",
-            ),
-            InputParam(
-                "negative_txt_ids",
-                required=False,
-                type_hint=torch.Tensor,
-                description="4D position IDs for negative text tokens (T, H, W, L)",
-            ),
-            InputParam(
-                "latent_ids",
-                required=True,
-                type_hint=torch.Tensor,
-                description="4D position IDs for latent tokens (T, H, W, L)",
-            ),
-            InputParam(
-                kwargs_type="denoiser_input_fields",
-                description="conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.",
-            ),
-        ]
-
-    @torch.no_grad()
-    def __call__(
-        self, components: Flux2KleinModularPipeline, block_state: BlockState, i: int, t: torch.Tensor
-    ) -> PipelineState:
-        latents = block_state.latents
-        latent_model_input = latents.to(components.transformer.dtype)
-        img_ids = block_state.latent_ids
-
-        image_latents = getattr(block_state, "image_latents", None)
-        if image_latents is not None:
-            latent_model_input = torch.cat([latents, image_latents], dim=1).to(components.transformer.dtype)
-            image_latent_ids = block_state.image_latent_ids
-            img_ids = torch.cat([img_ids, image_latent_ids], dim=1)
-
-        timestep = t.expand(latents.shape[0]).to(latents.dtype)
-
-        guider_inputs = {
-            "encoder_hidden_states": (
-                getattr(block_state, "prompt_embeds", None),
-                getattr(block_state, "negative_prompt_embeds", None),
-            ),
-            "txt_ids": (
-                getattr(block_state, "txt_ids", None),
-                getattr(block_state, "negative_txt_ids", None),
-            ),
-        }
-
-        transformer_args = set(inspect.signature(components.transformer.forward).parameters.keys())
-        additional_cond_kwargs = {}
-        for field_name, field_value in block_state.denoiser_input_fields.items():
-            if field_name in transformer_args and field_name not in guider_inputs:
-                additional_cond_kwargs[field_name] = field_value
-        block_state.additional_cond_kwargs.update(additional_cond_kwargs)
-
-        components.guider.set_state(step=i, num_inference_steps=block_state.num_inference_steps, timestep=t)
-        guider_state = components.guider.prepare_inputs(guider_inputs)
-
-        for guider_state_batch in guider_state:
-            components.guider.prepare_models(components.transformer)
-            cond_kwargs = {input_name: getattr(guider_state_batch, input_name) for input_name in guider_inputs.keys()}
-            cond_kwargs.update(additional_cond_kwargs)
-
-            noise_pred = components.transformer(
-                hidden_states=latent_model_input,
-                timestep=timestep / 1000,
-                guidance=None,
-                img_ids=img_ids,
-                joint_attention_kwargs=block_state.joint_attention_kwargs,
-                return_dict=False,
-                **cond_kwargs,
-            )[0]
-            guider_state_batch.noise_pred = noise_pred[:, : latents.size(1)]
-            components.guider.cleanup_models(components.transformer)
-
-        # perform guidance
-        block_state.noise_pred = components.guider(guider_state)[0]
-
-        return components, block_state
-
-
 class Flux2LoopAfterDenoiser(ModularPipelineBlocks):
    model_name = "flux2"

@@ -458,8 +220,6 @@ class Flux2DenoiseLoopWrapper(LoopSequentialPipelineBlocks):
            len(block_state.timesteps) - block_state.num_inference_steps * components.scheduler.order, 0
        )

-        block_state.additional_cond_kwargs = {}
-
        with self.progress_bar(total=block_state.num_inference_steps) as progress_bar:
            for i, t in enumerate(block_state.timesteps):
                components, block_state = self.loop_step(components, block_state, i=i, t=t)
@@ -490,35 +250,3 @@ class Flux2DenoiseStep(Flux2DenoiseLoopWrapper):
            " - `Flux2LoopAfterDenoiser`\n"
            "This block supports both text-to-image and image-conditioned generation."
        )
-
-
-class Flux2KleinDenoiseStep(Flux2DenoiseLoopWrapper):
-    block_classes = [Flux2KleinLoopDenoiser, Flux2LoopAfterDenoiser]
-    block_names = ["denoiser", "after_denoiser"]
-
-    @property
-    def description(self) -> str:
-        return (
-            "Denoise step that iteratively denoises the latents for Flux2. \n"
-            "Its loop logic is defined in `Flux2DenoiseLoopWrapper.__call__` method \n"
-            "At each iteration, it runs blocks defined in `sub_blocks` sequentially:\n"
-            " - `Flux2KleinLoopDenoiser`\n"
-            " - `Flux2LoopAfterDenoiser`\n"
-            "This block supports both text-to-image and image-conditioned generation."
-        )
-
-
-class Flux2KleinBaseDenoiseStep(Flux2DenoiseLoopWrapper):
-    block_classes = [Flux2KleinBaseLoopDenoiser, Flux2LoopAfterDenoiser]
-    block_names = ["denoiser", "after_denoiser"]
-
-    @property
-    def description(self) -> str:
-        return (
-            "Denoise step that iteratively denoises the latents for Flux2. \n"
-            "Its loop logic is defined in `Flux2DenoiseLoopWrapper.__call__` method \n"
-            "At each iteration, it runs blocks defined in `sub_blocks` sequentially:\n"
-            " - `Flux2KleinBaseLoopDenoiser`\n"
-            " - `Flux2LoopAfterDenoiser`\n"
-            "This block supports both text-to-image and image-conditioned generation."
-        )
--- a/src/diffusers/modular_pipelines/flux2/encoders.py
+++ b/src/diffusers/modular_pipelines/flux2/encoders.py
@@ -15,13 +15,13 @@
 from typing import List, Optional, Tuple, Union

 import torch
-from transformers import AutoProcessor, Mistral3ForConditionalGeneration, Qwen2TokenizerFast, Qwen3ForCausalLM
+from transformers import AutoProcessor, Mistral3ForConditionalGeneration

 from ...models import AutoencoderKLFlux2
 from ...utils import logging
 from ..modular_pipeline import ModularPipelineBlocks, PipelineState
-from ..modular_pipeline_utils import ComponentSpec, ConfigSpec, InputParam, OutputParam
-from .modular_pipeline import Flux2KleinModularPipeline, Flux2ModularPipeline
+from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam
+from .modular_pipeline import Flux2ModularPipeline


 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
@@ -79,8 +79,10 @@ class Flux2TextEncoderStep(ModularPipelineBlocks):
    def inputs(self) -> List[InputParam]:
        return [
            InputParam("prompt"),
+            InputParam("prompt_embeds", type_hint=torch.Tensor, required=False),
            InputParam("max_sequence_length", type_hint=int, default=512, required=False),
            InputParam("text_encoder_out_layers", type_hint=Tuple[int], default=(10, 20, 30), required=False),
+            InputParam("joint_attention_kwargs"),
        ]

    @property
@@ -97,7 +99,14 @@ class Flux2TextEncoderStep(ModularPipelineBlocks):
    @staticmethod
    def check_inputs(block_state):
        prompt = block_state.prompt
-        if prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+        prompt_embeds = getattr(block_state, "prompt_embeds", None)
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. "
+                "Please make sure to only forward one of the two."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")

    @staticmethod
@@ -156,6 +165,10 @@ class Flux2TextEncoderStep(ModularPipelineBlocks):

        block_state.device = components._execution_device

+        if block_state.prompt_embeds is not None:
+            self.set_block_state(state, block_state)
+            return components, state
+
        prompt = block_state.prompt
        if prompt is None:
            prompt = ""
@@ -192,6 +205,7 @@ class Flux2RemoteTextEncoderStep(ModularPipelineBlocks):
    def inputs(self) -> List[InputParam]:
        return [
            InputParam("prompt"),
+            InputParam("prompt_embeds", type_hint=torch.Tensor, required=False),
        ]

    @property
@@ -208,8 +222,15 @@ class Flux2RemoteTextEncoderStep(ModularPipelineBlocks):
    @staticmethod
    def check_inputs(block_state):
        prompt = block_state.prompt
-        if prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(block_state.prompt)}")
+        prompt_embeds = getattr(block_state, "prompt_embeds", None)
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. "
+                "Please make sure to only forward one of the two."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")

    @torch.no_grad()
    def __call__(self, components: Flux2ModularPipeline, state: PipelineState) -> PipelineState:
@@ -223,6 +244,10 @@ class Flux2RemoteTextEncoderStep(ModularPipelineBlocks):

        block_state.device = components._execution_device

+        if block_state.prompt_embeds is not None:
+            self.set_block_state(state, block_state)
+            return components, state
+
        prompt = block_state.prompt
        if prompt is None:
            prompt = ""
@@ -245,153 +270,6 @@ class Flux2RemoteTextEncoderStep(ModularPipelineBlocks):
        return components, state


-class Flux2KleinTextEncoderStep(ModularPipelineBlocks):
-    model_name = "flux2-klein"
-
-    @property
-    def description(self) -> str:
-        return "Text Encoder step that generates text embeddings using Qwen3 to guide the image generation"
-
-    @property
-    def expected_components(self) -> List[ComponentSpec]:
-        return [
-            ComponentSpec("text_encoder", Qwen3ForCausalLM),
-            ComponentSpec("tokenizer", Qwen2TokenizerFast),
-        ]
-
-    @property
-    def expected_configs(self) -> List[ConfigSpec]:
-        return [
-            ConfigSpec(name="is_distilled", default=False),
-        ]
-
-    @property
-    def inputs(self) -> List[InputParam]:
-        return [
-            InputParam("prompt"),
-            InputParam("max_sequence_length", type_hint=int, default=512, required=False),
-            InputParam("text_encoder_out_layers", type_hint=Tuple[int], default=(9, 18, 27), required=False),
-        ]
-
-    @property
-    def intermediate_outputs(self) -> List[OutputParam]:
-        return [
-            OutputParam(
-                "prompt_embeds",
-                kwargs_type="denoiser_input_fields",
-                type_hint=torch.Tensor,
-                description="Text embeddings from qwen3 used to guide the image generation",
-            ),
-            OutputParam(
-                "negative_prompt_embeds",
-                type_hint=torch.Tensor,
-                description="Negative text embeddings from qwen3 used to guide the image generation",
-            ),
-        ]
-
-    @staticmethod
-    def check_inputs(block_state):
-        prompt = block_state.prompt
-
-        if prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-    @staticmethod
-    # Copied from diffusers.pipelines.flux2.pipeline_flux2_klein.Flux2KleinPipeline._get_qwen3_prompt_embeds
-    def _get_qwen3_prompt_embeds(
-        text_encoder: Qwen3ForCausalLM,
-        tokenizer: Qwen2TokenizerFast,
-        prompt: Union[str, List[str]],
-        dtype: Optional[torch.dtype] = None,
-        device: Optional[torch.device] = None,
-        max_sequence_length: int = 512,
-        hidden_states_layers: List[int] = (9, 18, 27),
-    ):
-        dtype = text_encoder.dtype if dtype is None else dtype
-        device = text_encoder.device if device is None else device
-
-        prompt = [prompt] if isinstance(prompt, str) else prompt
-
-        all_input_ids = []
-        all_attention_masks = []
-
-        for single_prompt in prompt:
-            messages = [{"role": "user", "content": single_prompt}]
-            text = tokenizer.apply_chat_template(
-                messages,
-                tokenize=False,
-                add_generation_prompt=True,
-                enable_thinking=False,
-            )
-            inputs = tokenizer(
-                text,
-                return_tensors="pt",
-                padding="max_length",
-                truncation=True,
-                max_length=max_sequence_length,
-            )
-
-            all_input_ids.append(inputs["input_ids"])
-            all_attention_masks.append(inputs["attention_mask"])
-
-        input_ids = torch.cat(all_input_ids, dim=0).to(device)
-        attention_mask = torch.cat(all_attention_masks, dim=0).to(device)
-
-        # Forward pass through the model
-        output = text_encoder(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            output_hidden_states=True,
-            use_cache=False,
-        )
-
-        # Only use outputs from intermediate layers and stack them
-        out = torch.stack([output.hidden_states[k] for k in hidden_states_layers], dim=1)
-        out = out.to(dtype=dtype, device=device)
-
-        batch_size, num_channels, seq_len, hidden_dim = out.shape
-        prompt_embeds = out.permute(0, 2, 1, 3).reshape(batch_size, seq_len, num_channels * hidden_dim)
-
-        return prompt_embeds
-
-    @torch.no_grad()
-    def __call__(self, components: Flux2KleinModularPipeline, state: PipelineState) -> PipelineState:
-        block_state = self.get_block_state(state)
-        self.check_inputs(block_state)
-
-        device = components._execution_device
-
-        prompt = block_state.prompt
-        if prompt is None:
-            prompt = ""
-        prompt = [prompt] if isinstance(prompt, str) else prompt
-
-        block_state.prompt_embeds = self._get_qwen3_prompt_embeds(
-            text_encoder=components.text_encoder,
-            tokenizer=components.tokenizer,
-            prompt=prompt,
-            device=device,
-            max_sequence_length=block_state.max_sequence_length,
-            hidden_states_layers=block_state.text_encoder_out_layers,
-        )
-
-        if components.requires_unconditional_embeds:
-            negative_prompt = ""
-            block_state.negative_prompt_embeds = self._get_qwen3_prompt_embeds(
-                text_encoder=components.text_encoder,
-                tokenizer=components.tokenizer,
-                prompt=negative_prompt,
-                device=device,
-                max_sequence_length=block_state.max_sequence_length,
-                hidden_states_layers=block_state.text_encoder_out_layers,
-            )
-        else:
-            block_state.negative_prompt_embeds = None
-
-        self.set_block_state(state, block_state)
-        return components, state
-
-
 class Flux2VaeEncoderStep(ModularPipelineBlocks):
    model_name = "flux2"

--- a/src/diffusers/modular_pipelines/flux2/inputs.py
+++ b/src/diffusers/modular_pipelines/flux2/inputs.py
@@ -47,14 +47,7 @@ class Flux2TextInputStep(ModularPipelineBlocks):
                required=True,
                kwargs_type="denoiser_input_fields",
                type_hint=torch.Tensor,
-                description="Pre-generated text embeddings. Can be generated from text_encoder step.",
-            ),
-            InputParam(
-                "negative_prompt_embeds",
-                required=False,
-                kwargs_type="denoiser_input_fields",
-                type_hint=torch.Tensor,
-                description="Pre-generated negative text embeddings. Can be generated from text_encoder step.",
+                description="Pre-generated text embeddings from Mistral3. Can be generated from text_encoder step.",
            ),
        ]

@@ -77,12 +70,6 @@ class Flux2TextInputStep(ModularPipelineBlocks):
                kwargs_type="denoiser_input_fields",
                description="Text embeddings used to guide the image generation",
            ),
-            OutputParam(
-                "negative_prompt_embeds",
-                type_hint=torch.Tensor,
-                kwargs_type="denoiser_input_fields",
-                description="Negative text embeddings used to guide the image generation",
-            ),
        ]

    @torch.no_grad()
@@ -98,15 +85,6 @@ class Flux2TextInputStep(ModularPipelineBlocks):
            block_state.batch_size * block_state.num_images_per_prompt, seq_len, -1
        )

-        if block_state.negative_prompt_embeds is not None:
-            _, seq_len, _ = block_state.negative_prompt_embeds.shape
-            block_state.negative_prompt_embeds = block_state.negative_prompt_embeds.repeat(
-                1, block_state.num_images_per_prompt, 1
-            )
-            block_state.negative_prompt_embeds = block_state.negative_prompt_embeds.view(
-                block_state.batch_size * block_state.num_images_per_prompt, seq_len, -1
-            )
-
        self.set_block_state(state, block_state)
        return components, state

--- a/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2.py
+++ b/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2.py
--- a/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2_klein.py
+++ b/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2_klein.py
@@ -1,171 +0,0 @@
-# Copyright 2025 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from ...utils import logging
-from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks
-from ..modular_pipeline_utils import InsertableDict
-from .before_denoise import (
-    Flux2PrepareImageLatentsStep,
-    Flux2PrepareLatentsStep,
-    Flux2RoPEInputsStep,
-    Flux2SetTimestepsStep,
-)
-from .decoders import Flux2DecodeStep
-from .denoise import Flux2KleinBaseDenoiseStep, Flux2KleinDenoiseStep
-from .encoders import (
-    Flux2KleinTextEncoderStep,
-    Flux2VaeEncoderStep,
-)
-from .inputs import (
-    Flux2ProcessImagesInputStep,
-    Flux2TextInputStep,
-)
-
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-
-Flux2KleinVaeEncoderBlocks = InsertableDict(
-    [
-        ("preprocess", Flux2ProcessImagesInputStep()),
-        ("encode", Flux2VaeEncoderStep()),
-    ]
-)
-
-
-class Flux2KleinVaeEncoderSequentialStep(SequentialPipelineBlocks):
-    model_name = "flux2"
-
-    block_classes = Flux2KleinVaeEncoderBlocks.values()
-    block_names = Flux2KleinVaeEncoderBlocks.keys()
-
-    @property
-    def description(self) -> str:
-        return "VAE encoder step that preprocesses and encodes the image inputs into their latent representations."
-
-
-class Flux2KleinAutoVaeEncoderStep(AutoPipelineBlocks):
-    block_classes = [Flux2KleinVaeEncoderSequentialStep]
-    block_names = ["img_conditioning"]
-    block_trigger_inputs = ["image"]
-
-    @property
-    def description(self):
-        return (
-            "VAE encoder step that encodes the image inputs into their latent representations.\n"
-            "This is an auto pipeline block that works for image conditioning tasks.\n"
-            " - `Flux2KleinVaeEncoderSequentialStep` is used when `image` is provided.\n"
-            " - If `image` is not provided, step will be skipped."
-        )
-
-
-Flux2KleinCoreDenoiseBlocks = InsertableDict(
-    [
-        ("input", Flux2TextInputStep()),
-        ("prepare_image_latents", Flux2PrepareImageLatentsStep()),
-        ("prepare_latents", Flux2PrepareLatentsStep()),
-        ("set_timesteps", Flux2SetTimestepsStep()),
-        ("prepare_rope_inputs", Flux2RoPEInputsStep()),
-        ("denoise", Flux2KleinDenoiseStep()),
-    ]
-)
-
-
-class Flux2KleinCoreDenoiseStep(SequentialPipelineBlocks):
-    model_name = "flux2-klein"
-
-    block_classes = Flux2KleinCoreDenoiseBlocks.values()
-    block_names = Flux2KleinCoreDenoiseBlocks.keys()
-
-    @property
-    def description(self):
-        return "Core denoise step that performs the denoising process for Flux2-Klein (distilled model)."
-        return (
-            "Core denoise step that performs the denoising process for Flux2-Klein.\n"
-            " - `Flux2KleinTextInputStep` (input) standardizes the text inputs for the denoising step.\n"
-            " - `Flux2PrepareImageLatentsStep` (prepare_image_latents) prepares the image latents  and image_latent_ids for the denoising step.\n"
-            " - `Flux2PrepareLatentsStep` (prepare_latents) prepares the initial latents (latents) and latent_ids for the denoising step.\n"
-            " - `Flux2SetTimestepsStep` (set_timesteps) sets the timesteps for the denoising step.\n"
-            " - `Flux2RoPEInputsStep` (prepare_rope_inputs) prepares the RoPE inputs (txt_ids) for the denoising step.\n"
-            " - `Flux2KleinDenoiseStep` (denoise) iteratively denoises the latents.\n"
-        )
-
-
-Flux2KleinBaseCoreDenoiseBlocks = InsertableDict(
-    [
-        ("input", Flux2TextInputStep()),
-        ("prepare_latents", Flux2PrepareLatentsStep()),
-        ("prepare_image_latents", Flux2PrepareImageLatentsStep()),
-        ("set_timesteps", Flux2SetTimestepsStep()),
-        ("prepare_rope_inputs", Flux2RoPEInputsStep()),
-        ("denoise", Flux2KleinBaseDenoiseStep()),
-    ]
-)
-
-
-class Flux2KleinBaseCoreDenoiseStep(SequentialPipelineBlocks):
-    model_name = "flux2-klein"
-    block_classes = Flux2KleinBaseCoreDenoiseBlocks.values()
-    block_names = Flux2KleinBaseCoreDenoiseBlocks.keys()
-
-    @property
-    def description(self):
-        return "Core denoise step that performs the denoising process for Flux2-Klein (base model)."
-        return (
-            "Core denoise step that performs the denoising process for Flux2-Klein (base model).\n"
-            " - `Flux2KleinTextInputStep` (input) standardizes the text inputs for the denoising step.\n"
-            " - `Flux2PrepareImageLatentsStep` (prepare_image_latents) prepares the image latents and image_latent_ids for the denoising step.\n"
-            " - `Flux2PrepareLatentsStep` (prepare_latents) prepares the initial latents (latents) and latent_ids for the denoising step.\n"
-            " - `Flux2SetTimestepsStep` (set_timesteps) sets the timesteps for the denoising step.\n"
-            " - `Flux2RoPEInputsStep` (prepare_rope_inputs) prepares the RoPE inputs (txt_ids) for the denoising step.\n"
-            " - `Flux2KleinBaseDenoiseStep` (denoise) iteratively denoises the latents using Classifier-Free Guidance.\n"
-        )
-
-
-class Flux2KleinAutoBlocks(SequentialPipelineBlocks):
-    model_name = "flux2-klein"
-    block_classes = [
-        Flux2KleinTextEncoderStep(),
-        Flux2KleinAutoVaeEncoderStep(),
-        Flux2KleinCoreDenoiseStep(),
-        Flux2DecodeStep(),
-    ]
-    block_names = ["text_encoder", "vae_image_encoder", "denoise", "decode"]
-
-    @property
-    def description(self):
-        return (
-            "Auto blocks that perform the text-to-image and image-conditioned generation using Flux2-Klein.\n"
-            + " - for image-conditioned generation, you need to provide `image` (list of PIL images).\n"
-            + " - for text-to-image generation, all you need to provide is `prompt`.\n"
-        )
-
-
-class Flux2KleinBaseAutoBlocks(SequentialPipelineBlocks):
-    model_name = "flux2-klein"
-    block_classes = [
-        Flux2KleinTextEncoderStep(),
-        Flux2KleinAutoVaeEncoderStep(),
-        Flux2KleinBaseCoreDenoiseStep(),
-        Flux2DecodeStep(),
-    ]
-    block_names = ["text_encoder", "vae_image_encoder", "denoise", "decode"]
-
-    @property
-    def description(self):
-        return (
-            "Auto blocks that perform the text-to-image and image-conditioned generation using Flux2-Klein (base model).\n"
-            + " - for image-conditioned generation, you need to provide `image` (list of PIL images).\n"
-            + " - for text-to-image generation, all you need to provide is `prompt`.\n"
-        )
--- a/src/diffusers/modular_pipelines/flux2/modular_pipeline.py
+++ b/src/diffusers/modular_pipelines/flux2/modular_pipeline.py
@@ -13,8 +13,6 @@
 # limitations under the License.


-from typing import Any, Dict, Optional
-
 from ...loaders import Flux2LoraLoaderMixin
 from ...utils import logging
 from ..modular_pipeline import ModularPipeline
@@ -57,56 +55,3 @@ class Flux2ModularPipeline(ModularPipeline, Flux2LoraLoaderMixin):
        if getattr(self, "transformer", None):
            num_channels_latents = self.transformer.config.in_channels // 4
        return num_channels_latents
-
-
-class Flux2KleinModularPipeline(ModularPipeline, Flux2LoraLoaderMixin):
-    """
-    A ModularPipeline for Flux2-Klein.
-
-    > [!WARNING] > This is an experimental feature and is likely to change in the future.
-    """
-
-    default_blocks_name = "Flux2KleinBaseAutoBlocks"
-
-    def get_default_blocks_name(self, config_dict: Optional[Dict[str, Any]]) -> Optional[str]:
-        if config_dict is not None and "is_distilled" in config_dict and config_dict["is_distilled"]:
-            return "Flux2KleinAutoBlocks"
-        else:
-            return "Flux2KleinBaseAutoBlocks"
-
-    @property
-    def default_height(self):
-        return self.default_sample_size * self.vae_scale_factor
-
-    @property
-    def default_width(self):
-        return self.default_sample_size * self.vae_scale_factor
-
-    @property
-    def default_sample_size(self):
-        return 128
-
-    @property
-    def vae_scale_factor(self):
-        vae_scale_factor = 8
-        if getattr(self, "vae", None) is not None:
-            vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
-        return vae_scale_factor
-
-    @property
-    def num_channels_latents(self):
-        num_channels_latents = 32
-        if getattr(self, "transformer", None):
-            num_channels_latents = self.transformer.config.in_channels // 4
-        return num_channels_latents
-
-    @property
-    def requires_unconditional_embeds(self):
-        if hasattr(self.config, "is_distilled") and self.config.is_distilled:
-            return False
-
-        requires_unconditional_embeds = False
-        if hasattr(self, "guider") and self.guider is not None:
-            requires_unconditional_embeds = self.guider._enabled and self.guider.num_conditions > 1
-
-        return requires_unconditional_embeds
--- a/src/diffusers/modular_pipelines/modular_pipeline.py
+++ b/src/diffusers/modular_pipelines/modular_pipeline.py
@@ -59,7 +59,6 @@ MODULAR_PIPELINE_MAPPING = OrderedDict(
        ("flux", "FluxModularPipeline"),
        ("flux-kontext", "FluxKontextModularPipeline"),
        ("flux2", "Flux2ModularPipeline"),
-        ("flux2-klein", "Flux2KleinModularPipeline"),
        ("qwenimage", "QwenImageModularPipeline"),
        ("qwenimage-edit", "QwenImageEditModularPipeline"),
        ("qwenimage-edit-plus", "QwenImageEditPlusModularPipeline"),
--- a/src/diffusers/modular_pipelines/modular_pipeline_utils.py
+++ b/src/diffusers/modular_pipelines/modular_pipeline_utils.py
@@ -18,6 +18,7 @@ from collections import OrderedDict
 from dataclasses import dataclass, field, fields
 from typing import Any, Dict, List, Literal, Optional, Type, Union

+import PIL.Image
 import torch

 from ..configuration_utils import ConfigMixin, FrozenDict
@@ -323,11 +324,192 @@ class ConfigSpec:
    description: Optional[str] = None


-# YiYi Notes: both inputs and intermediate_inputs are InputParam objects
-# however some fields are not relevant for intermediate_inputs
-# e.g. unlike inputs, required only used in docstring for intermediate_inputs, we do not check if a required intermediate inputs is passed
-# default is not used for intermediate_inputs, we only use default from inputs, so it is ignored if it is set for intermediate_inputs
-# -> should we use different class for inputs and intermediate_inputs?
+# ======================================================
+# InputParam and OutputParam templates
+# ======================================================
+
+INPUT_PARAM_TEMPLATES = {
+    "prompt": {
+        "type_hint": str,
+        "required": True,
+        "description": "The prompt or prompts to guide image generation.",
+    },
+    "negative_prompt": {
+        "type_hint": str,
+        "description": "The prompt or prompts not to guide the image generation.",
+    },
+    "max_sequence_length": {
+        "type_hint": int,
+        "default": 512,
+        "description": "Maximum sequence length for prompt encoding.",
+    },
+    "height": {
+        "type_hint": int,
+        "description": "The height in pixels of the generated image.",
+    },
+    "width": {
+        "type_hint": int,
+        "description": "The width in pixels of the generated image.",
+    },
+    "num_inference_steps": {
+        "type_hint": int,
+        "default": 50,
+        "description": "The number of denoising steps.",
+    },
+    "num_images_per_prompt": {
+        "type_hint": int,
+        "default": 1,
+        "description": "The number of images to generate per prompt.",
+    },
+    "generator": {
+        "type_hint": torch.Generator,
+        "description": "Torch generator for deterministic generation.",
+    },
+    "sigmas": {
+        "type_hint": List[float],
+        "description": "Custom sigmas for the denoising process.",
+    },
+    "strength": {
+        "type_hint": float,
+        "default": 0.9,
+        "description": "Strength for img2img/inpainting.",
+    },
+    "image": {
+        "type_hint": Union[PIL.Image.Image, List[PIL.Image.Image]],
+        "required": True,
+        "description": "Reference image(s) for denoising. Can be a single image or list of images.",
+    },
+    "latents": {
+        "type_hint": torch.Tensor,
+        "description": "Pre-generated noisy latents for image generation.",
+    },
+    "timesteps": {
+        "type_hint": torch.Tensor,
+        "description": "Timesteps for the denoising process.",
+    },
+    "output_type": {
+        "type_hint": str,
+        "default": "pil",
+        "description": "Output format: 'pil', 'np', 'pt'.",
+    },
+    "attention_kwargs": {
+        "type_hint": Dict[str, Any],
+        "description": "Additional kwargs for attention processors.",
+    },
+    "denoiser_input_fields": {
+        "name": None,
+        "kwargs_type": "denoiser_input_fields",
+        "description": "conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.",
+    },
+    # inpainting
+    "mask_image": {
+        "type_hint": PIL.Image.Image,
+        "required": True,
+        "description": "Mask image for inpainting.",
+    },
+    "padding_mask_crop": {
+        "type_hint": int,
+        "description": "Padding for mask cropping in inpainting.",
+    },
+    # controlnet
+    "control_image": {
+        "type_hint": PIL.Image.Image,
+        "required": True,
+        "description": "Control image for ControlNet conditioning.",
+    },
+    "control_guidance_start": {
+        "type_hint": float,
+        "default": 0.0,
+        "description": "When to start applying ControlNet.",
+    },
+    "control_guidance_end": {
+        "type_hint": float,
+        "default": 1.0,
+        "description": "When to stop applying ControlNet.",
+    },
+    "controlnet_conditioning_scale": {
+        "type_hint": float,
+        "default": 1.0,
+        "description": "Scale for ControlNet conditioning.",
+    },
+    "layers": {
+        "type_hint": int,
+        "default": 4,
+        "description": "Number of layers to extract from the image",
+    },
+    # common intermediate inputs
+    "prompt_embeds": {
+        "type_hint": torch.Tensor,
+        "required": True,
+        "description": "text embeddings used to guide the image generation. Can be generated from text_encoder step.",
+    },
+    "prompt_embeds_mask": {
+        "type_hint": torch.Tensor,
+        "required": True,
+        "description": "mask for the text embeddings. Can be generated from text_encoder step.",
+    },
+    "negative_prompt_embeds": {
+        "type_hint": torch.Tensor,
+        "description": "negative text embeddings used to guide the image generation. Can be generated from text_encoder step.",
+    },
+    "negative_prompt_embeds_mask": {
+        "type_hint": torch.Tensor,
+        "description": "mask for the negative text embeddings. Can be generated from text_encoder step.",
+    },
+    "image_latents": {
+        "type_hint": torch.Tensor,
+        "required": True,
+        "description": "image latents used to guide the image generation. Can be generated from vae_encoder step.",
+    },
+    "batch_size": {
+        "type_hint": int,
+        "default": 1,
+        "description": "Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be generated in input step.",
+    },
+    "dtype": {
+        "type_hint": torch.dtype,
+        "default": torch.float32,
+        "description": "The dtype of the model inputs, can be generated in input step.",
+    },
+}
+
+OUTPUT_PARAM_TEMPLATES = {
+    "images": {
+        "type_hint": List[PIL.Image.Image],
+        "description": "Generated images.",
+    },
+    "latents": {
+        "type_hint": torch.Tensor,
+        "description": "Denoised latents.",
+    },
+    # intermediate outputs
+    "prompt_embeds": {
+        "type_hint": torch.Tensor,
+        "kwargs_type": "denoiser_input_fields",
+        "description": "The prompt embeddings.",
+    },
+    "prompt_embeds_mask": {
+        "type_hint": torch.Tensor,
+        "kwargs_type": "denoiser_input_fields",
+        "description": "The encoder attention mask.",
+    },
+    "negative_prompt_embeds": {
+        "type_hint": torch.Tensor,
+        "kwargs_type": "denoiser_input_fields",
+        "description": "The negative prompt embeddings.",
+    },
+    "negative_prompt_embeds_mask": {
+        "type_hint": torch.Tensor,
+        "kwargs_type": "denoiser_input_fields",
+        "description": "The negative prompt embeddings mask.",
+    },
+    "image_latents": {
+        "type_hint": torch.Tensor,
+        "description": "The latent representation of the input image.",
+    },
+}
+
+
@dataclass
 class InputParam:
    """Specification for an input parameter."""
@@ -337,11 +519,31 @@ class InputParam:
    default: Any = None
    required: bool = False
    description: str = ""
-    kwargs_type: str = None  # YiYi Notes: remove this feature (maybe)
+    kwargs_type: str = None

    def __repr__(self):
        return f"<{self.name}: {'required' if self.required else 'optional'}, default={self.default}>"

+    @classmethod
+    def template(cls, template_name: str, note: str = None, **overrides) -> "InputParam":
+        """Get template for name if exists, otherwise raise ValueError."""
+        if template_name not in INPUT_PARAM_TEMPLATES:
+            raise ValueError(f"InputParam template for {template_name} not found")
+
+        template_kwargs = INPUT_PARAM_TEMPLATES[template_name].copy()
+
+        # Determine the actual param name:
+        # 1. From overrides if provided
+        # 2. From template if present
+        # 3. Fall back to template_name
+        name = overrides.pop("name", template_kwargs.pop("name", template_name))
+
+        if note and "description" in template_kwargs:
+            template_kwargs["description"] = f"{template_kwargs['description']} ({note})"
+
+        template_kwargs.update(overrides)
+        return cls(name=name, **template_kwargs)
+

@dataclass
 class OutputParam:
@@ -350,13 +552,33 @@ class OutputParam:
    name: str
    type_hint: Any = None
    description: str = ""
-    kwargs_type: str = None  # YiYi notes: remove this feature (maybe)
+    kwargs_type: str = None

    def __repr__(self):
        return (
            f"<{self.name}: {self.type_hint.__name__ if hasattr(self.type_hint, '__name__') else str(self.type_hint)}>"
        )

+    @classmethod
+    def template(cls, template_name: str, note: str = None, **overrides) -> "OutputParam":
+        """Get template for name if exists, otherwise raise ValueError."""
+        if template_name not in OUTPUT_PARAM_TEMPLATES:
+            raise ValueError(f"OutputParam template for {template_name} not found")
+
+        template_kwargs = OUTPUT_PARAM_TEMPLATES[template_name].copy()
+
+        # Determine the actual param name:
+        # 1. From overrides if provided
+        # 2. From template if present
+        # 3. Fall back to template_name
+        name = overrides.pop("name", template_kwargs.pop("name", template_name))
+
+        if note and "description" in template_kwargs:
+            template_kwargs["description"] = f"{template_kwargs['description']} ({note})"
+
+        template_kwargs.update(overrides)
+        return cls(name=name, **template_kwargs)
+

 def format_inputs_short(inputs):
    """
@@ -509,10 +731,12 @@ def format_params(params, header="Args", indent_level=4, max_line_length=115):
            desc = re.sub(r"\[(.*?)\]\((https?://[^\s\)]+)\)", r"[\1](\2)", param.description)
            wrapped_desc = wrap_text(desc, desc_indent, max_line_length)
            param_str += f"\n{desc_indent}{wrapped_desc}"
+        else:
+            param_str += f"\n{desc_indent}TODO: Add description."

        formatted_params.append(param_str)

-    return "\n\n".join(formatted_params)
+    return "\n".join(formatted_params)


 def format_input_params(input_params, indent_level=4, max_line_length=115):
@@ -582,7 +806,7 @@ def format_components(components, indent_level=4, max_line_length=115, add_empty
        loading_field_values = []
        for field_name in component.loading_fields():
            field_value = getattr(component, field_name)
-            if field_value is not None:
+            if field_value:
                loading_field_values.append(f"{field_name}={field_value}")

        # Add loading field information if available
@@ -669,17 +893,17 @@ def make_doc_string(
    # Add description
    if description:
        desc_lines = description.strip().split("\n")
-        aligned_desc = "\n".join("  " + line for line in desc_lines)
+        aligned_desc = "\n".join("  " + line.rstrip() for line in desc_lines)
        output += aligned_desc + "\n\n"

    # Add components section if provided
    if expected_components and len(expected_components) > 0:
-        components_str = format_components(expected_components, indent_level=2)
+        components_str = format_components(expected_components, indent_level=2, add_empty_lines=False)
        output += components_str + "\n\n"

    # Add configs section if provided
    if expected_configs and len(expected_configs) > 0:
-        configs_str = format_configs(expected_configs, indent_level=2)
+        configs_str = format_configs(expected_configs, indent_level=2, add_empty_lines=False)
        output += configs_str + "\n\n"

    # Add inputs section
--- a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
+++ b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
@@ -118,7 +118,40 @@ def get_timesteps(scheduler, num_inference_steps, strength):
 # ====================


+# auto_docstring
 class QwenImagePrepareLatentsStep(ModularPipelineBlocks):
+    """
+    Prepare initial random noise for the generation process
+
+      Components:
+          pachifier (`QwenImagePachifier`)
+
+      Inputs:
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+          batch_size (`int`, *optional*, defaults to 1):
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
+              be generated in input step.
+          dtype (`dtype`, *optional*, defaults to torch.float32):
+              The dtype of the model inputs, can be generated in input step.
+
+      Outputs:
+          height (`int`):
+              if not set, updated to default value
+          width (`int`):
+              if not set, updated to default value
+          latents (`Tensor`):
+              The initial latents to use for the denoising process
+    """
+
    model_name = "qwenimage"

    @property
@@ -134,28 +167,20 @@ class QwenImagePrepareLatentsStep(ModularPipelineBlocks):
    @property
    def inputs(self) -> List[InputParam]:
        return [
-            InputParam("latents"),
-            InputParam(name="height"),
-            InputParam(name="width"),
-            InputParam(name="num_images_per_prompt", default=1),
-            InputParam(name="generator"),
-            InputParam(
-                name="batch_size",
-                required=True,
-                type_hint=int,
-                description="Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be generated in input step.",
-            ),
-            InputParam(
-                name="dtype",
-                required=True,
-                type_hint=torch.dtype,
-                description="The dtype of the model inputs, can be generated in input step.",
-            ),
+            InputParam.template("latents"),
+            InputParam.template("height"),
+            InputParam.template("width"),
+            InputParam.template("num_images_per_prompt"),
+            InputParam.template("generator"),
+            InputParam.template("batch_size"),
+            InputParam.template("dtype"),
        ]

    @property
    def intermediate_outputs(self) -> List[OutputParam]:
        return [
+            OutputParam(name="height", type_hint=int, description="if not set, updated to default value"),
+            OutputParam(name="width", type_hint=int, description="if not set, updated to default value"),
            OutputParam(
                name="latents",
                type_hint=torch.Tensor,
@@ -209,7 +234,42 @@ class QwenImagePrepareLatentsStep(ModularPipelineBlocks):
        return components, state


+# auto_docstring
 class QwenImageLayeredPrepareLatentsStep(ModularPipelineBlocks):
+    """
+    Prepare initial random noise (B, layers+1, C, H, W) for the generation process
+
+      Components:
+          pachifier (`QwenImageLayeredPachifier`)
+
+      Inputs:
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+          layers (`int`, *optional*, defaults to 4):
+              Number of layers to extract from the image
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+          batch_size (`int`, *optional*, defaults to 1):
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
+              be generated in input step.
+          dtype (`dtype`, *optional*, defaults to torch.float32):
+              The dtype of the model inputs, can be generated in input step.
+
+      Outputs:
+          height (`int`):
+              if not set, updated to default value
+          width (`int`):
+              if not set, updated to default value
+          latents (`Tensor`):
+              The initial latents to use for the denoising process
+    """
+
    model_name = "qwenimage-layered"

    @property
@@ -225,29 +285,21 @@ class QwenImageLayeredPrepareLatentsStep(ModularPipelineBlocks):
    @property
    def inputs(self) -> List[InputParam]:
        return [
-            InputParam("latents"),
-            InputParam(name="height"),
-            InputParam(name="width"),
-            InputParam(name="layers", default=4),
-            InputParam(name="num_images_per_prompt", default=1),
-            InputParam(name="generator"),
-            InputParam(
-                name="batch_size",
-                required=True,
-                type_hint=int,
-                description="Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be generated in input step.",
-            ),
-            InputParam(
-                name="dtype",
-                required=True,
-                type_hint=torch.dtype,
-                description="The dtype of the model inputs, can be generated in input step.",
-            ),
+            InputParam.template("latents"),
+            InputParam.template("height"),
+            InputParam.template("width"),
+            InputParam.template("layers"),
+            InputParam.template("num_images_per_prompt"),
+            InputParam.template("generator"),
+            InputParam.template("batch_size"),
+            InputParam.template("dtype"),
        ]

    @property
    def intermediate_outputs(self) -> List[OutputParam]:
        return [
+            OutputParam(name="height", type_hint=int, description="if not set, updated to default value"),
+            OutputParam(name="width", type_hint=int, description="if not set, updated to default value"),
            OutputParam(
                name="latents",
                type_hint=torch.Tensor,
@@ -301,7 +353,31 @@ class QwenImageLayeredPrepareLatentsStep(ModularPipelineBlocks):
        return components, state


+# auto_docstring
 class QwenImagePrepareLatentsWithStrengthStep(ModularPipelineBlocks):
+    """
+    Step that adds noise to image latents for image-to-image/inpainting. Should be run after set_timesteps,
+    prepare_latents. Both noise and image latents should alreadybe patchified.
+
+      Components:
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
+
+      Inputs:
+          latents (`Tensor`):
+              The initial random noised, can be generated in prepare latent step.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step. (Can be
+              generated from vae encoder and updated in input step.)
+          timesteps (`Tensor`):
+              The timesteps to use for the denoising process. Can be generated in set_timesteps step.
+
+      Outputs:
+          initial_noise (`Tensor`):
+              The initial random noised used for inpainting denoising.
+          latents (`Tensor`):
+              The scaled noisy latents to use for inpainting/image-to-image denoising.
+    """
+
    model_name = "qwenimage"

    @property
@@ -323,12 +399,7 @@ class QwenImagePrepareLatentsWithStrengthStep(ModularPipelineBlocks):
                type_hint=torch.Tensor,
                description="The initial random noised, can be generated in prepare latent step.",
            ),
-            InputParam(
-                name="image_latents",
-                required=True,
-                type_hint=torch.Tensor,
-                description="The image latents to use for the denoising process. Can be generated in vae encoder and packed in input step.",
-            ),
+            InputParam.template("image_latents", note="Can be generated from vae encoder and updated in input step."),
            InputParam(
                name="timesteps",
                required=True,
@@ -345,6 +416,11 @@ class QwenImagePrepareLatentsWithStrengthStep(ModularPipelineBlocks):
                type_hint=torch.Tensor,
                description="The initial random noised used for inpainting denoising.",
            ),
+            OutputParam(
+                name="latents",
+                type_hint=torch.Tensor,
+                description="The scaled noisy latents to use for inpainting/image-to-image denoising.",
+            ),
        ]

    @staticmethod
@@ -382,7 +458,29 @@ class QwenImagePrepareLatentsWithStrengthStep(ModularPipelineBlocks):
        return components, state


+# auto_docstring
 class QwenImageCreateMaskLatentsStep(ModularPipelineBlocks):
+    """
+    Step that creates mask latents from preprocessed mask_image by interpolating to latent space.
+
+      Components:
+          pachifier (`QwenImagePachifier`)
+
+      Inputs:
+          processed_mask_image (`Tensor`):
+              The processed mask to use for the inpainting process.
+          height (`int`):
+              The height in pixels of the generated image.
+          width (`int`):
+              The width in pixels of the generated image.
+          dtype (`dtype`, *optional*, defaults to torch.float32):
+              The dtype of the model inputs, can be generated in input step.
+
+      Outputs:
+          mask (`Tensor`):
+              The mask to use for the inpainting process.
+    """
+
    model_name = "qwenimage"

    @property
@@ -404,9 +502,9 @@ class QwenImageCreateMaskLatentsStep(ModularPipelineBlocks):
                type_hint=torch.Tensor,
                description="The processed mask to use for the inpainting process.",
            ),
-            InputParam(name="height", required=True),
-            InputParam(name="width", required=True),
-            InputParam(name="dtype", required=True),
+            InputParam.template("height", required=True),
+            InputParam.template("width", required=True),
+            InputParam.template("dtype"),
        ]

    @property
@@ -450,7 +548,28 @@ class QwenImageCreateMaskLatentsStep(ModularPipelineBlocks):
 # ====================


+# auto_docstring
 class QwenImageSetTimestepsStep(ModularPipelineBlocks):
+    """
+    Step that sets the the scheduler's timesteps for text-to-image generation. Should be run after prepare latents
+    step.
+
+      Components:
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
+
+      Inputs:
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              The number of denoising steps.
+          sigmas (`List`, *optional*):
+              Custom sigmas for the denoising process.
+          latents (`Tensor`):
+              The initial random noised latents for the denoising process. Can be generated in prepare latents step.
+
+      Outputs:
+          timesteps (`Tensor`):
+              The timesteps to use for the denoising process
+    """
+
    model_name = "qwenimage"

    @property
@@ -466,13 +585,13 @@ class QwenImageSetTimestepsStep(ModularPipelineBlocks):
    @property
    def inputs(self) -> List[InputParam]:
        return [
-            InputParam(name="num_inference_steps", default=50),
-            InputParam(name="sigmas"),
+            InputParam.template("num_inference_steps"),
+            InputParam.template("sigmas"),
            InputParam(
                name="latents",
                required=True,
                type_hint=torch.Tensor,
-                description="The latents to use for the denoising process, used to calculate the image sequence length.",
+                description="The initial random noised latents for the denoising process. Can be generated in prepare latents step.",
            ),
        ]

@@ -516,7 +635,27 @@ class QwenImageSetTimestepsStep(ModularPipelineBlocks):
        return components, state


+# auto_docstring
 class QwenImageLayeredSetTimestepsStep(ModularPipelineBlocks):
+    """
+    Set timesteps step for QwenImage Layered with custom mu calculation based on image_latents.
+
+      Components:
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
+
+      Inputs:
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              The number of denoising steps.
+          sigmas (`List`, *optional*):
+              Custom sigmas for the denoising process.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+
+      Outputs:
+          timesteps (`Tensor`):
+              The timesteps to use for the denoising process.
+    """
+
    model_name = "qwenimage-layered"

    @property
@@ -532,15 +671,17 @@ class QwenImageLayeredSetTimestepsStep(ModularPipelineBlocks):
    @property
    def inputs(self) -> List[InputParam]:
        return [
-            InputParam("num_inference_steps", default=50, type_hint=int),
-            InputParam("sigmas", type_hint=List[float]),
-            InputParam("image_latents", required=True, type_hint=torch.Tensor),
+            InputParam.template("num_inference_steps"),
+            InputParam.template("sigmas"),
+            InputParam.template("image_latents"),
        ]

    @property
    def intermediate_outputs(self) -> List[OutputParam]:
        return [
-            OutputParam(name="timesteps", type_hint=torch.Tensor),
+            OutputParam(
+                name="timesteps", type_hint=torch.Tensor, description="The timesteps to use for the denoising process."
+            ),
        ]

    @torch.no_grad()
@@ -574,7 +715,32 @@ class QwenImageLayeredSetTimestepsStep(ModularPipelineBlocks):
        return components, state


+# auto_docstring
 class QwenImageSetTimestepsWithStrengthStep(ModularPipelineBlocks):
+    """
+    Step that sets the the scheduler's timesteps for image-to-image generation, and inpainting. Should be run after
+    prepare latents step.
+
+      Components:
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
+
+      Inputs:
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              The number of denoising steps.
+          sigmas (`List`, *optional*):
+              Custom sigmas for the denoising process.
+          latents (`Tensor`):
+              The latents to use for the denoising process. Can be generated in prepare latents step.
+          strength (`float`, *optional*, defaults to 0.9):
+              Strength for img2img/inpainting.
+
+      Outputs:
+          timesteps (`Tensor`):
+              The timesteps to use for the denoising process.
+          num_inference_steps (`int`):
+              The number of denoising steps to perform at inference time. Updated based on strength.
+    """
+
    model_name = "qwenimage"

    @property
@@ -590,15 +756,15 @@ class QwenImageSetTimestepsWithStrengthStep(ModularPipelineBlocks):
    @property
    def inputs(self) -> List[InputParam]:
        return [
-            InputParam(name="num_inference_steps", default=50),
-            InputParam(name="sigmas"),
+            InputParam.template("num_inference_steps"),
+            InputParam.template("sigmas"),
            InputParam(
-                name="latents",
+                "latents",
                required=True,
                type_hint=torch.Tensor,
-                description="The latents to use for the denoising process, used to calculate the image sequence length.",
+                description="The latents to use for the denoising process. Can be generated in prepare latents step.",
            ),
-            InputParam(name="strength", default=0.9),
+            InputParam.template("strength", default=0.9),
        ]

    @property
@@ -607,7 +773,12 @@ class QwenImageSetTimestepsWithStrengthStep(ModularPipelineBlocks):
            OutputParam(
                name="timesteps",
                type_hint=torch.Tensor,
-                description="The timesteps to use for the denoising process. Can be generated in set_timesteps step.",
+                description="The timesteps to use for the denoising process.",
+            ),
+            OutputParam(
+                name="num_inference_steps",
+                type_hint=int,
+                description="The number of denoising steps to perform at inference time. Updated based on strength.",
            ),
        ]

@@ -654,7 +825,33 @@ class QwenImageSetTimestepsWithStrengthStep(ModularPipelineBlocks):
 ## RoPE inputs for denoiser


+# auto_docstring
 class QwenImageRoPEInputsStep(ModularPipelineBlocks):
+    """
+    Step that prepares the RoPE inputs for the denoising process. Should be place after prepare_latents step
+
+      Inputs:
+          batch_size (`int`, *optional*, defaults to 1):
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
+              be generated in input step.
+          height (`int`):
+              The height in pixels of the generated image.
+          width (`int`):
+              The width in pixels of the generated image.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
+
+      Outputs:
+          img_shapes (`List`):
+              The shapes of the images latents, used for RoPE calculation
+          txt_seq_lens (`List`):
+              The sequence lengths of the prompt embeds, used for RoPE calculation
+          negative_txt_seq_lens (`List`):
+              The sequence lengths of the negative prompt embeds, used for RoPE calculation
+    """
+
    model_name = "qwenimage"

    @property
@@ -666,11 +863,11 @@ class QwenImageRoPEInputsStep(ModularPipelineBlocks):
    @property
    def inputs(self) -> List[InputParam]:
        return [
-            InputParam(name="batch_size", required=True),
-            InputParam(name="height", required=True),
-            InputParam(name="width", required=True),
-            InputParam(name="prompt_embeds_mask"),
-            InputParam(name="negative_prompt_embeds_mask"),
+            InputParam.template("batch_size"),
+            InputParam.template("height", required=True),
+            InputParam.template("width", required=True),
+            InputParam.template("prompt_embeds_mask"),
+            InputParam.template("negative_prompt_embeds_mask"),
        ]

    @property
@@ -702,7 +899,38 @@ class QwenImageRoPEInputsStep(ModularPipelineBlocks):
        return components, state


+# auto_docstring
 class QwenImageEditRoPEInputsStep(ModularPipelineBlocks):
+    """
+    Step that prepares the RoPE inputs for denoising process. This is used in QwenImage Edit. Should be placed after
+    prepare_latents step
+
+      Inputs:
+          batch_size (`int`, *optional*, defaults to 1):
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
+              be generated in input step.
+          image_height (`int`):
+              The height of the reference image. Can be generated in input step.
+          image_width (`int`):
+              The width of the reference image. Can be generated in input step.
+          height (`int`):
+              The height in pixels of the generated image.
+          width (`int`):
+              The width in pixels of the generated image.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
+
+      Outputs:
+          img_shapes (`List`):
+              The shapes of the images latents, used for RoPE calculation
+          txt_seq_lens (`List`):
+              The sequence lengths of the prompt embeds, used for RoPE calculation
+          negative_txt_seq_lens (`List`):
+              The sequence lengths of the negative prompt embeds, used for RoPE calculation
+    """
+
    model_name = "qwenimage"

    @property
@@ -712,13 +940,23 @@ class QwenImageEditRoPEInputsStep(ModularPipelineBlocks):
    @property
    def inputs(self) -> List[InputParam]:
        return [
-            InputParam(name="batch_size", required=True),
-            InputParam(name="image_height", required=True),
-            InputParam(name="image_width", required=True),
-            InputParam(name="height", required=True),
-            InputParam(name="width", required=True),
-            InputParam(name="prompt_embeds_mask"),
-            InputParam(name="negative_prompt_embeds_mask"),
+            InputParam.template("batch_size"),
+            InputParam(
+                name="image_height",
+                required=True,
+                type_hint=int,
+                description="The height of the reference image. Can be generated in input step.",
+            ),
+            InputParam(
+                name="image_width",
+                required=True,
+                type_hint=int,
+                description="The width of the reference image. Can be generated in input step.",
+            ),
+            InputParam.template("height", required=True),
+            InputParam.template("width", required=True),
+            InputParam.template("prompt_embeds_mask"),
+            InputParam.template("negative_prompt_embeds_mask"),
        ]

    @property
@@ -756,7 +994,39 @@ class QwenImageEditRoPEInputsStep(ModularPipelineBlocks):
        return components, state


+# auto_docstring
 class QwenImageEditPlusRoPEInputsStep(ModularPipelineBlocks):
+    """
+    Step that prepares the RoPE inputs for denoising process. This is used in QwenImage Edit Plus.
+      Unlike Edit, Edit Plus handles lists of image_height/image_width for multiple reference images. Should be placed
+      after prepare_latents step.
+
+      Inputs:
+          batch_size (`int`, *optional*, defaults to 1):
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
+              be generated in input step.
+          image_height (`List`):
+              The heights of the reference images. Can be generated in input step.
+          image_width (`List`):
+              The widths of the reference images. Can be generated in input step.
+          height (`int`):
+              The height in pixels of the generated image.
+          width (`int`):
+              The width in pixels of the generated image.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
+
+      Outputs:
+          img_shapes (`List`):
+              The shapes of the image latents, used for RoPE calculation
+          txt_seq_lens (`List`):
+              The sequence lengths of the prompt embeds, used for RoPE calculation
+          negative_txt_seq_lens (`List`):
+              The sequence lengths of the negative prompt embeds, used for RoPE calculation
+    """
+
    model_name = "qwenimage-edit-plus"

    @property
@@ -770,13 +1040,23 @@ class QwenImageEditPlusRoPEInputsStep(ModularPipelineBlocks):
    @property
    def inputs(self) -> List[InputParam]:
        return [
-            InputParam(name="batch_size", required=True),
-            InputParam(name="image_height", required=True, type_hint=List[int]),
-            InputParam(name="image_width", required=True, type_hint=List[int]),
-            InputParam(name="height", required=True),
-            InputParam(name="width", required=True),
-            InputParam(name="prompt_embeds_mask"),
-            InputParam(name="negative_prompt_embeds_mask"),
+            InputParam.template("batch_size"),
+            InputParam(
+                name="image_height",
+                required=True,
+                type_hint=List[int],
+                description="The heights of the reference images. Can be generated in input step.",
+            ),
+            InputParam(
+                name="image_width",
+                required=True,
+                type_hint=List[int],
+                description="The widths of the reference images. Can be generated in input step.",
+            ),
+            InputParam.template("height", required=True),
+            InputParam.template("width", required=True),
+            InputParam.template("prompt_embeds_mask"),
+            InputParam.template("negative_prompt_embeds_mask"),
        ]

    @property
@@ -832,7 +1112,37 @@ class QwenImageEditPlusRoPEInputsStep(ModularPipelineBlocks):
        return components, state


+# auto_docstring
 class QwenImageLayeredRoPEInputsStep(ModularPipelineBlocks):
+    """
+    Step that prepares the RoPE inputs for the denoising process. Should be place after prepare_latents step
+
+      Inputs:
+          batch_size (`int`, *optional*, defaults to 1):
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
+              be generated in input step.
+          layers (`int`, *optional*, defaults to 4):
+              Number of layers to extract from the image
+          height (`int`):
+              The height in pixels of the generated image.
+          width (`int`):
+              The width in pixels of the generated image.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
+
+      Outputs:
+          img_shapes (`List`):
+              The shapes of the image latents, used for RoPE calculation
+          txt_seq_lens (`List`):
+              The sequence lengths of the prompt embeds, used for RoPE calculation
+          negative_txt_seq_lens (`List`):
+              The sequence lengths of the negative prompt embeds, used for RoPE calculation
+          additional_t_cond (`Tensor`):
+              The additional t cond, used for RoPE calculation
+    """
+
    model_name = "qwenimage-layered"

    @property
@@ -844,12 +1154,12 @@ class QwenImageLayeredRoPEInputsStep(ModularPipelineBlocks):
    @property
    def inputs(self) -> List[InputParam]:
        return [
-            InputParam(name="batch_size", required=True),
-            InputParam(name="layers", required=True),
-            InputParam(name="height", required=True),
-            InputParam(name="width", required=True),
-            InputParam(name="prompt_embeds_mask"),
-            InputParam(name="negative_prompt_embeds_mask"),
+            InputParam.template("batch_size"),
+            InputParam.template("layers"),
+            InputParam.template("height", required=True),
+            InputParam.template("width", required=True),
+            InputParam.template("prompt_embeds_mask"),
+            InputParam.template("negative_prompt_embeds_mask"),
        ]

    @property
@@ -914,7 +1224,34 @@ class QwenImageLayeredRoPEInputsStep(ModularPipelineBlocks):


 ## ControlNet inputs for denoiser
+
+
+# auto_docstring
 class QwenImageControlNetBeforeDenoiserStep(ModularPipelineBlocks):
+    """
+    step that prepare inputs for controlnet. Insert before the Denoise Step, after set_timesteps step.
+
+      Components:
+          controlnet (`QwenImageControlNetModel`)
+
+      Inputs:
+          control_guidance_start (`float`, *optional*, defaults to 0.0):
+              When to start applying ControlNet.
+          control_guidance_end (`float`, *optional*, defaults to 1.0):
+              When to stop applying ControlNet.
+          controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
+              Scale for ControlNet conditioning.
+          control_image_latents (`Tensor`):
+              The control image latents to use for the denoising process. Can be generated in controlnet vae encoder
+              step.
+          timesteps (`Tensor`):
+              The timesteps to use for the denoising process. Can be generated in set_timesteps step.
+
+      Outputs:
+          controlnet_keep (`List`):
+              The controlnet keep values
+    """
+
    model_name = "qwenimage"

    @property
@@ -930,12 +1267,17 @@ class QwenImageControlNetBeforeDenoiserStep(ModularPipelineBlocks):
    @property
    def inputs(self) -> List[InputParam]:
        return [
-            InputParam("control_guidance_start", default=0.0),
-            InputParam("control_guidance_end", default=1.0),
-            InputParam("controlnet_conditioning_scale", default=1.0),
-            InputParam("control_image_latents", required=True),
+            InputParam.template("control_guidance_start"),
+            InputParam.template("control_guidance_end"),
+            InputParam.template("controlnet_conditioning_scale"),
            InputParam(
-                "timesteps",
+                name="control_image_latents",
+                required=True,
+                type_hint=torch.Tensor,
+                description="The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step.",
+            ),
+            InputParam(
+                name="timesteps",
                required=True,
                type_hint=torch.Tensor,
                description="The timesteps to use for the denoising process. Can be generated in set_timesteps step.",
--- a/src/diffusers/modular_pipelines/qwenimage/decoders.py
+++ b/src/diffusers/modular_pipelines/qwenimage/decoders.py
@@ -12,10 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-from typing import List, Union
+from typing import Any, Dict, List

-import numpy as np
-import PIL
 import torch

 from ...configuration_utils import FrozenDict
@@ -31,7 +29,30 @@ logger = logging.get_logger(__name__)


 # after denoising loop (unpack latents)
+
+
+# auto_docstring
 class QwenImageAfterDenoiseStep(ModularPipelineBlocks):
+    """
+    Step that unpack the latents from 3D tensor (batch_size, sequence_length, channels) into 5D tensor (batch_size,
+    channels, 1, height, width)
+
+      Components:
+          pachifier (`QwenImagePachifier`)
+
+      Inputs:
+          height (`int`):
+              The height in pixels of the generated image.
+          width (`int`):
+              The width in pixels of the generated image.
+          latents (`Tensor`):
+              The latents to decode, can be generated in the denoise step.
+
+      Outputs:
+          latents (`Tensor`):
+              The denoisedlatents unpacked to B, C, 1, H, W
+    """
+
    model_name = "qwenimage"

    @property
@@ -49,13 +70,21 @@ class QwenImageAfterDenoiseStep(ModularPipelineBlocks):
    @property
    def inputs(self) -> List[InputParam]:
        return [
-            InputParam(name="height", required=True),
-            InputParam(name="width", required=True),
+            InputParam.template("height", required=True),
+            InputParam.template("width", required=True),
            InputParam(
                name="latents",
                required=True,
                type_hint=torch.Tensor,
-                description="The latents to decode, can be generated in the denoise step",
+                description="The latents to decode, can be generated in the denoise step.",
+            ),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [
+            OutputParam(
+                name="latents", type_hint=torch.Tensor, description="The denoisedlatents unpacked to B, C, 1, H, W"
            ),
        ]

@@ -72,7 +101,29 @@ class QwenImageAfterDenoiseStep(ModularPipelineBlocks):
        return components, state


+# auto_docstring
 class QwenImageLayeredAfterDenoiseStep(ModularPipelineBlocks):
+    """
+    Unpack latents from (B, seq, C*4) to (B, C, layers+1, H, W) after denoising.
+
+      Components:
+          pachifier (`QwenImageLayeredPachifier`)
+
+      Inputs:
+          latents (`Tensor`):
+              The denoised latents to decode, can be generated in the denoise step.
+          height (`int`):
+              The height in pixels of the generated image.
+          width (`int`):
+              The width in pixels of the generated image.
+          layers (`int`, *optional*, defaults to 4):
+              Number of layers to extract from the image
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents. (unpacked to B, C, layers+1, H, W)
+    """
+
    model_name = "qwenimage-layered"

    @property
@@ -88,10 +139,21 @@ class QwenImageLayeredAfterDenoiseStep(ModularPipelineBlocks):
    @property
    def inputs(self) -> List[InputParam]:
        return [
-            InputParam("latents", required=True, type_hint=torch.Tensor),
-            InputParam("height", required=True, type_hint=int),
-            InputParam("width", required=True, type_hint=int),
-            InputParam("layers", required=True, type_hint=int),
+            InputParam(
+                name="latents",
+                required=True,
+                type_hint=torch.Tensor,
+                description="The denoised latents to decode, can be generated in the denoise step.",
+            ),
+            InputParam.template("height", required=True),
+            InputParam.template("width", required=True),
+            InputParam.template("layers"),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [
+            OutputParam.template("latents", note="unpacked to B, C, layers+1, H, W"),
        ]

    @torch.no_grad()
@@ -112,7 +174,26 @@ class QwenImageLayeredAfterDenoiseStep(ModularPipelineBlocks):


 # decode step
+
+
+# auto_docstring
 class QwenImageDecoderStep(ModularPipelineBlocks):
+    """
+    Step that decodes the latents to images
+
+      Components:
+          vae (`AutoencoderKLQwenImage`)
+
+      Inputs:
+          latents (`Tensor`):
+              The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise
+              step.
+
+      Outputs:
+          images (`List`):
+              Generated images. (tensor output of the vae decoder.)
+    """
+
    model_name = "qwenimage"

    @property
@@ -134,19 +215,13 @@ class QwenImageDecoderStep(ModularPipelineBlocks):
                name="latents",
                required=True,
                type_hint=torch.Tensor,
-                description="The latents to decode, can be generated in the denoise step",
+                description="The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step.",
            ),
        ]

    @property
-    def intermediate_outputs(self) -> List[str]:
-        return [
-            OutputParam(
-                "images",
-                type_hint=Union[List[PIL.Image.Image], List[torch.Tensor], List[np.array]],
-                description="The generated images, can be a PIL.Image.Image, torch.Tensor or a numpy array",
-            )
-        ]
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [OutputParam.template("images", note="tensor output of the vae decoder.")]

    @torch.no_grad()
    def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
@@ -176,7 +251,26 @@ class QwenImageDecoderStep(ModularPipelineBlocks):
        return components, state


+# auto_docstring
 class QwenImageLayeredDecoderStep(ModularPipelineBlocks):
+    """
+    Decode unpacked latents (B, C, layers+1, H, W) into layer images.
+
+      Components:
+          vae (`AutoencoderKLQwenImage`) image_processor (`VaeImageProcessor`)
+
+      Inputs:
+          latents (`Tensor`):
+              The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise
+              step.
+          output_type (`str`, *optional*, defaults to pil):
+              Output format: 'pil', 'np', 'pt'.
+
+      Outputs:
+          images (`List`):
+              Generated images.
+    """
+
    model_name = "qwenimage-layered"

    @property
@@ -198,14 +292,19 @@ class QwenImageLayeredDecoderStep(ModularPipelineBlocks):
    @property
    def inputs(self) -> List[InputParam]:
        return [
-            InputParam("latents", required=True, type_hint=torch.Tensor),
-            InputParam("output_type", default="pil", type_hint=str),
+            InputParam(
+                name="latents",
+                required=True,
+                type_hint=torch.Tensor,
+                description="The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step.",
+            ),
+            InputParam.template("output_type"),
        ]

    @property
    def intermediate_outputs(self) -> List[OutputParam]:
        return [
-            OutputParam(name="images", type_hint=List[List[PIL.Image.Image]]),
+            OutputParam.template("images"),
        ]

    @torch.no_grad()
@@ -251,7 +350,27 @@ class QwenImageLayeredDecoderStep(ModularPipelineBlocks):


 # postprocess the decoded images
+
+
+# auto_docstring
 class QwenImageProcessImagesOutputStep(ModularPipelineBlocks):
+    """
+    postprocess the generated image
+
+      Components:
+          image_processor (`VaeImageProcessor`)
+
+      Inputs:
+          images (`Tensor`):
+              the generated image tensor from decoders step
+          output_type (`str`, *optional*, defaults to pil):
+              Output format: 'pil', 'np', 'pt'.
+
+      Outputs:
+          images (`List`):
+              Generated images.
+    """
+
    model_name = "qwenimage"

    @property
@@ -272,15 +391,19 @@ class QwenImageProcessImagesOutputStep(ModularPipelineBlocks):
    @property
    def inputs(self) -> List[InputParam]:
        return [
-            InputParam("images", required=True, description="the generated image from decoders step"),
            InputParam(
-                name="output_type",
-                default="pil",
-                type_hint=str,
-                description="The type of the output images, can be 'pil', 'np', 'pt'",
+                name="images",
+                required=True,
+                type_hint=torch.Tensor,
+                description="the generated image tensor from decoders step",
            ),
+            InputParam.template("output_type"),
        ]

+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [OutputParam.template("images")]
+
    @staticmethod
    def check_inputs(output_type):
        if output_type not in ["pil", "np", "pt"]:
@@ -301,7 +424,28 @@ class QwenImageProcessImagesOutputStep(ModularPipelineBlocks):
        return components, state


+# auto_docstring
 class QwenImageInpaintProcessImagesOutputStep(ModularPipelineBlocks):
+    """
+    postprocess the generated image, optional apply the mask overally to the original image..
+
+      Components:
+          image_mask_processor (`InpaintProcessor`)
+
+      Inputs:
+          images (`Tensor`):
+              the generated image tensor from decoders step
+          output_type (`str`, *optional*, defaults to pil):
+              Output format: 'pil', 'np', 'pt'.
+          mask_overlay_kwargs (`Dict`, *optional*):
+              The kwargs for the postprocess step to apply the mask overlay. generated in
+              InpaintProcessImagesInputStep.
+
+      Outputs:
+          images (`List`):
+              Generated images.
+    """
+
    model_name = "qwenimage"

    @property
@@ -322,16 +466,24 @@ class QwenImageInpaintProcessImagesOutputStep(ModularPipelineBlocks):
    @property
    def inputs(self) -> List[InputParam]:
        return [
-            InputParam("images", required=True, description="the generated image from decoders step"),
            InputParam(
-                name="output_type",
-                default="pil",
-                type_hint=str,
-                description="The type of the output images, can be 'pil', 'np', 'pt'",
+                name="images",
+                required=True,
+                type_hint=torch.Tensor,
+                description="the generated image tensor from decoders step",
+            ),
+            InputParam.template("output_type"),
+            InputParam(
+                name="mask_overlay_kwargs",
+                type_hint=Dict[str, Any],
+                description="The kwargs for the postprocess step to apply the mask overlay. generated in InpaintProcessImagesInputStep.",
            ),
-            InputParam("mask_overlay_kwargs"),
        ]

+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [OutputParam.template("images")]
+
    @staticmethod
    def check_inputs(output_type, mask_overlay_kwargs):
        if output_type not in ["pil", "np", "pt"]:
--- a/src/diffusers/modular_pipelines/qwenimage/denoise.py
+++ b/src/diffusers/modular_pipelines/qwenimage/denoise.py
@@ -50,7 +50,7 @@ class QwenImageLoopBeforeDenoiser(ModularPipelineBlocks):
    def inputs(self) -> List[InputParam]:
        return [
            InputParam(
-                "latents",
+                name="latents",
                required=True,
                type_hint=torch.Tensor,
                description="The initial latents to use for the denoising process. Can be generated in prepare_latent step.",
@@ -80,17 +80,12 @@ class QwenImageEditLoopBeforeDenoiser(ModularPipelineBlocks):
    def inputs(self) -> List[InputParam]:
        return [
            InputParam(
-                "latents",
+                name="latents",
                required=True,
                type_hint=torch.Tensor,
                description="The initial latents to use for the denoising process. Can be generated in prepare_latent step.",
            ),
-            InputParam(
-                "image_latents",
-                required=True,
-                type_hint=torch.Tensor,
-                description="The initial image latents to use for the denoising process. Can be encoded in vae_encoder step and packed in prepare_image_latents step.",
-            ),
+            InputParam.template("image_latents"),
        ]

    @torch.no_grad()
@@ -134,29 +129,12 @@ class QwenImageLoopBeforeDenoiserControlNet(ModularPipelineBlocks):
                type_hint=torch.Tensor,
                description="The control image to use for the denoising process. Can be generated in prepare_controlnet_inputs step.",
            ),
+            InputParam.template("controlnet_conditioning_scale", note="updated in prepare_controlnet_inputs step."),
            InputParam(
-                "controlnet_conditioning_scale",
-                type_hint=float,
-                description="The controlnet conditioning scale value to use for the denoising process. Can be generated in prepare_controlnet_inputs step.",
-            ),
-            InputParam(
-                "controlnet_keep",
+                name="controlnet_keep",
                required=True,
                type_hint=List[float],
-                description="The controlnet keep values to use for the denoising process. Can be generated in prepare_controlnet_inputs step.",
-            ),
-            InputParam(
-                "num_inference_steps",
-                required=True,
-                type_hint=int,
-                description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.",
-            ),
-            InputParam(
-                kwargs_type="denoiser_input_fields",
-                description=(
-                    "All conditional model inputs for the denoiser. "
-                    "It should contain prompt_embeds/negative_prompt_embeds."
-                ),
+                description="The controlnet keep values. Can be generated in prepare_controlnet_inputs step.",
            ),
        ]

@@ -217,28 +195,13 @@ class QwenImageLoopDenoiser(ModularPipelineBlocks):
    @property
    def inputs(self) -> List[InputParam]:
        return [
-            InputParam("attention_kwargs"),
-            InputParam(
-                "latents",
-                required=True,
-                type_hint=torch.Tensor,
-                description="The latents to use for the denoising process. Can be generated in prepare_latents step.",
-            ),
-            InputParam(
-                "num_inference_steps",
-                required=True,
-                type_hint=int,
-                description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.",
-            ),
-            InputParam(
-                kwargs_type="denoiser_input_fields",
-                description="conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.",
-            ),
+            InputParam.template("attention_kwargs"),
+            InputParam.template("denoiser_input_fields"),
            InputParam(
                "img_shapes",
                required=True,
                type_hint=List[Tuple[int, int]],
-                description="The shape of the image latents for RoPE calculation. Can be generated in prepare_additional_inputs step.",
+                description="The shape of the image latents for RoPE calculation. can be generated in prepare_additional_inputs step.",
            ),
        ]

@@ -317,23 +280,8 @@ class QwenImageEditLoopDenoiser(ModularPipelineBlocks):
    @property
    def inputs(self) -> List[InputParam]:
        return [
-            InputParam("attention_kwargs"),
-            InputParam(
-                "latents",
-                required=True,
-                type_hint=torch.Tensor,
-                description="The latents to use for the denoising process. Can be generated in prepare_latents step.",
-            ),
-            InputParam(
-                "num_inference_steps",
-                required=True,
-                type_hint=int,
-                description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.",
-            ),
-            InputParam(
-                kwargs_type="denoiser_input_fields",
-                description="conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.",
-            ),
+            InputParam.template("attention_kwargs"),
+            InputParam.template("denoiser_input_fields"),
            InputParam(
                "img_shapes",
                required=True,
@@ -415,7 +363,7 @@ class QwenImageLoopAfterDenoiser(ModularPipelineBlocks):
    @property
    def intermediate_outputs(self) -> List[OutputParam]:
        return [
-            OutputParam("latents", type_hint=torch.Tensor, description="The denoised latents."),
+            OutputParam.template("latents"),
        ]

    @torch.no_grad()
@@ -456,24 +404,19 @@ class QwenImageLoopAfterDenoiserInpaint(ModularPipelineBlocks):
                type_hint=torch.Tensor,
                description="The mask to use for the inpainting process. Can be generated in inpaint prepare latents step.",
            ),
-            InputParam(
-                "image_latents",
-                required=True,
-                type_hint=torch.Tensor,
-                description="The image latents to use for the inpainting process. Can be generated in inpaint prepare latents step.",
-            ),
+            InputParam.template("image_latents"),
            InputParam(
                "initial_noise",
                required=True,
                type_hint=torch.Tensor,
                description="The initial noise to use for the inpainting process. Can be generated in inpaint prepare latents step.",
            ),
-            InputParam(
-                "timesteps",
-                required=True,
-                type_hint=torch.Tensor,
-                description="The timesteps to use for the denoising process. Can be generated in set_timesteps step.",
-            ),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [
+            OutputParam.template("latents"),
        ]

    @torch.no_grad()
@@ -515,17 +458,12 @@ class QwenImageDenoiseLoopWrapper(LoopSequentialPipelineBlocks):
    def loop_inputs(self) -> List[InputParam]:
        return [
            InputParam(
-                "timesteps",
+                name="timesteps",
                required=True,
                type_hint=torch.Tensor,
                description="The timesteps to use for the denoising process. Can be generated in set_timesteps step.",
            ),
-            InputParam(
-                "num_inference_steps",
-                required=True,
-                type_hint=int,
-                description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.",
-            ),
+            InputParam.template("num_inference_steps", required=True),
        ]

    @torch.no_grad()
@@ -557,7 +495,42 @@ class QwenImageDenoiseLoopWrapper(LoopSequentialPipelineBlocks):


 # Qwen Image (text2image, image2image)
+
+
+# auto_docstring
 class QwenImageDenoiseStep(QwenImageDenoiseLoopWrapper):
+    """
+    Denoise step that iteratively denoise the latents.
+      Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method At each iteration, it runs blocks
+      defined in `sub_blocks` sequencially:
+       - `QwenImageLoopBeforeDenoiser`
+       - `QwenImageLoopDenoiser`
+       - `QwenImageLoopAfterDenoiser`
+      This block supports text2image and image2image tasks for QwenImage.
+
+      Components:
+          guider (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`) scheduler
+          (`FlowMatchEulerDiscreteScheduler`)
+
+      Inputs:
+          timesteps (`Tensor`):
+              The timesteps to use for the denoising process. Can be generated in set_timesteps step.
+          num_inference_steps (`int`):
+              The number of denoising steps.
+          latents (`Tensor`):
+              The initial latents to use for the denoising process. Can be generated in prepare_latent step.
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+          **denoiser_input_fields (`None`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+          img_shapes (`List`):
+              The shape of the image latents for RoPE calculation. can be generated in prepare_additional_inputs step.
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
    model_name = "qwenimage"

    block_classes = [
@@ -570,8 +543,8 @@ class QwenImageDenoiseStep(QwenImageDenoiseLoopWrapper):
    @property
    def description(self) -> str:
        return (
-            "Denoise step that iteratively denoise the latents. \n"
-            "Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method \n"
+            "Denoise step that iteratively denoise the latents.\n"
+            "Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method\n"
            "At each iteration, it runs blocks defined in `sub_blocks` sequencially:\n"
            " - `QwenImageLoopBeforeDenoiser`\n"
            " - `QwenImageLoopDenoiser`\n"
@@ -581,7 +554,47 @@ class QwenImageDenoiseStep(QwenImageDenoiseLoopWrapper):


 # Qwen Image (inpainting)
+# auto_docstring
 class QwenImageInpaintDenoiseStep(QwenImageDenoiseLoopWrapper):
+    """
+    Denoise step that iteratively denoise the latents.
+      Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method At each iteration, it runs blocks
+      defined in `sub_blocks` sequencially:
+       - `QwenImageLoopBeforeDenoiser`
+       - `QwenImageLoopDenoiser`
+       - `QwenImageLoopAfterDenoiser`
+       - `QwenImageLoopAfterDenoiserInpaint`
+      This block supports inpainting tasks for QwenImage.
+
+      Components:
+          guider (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`) scheduler
+          (`FlowMatchEulerDiscreteScheduler`)
+
+      Inputs:
+          timesteps (`Tensor`):
+              The timesteps to use for the denoising process. Can be generated in set_timesteps step.
+          num_inference_steps (`int`):
+              The number of denoising steps.
+          latents (`Tensor`):
+              The initial latents to use for the denoising process. Can be generated in prepare_latent step.
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+          **denoiser_input_fields (`None`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+          img_shapes (`List`):
+              The shape of the image latents for RoPE calculation. can be generated in prepare_additional_inputs step.
+          mask (`Tensor`):
+              The mask to use for the inpainting process. Can be generated in inpaint prepare latents step.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+          initial_noise (`Tensor`):
+              The initial noise to use for the inpainting process. Can be generated in inpaint prepare latents step.
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
    model_name = "qwenimage"
    block_classes = [
        QwenImageLoopBeforeDenoiser,
@@ -606,7 +619,47 @@ class QwenImageInpaintDenoiseStep(QwenImageDenoiseLoopWrapper):


 # Qwen Image (text2image, image2image) with controlnet
+# auto_docstring
 class QwenImageControlNetDenoiseStep(QwenImageDenoiseLoopWrapper):
+    """
+    Denoise step that iteratively denoise the latents.
+      Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method At each iteration, it runs blocks
+      defined in `sub_blocks` sequencially:
+       - `QwenImageLoopBeforeDenoiser`
+       - `QwenImageLoopBeforeDenoiserControlNet`
+       - `QwenImageLoopDenoiser`
+       - `QwenImageLoopAfterDenoiser`
+      This block supports text2img/img2img tasks with controlnet for QwenImage.
+
+      Components:
+          guider (`ClassifierFreeGuidance`) controlnet (`QwenImageControlNetModel`) transformer
+          (`QwenImageTransformer2DModel`) scheduler (`FlowMatchEulerDiscreteScheduler`)
+
+      Inputs:
+          timesteps (`Tensor`):
+              The timesteps to use for the denoising process. Can be generated in set_timesteps step.
+          num_inference_steps (`int`):
+              The number of denoising steps.
+          latents (`Tensor`):
+              The initial latents to use for the denoising process. Can be generated in prepare_latent step.
+          control_image_latents (`Tensor`):
+              The control image to use for the denoising process. Can be generated in prepare_controlnet_inputs step.
+          controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
+              Scale for ControlNet conditioning. (updated in prepare_controlnet_inputs step.)
+          controlnet_keep (`List`):
+              The controlnet keep values. Can be generated in prepare_controlnet_inputs step.
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+          **denoiser_input_fields (`None`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+          img_shapes (`List`):
+              The shape of the image latents for RoPE calculation. can be generated in prepare_additional_inputs step.
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
    model_name = "qwenimage"
    block_classes = [
        QwenImageLoopBeforeDenoiser,
@@ -631,7 +684,54 @@ class QwenImageControlNetDenoiseStep(QwenImageDenoiseLoopWrapper):


 # Qwen Image (inpainting) with controlnet
+# auto_docstring
 class QwenImageInpaintControlNetDenoiseStep(QwenImageDenoiseLoopWrapper):
+    """
+    Denoise step that iteratively denoise the latents.
+      Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method At each iteration, it runs blocks
+      defined in `sub_blocks` sequencially:
+       - `QwenImageLoopBeforeDenoiser`
+       - `QwenImageLoopBeforeDenoiserControlNet`
+       - `QwenImageLoopDenoiser`
+       - `QwenImageLoopAfterDenoiser`
+       - `QwenImageLoopAfterDenoiserInpaint`
+      This block supports inpainting tasks with controlnet for QwenImage.
+
+      Components:
+          guider (`ClassifierFreeGuidance`) controlnet (`QwenImageControlNetModel`) transformer
+          (`QwenImageTransformer2DModel`) scheduler (`FlowMatchEulerDiscreteScheduler`)
+
+      Inputs:
+          timesteps (`Tensor`):
+              The timesteps to use for the denoising process. Can be generated in set_timesteps step.
+          num_inference_steps (`int`):
+              The number of denoising steps.
+          latents (`Tensor`):
+              The initial latents to use for the denoising process. Can be generated in prepare_latent step.
+          control_image_latents (`Tensor`):
+              The control image to use for the denoising process. Can be generated in prepare_controlnet_inputs step.
+          controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
+              Scale for ControlNet conditioning. (updated in prepare_controlnet_inputs step.)
+          controlnet_keep (`List`):
+              The controlnet keep values. Can be generated in prepare_controlnet_inputs step.
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+          **denoiser_input_fields (`None`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+          img_shapes (`List`):
+              The shape of the image latents for RoPE calculation. can be generated in prepare_additional_inputs step.
+          mask (`Tensor`):
+              The mask to use for the inpainting process. Can be generated in inpaint prepare latents step.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+          initial_noise (`Tensor`):
+              The initial noise to use for the inpainting process. Can be generated in inpaint prepare latents step.
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
    model_name = "qwenimage"
    block_classes = [
        QwenImageLoopBeforeDenoiser,
@@ -664,7 +764,42 @@ class QwenImageInpaintControlNetDenoiseStep(QwenImageDenoiseLoopWrapper):


 # Qwen Image Edit (image2image)
+# auto_docstring
 class QwenImageEditDenoiseStep(QwenImageDenoiseLoopWrapper):
+    """
+    Denoise step that iteratively denoise the latents.
+      Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method At each iteration, it runs blocks
+      defined in `sub_blocks` sequencially:
+       - `QwenImageEditLoopBeforeDenoiser`
+       - `QwenImageEditLoopDenoiser`
+       - `QwenImageLoopAfterDenoiser`
+      This block supports QwenImage Edit.
+
+      Components:
+          guider (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`) scheduler
+          (`FlowMatchEulerDiscreteScheduler`)
+
+      Inputs:
+          timesteps (`Tensor`):
+              The timesteps to use for the denoising process. Can be generated in set_timesteps step.
+          num_inference_steps (`int`):
+              The number of denoising steps.
+          latents (`Tensor`):
+              The initial latents to use for the denoising process. Can be generated in prepare_latent step.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+          **denoiser_input_fields (`None`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+          img_shapes (`List`):
+              The shape of the image latents for RoPE calculation. Can be generated in prepare_additional_inputs step.
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
    model_name = "qwenimage-edit"
    block_classes = [
        QwenImageEditLoopBeforeDenoiser,
@@ -687,7 +822,47 @@ class QwenImageEditDenoiseStep(QwenImageDenoiseLoopWrapper):


 # Qwen Image Edit (inpainting)
+# auto_docstring
 class QwenImageEditInpaintDenoiseStep(QwenImageDenoiseLoopWrapper):
+    """
+    Denoise step that iteratively denoise the latents.
+      Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method At each iteration, it runs blocks
+      defined in `sub_blocks` sequencially:
+       - `QwenImageEditLoopBeforeDenoiser`
+       - `QwenImageEditLoopDenoiser`
+       - `QwenImageLoopAfterDenoiser`
+       - `QwenImageLoopAfterDenoiserInpaint`
+      This block supports inpainting tasks for QwenImage Edit.
+
+      Components:
+          guider (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`) scheduler
+          (`FlowMatchEulerDiscreteScheduler`)
+
+      Inputs:
+          timesteps (`Tensor`):
+              The timesteps to use for the denoising process. Can be generated in set_timesteps step.
+          num_inference_steps (`int`):
+              The number of denoising steps.
+          latents (`Tensor`):
+              The initial latents to use for the denoising process. Can be generated in prepare_latent step.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+          **denoiser_input_fields (`None`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+          img_shapes (`List`):
+              The shape of the image latents for RoPE calculation. Can be generated in prepare_additional_inputs step.
+          mask (`Tensor`):
+              The mask to use for the inpainting process. Can be generated in inpaint prepare latents step.
+          initial_noise (`Tensor`):
+              The initial noise to use for the inpainting process. Can be generated in inpaint prepare latents step.
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
    model_name = "qwenimage-edit"
    block_classes = [
        QwenImageEditLoopBeforeDenoiser,
@@ -712,7 +887,42 @@ class QwenImageEditInpaintDenoiseStep(QwenImageDenoiseLoopWrapper):


 # Qwen Image Layered (image2image)
+# auto_docstring
 class QwenImageLayeredDenoiseStep(QwenImageDenoiseLoopWrapper):
+    """
+    Denoise step that iteratively denoise the latents.
+      Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method At each iteration, it runs blocks
+      defined in `sub_blocks` sequencially:
+       - `QwenImageEditLoopBeforeDenoiser`
+       - `QwenImageEditLoopDenoiser`
+       - `QwenImageLoopAfterDenoiser`
+      This block supports QwenImage Layered.
+
+      Components:
+          guider (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`) scheduler
+          (`FlowMatchEulerDiscreteScheduler`)
+
+      Inputs:
+          timesteps (`Tensor`):
+              The timesteps to use for the denoising process. Can be generated in set_timesteps step.
+          num_inference_steps (`int`):
+              The number of denoising steps.
+          latents (`Tensor`):
+              The initial latents to use for the denoising process. Can be generated in prepare_latent step.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+          **denoiser_input_fields (`None`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+          img_shapes (`List`):
+              The shape of the image latents for RoPE calculation. Can be generated in prepare_additional_inputs step.
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
    model_name = "qwenimage-layered"
    block_classes = [
        QwenImageEditLoopBeforeDenoiser,
--- a/src/diffusers/modular_pipelines/qwenimage/encoders.py
+++ b/src/diffusers/modular_pipelines/qwenimage/encoders.py
--- a/src/diffusers/modular_pipelines/qwenimage/inputs.py
+++ b/src/diffusers/modular_pipelines/qwenimage/inputs.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-from typing import List, Tuple
+from typing import List, Optional, Tuple

 import torch

@@ -109,7 +109,44 @@ def calculate_dimension_from_latents(latents: torch.Tensor, vae_scale_factor: in
    return height, width


+# auto_docstring
 class QwenImageTextInputsStep(ModularPipelineBlocks):
+    """
+    Text input processing step that standardizes text embeddings for the pipeline.
+      This step:
+        1. Determines `batch_size` and `dtype` based on `prompt_embeds`
+        2. Ensures all text embeddings have consistent batch sizes (batch_size * num_images_per_prompt)
+
+      This block should be placed after all encoder steps to process the text embeddings before they are used in
+      subsequent pipeline steps.
+
+      Inputs:
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
+
+      Outputs:
+          batch_size (`int`):
+              The batch size of the prompt embeddings
+          dtype (`dtype`):
+              The data type of the prompt embeddings
+          prompt_embeds (`Tensor`):
+              The prompt embeddings. (batch-expanded)
+          prompt_embeds_mask (`Tensor`):
+              The encoder attention mask. (batch-expanded)
+          negative_prompt_embeds (`Tensor`):
+              The negative prompt embeddings. (batch-expanded)
+          negative_prompt_embeds_mask (`Tensor`):
+              The negative prompt embeddings mask. (batch-expanded)
+    """
+
    model_name = "qwenimage"

    @property
@@ -129,26 +166,22 @@ class QwenImageTextInputsStep(ModularPipelineBlocks):
    @property
    def inputs(self) -> List[InputParam]:
        return [
-            InputParam(name="num_images_per_prompt", default=1),
-            InputParam(name="prompt_embeds", required=True, kwargs_type="denoiser_input_fields"),
-            InputParam(name="prompt_embeds_mask", required=True, kwargs_type="denoiser_input_fields"),
-            InputParam(name="negative_prompt_embeds", kwargs_type="denoiser_input_fields"),
-            InputParam(name="negative_prompt_embeds_mask", kwargs_type="denoiser_input_fields"),
+            InputParam.template("num_images_per_prompt"),
+            InputParam.template("prompt_embeds"),
+            InputParam.template("prompt_embeds_mask"),
+            InputParam.template("negative_prompt_embeds"),
+            InputParam.template("negative_prompt_embeds_mask"),
        ]

    @property
-    def intermediate_outputs(self) -> List[str]:
+    def intermediate_outputs(self) -> List[OutputParam]:
        return [
-            OutputParam(
-                "batch_size",
-                type_hint=int,
-                description="Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt",
-            ),
-            OutputParam(
-                "dtype",
-                type_hint=torch.dtype,
-                description="Data type of model tensor inputs (determined by `prompt_embeds`)",
-            ),
+            OutputParam(name="batch_size", type_hint=int, description="The batch size of the prompt embeddings"),
+            OutputParam(name="dtype", type_hint=torch.dtype, description="The data type of the prompt embeddings"),
+            OutputParam.template("prompt_embeds", note="batch-expanded"),
+            OutputParam.template("prompt_embeds_mask", note="batch-expanded"),
+            OutputParam.template("negative_prompt_embeds", note="batch-expanded"),
+            OutputParam.template("negative_prompt_embeds_mask", note="batch-expanded"),
        ]

    @staticmethod
@@ -221,20 +254,76 @@ class QwenImageTextInputsStep(ModularPipelineBlocks):
        return components, state


+# auto_docstring
 class QwenImageAdditionalInputsStep(ModularPipelineBlocks):
-    """Input step for QwenImage: update height/width, expand batch, patchify."""
+    """
+    Input processing step that:
+        1. For image latent inputs: Updates height/width if None, patchifies, and expands batch size
+        2. For additional batch inputs: Expands batch dimensions to match final batch size
+
+      Configured inputs:
+        - Image latent inputs: ['image_latents']
+
+      This block should be placed after the encoder steps and the text input step.
+
+      Components:
+          pachifier (`QwenImagePachifier`)
+
+      Inputs:
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+          batch_size (`int`, *optional*, defaults to 1):
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
+              be generated in input step.
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+
+      Outputs:
+          image_height (`int`):
+              The image height calculated from the image latents dimension
+          image_width (`int`):
+              The image width calculated from the image latents dimension
+          height (`int`):
+              if not provided, updated to image height
+          width (`int`):
+              if not provided, updated to image width
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified and
+              batch-expanded)
+    """

    model_name = "qwenimage"

    def __init__(
        self,
-        image_latent_inputs: List[str] = ["image_latents"],
-        additional_batch_inputs: List[str] = [],
+        image_latent_inputs: Optional[List[InputParam]] = None,
+        additional_batch_inputs: Optional[List[InputParam]] = None,
    ):
+        # by default, process `image_latents`
+        if image_latent_inputs is None:
+            image_latent_inputs = [InputParam.template("image_latents")]
+        if additional_batch_inputs is None:
+            additional_batch_inputs = []
+
        if not isinstance(image_latent_inputs, list):
-            image_latent_inputs = [image_latent_inputs]
+            raise ValueError(f"image_latent_inputs must be a list, but got {type(image_latent_inputs)}")
+        else:
+            for input_param in image_latent_inputs:
+                if not isinstance(input_param, InputParam):
+                    raise ValueError(f"image_latent_inputs must be a list of InputParam, but got {type(input_param)}")
+
        if not isinstance(additional_batch_inputs, list):
-            additional_batch_inputs = [additional_batch_inputs]
+            raise ValueError(f"additional_batch_inputs must be a list, but got {type(additional_batch_inputs)}")
+        else:
+            for input_param in additional_batch_inputs:
+                if not isinstance(input_param, InputParam):
+                    raise ValueError(
+                        f"additional_batch_inputs must be a list of InputParam, but got {type(input_param)}"
+                    )

        self._image_latent_inputs = image_latent_inputs
        self._additional_batch_inputs = additional_batch_inputs
@@ -252,9 +341,9 @@ class QwenImageAdditionalInputsStep(ModularPipelineBlocks):
        if self._image_latent_inputs or self._additional_batch_inputs:
            inputs_info = "\n\nConfigured inputs:"
            if self._image_latent_inputs:
-                inputs_info += f"\n  - Image latent inputs: {self._image_latent_inputs}"
+                inputs_info += f"\n  - Image latent inputs: {[p.name for p in self._image_latent_inputs]}"
            if self._additional_batch_inputs:
-                inputs_info += f"\n  - Additional batch inputs: {self._additional_batch_inputs}"
+                inputs_info += f"\n  - Additional batch inputs: {[p.name for p in self._additional_batch_inputs]}"

        placement_section = "\n\nThis block should be placed after the encoder steps and the text input step."

@@ -269,23 +358,19 @@ class QwenImageAdditionalInputsStep(ModularPipelineBlocks):
    @property
    def inputs(self) -> List[InputParam]:
        inputs = [
-            InputParam(name="num_images_per_prompt", default=1),
-            InputParam(name="batch_size", required=True),
-            InputParam(name="height"),
-            InputParam(name="width"),
+            InputParam.template("num_images_per_prompt"),
+            InputParam.template("batch_size"),
+            InputParam.template("height"),
+            InputParam.template("width"),
        ]
-
-        for image_latent_input_name in self._image_latent_inputs:
-            inputs.append(InputParam(name=image_latent_input_name))
-
-        for input_name in self._additional_batch_inputs:
-            inputs.append(InputParam(name=input_name))
+        # default is `image_latents`
+        inputs += self._image_latent_inputs + self._additional_batch_inputs

        return inputs

    @property
    def intermediate_outputs(self) -> List[OutputParam]:
-        return [
+        outputs = [
            OutputParam(
                name="image_height",
                type_hint=int,
@@ -298,11 +383,43 @@ class QwenImageAdditionalInputsStep(ModularPipelineBlocks):
            ),
        ]

+        # `height`/`width` are not new outputs, but they will be updated if any image latent inputs are provided
+        if len(self._image_latent_inputs) > 0:
+            outputs.append(
+                OutputParam(name="height", type_hint=int, description="if not provided, updated to image height")
+            )
+            outputs.append(
+                OutputParam(name="width", type_hint=int, description="if not provided, updated to image width")
+            )
+
+        # image latent inputs are modified in place (patchified and batch-expanded)
+        for input_param in self._image_latent_inputs:
+            outputs.append(
+                OutputParam(
+                    name=input_param.name,
+                    type_hint=input_param.type_hint,
+                    description=input_param.description + " (patchified and batch-expanded)",
+                )
+            )
+
+        # additional batch inputs (batch-expanded only)
+        for input_param in self._additional_batch_inputs:
+            outputs.append(
+                OutputParam(
+                    name=input_param.name,
+                    type_hint=input_param.type_hint,
+                    description=input_param.description + " (batch-expanded)",
+                )
+            )
+
+        return outputs
+
    def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
        block_state = self.get_block_state(state)

        # Process image latent inputs
-        for image_latent_input_name in self._image_latent_inputs:
+        for input_param in self._image_latent_inputs:
+            image_latent_input_name = input_param.name
            image_latent_tensor = getattr(block_state, image_latent_input_name)
            if image_latent_tensor is None:
                continue
@@ -331,7 +448,8 @@ class QwenImageAdditionalInputsStep(ModularPipelineBlocks):
            setattr(block_state, image_latent_input_name, image_latent_tensor)

        # Process additional batch inputs (only batch expansion)
-        for input_name in self._additional_batch_inputs:
+        for input_param in self._additional_batch_inputs:
+            input_name = input_param.name
            input_tensor = getattr(block_state, input_name)
            if input_tensor is None:
                continue
@@ -349,20 +467,76 @@ class QwenImageAdditionalInputsStep(ModularPipelineBlocks):
        return components, state


+# auto_docstring
 class QwenImageEditPlusAdditionalInputsStep(ModularPipelineBlocks):
-    """Input step for QwenImage Edit Plus: handles list of latents with different sizes."""
+    """
+    Input processing step for Edit Plus that:
+        1. For image latent inputs (list): Collects heights/widths, patchifies each, concatenates, expands batch
+        2. For additional batch inputs: Expands batch dimensions to match final batch size
+        Height/width defaults to last image in the list.
+
+      Configured inputs:
+        - Image latent inputs: ['image_latents']
+
+      This block should be placed after the encoder steps and the text input step.
+
+      Components:
+          pachifier (`QwenImagePachifier`)
+
+      Inputs:
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+          batch_size (`int`, *optional*, defaults to 1):
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
+              be generated in input step.
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+
+      Outputs:
+          image_height (`List`):
+              The image heights calculated from the image latents dimension
+          image_width (`List`):
+              The image widths calculated from the image latents dimension
+          height (`int`):
+              if not provided, updated to image height
+          width (`int`):
+              if not provided, updated to image width
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified,
+              concatenated, and batch-expanded)
+    """

    model_name = "qwenimage-edit-plus"

    def __init__(
        self,
-        image_latent_inputs: List[str] = ["image_latents"],
-        additional_batch_inputs: List[str] = [],
+        image_latent_inputs: Optional[List[InputParam]] = None,
+        additional_batch_inputs: Optional[List[InputParam]] = None,
    ):
+        if image_latent_inputs is None:
+            image_latent_inputs = [InputParam.template("image_latents")]
+        if additional_batch_inputs is None:
+            additional_batch_inputs = []
+
        if not isinstance(image_latent_inputs, list):
-            image_latent_inputs = [image_latent_inputs]
+            raise ValueError(f"image_latent_inputs must be a list, but got {type(image_latent_inputs)}")
+        else:
+            for input_param in image_latent_inputs:
+                if not isinstance(input_param, InputParam):
+                    raise ValueError(f"image_latent_inputs must be a list of InputParam, but got {type(input_param)}")
+
        if not isinstance(additional_batch_inputs, list):
-            additional_batch_inputs = [additional_batch_inputs]
+            raise ValueError(f"additional_batch_inputs must be a list, but got {type(additional_batch_inputs)}")
+        else:
+            for input_param in additional_batch_inputs:
+                if not isinstance(input_param, InputParam):
+                    raise ValueError(
+                        f"additional_batch_inputs must be a list of InputParam, but got {type(input_param)}"
+                    )

        self._image_latent_inputs = image_latent_inputs
        self._additional_batch_inputs = additional_batch_inputs
@@ -381,9 +555,9 @@ class QwenImageEditPlusAdditionalInputsStep(ModularPipelineBlocks):
        if self._image_latent_inputs or self._additional_batch_inputs:
            inputs_info = "\n\nConfigured inputs:"
            if self._image_latent_inputs:
-                inputs_info += f"\n  - Image latent inputs: {self._image_latent_inputs}"
+                inputs_info += f"\n  - Image latent inputs: {[p.name for p in self._image_latent_inputs]}"
            if self._additional_batch_inputs:
-                inputs_info += f"\n  - Additional batch inputs: {self._additional_batch_inputs}"
+                inputs_info += f"\n  - Additional batch inputs: {[p.name for p in self._additional_batch_inputs]}"

        placement_section = "\n\nThis block should be placed after the encoder steps and the text input step."

@@ -398,23 +572,20 @@ class QwenImageEditPlusAdditionalInputsStep(ModularPipelineBlocks):
    @property
    def inputs(self) -> List[InputParam]:
        inputs = [
-            InputParam(name="num_images_per_prompt", default=1),
-            InputParam(name="batch_size", required=True),
-            InputParam(name="height"),
-            InputParam(name="width"),
+            InputParam.template("num_images_per_prompt"),
+            InputParam.template("batch_size"),
+            InputParam.template("height"),
+            InputParam.template("width"),
        ]

-        for image_latent_input_name in self._image_latent_inputs:
-            inputs.append(InputParam(name=image_latent_input_name))
-
-        for input_name in self._additional_batch_inputs:
-            inputs.append(InputParam(name=input_name))
+        # default is `image_latents`
+        inputs += self._image_latent_inputs + self._additional_batch_inputs

        return inputs

    @property
    def intermediate_outputs(self) -> List[OutputParam]:
-        return [
+        outputs = [
            OutputParam(
                name="image_height",
                type_hint=List[int],
@@ -427,11 +598,43 @@ class QwenImageEditPlusAdditionalInputsStep(ModularPipelineBlocks):
            ),
        ]

+        # `height`/`width` are updated if any image latent inputs are provided
+        if len(self._image_latent_inputs) > 0:
+            outputs.append(
+                OutputParam(name="height", type_hint=int, description="if not provided, updated to image height")
+            )
+            outputs.append(
+                OutputParam(name="width", type_hint=int, description="if not provided, updated to image width")
+            )
+
+        # image latent inputs are modified in place (patchified, concatenated, and batch-expanded)
+        for input_param in self._image_latent_inputs:
+            outputs.append(
+                OutputParam(
+                    name=input_param.name,
+                    type_hint=input_param.type_hint,
+                    description=input_param.description + " (patchified, concatenated, and batch-expanded)",
+                )
+            )
+
+        # additional batch inputs (batch-expanded only)
+        for input_param in self._additional_batch_inputs:
+            outputs.append(
+                OutputParam(
+                    name=input_param.name,
+                    type_hint=input_param.type_hint,
+                    description=input_param.description + " (batch-expanded)",
+                )
+            )
+
+        return outputs
+
    def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
        block_state = self.get_block_state(state)

        # Process image latent inputs
-        for image_latent_input_name in self._image_latent_inputs:
+        for input_param in self._image_latent_inputs:
+            image_latent_input_name = input_param.name
            image_latent_tensor = getattr(block_state, image_latent_input_name)
            if image_latent_tensor is None:
                continue
@@ -476,7 +679,8 @@ class QwenImageEditPlusAdditionalInputsStep(ModularPipelineBlocks):
            setattr(block_state, image_latent_input_name, packed_image_latent_tensors)

        # Process additional batch inputs (only batch expansion)
-        for input_name in self._additional_batch_inputs:
+        for input_param in self._additional_batch_inputs:
+            input_name = input_param.name
            input_tensor = getattr(block_state, input_name)
            if input_tensor is None:
                continue
@@ -494,22 +698,75 @@ class QwenImageEditPlusAdditionalInputsStep(ModularPipelineBlocks):
        return components, state


-# YiYi TODO: support define config default component from the ModularPipeline level.
-# it is same as QwenImageAdditionalInputsStep, but with layered pachifier.
+# same as QwenImageAdditionalInputsStep, but with layered pachifier.
+
+
+# auto_docstring
 class QwenImageLayeredAdditionalInputsStep(ModularPipelineBlocks):
-    """Input step for QwenImage Layered: update height/width, expand batch, patchify with layered pachifier."""
+    """
+    Input processing step for Layered that:
+        1. For image latent inputs: Updates height/width if None, patchifies with layered pachifier, and expands batch
+           size
+        2. For additional batch inputs: Expands batch dimensions to match final batch size
+
+      Configured inputs:
+        - Image latent inputs: ['image_latents']
+
+      This block should be placed after the encoder steps and the text input step.
+
+      Components:
+          pachifier (`QwenImageLayeredPachifier`)
+
+      Inputs:
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+          batch_size (`int`, *optional*, defaults to 1):
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
+              be generated in input step.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+
+      Outputs:
+          image_height (`int`):
+              The image height calculated from the image latents dimension
+          image_width (`int`):
+              The image width calculated from the image latents dimension
+          height (`int`):
+              if not provided, updated to image height
+          width (`int`):
+              if not provided, updated to image width
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified
+              with layered pachifier and batch-expanded)
+    """

    model_name = "qwenimage-layered"

    def __init__(
        self,
-        image_latent_inputs: List[str] = ["image_latents"],
-        additional_batch_inputs: List[str] = [],
+        image_latent_inputs: Optional[List[InputParam]] = None,
+        additional_batch_inputs: Optional[List[InputParam]] = None,
    ):
+        if image_latent_inputs is None:
+            image_latent_inputs = [InputParam.template("image_latents")]
+        if additional_batch_inputs is None:
+            additional_batch_inputs = []
+
        if not isinstance(image_latent_inputs, list):
-            image_latent_inputs = [image_latent_inputs]
+            raise ValueError(f"image_latent_inputs must be a list, but got {type(image_latent_inputs)}")
+        else:
+            for input_param in image_latent_inputs:
+                if not isinstance(input_param, InputParam):
+                    raise ValueError(f"image_latent_inputs must be a list of InputParam, but got {type(input_param)}")
+
        if not isinstance(additional_batch_inputs, list):
-            additional_batch_inputs = [additional_batch_inputs]
+            raise ValueError(f"additional_batch_inputs must be a list, but got {type(additional_batch_inputs)}")
+        else:
+            for input_param in additional_batch_inputs:
+                if not isinstance(input_param, InputParam):
+                    raise ValueError(
+                        f"additional_batch_inputs must be a list of InputParam, but got {type(input_param)}"
+                    )

        self._image_latent_inputs = image_latent_inputs
        self._additional_batch_inputs = additional_batch_inputs
@@ -527,9 +784,9 @@ class QwenImageLayeredAdditionalInputsStep(ModularPipelineBlocks):
        if self._image_latent_inputs or self._additional_batch_inputs:
            inputs_info = "\n\nConfigured inputs:"
            if self._image_latent_inputs:
-                inputs_info += f"\n  - Image latent inputs: {self._image_latent_inputs}"
+                inputs_info += f"\n  - Image latent inputs: {[p.name for p in self._image_latent_inputs]}"
            if self._additional_batch_inputs:
-                inputs_info += f"\n  - Additional batch inputs: {self._additional_batch_inputs}"
+                inputs_info += f"\n  - Additional batch inputs: {[p.name for p in self._additional_batch_inputs]}"

        placement_section = "\n\nThis block should be placed after the encoder steps and the text input step."

@@ -544,21 +801,18 @@ class QwenImageLayeredAdditionalInputsStep(ModularPipelineBlocks):
    @property
    def inputs(self) -> List[InputParam]:
        inputs = [
-            InputParam(name="num_images_per_prompt", default=1),
-            InputParam(name="batch_size", required=True),
+            InputParam.template("num_images_per_prompt"),
+            InputParam.template("batch_size"),
        ]
+        # default is `image_latents`

-        for image_latent_input_name in self._image_latent_inputs:
-            inputs.append(InputParam(name=image_latent_input_name))
-
-        for input_name in self._additional_batch_inputs:
-            inputs.append(InputParam(name=input_name))
+        inputs += self._image_latent_inputs + self._additional_batch_inputs

        return inputs

    @property
    def intermediate_outputs(self) -> List[OutputParam]:
-        return [
+        outputs = [
            OutputParam(
                name="image_height",
                type_hint=int,
@@ -569,15 +823,44 @@ class QwenImageLayeredAdditionalInputsStep(ModularPipelineBlocks):
                type_hint=int,
                description="The image width calculated from the image latents dimension",
            ),
-            OutputParam(name="height", type_hint=int, description="The height of the image output"),
-            OutputParam(name="width", type_hint=int, description="The width of the image output"),
        ]

+        if len(self._image_latent_inputs) > 0:
+            outputs.append(
+                OutputParam(name="height", type_hint=int, description="if not provided, updated to image height")
+            )
+            outputs.append(
+                OutputParam(name="width", type_hint=int, description="if not provided, updated to image width")
+            )
+
+        # Add outputs for image latent inputs (patchified with layered pachifier and batch-expanded)
+        for input_param in self._image_latent_inputs:
+            outputs.append(
+                OutputParam(
+                    name=input_param.name,
+                    type_hint=input_param.type_hint,
+                    description=input_param.description + " (patchified with layered pachifier and batch-expanded)",
+                )
+            )
+
+        # Add outputs for additional batch inputs (batch-expanded only)
+        for input_param in self._additional_batch_inputs:
+            outputs.append(
+                OutputParam(
+                    name=input_param.name,
+                    type_hint=input_param.type_hint,
+                    description=input_param.description + " (batch-expanded)",
+                )
+            )
+
+        return outputs
+
    def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
        block_state = self.get_block_state(state)

        # Process image latent inputs
-        for image_latent_input_name in self._image_latent_inputs:
+        for input_param in self._image_latent_inputs:
+            image_latent_input_name = input_param.name
            image_latent_tensor = getattr(block_state, image_latent_input_name)
            if image_latent_tensor is None:
                continue
@@ -608,7 +891,8 @@ class QwenImageLayeredAdditionalInputsStep(ModularPipelineBlocks):
            setattr(block_state, image_latent_input_name, image_latent_tensor)

        # Process additional batch inputs (only batch expansion)
-        for input_name in self._additional_batch_inputs:
+        for input_param in self._additional_batch_inputs:
+            input_name = input_param.name
            input_tensor = getattr(block_state, input_name)
            if input_tensor is None:
                continue
@@ -626,7 +910,34 @@ class QwenImageLayeredAdditionalInputsStep(ModularPipelineBlocks):
        return components, state


+# auto_docstring
 class QwenImageControlNetInputsStep(ModularPipelineBlocks):
+    """
+    prepare the `control_image_latents` for controlnet. Insert after all the other inputs steps.
+
+      Inputs:
+          control_image_latents (`Tensor`):
+              The control image latents to use for the denoising process. Can be generated in controlnet vae encoder
+              step.
+          batch_size (`int`, *optional*, defaults to 1):
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
+              be generated in input step.
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+
+      Outputs:
+          control_image_latents (`Tensor`):
+              The control image latents (patchified and batch-expanded).
+          height (`int`):
+              if not provided, updated to control image height
+          width (`int`):
+              if not provided, updated to control image width
+    """
+
    model_name = "qwenimage"

    @property
@@ -636,11 +947,28 @@ class QwenImageControlNetInputsStep(ModularPipelineBlocks):
    @property
    def inputs(self) -> List[InputParam]:
        return [
-            InputParam(name="control_image_latents", required=True),
-            InputParam(name="batch_size", required=True),
-            InputParam(name="num_images_per_prompt", default=1),
-            InputParam(name="height"),
-            InputParam(name="width"),
+            InputParam(
+                name="control_image_latents",
+                required=True,
+                type_hint=torch.Tensor,
+                description="The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step.",
+            ),
+            InputParam.template("batch_size"),
+            InputParam.template("num_images_per_prompt"),
+            InputParam.template("height"),
+            InputParam.template("width"),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [
+            OutputParam(
+                name="control_image_latents",
+                type_hint=torch.Tensor,
+                description="The control image latents (patchified and batch-expanded).",
+            ),
+            OutputParam(name="height", type_hint=int, description="if not provided, updated to control image height"),
+            OutputParam(name="width", type_hint=int, description="if not provided, updated to control image width"),
        ]

    @torch.no_grad()
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
@@ -12,14 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-from typing import List
-
-import PIL.Image
 import torch

 from ...utils import logging
 from ..modular_pipeline import AutoPipelineBlocks, ConditionalPipelineBlocks, SequentialPipelineBlocks
-from ..modular_pipeline_utils import InsertableDict, OutputParam
+from ..modular_pipeline_utils import InputParam, InsertableDict, OutputParam
 from .before_denoise import (
    QwenImageControlNetBeforeDenoiserStep,
    QwenImageCreateMaskLatentsStep,
@@ -59,11 +56,91 @@ logger = logging.get_logger(__name__)


 # ====================
-# 1. VAE ENCODER
+# 1. TEXT ENCODER
 # ====================


+# auto_docstring
+class QwenImageAutoTextEncoderStep(AutoPipelineBlocks):
+    """
+    Text encoder step that encodes the text prompt into a text embedding. This is an auto pipeline block.
+
+      Components:
+          text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use tokenizer (`Qwen2Tokenizer`):
+          The tokenizer to use guider (`ClassifierFreeGuidance`)
+
+      Inputs:
+          prompt (`str`, *optional*):
+              The prompt or prompts to guide image generation.
+          negative_prompt (`str`, *optional*):
+              The prompt or prompts not to guide the image generation.
+          max_sequence_length (`int`, *optional*, defaults to 1024):
+              Maximum sequence length for prompt encoding.
+
+      Outputs:
+          prompt_embeds (`Tensor`):
+              The prompt embeddings.
+          prompt_embeds_mask (`Tensor`):
+              The encoder attention mask.
+          negative_prompt_embeds (`Tensor`):
+              The negative prompt embeddings.
+          negative_prompt_embeds_mask (`Tensor`):
+              The negative prompt embeddings mask.
+    """
+
+    model_name = "qwenimage"
+    block_classes = [QwenImageTextEncoderStep()]
+    block_names = ["text_encoder"]
+    block_trigger_inputs = ["prompt"]
+
+    @property
+    def description(self) -> str:
+        return "Text encoder step that encodes the text prompt into a text embedding. This is an auto pipeline block."
+        " - `QwenImageTextEncoderStep` (text_encoder) is used when `prompt` is provided."
+        " - if `prompt` is not provided, step will be skipped."
+
+
+# ====================
+# 2. VAE ENCODER
+# ====================
+
+
+# auto_docstring
 class QwenImageInpaintVaeEncoderStep(SequentialPipelineBlocks):
+    """
+    This step is used for processing image and mask inputs for inpainting tasks. It:
+       - Resizes the image to the target size, based on `height` and `width`.
+       - Processes and updates `image` and `mask_image`.
+       - Creates `image_latents`.
+
+      Components:
+          image_mask_processor (`InpaintProcessor`) vae (`AutoencoderKLQwenImage`)
+
+      Inputs:
+          mask_image (`Image`):
+              Mask image for inpainting.
+          image (`Union[Image, List]`):
+              Reference image(s) for denoising. Can be a single image or list of images.
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+          padding_mask_crop (`int`, *optional*):
+              Padding for mask cropping in inpainting.
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+
+      Outputs:
+          processed_image (`Tensor`):
+              The processed image
+          processed_mask_image (`Tensor`):
+              The processed mask image
+          mask_overlay_kwargs (`Dict`):
+              The kwargs for the postprocess step to apply the mask overlay
+          image_latents (`Tensor`):
+              The latent representation of the input image.
+    """
+
    model_name = "qwenimage"
    block_classes = [QwenImageInpaintProcessImagesInputStep(), QwenImageVaeEncoderStep()]
    block_names = ["preprocess", "encode"]
@@ -78,7 +155,31 @@ class QwenImageInpaintVaeEncoderStep(SequentialPipelineBlocks):
        )


+# auto_docstring
 class QwenImageImg2ImgVaeEncoderStep(SequentialPipelineBlocks):
+    """
+    Vae encoder step that preprocess andencode the image inputs into their latent representations.
+
+      Components:
+          image_processor (`VaeImageProcessor`) vae (`AutoencoderKLQwenImage`)
+
+      Inputs:
+          image (`Union[Image, List]`):
+              Reference image(s) for denoising. Can be a single image or list of images.
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+
+      Outputs:
+          processed_image (`Tensor`):
+              The processed image
+          image_latents (`Tensor`):
+              The latent representation of the input image.
+    """
+
    model_name = "qwenimage"

    block_classes = [QwenImageProcessImagesInputStep(), QwenImageVaeEncoderStep()]
@@ -89,7 +190,6 @@ class QwenImageImg2ImgVaeEncoderStep(SequentialPipelineBlocks):
        return "Vae encoder step that preprocess andencode the image inputs into their latent representations."


-# Auto VAE encoder
 class QwenImageAutoVaeEncoderStep(AutoPipelineBlocks):
    block_classes = [QwenImageInpaintVaeEncoderStep, QwenImageImg2ImgVaeEncoderStep]
    block_names = ["inpaint", "img2img"]
@@ -107,7 +207,33 @@ class QwenImageAutoVaeEncoderStep(AutoPipelineBlocks):


 # optional controlnet vae encoder
+# auto_docstring
 class QwenImageOptionalControlNetVaeEncoderStep(AutoPipelineBlocks):
+    """
+    Vae encoder step that encode the image inputs into their latent representations.
+      This is an auto pipeline block.
+       - `QwenImageControlNetVaeEncoderStep` (controlnet) is used when `control_image` is provided.
+       - if `control_image` is not provided, step will be skipped.
+
+      Components:
+          vae (`AutoencoderKLQwenImage`) controlnet (`QwenImageControlNetModel`) control_image_processor
+          (`VaeImageProcessor`)
+
+      Inputs:
+          control_image (`Image`, *optional*):
+              Control image for ControlNet conditioning.
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+
+      Outputs:
+          control_image_latents (`Tensor`):
+              The latents representing the control image
+    """
+
    block_classes = [QwenImageControlNetVaeEncoderStep]
    block_names = ["controlnet"]
    block_trigger_inputs = ["control_image"]
@@ -123,14 +249,65 @@ class QwenImageOptionalControlNetVaeEncoderStep(AutoPipelineBlocks):


 # ====================
-# 2. DENOISE (input -> prepare_latents -> set_timesteps -> prepare_rope_inputs -> denoise -> after_denoise)
+# 3. DENOISE (input -> prepare_latents -> set_timesteps -> prepare_rope_inputs -> denoise -> after_denoise)
 # ====================


 # assemble input steps
+# auto_docstring
 class QwenImageImg2ImgInputStep(SequentialPipelineBlocks):
+    """
+    Input step that prepares the inputs for the img2img denoising step. It:
+
+      Components:
+          pachifier (`QwenImagePachifier`)
+
+      Inputs:
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+
+      Outputs:
+          batch_size (`int`):
+              The batch size of the prompt embeddings
+          dtype (`dtype`):
+              The data type of the prompt embeddings
+          prompt_embeds (`Tensor`):
+              The prompt embeddings. (batch-expanded)
+          prompt_embeds_mask (`Tensor`):
+              The encoder attention mask. (batch-expanded)
+          negative_prompt_embeds (`Tensor`):
+              The negative prompt embeddings. (batch-expanded)
+          negative_prompt_embeds_mask (`Tensor`):
+              The negative prompt embeddings mask. (batch-expanded)
+          image_height (`int`):
+              The image height calculated from the image latents dimension
+          image_width (`int`):
+              The image width calculated from the image latents dimension
+          height (`int`):
+              if not provided, updated to image height
+          width (`int`):
+              if not provided, updated to image width
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified and
+              batch-expanded)
+    """
+
    model_name = "qwenimage"
-    block_classes = [QwenImageTextInputsStep(), QwenImageAdditionalInputsStep(image_latent_inputs=["image_latents"])]
+    block_classes = [QwenImageTextInputsStep(), QwenImageAdditionalInputsStep()]
    block_names = ["text_inputs", "additional_inputs"]

    @property
@@ -140,12 +317,69 @@ class QwenImageImg2ImgInputStep(SequentialPipelineBlocks):
        " - update height/width based `image_latents`, patchify `image_latents`."


+# auto_docstring
 class QwenImageInpaintInputStep(SequentialPipelineBlocks):
+    """
+    Input step that prepares the inputs for the inpainting denoising step. It:
+
+      Components:
+          pachifier (`QwenImagePachifier`)
+
+      Inputs:
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+          image_latents (`Tensor`, *optional*):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+          processed_mask_image (`Tensor`, *optional*):
+              The processed mask image
+
+      Outputs:
+          batch_size (`int`):
+              The batch size of the prompt embeddings
+          dtype (`dtype`):
+              The data type of the prompt embeddings
+          prompt_embeds (`Tensor`):
+              The prompt embeddings. (batch-expanded)
+          prompt_embeds_mask (`Tensor`):
+              The encoder attention mask. (batch-expanded)
+          negative_prompt_embeds (`Tensor`):
+              The negative prompt embeddings. (batch-expanded)
+          negative_prompt_embeds_mask (`Tensor`):
+              The negative prompt embeddings mask. (batch-expanded)
+          image_height (`int`):
+              The image height calculated from the image latents dimension
+          image_width (`int`):
+              The image width calculated from the image latents dimension
+          height (`int`):
+              if not provided, updated to image height
+          width (`int`):
+              if not provided, updated to image width
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified and
+              batch-expanded)
+          processed_mask_image (`Tensor`):
+              The processed mask image (batch-expanded)
+    """
+
    model_name = "qwenimage"
    block_classes = [
        QwenImageTextInputsStep(),
        QwenImageAdditionalInputsStep(
-            image_latent_inputs=["image_latents"], additional_batch_inputs=["processed_mask_image"]
+            additional_batch_inputs=[
+                InputParam(name="processed_mask_image", type_hint=torch.Tensor, description="The processed mask image")
+            ]
        ),
    ]
    block_names = ["text_inputs", "additional_inputs"]
@@ -158,7 +392,42 @@ class QwenImageInpaintInputStep(SequentialPipelineBlocks):


 # assemble prepare latents steps
+# auto_docstring
 class QwenImageInpaintPrepareLatentsStep(SequentialPipelineBlocks):
+    """
+    This step prepares the latents/image_latents and mask inputs for the inpainting denoising step. It:
+       - Add noise to the image latents to create the latents input for the denoiser.
+       - Create the pachified latents `mask` based on the processedmask image.
+
+      Components:
+          scheduler (`FlowMatchEulerDiscreteScheduler`) pachifier (`QwenImagePachifier`)
+
+      Inputs:
+          latents (`Tensor`):
+              The initial random noised, can be generated in prepare latent step.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step. (Can be
+              generated from vae encoder and updated in input step.)
+          timesteps (`Tensor`):
+              The timesteps to use for the denoising process. Can be generated in set_timesteps step.
+          processed_mask_image (`Tensor`):
+              The processed mask to use for the inpainting process.
+          height (`int`):
+              The height in pixels of the generated image.
+          width (`int`):
+              The width in pixels of the generated image.
+          dtype (`dtype`, *optional*, defaults to torch.float32):
+              The dtype of the model inputs, can be generated in input step.
+
+      Outputs:
+          initial_noise (`Tensor`):
+              The initial random noised used for inpainting denoising.
+          latents (`Tensor`):
+              The scaled noisy latents to use for inpainting/image-to-image denoising.
+          mask (`Tensor`):
+              The mask to use for the inpainting process.
+    """
+
    model_name = "qwenimage"
    block_classes = [QwenImagePrepareLatentsWithStrengthStep(), QwenImageCreateMaskLatentsStep()]
    block_names = ["add_noise_to_latents", "create_mask_latents"]
@@ -176,7 +445,49 @@ class QwenImageInpaintPrepareLatentsStep(SequentialPipelineBlocks):


 # Qwen Image (text2image)
+# auto_docstring
 class QwenImageCoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs
+    (timesteps, latents, rope inputs etc.).
+
+      Components:
+          pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider
+          (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)
+
+      Inputs:
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              The number of denoising steps.
+          sigmas (`List`, *optional*):
+              Custom sigmas for the denoising process.
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+          **denoiser_input_fields (`None`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
    model_name = "qwenimage"
    block_classes = [
        QwenImageTextInputsStep(),
@@ -199,9 +510,63 @@ class QwenImageCoreDenoiseStep(SequentialPipelineBlocks):
    def description(self):
        return "step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.)."

+    @property
+    def outputs(self):
+        return [
+            OutputParam.template("latents"),
+        ]
+

 # Qwen Image (inpainting)
+# auto_docstring
 class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint
+    task.
+
+      Components:
+          pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider
+          (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)
+
+      Inputs:
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+          image_latents (`Tensor`, *optional*):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+          processed_mask_image (`Tensor`, *optional*):
+              The processed mask image
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              The number of denoising steps.
+          sigmas (`List`, *optional*):
+              Custom sigmas for the denoising process.
+          strength (`float`, *optional*, defaults to 0.9):
+              Strength for img2img/inpainting.
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+          **denoiser_input_fields (`None`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
    model_name = "qwenimage"
    block_classes = [
        QwenImageInpaintInputStep(),
@@ -226,9 +591,61 @@ class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks):
    def description(self):
        return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task."

+    @property
+    def outputs(self):
+        return [
+            OutputParam.template("latents"),
+        ]
+

 # Qwen Image (image2image)
+# auto_docstring
 class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img
+    task.
+
+      Components:
+          pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider
+          (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)
+
+      Inputs:
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              The number of denoising steps.
+          sigmas (`List`, *optional*):
+              Custom sigmas for the denoising process.
+          strength (`float`, *optional*, defaults to 0.9):
+              Strength for img2img/inpainting.
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+          **denoiser_input_fields (`None`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
    model_name = "qwenimage"
    block_classes = [
        QwenImageImg2ImgInputStep(),
@@ -253,9 +670,66 @@ class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
    def description(self):
        return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task."

+    @property
+    def outputs(self):
+        return [
+            OutputParam.template("latents"),
+        ]
+

 # Qwen Image (text2image) with controlnet
+# auto_docstring
 class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs
+    (timesteps, latents, rope inputs etc.).
+
+      Components:
+          pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) controlnet
+          (`QwenImageControlNetModel`) guider (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)
+
+      Inputs:
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
+          control_image_latents (`Tensor`):
+              The control image latents to use for the denoising process. Can be generated in controlnet vae encoder
+              step.
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              The number of denoising steps.
+          sigmas (`List`, *optional*):
+              Custom sigmas for the denoising process.
+          control_guidance_start (`float`, *optional*, defaults to 0.0):
+              When to start applying ControlNet.
+          control_guidance_end (`float`, *optional*, defaults to 1.0):
+              When to stop applying ControlNet.
+          controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
+              Scale for ControlNet conditioning.
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+          **denoiser_input_fields (`None`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
    model_name = "qwenimage"
    block_classes = [
        QwenImageTextInputsStep(),
@@ -282,9 +756,72 @@ class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks):
    def description(self):
        return "step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.)."

+    @property
+    def outputs(self):
+        return [
+            OutputParam.template("latents"),
+        ]
+

 # Qwen Image (inpainting) with controlnet
+# auto_docstring
 class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint
+    task.
+
+      Components:
+          pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) controlnet
+          (`QwenImageControlNetModel`) guider (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)
+
+      Inputs:
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+          image_latents (`Tensor`, *optional*):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+          processed_mask_image (`Tensor`, *optional*):
+              The processed mask image
+          control_image_latents (`Tensor`):
+              The control image latents to use for the denoising process. Can be generated in controlnet vae encoder
+              step.
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              The number of denoising steps.
+          sigmas (`List`, *optional*):
+              Custom sigmas for the denoising process.
+          strength (`float`, *optional*, defaults to 0.9):
+              Strength for img2img/inpainting.
+          control_guidance_start (`float`, *optional*, defaults to 0.0):
+              When to start applying ControlNet.
+          control_guidance_end (`float`, *optional*, defaults to 1.0):
+              When to stop applying ControlNet.
+          controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
+              Scale for ControlNet conditioning.
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+          **denoiser_input_fields (`None`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
    model_name = "qwenimage"
    block_classes = [
        QwenImageInpaintInputStep(),
@@ -313,9 +850,70 @@ class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks):
    def description(self):
        return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task."

+    @property
+    def outputs(self):
+        return [
+            OutputParam.template("latents"),
+        ]
+

 # Qwen Image (image2image) with controlnet
+# auto_docstring
 class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img
+    task.
+
+      Components:
+          pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) controlnet
+          (`QwenImageControlNetModel`) guider (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)
+
+      Inputs:
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+          control_image_latents (`Tensor`):
+              The control image latents to use for the denoising process. Can be generated in controlnet vae encoder
+              step.
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              The number of denoising steps.
+          sigmas (`List`, *optional*):
+              Custom sigmas for the denoising process.
+          strength (`float`, *optional*, defaults to 0.9):
+              Strength for img2img/inpainting.
+          control_guidance_start (`float`, *optional*, defaults to 0.0):
+              When to start applying ControlNet.
+          control_guidance_end (`float`, *optional*, defaults to 1.0):
+              When to stop applying ControlNet.
+          controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
+              Scale for ControlNet conditioning.
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+          **denoiser_input_fields (`None`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
    model_name = "qwenimage"
    block_classes = [
        QwenImageImg2ImgInputStep(),
@@ -344,6 +942,12 @@ class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
    def description(self):
        return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task."

+    @property
+    def outputs(self):
+        return [
+            OutputParam.template("latents"),
+        ]
+

 # Auto denoise step for QwenImage
 class QwenImageAutoCoreDenoiseStep(ConditionalPipelineBlocks):
@@ -402,19 +1006,36 @@ class QwenImageAutoCoreDenoiseStep(ConditionalPipelineBlocks):
    @property
    def outputs(self):
        return [
-            OutputParam(
-                name="latents", type_hint=torch.Tensor, description="The latents generated by the denoising step"
-            ),
+            OutputParam.template("latents"),
        ]


 # ====================
-# 3. DECODE
+# 4. DECODE
 # ====================


 # standard decode step works for most tasks except for inpaint
+# auto_docstring
 class QwenImageDecodeStep(SequentialPipelineBlocks):
+    """
+    Decode step that decodes the latents to images and postprocess the generated image.
+
+      Components:
+          vae (`AutoencoderKLQwenImage`) image_processor (`VaeImageProcessor`)
+
+      Inputs:
+          latents (`Tensor`):
+              The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise
+              step.
+          output_type (`str`, *optional*, defaults to pil):
+              Output format: 'pil', 'np', 'pt'.
+
+      Outputs:
+          images (`List`):
+              Generated images. (tensor output of the vae decoder.)
+    """
+
    model_name = "qwenimage"
    block_classes = [QwenImageDecoderStep(), QwenImageProcessImagesOutputStep()]
    block_names = ["decode", "postprocess"]
@@ -425,7 +1046,30 @@ class QwenImageDecodeStep(SequentialPipelineBlocks):


 # Inpaint decode step
+# auto_docstring
 class QwenImageInpaintDecodeStep(SequentialPipelineBlocks):
+    """
+    Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask
+    overally to the original image.
+
+      Components:
+          vae (`AutoencoderKLQwenImage`) image_mask_processor (`InpaintProcessor`)
+
+      Inputs:
+          latents (`Tensor`):
+              The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise
+              step.
+          output_type (`str`, *optional*, defaults to pil):
+              Output format: 'pil', 'np', 'pt'.
+          mask_overlay_kwargs (`Dict`, *optional*):
+              The kwargs for the postprocess step to apply the mask overlay. generated in
+              InpaintProcessImagesInputStep.
+
+      Outputs:
+          images (`List`):
+              Generated images. (tensor output of the vae decoder.)
+    """
+
    model_name = "qwenimage"
    block_classes = [QwenImageDecoderStep(), QwenImageInpaintProcessImagesOutputStep()]
    block_names = ["decode", "postprocess"]
@@ -452,11 +1096,11 @@ class QwenImageAutoDecodeStep(AutoPipelineBlocks):


 # ====================
-# 4. AUTO BLOCKS & PRESETS
+# 5. AUTO BLOCKS & PRESETS
 # ====================
 AUTO_BLOCKS = InsertableDict(
    [
-        ("text_encoder", QwenImageTextEncoderStep()),
+        ("text_encoder", QwenImageAutoTextEncoderStep()),
        ("vae_encoder", QwenImageAutoVaeEncoderStep()),
        ("controlnet_vae_encoder", QwenImageOptionalControlNetVaeEncoderStep()),
        ("denoise", QwenImageAutoCoreDenoiseStep()),
@@ -465,7 +1109,89 @@ AUTO_BLOCKS = InsertableDict(
 )


+# auto_docstring
 class QwenImageAutoBlocks(SequentialPipelineBlocks):
+    """
+    Auto Modular pipeline for text-to-image, image-to-image, inpainting, and controlnet tasks using QwenImage.
+      - for image-to-image generation, you need to provide `image`
+      - for inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop`.
+      - to run the controlnet workflow, you need to provide `control_image`
+      - for text-to-image generation, all you need to provide is `prompt`
+
+      Components:
+          text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use tokenizer (`Qwen2Tokenizer`):
+          The tokenizer to use guider (`ClassifierFreeGuidance`) image_mask_processor (`InpaintProcessor`) vae
+          (`AutoencoderKLQwenImage`) image_processor (`VaeImageProcessor`) controlnet (`QwenImageControlNetModel`)
+          control_image_processor (`VaeImageProcessor`) pachifier (`QwenImagePachifier`) scheduler
+          (`FlowMatchEulerDiscreteScheduler`) transformer (`QwenImageTransformer2DModel`)
+
+      Inputs:
+          prompt (`str`, *optional*):
+              The prompt or prompts to guide image generation.
+          negative_prompt (`str`, *optional*):
+              The prompt or prompts not to guide the image generation.
+          max_sequence_length (`int`, *optional*, defaults to 1024):
+              Maximum sequence length for prompt encoding.
+          mask_image (`Image`, *optional*):
+              Mask image for inpainting.
+          image (`Union[Image, List]`, *optional*):
+              Reference image(s) for denoising. Can be a single image or list of images.
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+          padding_mask_crop (`int`, *optional*):
+              Padding for mask cropping in inpainting.
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+          control_image (`Image`, *optional*):
+              Control image for ControlNet conditioning.
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
+          latents (`Tensor`):
+              Pre-generated noisy latents for image generation.
+          num_inference_steps (`int`):
+              The number of denoising steps.
+          sigmas (`List`, *optional*):
+              Custom sigmas for the denoising process.
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+          **denoiser_input_fields (`None`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+          image_latents (`Tensor`, *optional*):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+          processed_mask_image (`Tensor`, *optional*):
+              The processed mask image
+          strength (`float`, *optional*, defaults to 0.9):
+              Strength for img2img/inpainting.
+          control_image_latents (`Tensor`, *optional*):
+              The control image latents to use for the denoising process. Can be generated in controlnet vae encoder
+              step.
+          control_guidance_start (`float`, *optional*, defaults to 0.0):
+              When to start applying ControlNet.
+          control_guidance_end (`float`, *optional*, defaults to 1.0):
+              When to stop applying ControlNet.
+          controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
+              Scale for ControlNet conditioning.
+          output_type (`str`, *optional*, defaults to pil):
+              Output format: 'pil', 'np', 'pt'.
+          mask_overlay_kwargs (`Dict`, *optional*):
+              The kwargs for the postprocess step to apply the mask overlay. generated in
+              InpaintProcessImagesInputStep.
+
+      Outputs:
+          images (`List`):
+              Generated images.
+    """
+
    model_name = "qwenimage"

    block_classes = AUTO_BLOCKS.values()
@@ -476,7 +1202,7 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks):
        return (
            "Auto Modular pipeline for text-to-image, image-to-image, inpainting, and controlnet tasks using QwenImage.\n"
            + "- for image-to-image generation, you need to provide `image`\n"
-            + "- for inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop` \n"
+            + "- for inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop`.\n"
            + "- to run the controlnet workflow, you need to provide `control_image`\n"
            + "- for text-to-image generation, all you need to provide is `prompt`"
        )
@@ -484,5 +1210,5 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks):
    @property
    def outputs(self):
        return [
-            OutputParam(name="images", type_hint=List[List[PIL.Image.Image]]),
+            OutputParam.template("images"),
        ]
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
@@ -12,14 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-from typing import List, Optional
+from typing import Optional

-import PIL.Image
 import torch

 from ...utils import logging
 from ..modular_pipeline import AutoPipelineBlocks, ConditionalPipelineBlocks, SequentialPipelineBlocks
-from ..modular_pipeline_utils import InsertableDict, OutputParam
+from ..modular_pipeline_utils import InputParam, InsertableDict, OutputParam
 from .before_denoise import (
    QwenImageCreateMaskLatentsStep,
    QwenImageEditRoPEInputsStep,
@@ -59,8 +58,35 @@ logger = logging.get_logger(__name__)
 # ====================


+# auto_docstring
 class QwenImageEditVLEncoderStep(SequentialPipelineBlocks):
-    """VL encoder that takes both image and text prompts."""
+    """
+    QwenImage-Edit VL encoder step that encode the image and text prompts together.
+
+      Components:
+          image_resize_processor (`VaeImageProcessor`) text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor
+          (`Qwen2VLProcessor`) guider (`ClassifierFreeGuidance`)
+
+      Inputs:
+          image (`Union[Image, List]`):
+              Reference image(s) for denoising. Can be a single image or list of images.
+          prompt (`str`):
+              The prompt or prompts to guide image generation.
+          negative_prompt (`str`, *optional*):
+              The prompt or prompts not to guide the image generation.
+
+      Outputs:
+          resized_image (`List`):
+              The resized images
+          prompt_embeds (`Tensor`):
+              The prompt embeddings.
+          prompt_embeds_mask (`Tensor`):
+              The encoder attention mask.
+          negative_prompt_embeds (`Tensor`):
+              The negative prompt embeddings.
+          negative_prompt_embeds_mask (`Tensor`):
+              The negative prompt embeddings mask.
+    """

    model_name = "qwenimage-edit"
    block_classes = [
@@ -80,7 +106,30 @@ class QwenImageEditVLEncoderStep(SequentialPipelineBlocks):


 # Edit VAE encoder
+# auto_docstring
 class QwenImageEditVaeEncoderStep(SequentialPipelineBlocks):
+    """
+    Vae encoder step that encode the image inputs into their latent representations.
+
+      Components:
+          image_resize_processor (`VaeImageProcessor`) image_processor (`VaeImageProcessor`) vae
+          (`AutoencoderKLQwenImage`)
+
+      Inputs:
+          image (`Union[Image, List]`):
+              Reference image(s) for denoising. Can be a single image or list of images.
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+
+      Outputs:
+          resized_image (`List`):
+              The resized images
+          processed_image (`Tensor`):
+              The processed image
+          image_latents (`Tensor`):
+              The latent representation of the input image.
+    """
+
    model_name = "qwenimage-edit"
    block_classes = [
        QwenImageEditResizeStep(),
@@ -95,12 +144,46 @@ class QwenImageEditVaeEncoderStep(SequentialPipelineBlocks):


 # Edit Inpaint VAE encoder
+# auto_docstring
 class QwenImageEditInpaintVaeEncoderStep(SequentialPipelineBlocks):
+    """
+    This step is used for processing image and mask inputs for QwenImage-Edit inpaint tasks. It:
+       - resize the image for target area (1024 * 1024) while maintaining the aspect ratio.
+       - process the resized image and mask image.
+       - create image latents.
+
+      Components:
+          image_resize_processor (`VaeImageProcessor`) image_mask_processor (`InpaintProcessor`) vae
+          (`AutoencoderKLQwenImage`)
+
+      Inputs:
+          image (`Union[Image, List]`):
+              Reference image(s) for denoising. Can be a single image or list of images.
+          mask_image (`Image`):
+              Mask image for inpainting.
+          padding_mask_crop (`int`, *optional*):
+              Padding for mask cropping in inpainting.
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+
+      Outputs:
+          resized_image (`List`):
+              The resized images
+          processed_image (`Tensor`):
+              The processed image
+          processed_mask_image (`Tensor`):
+              The processed mask image
+          mask_overlay_kwargs (`Dict`):
+              The kwargs for the postprocess step to apply the mask overlay
+          image_latents (`Tensor`):
+              The latent representation of the input image.
+    """
+
    model_name = "qwenimage-edit"
    block_classes = [
        QwenImageEditResizeStep(),
        QwenImageEditInpaintProcessImagesInputStep(),
-        QwenImageVaeEncoderStep(input_name="processed_image", output_name="image_latents"),
+        QwenImageVaeEncoderStep(),
    ]
    block_names = ["resize", "preprocess", "encode"]

@@ -137,11 +220,64 @@ class QwenImageEditAutoVaeEncoderStep(AutoPipelineBlocks):


 # assemble input steps
+# auto_docstring
 class QwenImageEditInputStep(SequentialPipelineBlocks):
+    """
+    Input step that prepares the inputs for the edit denoising step. It:
+       - make sure the text embeddings have consistent batch size as well as the additional inputs.
+       - update height/width based `image_latents`, patchify `image_latents`.
+
+      Components:
+          pachifier (`QwenImagePachifier`)
+
+      Inputs:
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+
+      Outputs:
+          batch_size (`int`):
+              The batch size of the prompt embeddings
+          dtype (`dtype`):
+              The data type of the prompt embeddings
+          prompt_embeds (`Tensor`):
+              The prompt embeddings. (batch-expanded)
+          prompt_embeds_mask (`Tensor`):
+              The encoder attention mask. (batch-expanded)
+          negative_prompt_embeds (`Tensor`):
+              The negative prompt embeddings. (batch-expanded)
+          negative_prompt_embeds_mask (`Tensor`):
+              The negative prompt embeddings mask. (batch-expanded)
+          image_height (`int`):
+              The image height calculated from the image latents dimension
+          image_width (`int`):
+              The image width calculated from the image latents dimension
+          height (`int`):
+              if not provided, updated to image height
+          width (`int`):
+              if not provided, updated to image width
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified and
+              batch-expanded)
+    """
+
    model_name = "qwenimage-edit"
    block_classes = [
        QwenImageTextInputsStep(),
-        QwenImageAdditionalInputsStep(image_latent_inputs=["image_latents"]),
+        QwenImageAdditionalInputsStep(),
    ]
    block_names = ["text_inputs", "additional_inputs"]

@@ -154,12 +290,71 @@ class QwenImageEditInputStep(SequentialPipelineBlocks):
        )


+# auto_docstring
 class QwenImageEditInpaintInputStep(SequentialPipelineBlocks):
+    """
+    Input step that prepares the inputs for the edit inpaint denoising step. It:
+       - make sure the text embeddings have consistent batch size as well as the additional inputs.
+       - update height/width based `image_latents`, patchify `image_latents`.
+
+      Components:
+          pachifier (`QwenImagePachifier`)
+
+      Inputs:
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+          processed_mask_image (`Tensor`, *optional*):
+              The processed mask image
+
+      Outputs:
+          batch_size (`int`):
+              The batch size of the prompt embeddings
+          dtype (`dtype`):
+              The data type of the prompt embeddings
+          prompt_embeds (`Tensor`):
+              The prompt embeddings. (batch-expanded)
+          prompt_embeds_mask (`Tensor`):
+              The encoder attention mask. (batch-expanded)
+          negative_prompt_embeds (`Tensor`):
+              The negative prompt embeddings. (batch-expanded)
+          negative_prompt_embeds_mask (`Tensor`):
+              The negative prompt embeddings mask. (batch-expanded)
+          image_height (`int`):
+              The image height calculated from the image latents dimension
+          image_width (`int`):
+              The image width calculated from the image latents dimension
+          height (`int`):
+              if not provided, updated to image height
+          width (`int`):
+              if not provided, updated to image width
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified and
+              batch-expanded)
+          processed_mask_image (`Tensor`):
+              The processed mask image (batch-expanded)
+    """
+
    model_name = "qwenimage-edit"
    block_classes = [
        QwenImageTextInputsStep(),
        QwenImageAdditionalInputsStep(
-            image_latent_inputs=["image_latents"], additional_batch_inputs=["processed_mask_image"]
+            additional_batch_inputs=[
+                InputParam(name="processed_mask_image", type_hint=torch.Tensor, description="The processed mask image")
+            ]
        ),
    ]
    block_names = ["text_inputs", "additional_inputs"]
@@ -174,7 +369,42 @@ class QwenImageEditInpaintInputStep(SequentialPipelineBlocks):


 # assemble prepare latents steps
+# auto_docstring
 class QwenImageEditInpaintPrepareLatentsStep(SequentialPipelineBlocks):
+    """
+    This step prepares the latents/image_latents and mask inputs for the edit inpainting denoising step. It:
+       - Add noise to the image latents to create the latents input for the denoiser.
+       - Create the patchified latents `mask` based on the processed mask image.
+
+      Components:
+          scheduler (`FlowMatchEulerDiscreteScheduler`) pachifier (`QwenImagePachifier`)
+
+      Inputs:
+          latents (`Tensor`):
+              The initial random noised, can be generated in prepare latent step.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step. (Can be
+              generated from vae encoder and updated in input step.)
+          timesteps (`Tensor`):
+              The timesteps to use for the denoising process. Can be generated in set_timesteps step.
+          processed_mask_image (`Tensor`):
+              The processed mask to use for the inpainting process.
+          height (`int`):
+              The height in pixels of the generated image.
+          width (`int`):
+              The width in pixels of the generated image.
+          dtype (`dtype`, *optional*, defaults to torch.float32):
+              The dtype of the model inputs, can be generated in input step.
+
+      Outputs:
+          initial_noise (`Tensor`):
+              The initial random noised used for inpainting denoising.
+          latents (`Tensor`):
+              The scaled noisy latents to use for inpainting/image-to-image denoising.
+          mask (`Tensor`):
+              The mask to use for the inpainting process.
+    """
+
    model_name = "qwenimage-edit"
    block_classes = [QwenImagePrepareLatentsWithStrengthStep(), QwenImageCreateMaskLatentsStep()]
    block_names = ["add_noise_to_latents", "create_mask_latents"]
@@ -189,7 +419,50 @@ class QwenImageEditInpaintPrepareLatentsStep(SequentialPipelineBlocks):


 # Qwen Image Edit (image2image) core denoise step
+# auto_docstring
 class QwenImageEditCoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    Core denoising workflow for QwenImage-Edit edit (img2img) task.
+
+      Components:
+          pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider
+          (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)
+
+      Inputs:
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              The number of denoising steps.
+          sigmas (`List`, *optional*):
+              Custom sigmas for the denoising process.
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+          **denoiser_input_fields (`None`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
    model_name = "qwenimage-edit"
    block_classes = [
        QwenImageEditInputStep(),
@@ -212,9 +485,62 @@ class QwenImageEditCoreDenoiseStep(SequentialPipelineBlocks):
    def description(self):
        return "Core denoising workflow for QwenImage-Edit edit (img2img) task."

+    @property
+    def outputs(self):
+        return [
+            OutputParam.template("latents"),
+        ]
+

 # Qwen Image Edit (inpainting) core denoise step
+# auto_docstring
 class QwenImageEditInpaintCoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    Core denoising workflow for QwenImage-Edit edit inpaint task.
+
+      Components:
+          pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider
+          (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)
+
+      Inputs:
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+          processed_mask_image (`Tensor`, *optional*):
+              The processed mask image
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              The number of denoising steps.
+          sigmas (`List`, *optional*):
+              Custom sigmas for the denoising process.
+          strength (`float`, *optional*, defaults to 0.9):
+              Strength for img2img/inpainting.
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+          **denoiser_input_fields (`None`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
    model_name = "qwenimage-edit"
    block_classes = [
        QwenImageEditInpaintInputStep(),
@@ -239,6 +565,12 @@ class QwenImageEditInpaintCoreDenoiseStep(SequentialPipelineBlocks):
    def description(self):
        return "Core denoising workflow for QwenImage-Edit edit inpaint task."

+    @property
+    def outputs(self):
+        return [
+            OutputParam.template("latents"),
+        ]
+

 # Auto core denoise step for QwenImage Edit
 class QwenImageEditAutoCoreDenoiseStep(ConditionalPipelineBlocks):
@@ -267,6 +599,12 @@ class QwenImageEditAutoCoreDenoiseStep(ConditionalPipelineBlocks):
            "Supports edit (img2img) and edit inpainting tasks for QwenImage-Edit."
        )

+    @property
+    def outputs(self):
+        return [
+            OutputParam.template("latents"),
+        ]
+

 # ====================
 # 4. DECODE
@@ -274,7 +612,26 @@ class QwenImageEditAutoCoreDenoiseStep(ConditionalPipelineBlocks):


 # Decode step (standard)
+# auto_docstring
 class QwenImageEditDecodeStep(SequentialPipelineBlocks):
+    """
+    Decode step that decodes the latents to images and postprocess the generated image.
+
+      Components:
+          vae (`AutoencoderKLQwenImage`) image_processor (`VaeImageProcessor`)
+
+      Inputs:
+          latents (`Tensor`):
+              The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise
+              step.
+          output_type (`str`, *optional*, defaults to pil):
+              Output format: 'pil', 'np', 'pt'.
+
+      Outputs:
+          images (`List`):
+              Generated images. (tensor output of the vae decoder.)
+    """
+
    model_name = "qwenimage-edit"
    block_classes = [QwenImageDecoderStep(), QwenImageProcessImagesOutputStep()]
    block_names = ["decode", "postprocess"]
@@ -285,7 +642,30 @@ class QwenImageEditDecodeStep(SequentialPipelineBlocks):


 # Inpaint decode step
+# auto_docstring
 class QwenImageEditInpaintDecodeStep(SequentialPipelineBlocks):
+    """
+    Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask
+    overlay to the original image.
+
+      Components:
+          vae (`AutoencoderKLQwenImage`) image_mask_processor (`InpaintProcessor`)
+
+      Inputs:
+          latents (`Tensor`):
+              The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise
+              step.
+          output_type (`str`, *optional*, defaults to pil):
+              Output format: 'pil', 'np', 'pt'.
+          mask_overlay_kwargs (`Dict`, *optional*):
+              The kwargs for the postprocess step to apply the mask overlay. generated in
+              InpaintProcessImagesInputStep.
+
+      Outputs:
+          images (`List`):
+              Generated images. (tensor output of the vae decoder.)
+    """
+
    model_name = "qwenimage-edit"
    block_classes = [QwenImageDecoderStep(), QwenImageInpaintProcessImagesOutputStep()]
    block_names = ["decode", "postprocess"]
@@ -313,9 +693,7 @@ class QwenImageEditAutoDecodeStep(AutoPipelineBlocks):
    @property
    def outputs(self):
        return [
-            OutputParam(
-                name="latents", type_hint=torch.Tensor, description="The latents generated by the denoising step"
-            ),
+            OutputParam.template("latents"),
        ]


@@ -333,7 +711,66 @@ EDIT_AUTO_BLOCKS = InsertableDict(
 )


+# auto_docstring
 class QwenImageEditAutoBlocks(SequentialPipelineBlocks):
+    """
+    Auto Modular pipeline for edit (img2img) and edit inpaint tasks using QwenImage-Edit.
+      - for edit (img2img) generation, you need to provide `image`
+      - for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide
+        `padding_mask_crop`
+
+      Components:
+          image_resize_processor (`VaeImageProcessor`) text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor
+          (`Qwen2VLProcessor`) guider (`ClassifierFreeGuidance`) image_mask_processor (`InpaintProcessor`) vae
+          (`AutoencoderKLQwenImage`) image_processor (`VaeImageProcessor`) pachifier (`QwenImagePachifier`) scheduler
+          (`FlowMatchEulerDiscreteScheduler`) transformer (`QwenImageTransformer2DModel`)
+
+      Inputs:
+          image (`Union[Image, List]`):
+              Reference image(s) for denoising. Can be a single image or list of images.
+          prompt (`str`):
+              The prompt or prompts to guide image generation.
+          negative_prompt (`str`, *optional*):
+              The prompt or prompts not to guide the image generation.
+          mask_image (`Image`, *optional*):
+              Mask image for inpainting.
+          padding_mask_crop (`int`, *optional*):
+              Padding for mask cropping in inpainting.
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+          height (`int`):
+              The height in pixels of the generated image.
+          width (`int`):
+              The width in pixels of the generated image.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+          processed_mask_image (`Tensor`, *optional*):
+              The processed mask image
+          latents (`Tensor`):
+              Pre-generated noisy latents for image generation.
+          num_inference_steps (`int`):
+              The number of denoising steps.
+          sigmas (`List`, *optional*):
+              Custom sigmas for the denoising process.
+          strength (`float`, *optional*, defaults to 0.9):
+              Strength for img2img/inpainting.
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+          **denoiser_input_fields (`None`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+          output_type (`str`, *optional*, defaults to pil):
+              Output format: 'pil', 'np', 'pt'.
+          mask_overlay_kwargs (`Dict`, *optional*):
+              The kwargs for the postprocess step to apply the mask overlay. generated in
+              InpaintProcessImagesInputStep.
+
+      Outputs:
+          images (`List`):
+              Generated images.
+    """
+
    model_name = "qwenimage-edit"
    block_classes = EDIT_AUTO_BLOCKS.values()
    block_names = EDIT_AUTO_BLOCKS.keys()
@@ -349,5 +786,5 @@ class QwenImageEditAutoBlocks(SequentialPipelineBlocks):
    @property
    def outputs(self):
        return [
-            OutputParam(name="images", type_hint=List[List[PIL.Image.Image]], description="The generated images"),
+            OutputParam.template("images"),
        ]
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
@@ -12,11 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-from typing import List
-
-import PIL.Image
-import torch
-
 from ...utils import logging
 from ..modular_pipeline import SequentialPipelineBlocks
 from ..modular_pipeline_utils import InsertableDict, OutputParam
@@ -53,12 +48,41 @@ logger = logging.get_logger(__name__)
 # ====================


+# auto_docstring
 class QwenImageEditPlusVLEncoderStep(SequentialPipelineBlocks):
-    """VL encoder that takes both image and text prompts. Uses 384x384 target area."""
+    """
+    QwenImage-Edit Plus VL encoder step that encodes the image and text prompts together.
+
+      Components:
+          image_resize_processor (`VaeImageProcessor`) text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor
+          (`Qwen2VLProcessor`) guider (`ClassifierFreeGuidance`)
+
+      Inputs:
+          image (`Union[Image, List]`):
+              Reference image(s) for denoising. Can be a single image or list of images.
+          prompt (`str`):
+              The prompt or prompts to guide image generation.
+          negative_prompt (`str`, *optional*):
+              The prompt or prompts not to guide the image generation.
+
+      Outputs:
+          resized_image (`List`):
+              Images resized to 1024x1024 target area for VAE encoding
+          resized_cond_image (`List`):
+              Images resized to 384x384 target area for VL text encoding
+          prompt_embeds (`Tensor`):
+              The prompt embeddings.
+          prompt_embeds_mask (`Tensor`):
+              The encoder attention mask.
+          negative_prompt_embeds (`Tensor`):
+              The negative prompt embeddings.
+          negative_prompt_embeds_mask (`Tensor`):
+              The negative prompt embeddings mask.
+    """

    model_name = "qwenimage-edit-plus"
    block_classes = [
-        QwenImageEditPlusResizeStep(target_area=384 * 384, output_name="resized_cond_image"),
+        QwenImageEditPlusResizeStep(),
        QwenImageEditPlusTextEncoderStep(),
    ]
    block_names = ["resize", "encode"]
@@ -73,12 +97,36 @@ class QwenImageEditPlusVLEncoderStep(SequentialPipelineBlocks):
 # ====================


+# auto_docstring
 class QwenImageEditPlusVaeEncoderStep(SequentialPipelineBlocks):
-    """VAE encoder that handles multiple images with different sizes. Uses 1024x1024 target area."""
+    """
+    VAE encoder step that encodes image inputs into latent representations.
+      Each image is resized independently based on its own aspect ratio to 1024x1024 target area.
+
+      Components:
+          image_resize_processor (`VaeImageProcessor`) image_processor (`VaeImageProcessor`) vae
+          (`AutoencoderKLQwenImage`)
+
+      Inputs:
+          image (`Union[Image, List]`):
+              Reference image(s) for denoising. Can be a single image or list of images.
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+
+      Outputs:
+          resized_image (`List`):
+              Images resized to 1024x1024 target area for VAE encoding
+          resized_cond_image (`List`):
+              Images resized to 384x384 target area for VL text encoding
+          processed_image (`Tensor`):
+              The processed image
+          image_latents (`Tensor`):
+              The latent representation of the input image.
+    """

    model_name = "qwenimage-edit-plus"
    block_classes = [
-        QwenImageEditPlusResizeStep(target_area=1024 * 1024, output_name="resized_image"),
+        QwenImageEditPlusResizeStep(),
        QwenImageEditPlusProcessImagesInputStep(),
        QwenImageVaeEncoderStep(),
    ]
@@ -98,11 +146,66 @@ class QwenImageEditPlusVaeEncoderStep(SequentialPipelineBlocks):


 # assemble input steps
+# auto_docstring
 class QwenImageEditPlusInputStep(SequentialPipelineBlocks):
+    """
+    Input step that prepares the inputs for the Edit Plus denoising step. It:
+       - Standardizes text embeddings batch size.
+       - Processes list of image latents: patchifies, concatenates along dim=1, expands batch.
+       - Outputs lists of image_height/image_width for RoPE calculation.
+       - Defaults height/width from last image in the list.
+
+      Components:
+          pachifier (`QwenImagePachifier`)
+
+      Inputs:
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+
+      Outputs:
+          batch_size (`int`):
+              The batch size of the prompt embeddings
+          dtype (`dtype`):
+              The data type of the prompt embeddings
+          prompt_embeds (`Tensor`):
+              The prompt embeddings. (batch-expanded)
+          prompt_embeds_mask (`Tensor`):
+              The encoder attention mask. (batch-expanded)
+          negative_prompt_embeds (`Tensor`):
+              The negative prompt embeddings. (batch-expanded)
+          negative_prompt_embeds_mask (`Tensor`):
+              The negative prompt embeddings mask. (batch-expanded)
+          image_height (`List`):
+              The image heights calculated from the image latents dimension
+          image_width (`List`):
+              The image widths calculated from the image latents dimension
+          height (`int`):
+              if not provided, updated to image height
+          width (`int`):
+              if not provided, updated to image width
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified,
+              concatenated, and batch-expanded)
+    """
+
    model_name = "qwenimage-edit-plus"
    block_classes = [
        QwenImageTextInputsStep(),
-        QwenImageEditPlusAdditionalInputsStep(image_latent_inputs=["image_latents"]),
+        QwenImageEditPlusAdditionalInputsStep(),
    ]
    block_names = ["text_inputs", "additional_inputs"]

@@ -118,7 +221,50 @@ class QwenImageEditPlusInputStep(SequentialPipelineBlocks):


 # Qwen Image Edit Plus (image2image) core denoise step
+# auto_docstring
 class QwenImageEditPlusCoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    Core denoising workflow for QwenImage-Edit Plus edit (img2img) task.
+
+      Components:
+          pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider
+          (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)
+
+      Inputs:
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              The number of denoising steps.
+          sigmas (`List`, *optional*):
+              Custom sigmas for the denoising process.
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+          **denoiser_input_fields (`None`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
    model_name = "qwenimage-edit-plus"
    block_classes = [
        QwenImageEditPlusInputStep(),
@@ -144,9 +290,7 @@ class QwenImageEditPlusCoreDenoiseStep(SequentialPipelineBlocks):
    @property
    def outputs(self):
        return [
-            OutputParam(
-                name="latents", type_hint=torch.Tensor, description="The latents generated by the denoising step"
-            ),
+            OutputParam.template("latents"),
        ]


@@ -155,7 +299,26 @@ class QwenImageEditPlusCoreDenoiseStep(SequentialPipelineBlocks):
 # ====================


+# auto_docstring
 class QwenImageEditPlusDecodeStep(SequentialPipelineBlocks):
+    """
+    Decode step that decodes the latents to images and postprocesses the generated image.
+
+      Components:
+          vae (`AutoencoderKLQwenImage`) image_processor (`VaeImageProcessor`)
+
+      Inputs:
+          latents (`Tensor`):
+              The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise
+              step.
+          output_type (`str`, *optional*, defaults to pil):
+              Output format: 'pil', 'np', 'pt'.
+
+      Outputs:
+          images (`List`):
+              Generated images. (tensor output of the vae decoder.)
+    """
+
    model_name = "qwenimage-edit-plus"
    block_classes = [QwenImageDecoderStep(), QwenImageProcessImagesOutputStep()]
    block_names = ["decode", "postprocess"]
@@ -179,7 +342,53 @@ EDIT_PLUS_AUTO_BLOCKS = InsertableDict(
 )


+# auto_docstring
 class QwenImageEditPlusAutoBlocks(SequentialPipelineBlocks):
+    """
+    Auto Modular pipeline for edit (img2img) tasks using QwenImage-Edit Plus.
+      - `image` is required input (can be single image or list of images).
+      - Each image is resized independently based on its own aspect ratio.
+      - VL encoder uses 384x384 target area, VAE encoder uses 1024x1024 target area.
+
+      Components:
+          image_resize_processor (`VaeImageProcessor`) text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor
+          (`Qwen2VLProcessor`) guider (`ClassifierFreeGuidance`) image_processor (`VaeImageProcessor`) vae
+          (`AutoencoderKLQwenImage`) pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`)
+          transformer (`QwenImageTransformer2DModel`)
+
+      Inputs:
+          image (`Union[Image, List]`):
+              Reference image(s) for denoising. Can be a single image or list of images.
+          prompt (`str`):
+              The prompt or prompts to guide image generation.
+          negative_prompt (`str`, *optional*):
+              The prompt or prompts not to guide the image generation.
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              The number of denoising steps.
+          sigmas (`List`, *optional*):
+              Custom sigmas for the denoising process.
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+          **denoiser_input_fields (`None`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+          output_type (`str`, *optional*, defaults to pil):
+              Output format: 'pil', 'np', 'pt'.
+
+      Outputs:
+          images (`List`):
+              Generated images.
+    """
+
    model_name = "qwenimage-edit-plus"
    block_classes = EDIT_PLUS_AUTO_BLOCKS.values()
    block_names = EDIT_PLUS_AUTO_BLOCKS.keys()
@@ -196,5 +405,5 @@ class QwenImageEditPlusAutoBlocks(SequentialPipelineBlocks):
    @property
    def outputs(self):
        return [
-            OutputParam(name="images", type_hint=List[List[PIL.Image.Image]], description="The generated images"),
+            OutputParam.template("images"),
        ]
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
@@ -12,12 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-
-from typing import List
-
-import PIL.Image
-import torch
-
 from ...utils import logging
 from ..modular_pipeline import SequentialPipelineBlocks
 from ..modular_pipeline_utils import InsertableDict, OutputParam
@@ -55,8 +49,44 @@ logger = logging.get_logger(__name__)
 # ====================


+# auto_docstring
 class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks):
-    """Text encoder that takes text prompt, will generate a prompt based on image if not provided."""
+    """
+    QwenImage-Layered Text encoder step that encode the text prompt, will generate a prompt based on image if not
+    provided.
+
+      Components:
+          image_resize_processor (`VaeImageProcessor`) text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor
+          (`Qwen2VLProcessor`) tokenizer (`Qwen2Tokenizer`): The tokenizer to use guider (`ClassifierFreeGuidance`)
+
+      Inputs:
+          image (`Union[Image, List]`):
+              Reference image(s) for denoising. Can be a single image or list of images.
+          resolution (`int`, *optional*, defaults to 640):
+              The target area to resize the image to, can be 1024 or 640
+          prompt (`str`, *optional*):
+              The prompt or prompts to guide image generation.
+          use_en_prompt (`bool`, *optional*, defaults to False):
+              Whether to use English prompt template
+          negative_prompt (`str`, *optional*):
+              The prompt or prompts not to guide the image generation.
+          max_sequence_length (`int`, *optional*, defaults to 1024):
+              Maximum sequence length for prompt encoding.
+
+      Outputs:
+          resized_image (`List`):
+              The resized images
+          prompt (`str`):
+              The prompt or prompts to guide image generation. If not provided, updated using image caption
+          prompt_embeds (`Tensor`):
+              The prompt embeddings.
+          prompt_embeds_mask (`Tensor`):
+              The encoder attention mask.
+          negative_prompt_embeds (`Tensor`):
+              The negative prompt embeddings.
+          negative_prompt_embeds_mask (`Tensor`):
+              The negative prompt embeddings mask.
+    """

    model_name = "qwenimage-layered"
    block_classes = [
@@ -77,7 +107,32 @@ class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks):


 # Edit VAE encoder
+# auto_docstring
 class QwenImageLayeredVaeEncoderStep(SequentialPipelineBlocks):
+    """
+    Vae encoder step that encode the image inputs into their latent representations.
+
+      Components:
+          image_resize_processor (`VaeImageProcessor`) image_processor (`VaeImageProcessor`) vae
+          (`AutoencoderKLQwenImage`)
+
+      Inputs:
+          image (`Union[Image, List]`):
+              Reference image(s) for denoising. Can be a single image or list of images.
+          resolution (`int`, *optional*, defaults to 640):
+              The target area to resize the image to, can be 1024 or 640
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+
+      Outputs:
+          resized_image (`List`):
+              The resized images
+          processed_image (`Tensor`):
+              The processed image
+          image_latents (`Tensor`):
+              The latent representation of the input image.
+    """
+
    model_name = "qwenimage-layered"
    block_classes = [
        QwenImageLayeredResizeStep(),
@@ -98,11 +153,60 @@ class QwenImageLayeredVaeEncoderStep(SequentialPipelineBlocks):


 # assemble input steps
+# auto_docstring
 class QwenImageLayeredInputStep(SequentialPipelineBlocks):
+    """
+    Input step that prepares the inputs for the layered denoising step. It:
+       - make sure the text embeddings have consistent batch size as well as the additional inputs.
+       - update height/width based `image_latents`, patchify `image_latents`.
+
+      Components:
+          pachifier (`QwenImageLayeredPachifier`)
+
+      Inputs:
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+
+      Outputs:
+          batch_size (`int`):
+              The batch size of the prompt embeddings
+          dtype (`dtype`):
+              The data type of the prompt embeddings
+          prompt_embeds (`Tensor`):
+              The prompt embeddings. (batch-expanded)
+          prompt_embeds_mask (`Tensor`):
+              The encoder attention mask. (batch-expanded)
+          negative_prompt_embeds (`Tensor`):
+              The negative prompt embeddings. (batch-expanded)
+          negative_prompt_embeds_mask (`Tensor`):
+              The negative prompt embeddings mask. (batch-expanded)
+          image_height (`int`):
+              The image height calculated from the image latents dimension
+          image_width (`int`):
+              The image width calculated from the image latents dimension
+          height (`int`):
+              if not provided, updated to image height
+          width (`int`):
+              if not provided, updated to image width
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified
+              with layered pachifier and batch-expanded)
+    """
+
    model_name = "qwenimage-layered"
    block_classes = [
        QwenImageTextInputsStep(),
-        QwenImageLayeredAdditionalInputsStep(image_latent_inputs=["image_latents"]),
+        QwenImageLayeredAdditionalInputsStep(),
    ]
    block_names = ["text_inputs", "additional_inputs"]

@@ -116,7 +220,48 @@ class QwenImageLayeredInputStep(SequentialPipelineBlocks):


 # Qwen Image Layered (image2image) core denoise step
+# auto_docstring
 class QwenImageLayeredCoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    Core denoising workflow for QwenImage-Layered img2img task.
+
+      Components:
+          pachifier (`QwenImageLayeredPachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider
+          (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)
+
+      Inputs:
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+          layers (`int`, *optional*, defaults to 4):
+              Number of layers to extract from the image
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              The number of denoising steps.
+          sigmas (`List`, *optional*):
+              Custom sigmas for the denoising process.
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+          **denoiser_input_fields (`None`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
    model_name = "qwenimage-layered"
    block_classes = [
        QwenImageLayeredInputStep(),
@@ -142,9 +287,7 @@ class QwenImageLayeredCoreDenoiseStep(SequentialPipelineBlocks):
    @property
    def outputs(self):
        return [
-            OutputParam(
-                name="latents", type_hint=torch.Tensor, description="The latents generated by the denoising step"
-            ),
+            OutputParam.template("latents"),
        ]


@@ -162,7 +305,54 @@ LAYERED_AUTO_BLOCKS = InsertableDict(
 )


+# auto_docstring
 class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks):
+    """
+    Auto Modular pipeline for layered denoising tasks using QwenImage-Layered.
+
+      Components:
+          image_resize_processor (`VaeImageProcessor`) text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor
+          (`Qwen2VLProcessor`) tokenizer (`Qwen2Tokenizer`): The tokenizer to use guider (`ClassifierFreeGuidance`)
+          image_processor (`VaeImageProcessor`) vae (`AutoencoderKLQwenImage`) pachifier (`QwenImageLayeredPachifier`)
+          scheduler (`FlowMatchEulerDiscreteScheduler`) transformer (`QwenImageTransformer2DModel`)
+
+      Inputs:
+          image (`Union[Image, List]`):
+              Reference image(s) for denoising. Can be a single image or list of images.
+          resolution (`int`, *optional*, defaults to 640):
+              The target area to resize the image to, can be 1024 or 640
+          prompt (`str`, *optional*):
+              The prompt or prompts to guide image generation.
+          use_en_prompt (`bool`, *optional*, defaults to False):
+              Whether to use English prompt template
+          negative_prompt (`str`, *optional*):
+              The prompt or prompts not to guide the image generation.
+          max_sequence_length (`int`, *optional*, defaults to 1024):
+              Maximum sequence length for prompt encoding.
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+          layers (`int`, *optional*, defaults to 4):
+              Number of layers to extract from the image
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              The number of denoising steps.
+          sigmas (`List`, *optional*):
+              Custom sigmas for the denoising process.
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+          **denoiser_input_fields (`None`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+          output_type (`str`, *optional*, defaults to pil):
+              Output format: 'pil', 'np', 'pt'.
+
+      Outputs:
+          images (`List`):
+              Generated images.
+    """
+
    model_name = "qwenimage-layered"
    block_classes = LAYERED_AUTO_BLOCKS.values()
    block_names = LAYERED_AUTO_BLOCKS.keys()
@@ -174,5 +364,5 @@ class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks):
    @property
    def outputs(self):
        return [
-            OutputParam(name="images", type_hint=List[List[PIL.Image.Image]], description="The generated images"),
+            OutputParam.template("images"),
        ]
--- a/src/diffusers/modular_pipelines/z_image/denoise.py
+++ b/src/diffusers/modular_pipelines/z_image/denoise.py
@@ -131,7 +131,7 @@ class ZImageLoopDenoiser(ModularPipelineBlocks):
            ),
            InputParam(
                kwargs_type="denoiser_input_fields",
-                description="conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.",
+                description="The conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.",
            ),
        ]
        guider_input_names = []
--- a/src/diffusers/pipelines/init.py
+++ b/src/diffusers/pipelines/init.py
@@ -129,7 +129,7 @@ else:
        "AnimateDiffVideoToVideoControlNetPipeline",
    ]
    _import_structure["bria"] = ["BriaPipeline"]
-    _import_structure["bria_fibo"] = ["BriaFiboPipeline", "BriaFiboEditPipeline"]
+    _import_structure["bria_fibo"] = ["BriaFiboPipeline"]
    _import_structure["flux2"] = ["Flux2Pipeline", "Flux2KleinPipeline"]
    _import_structure["flux"] = [
        "FluxControlPipeline",
@@ -597,7 +597,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
        from .aura_flow import AuraFlowPipeline
        from .blip_diffusion import BlipDiffusionPipeline
        from .bria import BriaPipeline
-        from .bria_fibo import BriaFiboEditPipeline, BriaFiboPipeline
+        from .bria_fibo import BriaFiboPipeline
        from .chroma import ChromaImg2ImgPipeline, ChromaInpaintPipeline, ChromaPipeline
        from .chronoedit import ChronoEditPipeline
        from .cogvideo import (
--- a/src/diffusers/pipelines/bria_fibo/init.py
+++ b/src/diffusers/pipelines/bria_fibo/init.py
@@ -23,8 +23,6 @@ except OptionalDependencyNotAvailable:
    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
 else:
    _import_structure["pipeline_bria_fibo"] = ["BriaFiboPipeline"]
-    _import_structure["pipeline_bria_fibo_edit"] = ["BriaFiboEditPipeline"]
-

 if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
    try:
@@ -35,7 +33,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
        from ...utils.dummy_torch_and_transformers_objects import *
    else:
        from .pipeline_bria_fibo import BriaFiboPipeline
-        from .pipeline_bria_fibo_edit import BriaFiboEditPipeline

 else:
    import sys
--- a/src/diffusers/pipelines/bria_fibo/pipeline_bria_fibo_edit.py
+++ b/src/diffusers/pipelines/bria_fibo/pipeline_bria_fibo_edit.py
--- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py
+++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
@@ -17,51 +17,6 @@ class Flux2AutoBlocks(metaclass=DummyObject):
        requires_backends(cls, ["torch", "transformers"])


-class Flux2KleinAutoBlocks(metaclass=DummyObject):
-    _backends = ["torch", "transformers"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch", "transformers"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "transformers"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "transformers"])
-
-
-class Flux2KleinBaseAutoBlocks(metaclass=DummyObject):
-    _backends = ["torch", "transformers"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch", "transformers"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "transformers"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "transformers"])
-
-
-class Flux2KleinModularPipeline(metaclass=DummyObject):
-    _backends = ["torch", "transformers"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch", "transformers"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "transformers"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "transformers"])
-
-
 class Flux2ModularPipeline(metaclass=DummyObject):
    _backends = ["torch", "transformers"]

@@ -632,21 +587,6 @@ class AuraFlowPipeline(metaclass=DummyObject):
        requires_backends(cls, ["torch", "transformers"])


-class BriaFiboEditPipeline(metaclass=DummyObject):
-    _backends = ["torch", "transformers"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch", "transformers"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "transformers"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "transformers"])
-
-
 class BriaFiboPipeline(metaclass=DummyObject):
    _backends = ["torch", "transformers"]

--- a/tests/pipelines/bria_fibo_edit/init.py
+++ b/tests/pipelines/bria_fibo_edit/init.py
--- a/tests/pipelines/bria_fibo_edit/test_pipeline_bria_fibo_edit.py
+++ b/tests/pipelines/bria_fibo_edit/test_pipeline_bria_fibo_edit.py
@@ -1,192 +0,0 @@
-# Copyright 2024 Bria AI and The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-import torch
-from PIL import Image
-from transformers import AutoTokenizer
-from transformers.models.smollm3.modeling_smollm3 import SmolLM3Config, SmolLM3ForCausalLM
-
-from diffusers import (
-    AutoencoderKLWan,
-    BriaFiboEditPipeline,
-    FlowMatchEulerDiscreteScheduler,
-)
-from diffusers.models.transformers.transformer_bria_fibo import BriaFiboTransformer2DModel
-from tests.pipelines.test_pipelines_common import PipelineTesterMixin
-
-from ...testing_utils import (
-    enable_full_determinism,
-    torch_device,
-)
-
-
-enable_full_determinism()
-
-
-class BriaFiboPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = BriaFiboEditPipeline
-    params = frozenset(["prompt", "height", "width", "guidance_scale"])
-    batch_params = frozenset(["prompt"])
-    test_xformers_attention = False
-    test_layerwise_casting = False
-    test_group_offloading = False
-    supports_dduf = False
-
-    def get_dummy_components(self):
-        torch.manual_seed(0)
-        transformer = BriaFiboTransformer2DModel(
-            patch_size=1,
-            in_channels=16,
-            num_layers=1,
-            num_single_layers=1,
-            attention_head_dim=8,
-            num_attention_heads=2,
-            joint_attention_dim=64,
-            text_encoder_dim=32,
-            pooled_projection_dim=None,
-            axes_dims_rope=[0, 4, 4],
-        )
-
-        vae = AutoencoderKLWan(
-            base_dim=80,
-            decoder_base_dim=128,
-            dim_mult=[1, 2, 4, 4],
-            dropout=0.0,
-            in_channels=12,
-            latents_mean=[0.0] * 16,
-            latents_std=[1.0] * 16,
-            is_residual=True,
-            num_res_blocks=2,
-            out_channels=12,
-            patch_size=2,
-            scale_factor_spatial=16,
-            scale_factor_temporal=4,
-            temperal_downsample=[False, True, True],
-            z_dim=16,
-        )
-        scheduler = FlowMatchEulerDiscreteScheduler()
-        text_encoder = SmolLM3ForCausalLM(SmolLM3Config(hidden_size=32))
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
-
-        components = {
-            "scheduler": scheduler,
-            "text_encoder": text_encoder,
-            "tokenizer": tokenizer,
-            "transformer": transformer,
-            "vae": vae,
-        }
-        return components
-
-    def get_dummy_inputs(self, device, seed=0):
-        if str(device).startswith("mps"):
-            generator = torch.manual_seed(seed)
-        else:
-            generator = torch.Generator(device="cpu").manual_seed(seed)
-        inputs = {
-            "prompt": '{"text": "A painting of a squirrel eating a burger","edit_instruction": "A painting of a squirrel eating a burger"}',
-            "negative_prompt": "bad, ugly",
-            "generator": generator,
-            "num_inference_steps": 2,
-            "guidance_scale": 5.0,
-            "height": 192,
-            "width": 336,
-            "output_type": "np",
-        }
-        image = Image.new("RGB", (336, 192), (255, 255, 255))
-        inputs["image"] = image
-        return inputs
-
-    @unittest.skip(reason="will not be supported due to dim-fusion")
-    def test_encode_prompt_works_in_isolation(self):
-        pass
-
-    @unittest.skip(reason="Batching is not supported yet")
-    def test_num_images_per_prompt(self):
-        pass
-
-    @unittest.skip(reason="Batching is not supported yet")
-    def test_inference_batch_consistent(self):
-        pass
-
-    @unittest.skip(reason="Batching is not supported yet")
-    def test_inference_batch_single_identical(self):
-        pass
-
-    def test_bria_fibo_different_prompts(self):
-        pipe = self.pipeline_class(**self.get_dummy_components())
-        pipe = pipe.to(torch_device)
-        inputs = self.get_dummy_inputs(torch_device)
-        output_same_prompt = pipe(**inputs).images[0]
-
-        inputs = self.get_dummy_inputs(torch_device)
-        inputs["prompt"] = {"edit_instruction": "a different prompt"}
-        output_different_prompts = pipe(**inputs).images[0]
-
-        max_diff = np.abs(output_same_prompt - output_different_prompts).max()
-        assert max_diff > 1e-6
-
-    def test_image_output_shape(self):
-        pipe = self.pipeline_class(**self.get_dummy_components())
-        pipe = pipe.to(torch_device)
-        inputs = self.get_dummy_inputs(torch_device)
-
-        height_width_pairs = [(32, 32), (64, 64), (32, 64)]
-        for height, width in height_width_pairs:
-            expected_height = height
-            expected_width = width
-
-            inputs.update({"height": height, "width": width})
-            image = pipe(**inputs).images[0]
-            output_height, output_width, _ = image.shape
-            assert (output_height, output_width) == (expected_height, expected_width)
-
-    def test_bria_fibo_edit_mask(self):
-        pipe = self.pipeline_class(**self.get_dummy_components())
-        pipe = pipe.to(torch_device)
-        inputs = self.get_dummy_inputs(torch_device)
-
-        mask = Image.fromarray((np.ones((192, 336)) * 255).astype(np.uint8), mode="L")
-
-        inputs.update({"mask": mask})
-        output = pipe(**inputs).images[0]
-
-        assert output.shape == (192, 336, 3)
-
-    def test_bria_fibo_edit_mask_image_size_mismatch(self):
-        pipe = self.pipeline_class(**self.get_dummy_components())
-        pipe = pipe.to(torch_device)
-        inputs = self.get_dummy_inputs(torch_device)
-
-        mask = Image.fromarray((np.ones((64, 64)) * 255).astype(np.uint8), mode="L")
-
-        inputs.update({"mask": mask})
-        with self.assertRaisesRegex(ValueError, "Mask and image must have the same size"):
-            pipe(**inputs)
-
-    def test_bria_fibo_edit_mask_no_image(self):
-        pipe = self.pipeline_class(**self.get_dummy_components())
-        pipe = pipe.to(torch_device)
-        inputs = self.get_dummy_inputs(torch_device)
-
-        mask = Image.fromarray((np.ones((32, 32)) * 255).astype(np.uint8), mode="L")
-
-        # Remove image from inputs if it's there (it shouldn't be by default from get_dummy_inputs)
-        inputs.pop("image", None)
-        inputs.update({"mask": mask})
-
-        with self.assertRaisesRegex(ValueError, "If mask is provided, image must also be provided"):
-            pipe(**inputs)
--- a/utils/modular_auto_docstring.py
+++ b/utils/modular_auto_docstring.py
@@ -0,0 +1,300 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Auto Docstring Generator for Modular Pipeline Blocks
+
+This script scans Python files for classes that have `# auto_docstring` comment above them
+and inserts/updates the docstring from the class's `doc` property.
+
+Run from the root of the repo:
+    python utils/modular_auto_docstring.py [path] [--fix_and_overwrite]
+
+Examples:
+    # Check for auto_docstring markers (will error if found without proper docstring)
+    python utils/modular_auto_docstring.py
+
+    # Check specific directory
+    python utils/modular_auto_docstring.py src/diffusers/modular_pipelines/
+
+    # Fix and overwrite the docstrings
+    python utils/modular_auto_docstring.py --fix_and_overwrite
+
+Usage in code:
+    # auto_docstring
+    class QwenImageAutoVaeEncoderStep(AutoPipelineBlocks):
+        # docstring will be automatically inserted here
+
+        @property
+        def doc(self):
+            return "Your docstring content..."
+"""
+
+import argparse
+import ast
+import glob
+import importlib
+import os
+import re
+import sys
+
+
+# All paths are set with the intent you should run this script from the root of the repo
+DIFFUSERS_PATH = "src/diffusers"
+REPO_PATH = "."
+
+# Pattern to match the auto_docstring comment
+AUTO_DOCSTRING_PATTERN = re.compile(r"^\s*#\s*auto_docstring\s*$")
+
+
+def setup_diffusers_import():
+    """Setup import path to use the local diffusers module."""
+    src_path = os.path.join(REPO_PATH, "src")
+    if src_path not in sys.path:
+        sys.path.insert(0, src_path)
+
+
+def get_module_from_filepath(filepath: str) -> str:
+    """Convert a filepath to a module name."""
+    filepath = os.path.normpath(filepath)
+
+    if filepath.startswith("src" + os.sep):
+        filepath = filepath[4:]
+
+    if filepath.endswith(".py"):
+        filepath = filepath[:-3]
+
+    module_name = filepath.replace(os.sep, ".")
+    return module_name
+
+
+def load_module(filepath: str):
+    """Load a module from filepath."""
+    setup_diffusers_import()
+    module_name = get_module_from_filepath(filepath)
+
+    try:
+        module = importlib.import_module(module_name)
+        return module
+    except Exception as e:
+        print(f"Warning: Could not import module {module_name}: {e}")
+        return None
+
+
+def get_doc_from_class(module, class_name: str) -> str:
+    """Get the doc property from an instantiated class."""
+    if module is None:
+        return None
+
+    cls = getattr(module, class_name, None)
+    if cls is None:
+        return None
+
+    try:
+        instance = cls()
+        if hasattr(instance, "doc"):
+            return instance.doc
+    except Exception as e:
+        print(f"Warning: Could not instantiate {class_name}: {e}")
+
+    return None
+
+
+def find_auto_docstring_classes(filepath: str) -> list:
+    """
+    Find all classes in a file that have # auto_docstring comment above them.
+
+    Returns list of (class_name, class_line_number, has_existing_docstring, docstring_end_line)
+    """
+    with open(filepath, "r", encoding="utf-8", newline="\n") as f:
+        lines = f.readlines()
+
+    # Parse AST to find class locations and their docstrings
+    content = "".join(lines)
+    try:
+        tree = ast.parse(content)
+    except SyntaxError as e:
+        print(f"Syntax error in {filepath}: {e}")
+        return []
+
+    # Build a map of class_name -> (class_line, has_docstring, docstring_end_line)
+    class_info = {}
+    for node in ast.walk(tree):
+        if isinstance(node, ast.ClassDef):
+            has_docstring = False
+            docstring_end_line = node.lineno  # default to class line
+
+            if node.body and isinstance(node.body[0], ast.Expr):
+                first_stmt = node.body[0]
+                if isinstance(first_stmt.value, ast.Constant) and isinstance(first_stmt.value.value, str):
+                    has_docstring = True
+                    docstring_end_line = first_stmt.end_lineno or first_stmt.lineno
+
+            class_info[node.name] = (node.lineno, has_docstring, docstring_end_line)
+
+    # Now scan for # auto_docstring comments
+    classes_to_update = []
+
+    for i, line in enumerate(lines):
+        if AUTO_DOCSTRING_PATTERN.match(line):
+            # Found the marker, look for class definition on next non-empty, non-comment line
+            j = i + 1
+            while j < len(lines):
+                next_line = lines[j].strip()
+                if next_line and not next_line.startswith("#"):
+                    break
+                j += 1
+
+            if j < len(lines) and lines[j].strip().startswith("class "):
+                # Extract class name
+                match = re.match(r"class\s+(\w+)", lines[j].strip())
+                if match:
+                    class_name = match.group(1)
+                    if class_name in class_info:
+                        class_line, has_docstring, docstring_end_line = class_info[class_name]
+                        classes_to_update.append((class_name, class_line, has_docstring, docstring_end_line))
+
+    return classes_to_update
+
+
+def strip_class_name_line(doc: str, class_name: str) -> str:
+    """Remove the 'class ClassName' line from the doc if present."""
+    lines = doc.strip().split("\n")
+    if lines and lines[0].strip() == f"class {class_name}":
+        # Remove the class line and any blank line following it
+        lines = lines[1:]
+        while lines and not lines[0].strip():
+            lines = lines[1:]
+    return "\n".join(lines)
+
+
+def format_docstring(doc: str, indent: str = "    ") -> str:
+    """Format a doc string as a properly indented docstring."""
+    lines = doc.strip().split("\n")
+
+    if len(lines) == 1:
+        return f'{indent}"""{lines[0]}"""\n'
+    else:
+        result = [f'{indent}"""\n']
+        for line in lines:
+            if line.strip():
+                result.append(f"{indent}{line}\n")
+            else:
+                result.append("\n")
+        result.append(f'{indent}"""\n')
+        return "".join(result)
+
+
+def process_file(filepath: str, overwrite: bool = False) -> list:
+    """
+    Process a file and find/insert docstrings for # auto_docstring marked classes.
+
+    Returns list of classes that need updating.
+    """
+    classes_to_update = find_auto_docstring_classes(filepath)
+
+    if not classes_to_update:
+        return []
+
+    if not overwrite:
+        # Just return the list of classes that need updating
+        return [(filepath, cls_name, line) for cls_name, line, _, _ in classes_to_update]
+
+    # Load the module to get doc properties
+    module = load_module(filepath)
+
+    with open(filepath, "r", encoding="utf-8", newline="\n") as f:
+        lines = f.readlines()
+
+    # Process in reverse order to maintain line numbers
+    updated = False
+    for class_name, class_line, has_docstring, docstring_end_line in reversed(classes_to_update):
+        doc = get_doc_from_class(module, class_name)
+
+        if doc is None:
+            print(f"Warning: Could not get doc for {class_name} in {filepath}")
+            continue
+
+        # Remove the "class ClassName" line since it's redundant in a docstring
+        doc = strip_class_name_line(doc, class_name)
+
+        # Format the new docstring with 4-space indent
+        new_docstring = format_docstring(doc, "    ")
+
+        if has_docstring:
+            # Replace existing docstring (line after class definition to docstring_end_line)
+            # class_line is 1-indexed, we want to replace from class_line+1 to docstring_end_line
+            lines = lines[:class_line] + [new_docstring] + lines[docstring_end_line:]
+        else:
+            # Insert new docstring right after class definition line
+            # class_line is 1-indexed, so lines[class_line-1] is the class line
+            # Insert at position class_line (which is right after the class line)
+            lines = lines[:class_line] + [new_docstring] + lines[class_line:]
+
+        updated = True
+        print(f"Updated docstring for {class_name} in {filepath}")
+
+    if updated:
+        with open(filepath, "w", encoding="utf-8", newline="\n") as f:
+            f.writelines(lines)
+
+    return [(filepath, cls_name, line) for cls_name, line, _, _ in classes_to_update]
+
+
+def check_auto_docstrings(path: str = None, overwrite: bool = False):
+    """
+    Check all files for # auto_docstring markers and optionally fix them.
+    """
+    if path is None:
+        path = DIFFUSERS_PATH
+
+    if os.path.isfile(path):
+        all_files = [path]
+    else:
+        all_files = glob.glob(os.path.join(path, "**/*.py"), recursive=True)
+
+    all_markers = []
+
+    for filepath in all_files:
+        markers = process_file(filepath, overwrite)
+        all_markers.extend(markers)
+
+    if not overwrite and len(all_markers) > 0:
+        message = "\n".join([f"- {f}: {cls} at line {line}" for f, cls, line in all_markers])
+        raise ValueError(
+            f"Found the following # auto_docstring markers that need docstrings:\n{message}\n\n"
+            f"Run `python utils/modular_auto_docstring.py --fix_and_overwrite` to fix them."
+        )
+
+    if overwrite and len(all_markers) > 0:
+        print(f"\nUpdated {len(all_markers)} docstring(s).")
+    elif len(all_markers) == 0:
+        print("No # auto_docstring markers found.")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Check and fix # auto_docstring markers in modular pipeline blocks",
+    )
+    parser.add_argument("path", nargs="?", default=None, help="File or directory to process (default: src/diffusers)")
+    parser.add_argument(
+        "--fix_and_overwrite",
+        action="store_true",
+        help="Whether to fix the docstrings by inserting them from doc property.",
+    )
+
+    args = parser.parse_args()
+
+    check_auto_docstrings(args.path, args.fix_and_overwrite)
Author	SHA1	Message	Date
github-actions[bot]	23d06423ab	Apply style fixes	2026-01-19 09:23:31 +00:00
YiYi Xu	aba551c868	Merge branch 'main' into modular-doc-improv	2026-01-18 23:20:36 -10:00
yiyixuxu	1f9576a2ca	fix	2026-01-19 09:56:14 +01:00
yiyixuxu	d75fbc43c7	Merge branch 'modular-doc-improv' of github.com:huggingface/diffusers into modular-doc-improv	2026-01-19 09:54:46 +01:00
yiyixuxu	b7127ce7a7	revert change in z	2026-01-19 09:54:40 +01:00
YiYi Xu	7e9d2b954e	Apply suggestions from code review	2026-01-18 22:44:44 -10:00
yiyixuxu	94525200fd	rmove space in make docstring	2026-01-19 09:35:39 +01:00
yiyixuxu	f056af1fbb	make style	2026-01-19 09:27:40 +01:00
yiyixuxu	8d45ff5bf6	apply auto docstring	2026-01-19 09:22:04 +01:00
yiyixuxu	fb15752d55	up up up	2026-01-19 08:10:31 +01:00
yiyixuxu	1f2dbc9dd2	up	2026-01-19 04:10:17 +01:00
yiyixuxu	002c3e8239	add template method	2026-01-19 03:24:34 +01:00
yiyixuxu	de03d7f100	refactor based on dhruv's feedback: remove the class method	2026-01-18 00:35:01 +01:00
yiyixuxu	25c968a38f	add TODO in the description for empty docstring	2026-01-17 09:57:56 +01:00
yiyixuxu	aea0d046f6	address feedbacks	2026-01-17 09:36:58 +01:00
yiyixuxu	1c90ce33f2	up	2026-01-10 12:21:26 +01:00
yiyixuxu	507953f415	more more	2026-01-10 12:19:14 +01:00
yiyixuxu	f0555af1c6	up up up	2026-01-10 12:15:53 +01:00
yiyixuxu	2a81f2ec54	style	2026-01-10 12:15:36 +01:00
yiyixuxu	d20f413f78	more auto docstring	2026-01-10 12:11:28 +01:00
yiyixuxu	ff09bf1a63	add modular_auto_docstring!	2026-01-10 11:55:03 +01:00
yiyixuxu	34a743e2dc	style	2026-01-10 10:57:27 +01:00
yiyixuxu	43ab14845d	update outputs	2026-01-10 10:56:54 +01:00
YiYi Xu	fbfe5c8d6b	Merge branch 'main' into modular-doc-improv	2026-01-09 23:54:23 -10:00
yiyixuxu	b29873dee7	up up	2026-01-10 10:52:53 +01:00
yiyixuxu	7b499de6d0	up	2026-01-10 03:35:15 +01:00