Merge branch 'main' into device-map-direct

2026-01-20 18:45:47 +08:00 · 2026-01-19 10:26:24 +05:30 · 2026-01-13 10:35:08 +05:30 · 2026-01-08 12:23:39 +05:30 · 2025-12-23 13:16:10 +05:30 · 2025-12-11 14:47:09 +08:00
19 changed files with 664 additions and 4068 deletions
--- a/.github/workflows/pr_modular_tests.yml
+++ b/.github/workflows/pr_modular_tests.yml
@@ -75,27 +75,9 @@ jobs:
        if: ${{ failure() }}
        run: |
          echo "Repo consistency check failed. Please ensure the right dependency versions are installed with 'pip install -e .[quality]' and run 'make fix-copies'" >> $GITHUB_STEP_SUMMARY
-  check_auto_docs:
-    runs-on: ubuntu-22.04
-    steps:
-      - uses: actions/checkout@v6
-      - name: Set up Python
-        uses: actions/setup-python@v6
-        with:
-          python-version: "3.10"
-      - name: Install dependencies
-        run: |
-          pip install --upgrade pip
-          pip install .[quality]
-      - name: Check auto docs
-        run: make modular-autodoctrings
-      - name: Check if failure
-        if: ${{ failure() }}
-        run: |
-          echo "Auto docstring checks failed. Please run `python utils/modular_auto_docstring.py --fix_and_overwrite`." >> $GITHUB_STEP_SUMMARY

  run_fast_tests:
-    needs: [check_code_quality, check_repository_consistency, check_auto_docs]
+    needs: [check_code_quality, check_repository_consistency]
    name: Fast PyTorch Modular Pipeline CPU tests

    runs-on:
--- a/4
+++ b/4
@@ -70,10 +70,6 @@ fix-copies:
 	python utils/check_copies.py --fix_and_overwrite
 	python utils/check_dummies.py --fix_and_overwrite

-# Auto docstrings in modular blocks
-modular-autodoctrings:
-	python utils/modular_auto_docstring.py
-
 # Run tests for the library

 test:
--- a/src/diffusers/modular_pipelines/modular_pipeline_utils.py
+++ b/src/diffusers/modular_pipelines/modular_pipeline_utils.py
@@ -18,7 +18,6 @@ from collections import OrderedDict
 from dataclasses import dataclass, field, fields
 from typing import Any, Dict, List, Literal, Optional, Type, Union

-import PIL.Image
 import torch

 from ..configuration_utils import ConfigMixin, FrozenDict
@@ -324,192 +323,11 @@ class ConfigSpec:
    description: Optional[str] = None


-# ======================================================
-# InputParam and OutputParam templates
-# ======================================================
-
-INPUT_PARAM_TEMPLATES = {
-    "prompt": {
-        "type_hint": str,
-        "required": True,
-        "description": "The prompt or prompts to guide image generation.",
-    },
-    "negative_prompt": {
-        "type_hint": str,
-        "description": "The prompt or prompts not to guide the image generation.",
-    },
-    "max_sequence_length": {
-        "type_hint": int,
-        "default": 512,
-        "description": "Maximum sequence length for prompt encoding.",
-    },
-    "height": {
-        "type_hint": int,
-        "description": "The height in pixels of the generated image.",
-    },
-    "width": {
-        "type_hint": int,
-        "description": "The width in pixels of the generated image.",
-    },
-    "num_inference_steps": {
-        "type_hint": int,
-        "default": 50,
-        "description": "The number of denoising steps.",
-    },
-    "num_images_per_prompt": {
-        "type_hint": int,
-        "default": 1,
-        "description": "The number of images to generate per prompt.",
-    },
-    "generator": {
-        "type_hint": torch.Generator,
-        "description": "Torch generator for deterministic generation.",
-    },
-    "sigmas": {
-        "type_hint": List[float],
-        "description": "Custom sigmas for the denoising process.",
-    },
-    "strength": {
-        "type_hint": float,
-        "default": 0.9,
-        "description": "Strength for img2img/inpainting.",
-    },
-    "image": {
-        "type_hint": Union[PIL.Image.Image, List[PIL.Image.Image]],
-        "required": True,
-        "description": "Reference image(s) for denoising. Can be a single image or list of images.",
-    },
-    "latents": {
-        "type_hint": torch.Tensor,
-        "description": "Pre-generated noisy latents for image generation.",
-    },
-    "timesteps": {
-        "type_hint": torch.Tensor,
-        "description": "Timesteps for the denoising process.",
-    },
-    "output_type": {
-        "type_hint": str,
-        "default": "pil",
-        "description": "Output format: 'pil', 'np', 'pt'.",
-    },
-    "attention_kwargs": {
-        "type_hint": Dict[str, Any],
-        "description": "Additional kwargs for attention processors.",
-    },
-    "denoiser_input_fields": {
-        "name": None,
-        "kwargs_type": "denoiser_input_fields",
-        "description": "conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.",
-    },
-    # inpainting
-    "mask_image": {
-        "type_hint": PIL.Image.Image,
-        "required": True,
-        "description": "Mask image for inpainting.",
-    },
-    "padding_mask_crop": {
-        "type_hint": int,
-        "description": "Padding for mask cropping in inpainting.",
-    },
-    # controlnet
-    "control_image": {
-        "type_hint": PIL.Image.Image,
-        "required": True,
-        "description": "Control image for ControlNet conditioning.",
-    },
-    "control_guidance_start": {
-        "type_hint": float,
-        "default": 0.0,
-        "description": "When to start applying ControlNet.",
-    },
-    "control_guidance_end": {
-        "type_hint": float,
-        "default": 1.0,
-        "description": "When to stop applying ControlNet.",
-    },
-    "controlnet_conditioning_scale": {
-        "type_hint": float,
-        "default": 1.0,
-        "description": "Scale for ControlNet conditioning.",
-    },
-    "layers": {
-        "type_hint": int,
-        "default": 4,
-        "description": "Number of layers to extract from the image",
-    },
-    # common intermediate inputs
-    "prompt_embeds": {
-        "type_hint": torch.Tensor,
-        "required": True,
-        "description": "text embeddings used to guide the image generation. Can be generated from text_encoder step.",
-    },
-    "prompt_embeds_mask": {
-        "type_hint": torch.Tensor,
-        "required": True,
-        "description": "mask for the text embeddings. Can be generated from text_encoder step.",
-    },
-    "negative_prompt_embeds": {
-        "type_hint": torch.Tensor,
-        "description": "negative text embeddings used to guide the image generation. Can be generated from text_encoder step.",
-    },
-    "negative_prompt_embeds_mask": {
-        "type_hint": torch.Tensor,
-        "description": "mask for the negative text embeddings. Can be generated from text_encoder step.",
-    },
-    "image_latents": {
-        "type_hint": torch.Tensor,
-        "required": True,
-        "description": "image latents used to guide the image generation. Can be generated from vae_encoder step.",
-    },
-    "batch_size": {
-        "type_hint": int,
-        "default": 1,
-        "description": "Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be generated in input step.",
-    },
-    "dtype": {
-        "type_hint": torch.dtype,
-        "default": torch.float32,
-        "description": "The dtype of the model inputs, can be generated in input step.",
-    },
-}
-
-OUTPUT_PARAM_TEMPLATES = {
-    "images": {
-        "type_hint": List[PIL.Image.Image],
-        "description": "Generated images.",
-    },
-    "latents": {
-        "type_hint": torch.Tensor,
-        "description": "Denoised latents.",
-    },
-    # intermediate outputs
-    "prompt_embeds": {
-        "type_hint": torch.Tensor,
-        "kwargs_type": "denoiser_input_fields",
-        "description": "The prompt embeddings.",
-    },
-    "prompt_embeds_mask": {
-        "type_hint": torch.Tensor,
-        "kwargs_type": "denoiser_input_fields",
-        "description": "The encoder attention mask.",
-    },
-    "negative_prompt_embeds": {
-        "type_hint": torch.Tensor,
-        "kwargs_type": "denoiser_input_fields",
-        "description": "The negative prompt embeddings.",
-    },
-    "negative_prompt_embeds_mask": {
-        "type_hint": torch.Tensor,
-        "kwargs_type": "denoiser_input_fields",
-        "description": "The negative prompt embeddings mask.",
-    },
-    "image_latents": {
-        "type_hint": torch.Tensor,
-        "description": "The latent representation of the input image.",
-    },
-}
-
-
+# YiYi Notes: both inputs and intermediate_inputs are InputParam objects
+# however some fields are not relevant for intermediate_inputs
+# e.g. unlike inputs, required only used in docstring for intermediate_inputs, we do not check if a required intermediate inputs is passed
+# default is not used for intermediate_inputs, we only use default from inputs, so it is ignored if it is set for intermediate_inputs
+# -> should we use different class for inputs and intermediate_inputs?
@dataclass
 class InputParam:
    """Specification for an input parameter."""
@@ -519,31 +337,11 @@ class InputParam:
    default: Any = None
    required: bool = False
    description: str = ""
-    kwargs_type: str = None
+    kwargs_type: str = None  # YiYi Notes: remove this feature (maybe)

    def __repr__(self):
        return f"<{self.name}: {'required' if self.required else 'optional'}, default={self.default}>"

-    @classmethod
-    def template(cls, template_name: str, note: str = None, **overrides) -> "InputParam":
-        """Get template for name if exists, otherwise raise ValueError."""
-        if template_name not in INPUT_PARAM_TEMPLATES:
-            raise ValueError(f"InputParam template for {template_name} not found")
-
-        template_kwargs = INPUT_PARAM_TEMPLATES[template_name].copy()
-
-        # Determine the actual param name:
-        # 1. From overrides if provided
-        # 2. From template if present
-        # 3. Fall back to template_name
-        name = overrides.pop("name", template_kwargs.pop("name", template_name))
-
-        if note and "description" in template_kwargs:
-            template_kwargs["description"] = f"{template_kwargs['description']} ({note})"
-
-        template_kwargs.update(overrides)
-        return cls(name=name, **template_kwargs)
-

@dataclass
 class OutputParam:
@@ -552,33 +350,13 @@ class OutputParam:
    name: str
    type_hint: Any = None
    description: str = ""
-    kwargs_type: str = None
+    kwargs_type: str = None  # YiYi notes: remove this feature (maybe)

    def __repr__(self):
        return (
            f"<{self.name}: {self.type_hint.__name__ if hasattr(self.type_hint, '__name__') else str(self.type_hint)}>"
        )

-    @classmethod
-    def template(cls, template_name: str, note: str = None, **overrides) -> "OutputParam":
-        """Get template for name if exists, otherwise raise ValueError."""
-        if template_name not in OUTPUT_PARAM_TEMPLATES:
-            raise ValueError(f"OutputParam template for {template_name} not found")
-
-        template_kwargs = OUTPUT_PARAM_TEMPLATES[template_name].copy()
-
-        # Determine the actual param name:
-        # 1. From overrides if provided
-        # 2. From template if present
-        # 3. Fall back to template_name
-        name = overrides.pop("name", template_kwargs.pop("name", template_name))
-
-        if note and "description" in template_kwargs:
-            template_kwargs["description"] = f"{template_kwargs['description']} ({note})"
-
-        template_kwargs.update(overrides)
-        return cls(name=name, **template_kwargs)
-

 def format_inputs_short(inputs):
    """
@@ -731,12 +509,10 @@ def format_params(params, header="Args", indent_level=4, max_line_length=115):
            desc = re.sub(r"\[(.*?)\]\((https?://[^\s\)]+)\)", r"[\1](\2)", param.description)
            wrapped_desc = wrap_text(desc, desc_indent, max_line_length)
            param_str += f"\n{desc_indent}{wrapped_desc}"
-        else:
-            param_str += f"\n{desc_indent}TODO: Add description."

        formatted_params.append(param_str)

-    return "\n".join(formatted_params)
+    return "\n\n".join(formatted_params)


 def format_input_params(input_params, indent_level=4, max_line_length=115):
@@ -806,7 +582,7 @@ def format_components(components, indent_level=4, max_line_length=115, add_empty
        loading_field_values = []
        for field_name in component.loading_fields():
            field_value = getattr(component, field_name)
-            if field_value:
+            if field_value is not None:
                loading_field_values.append(f"{field_name}={field_value}")

        # Add loading field information if available
@@ -893,17 +669,17 @@ def make_doc_string(
    # Add description
    if description:
        desc_lines = description.strip().split("\n")
-        aligned_desc = "\n".join("  " + line.rstrip() for line in desc_lines)
+        aligned_desc = "\n".join("  " + line for line in desc_lines)
        output += aligned_desc + "\n\n"

    # Add components section if provided
    if expected_components and len(expected_components) > 0:
-        components_str = format_components(expected_components, indent_level=2, add_empty_lines=False)
+        components_str = format_components(expected_components, indent_level=2)
        output += components_str + "\n\n"

    # Add configs section if provided
    if expected_configs and len(expected_configs) > 0:
-        configs_str = format_configs(expected_configs, indent_level=2, add_empty_lines=False)
+        configs_str = format_configs(expected_configs, indent_level=2)
        output += configs_str + "\n\n"

    # Add inputs section
--- a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
+++ b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
@@ -118,40 +118,7 @@ def get_timesteps(scheduler, num_inference_steps, strength):
 # ====================


-# auto_docstring
 class QwenImagePrepareLatentsStep(ModularPipelineBlocks):
-    """
-    Prepare initial random noise for the generation process
-
-      Components:
-          pachifier (`QwenImagePachifier`)
-
-      Inputs:
-          latents (`Tensor`, *optional*):
-              Pre-generated noisy latents for image generation.
-          height (`int`, *optional*):
-              The height in pixels of the generated image.
-          width (`int`, *optional*):
-              The width in pixels of the generated image.
-          num_images_per_prompt (`int`, *optional*, defaults to 1):
-              The number of images to generate per prompt.
-          generator (`Generator`, *optional*):
-              Torch generator for deterministic generation.
-          batch_size (`int`, *optional*, defaults to 1):
-              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
-              be generated in input step.
-          dtype (`dtype`, *optional*, defaults to torch.float32):
-              The dtype of the model inputs, can be generated in input step.
-
-      Outputs:
-          height (`int`):
-              if not set, updated to default value
-          width (`int`):
-              if not set, updated to default value
-          latents (`Tensor`):
-              The initial latents to use for the denoising process
-    """
-
    model_name = "qwenimage"

    @property
@@ -167,20 +134,28 @@ class QwenImagePrepareLatentsStep(ModularPipelineBlocks):
    @property
    def inputs(self) -> List[InputParam]:
        return [
-            InputParam.template("latents"),
-            InputParam.template("height"),
-            InputParam.template("width"),
-            InputParam.template("num_images_per_prompt"),
-            InputParam.template("generator"),
-            InputParam.template("batch_size"),
-            InputParam.template("dtype"),
+            InputParam("latents"),
+            InputParam(name="height"),
+            InputParam(name="width"),
+            InputParam(name="num_images_per_prompt", default=1),
+            InputParam(name="generator"),
+            InputParam(
+                name="batch_size",
+                required=True,
+                type_hint=int,
+                description="Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be generated in input step.",
+            ),
+            InputParam(
+                name="dtype",
+                required=True,
+                type_hint=torch.dtype,
+                description="The dtype of the model inputs, can be generated in input step.",
+            ),
        ]

    @property
    def intermediate_outputs(self) -> List[OutputParam]:
        return [
-            OutputParam(name="height", type_hint=int, description="if not set, updated to default value"),
-            OutputParam(name="width", type_hint=int, description="if not set, updated to default value"),
            OutputParam(
                name="latents",
                type_hint=torch.Tensor,
@@ -234,42 +209,7 @@ class QwenImagePrepareLatentsStep(ModularPipelineBlocks):
        return components, state


-# auto_docstring
 class QwenImageLayeredPrepareLatentsStep(ModularPipelineBlocks):
-    """
-    Prepare initial random noise (B, layers+1, C, H, W) for the generation process
-
-      Components:
-          pachifier (`QwenImageLayeredPachifier`)
-
-      Inputs:
-          latents (`Tensor`, *optional*):
-              Pre-generated noisy latents for image generation.
-          height (`int`, *optional*):
-              The height in pixels of the generated image.
-          width (`int`, *optional*):
-              The width in pixels of the generated image.
-          layers (`int`, *optional*, defaults to 4):
-              Number of layers to extract from the image
-          num_images_per_prompt (`int`, *optional*, defaults to 1):
-              The number of images to generate per prompt.
-          generator (`Generator`, *optional*):
-              Torch generator for deterministic generation.
-          batch_size (`int`, *optional*, defaults to 1):
-              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
-              be generated in input step.
-          dtype (`dtype`, *optional*, defaults to torch.float32):
-              The dtype of the model inputs, can be generated in input step.
-
-      Outputs:
-          height (`int`):
-              if not set, updated to default value
-          width (`int`):
-              if not set, updated to default value
-          latents (`Tensor`):
-              The initial latents to use for the denoising process
-    """
-
    model_name = "qwenimage-layered"

    @property
@@ -285,21 +225,29 @@ class QwenImageLayeredPrepareLatentsStep(ModularPipelineBlocks):
    @property
    def inputs(self) -> List[InputParam]:
        return [
-            InputParam.template("latents"),
-            InputParam.template("height"),
-            InputParam.template("width"),
-            InputParam.template("layers"),
-            InputParam.template("num_images_per_prompt"),
-            InputParam.template("generator"),
-            InputParam.template("batch_size"),
-            InputParam.template("dtype"),
+            InputParam("latents"),
+            InputParam(name="height"),
+            InputParam(name="width"),
+            InputParam(name="layers", default=4),
+            InputParam(name="num_images_per_prompt", default=1),
+            InputParam(name="generator"),
+            InputParam(
+                name="batch_size",
+                required=True,
+                type_hint=int,
+                description="Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be generated in input step.",
+            ),
+            InputParam(
+                name="dtype",
+                required=True,
+                type_hint=torch.dtype,
+                description="The dtype of the model inputs, can be generated in input step.",
+            ),
        ]

    @property
    def intermediate_outputs(self) -> List[OutputParam]:
        return [
-            OutputParam(name="height", type_hint=int, description="if not set, updated to default value"),
-            OutputParam(name="width", type_hint=int, description="if not set, updated to default value"),
            OutputParam(
                name="latents",
                type_hint=torch.Tensor,
@@ -353,31 +301,7 @@ class QwenImageLayeredPrepareLatentsStep(ModularPipelineBlocks):
        return components, state


-# auto_docstring
 class QwenImagePrepareLatentsWithStrengthStep(ModularPipelineBlocks):
-    """
-    Step that adds noise to image latents for image-to-image/inpainting. Should be run after set_timesteps,
-    prepare_latents. Both noise and image latents should alreadybe patchified.
-
-      Components:
-          scheduler (`FlowMatchEulerDiscreteScheduler`)
-
-      Inputs:
-          latents (`Tensor`):
-              The initial random noised, can be generated in prepare latent step.
-          image_latents (`Tensor`):
-              image latents used to guide the image generation. Can be generated from vae_encoder step. (Can be
-              generated from vae encoder and updated in input step.)
-          timesteps (`Tensor`):
-              The timesteps to use for the denoising process. Can be generated in set_timesteps step.
-
-      Outputs:
-          initial_noise (`Tensor`):
-              The initial random noised used for inpainting denoising.
-          latents (`Tensor`):
-              The scaled noisy latents to use for inpainting/image-to-image denoising.
-    """
-
    model_name = "qwenimage"

    @property
@@ -399,7 +323,12 @@ class QwenImagePrepareLatentsWithStrengthStep(ModularPipelineBlocks):
                type_hint=torch.Tensor,
                description="The initial random noised, can be generated in prepare latent step.",
            ),
-            InputParam.template("image_latents", note="Can be generated from vae encoder and updated in input step."),
+            InputParam(
+                name="image_latents",
+                required=True,
+                type_hint=torch.Tensor,
+                description="The image latents to use for the denoising process. Can be generated in vae encoder and packed in input step.",
+            ),
            InputParam(
                name="timesteps",
                required=True,
@@ -416,11 +345,6 @@ class QwenImagePrepareLatentsWithStrengthStep(ModularPipelineBlocks):
                type_hint=torch.Tensor,
                description="The initial random noised used for inpainting denoising.",
            ),
-            OutputParam(
-                name="latents",
-                type_hint=torch.Tensor,
-                description="The scaled noisy latents to use for inpainting/image-to-image denoising.",
-            ),
        ]

    @staticmethod
@@ -458,29 +382,7 @@ class QwenImagePrepareLatentsWithStrengthStep(ModularPipelineBlocks):
        return components, state


-# auto_docstring
 class QwenImageCreateMaskLatentsStep(ModularPipelineBlocks):
-    """
-    Step that creates mask latents from preprocessed mask_image by interpolating to latent space.
-
-      Components:
-          pachifier (`QwenImagePachifier`)
-
-      Inputs:
-          processed_mask_image (`Tensor`):
-              The processed mask to use for the inpainting process.
-          height (`int`):
-              The height in pixels of the generated image.
-          width (`int`):
-              The width in pixels of the generated image.
-          dtype (`dtype`, *optional*, defaults to torch.float32):
-              The dtype of the model inputs, can be generated in input step.
-
-      Outputs:
-          mask (`Tensor`):
-              The mask to use for the inpainting process.
-    """
-
    model_name = "qwenimage"

    @property
@@ -502,9 +404,9 @@ class QwenImageCreateMaskLatentsStep(ModularPipelineBlocks):
                type_hint=torch.Tensor,
                description="The processed mask to use for the inpainting process.",
            ),
-            InputParam.template("height", required=True),
-            InputParam.template("width", required=True),
-            InputParam.template("dtype"),
+            InputParam(name="height", required=True),
+            InputParam(name="width", required=True),
+            InputParam(name="dtype", required=True),
        ]

    @property
@@ -548,28 +450,7 @@ class QwenImageCreateMaskLatentsStep(ModularPipelineBlocks):
 # ====================


-# auto_docstring
 class QwenImageSetTimestepsStep(ModularPipelineBlocks):
-    """
-    Step that sets the the scheduler's timesteps for text-to-image generation. Should be run after prepare latents
-    step.
-
-      Components:
-          scheduler (`FlowMatchEulerDiscreteScheduler`)
-
-      Inputs:
-          num_inference_steps (`int`, *optional*, defaults to 50):
-              The number of denoising steps.
-          sigmas (`List`, *optional*):
-              Custom sigmas for the denoising process.
-          latents (`Tensor`):
-              The initial random noised latents for the denoising process. Can be generated in prepare latents step.
-
-      Outputs:
-          timesteps (`Tensor`):
-              The timesteps to use for the denoising process
-    """
-
    model_name = "qwenimage"

    @property
@@ -585,13 +466,13 @@ class QwenImageSetTimestepsStep(ModularPipelineBlocks):
    @property
    def inputs(self) -> List[InputParam]:
        return [
-            InputParam.template("num_inference_steps"),
-            InputParam.template("sigmas"),
+            InputParam(name="num_inference_steps", default=50),
+            InputParam(name="sigmas"),
            InputParam(
                name="latents",
                required=True,
                type_hint=torch.Tensor,
-                description="The initial random noised latents for the denoising process. Can be generated in prepare latents step.",
+                description="The latents to use for the denoising process, used to calculate the image sequence length.",
            ),
        ]

@@ -635,27 +516,7 @@ class QwenImageSetTimestepsStep(ModularPipelineBlocks):
        return components, state


-# auto_docstring
 class QwenImageLayeredSetTimestepsStep(ModularPipelineBlocks):
-    """
-    Set timesteps step for QwenImage Layered with custom mu calculation based on image_latents.
-
-      Components:
-          scheduler (`FlowMatchEulerDiscreteScheduler`)
-
-      Inputs:
-          num_inference_steps (`int`, *optional*, defaults to 50):
-              The number of denoising steps.
-          sigmas (`List`, *optional*):
-              Custom sigmas for the denoising process.
-          image_latents (`Tensor`):
-              image latents used to guide the image generation. Can be generated from vae_encoder step.
-
-      Outputs:
-          timesteps (`Tensor`):
-              The timesteps to use for the denoising process.
-    """
-
    model_name = "qwenimage-layered"

    @property
@@ -671,17 +532,15 @@ class QwenImageLayeredSetTimestepsStep(ModularPipelineBlocks):
    @property
    def inputs(self) -> List[InputParam]:
        return [
-            InputParam.template("num_inference_steps"),
-            InputParam.template("sigmas"),
-            InputParam.template("image_latents"),
+            InputParam("num_inference_steps", default=50, type_hint=int),
+            InputParam("sigmas", type_hint=List[float]),
+            InputParam("image_latents", required=True, type_hint=torch.Tensor),
        ]

    @property
    def intermediate_outputs(self) -> List[OutputParam]:
        return [
-            OutputParam(
-                name="timesteps", type_hint=torch.Tensor, description="The timesteps to use for the denoising process."
-            ),
+            OutputParam(name="timesteps", type_hint=torch.Tensor),
        ]

    @torch.no_grad()
@@ -715,32 +574,7 @@ class QwenImageLayeredSetTimestepsStep(ModularPipelineBlocks):
        return components, state


-# auto_docstring
 class QwenImageSetTimestepsWithStrengthStep(ModularPipelineBlocks):
-    """
-    Step that sets the the scheduler's timesteps for image-to-image generation, and inpainting. Should be run after
-    prepare latents step.
-
-      Components:
-          scheduler (`FlowMatchEulerDiscreteScheduler`)
-
-      Inputs:
-          num_inference_steps (`int`, *optional*, defaults to 50):
-              The number of denoising steps.
-          sigmas (`List`, *optional*):
-              Custom sigmas for the denoising process.
-          latents (`Tensor`):
-              The latents to use for the denoising process. Can be generated in prepare latents step.
-          strength (`float`, *optional*, defaults to 0.9):
-              Strength for img2img/inpainting.
-
-      Outputs:
-          timesteps (`Tensor`):
-              The timesteps to use for the denoising process.
-          num_inference_steps (`int`):
-              The number of denoising steps to perform at inference time. Updated based on strength.
-    """
-
    model_name = "qwenimage"

    @property
@@ -756,15 +590,15 @@ class QwenImageSetTimestepsWithStrengthStep(ModularPipelineBlocks):
    @property
    def inputs(self) -> List[InputParam]:
        return [
-            InputParam.template("num_inference_steps"),
-            InputParam.template("sigmas"),
+            InputParam(name="num_inference_steps", default=50),
+            InputParam(name="sigmas"),
            InputParam(
-                "latents",
+                name="latents",
                required=True,
                type_hint=torch.Tensor,
-                description="The latents to use for the denoising process. Can be generated in prepare latents step.",
+                description="The latents to use for the denoising process, used to calculate the image sequence length.",
            ),
-            InputParam.template("strength", default=0.9),
+            InputParam(name="strength", default=0.9),
        ]

    @property
@@ -773,12 +607,7 @@ class QwenImageSetTimestepsWithStrengthStep(ModularPipelineBlocks):
            OutputParam(
                name="timesteps",
                type_hint=torch.Tensor,
-                description="The timesteps to use for the denoising process.",
-            ),
-            OutputParam(
-                name="num_inference_steps",
-                type_hint=int,
-                description="The number of denoising steps to perform at inference time. Updated based on strength.",
+                description="The timesteps to use for the denoising process. Can be generated in set_timesteps step.",
            ),
        ]

@@ -825,33 +654,7 @@ class QwenImageSetTimestepsWithStrengthStep(ModularPipelineBlocks):
 ## RoPE inputs for denoiser


-# auto_docstring
 class QwenImageRoPEInputsStep(ModularPipelineBlocks):
-    """
-    Step that prepares the RoPE inputs for the denoising process. Should be place after prepare_latents step
-
-      Inputs:
-          batch_size (`int`, *optional*, defaults to 1):
-              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
-              be generated in input step.
-          height (`int`):
-              The height in pixels of the generated image.
-          width (`int`):
-              The width in pixels of the generated image.
-          prompt_embeds_mask (`Tensor`):
-              mask for the text embeddings. Can be generated from text_encoder step.
-          negative_prompt_embeds_mask (`Tensor`, *optional*):
-              mask for the negative text embeddings. Can be generated from text_encoder step.
-
-      Outputs:
-          img_shapes (`List`):
-              The shapes of the images latents, used for RoPE calculation
-          txt_seq_lens (`List`):
-              The sequence lengths of the prompt embeds, used for RoPE calculation
-          negative_txt_seq_lens (`List`):
-              The sequence lengths of the negative prompt embeds, used for RoPE calculation
-    """
-
    model_name = "qwenimage"

    @property
@@ -863,11 +666,11 @@ class QwenImageRoPEInputsStep(ModularPipelineBlocks):
    @property
    def inputs(self) -> List[InputParam]:
        return [
-            InputParam.template("batch_size"),
-            InputParam.template("height", required=True),
-            InputParam.template("width", required=True),
-            InputParam.template("prompt_embeds_mask"),
-            InputParam.template("negative_prompt_embeds_mask"),
+            InputParam(name="batch_size", required=True),
+            InputParam(name="height", required=True),
+            InputParam(name="width", required=True),
+            InputParam(name="prompt_embeds_mask"),
+            InputParam(name="negative_prompt_embeds_mask"),
        ]

    @property
@@ -899,38 +702,7 @@ class QwenImageRoPEInputsStep(ModularPipelineBlocks):
        return components, state


-# auto_docstring
 class QwenImageEditRoPEInputsStep(ModularPipelineBlocks):
-    """
-    Step that prepares the RoPE inputs for denoising process. This is used in QwenImage Edit. Should be placed after
-    prepare_latents step
-
-      Inputs:
-          batch_size (`int`, *optional*, defaults to 1):
-              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
-              be generated in input step.
-          image_height (`int`):
-              The height of the reference image. Can be generated in input step.
-          image_width (`int`):
-              The width of the reference image. Can be generated in input step.
-          height (`int`):
-              The height in pixels of the generated image.
-          width (`int`):
-              The width in pixels of the generated image.
-          prompt_embeds_mask (`Tensor`):
-              mask for the text embeddings. Can be generated from text_encoder step.
-          negative_prompt_embeds_mask (`Tensor`, *optional*):
-              mask for the negative text embeddings. Can be generated from text_encoder step.
-
-      Outputs:
-          img_shapes (`List`):
-              The shapes of the images latents, used for RoPE calculation
-          txt_seq_lens (`List`):
-              The sequence lengths of the prompt embeds, used for RoPE calculation
-          negative_txt_seq_lens (`List`):
-              The sequence lengths of the negative prompt embeds, used for RoPE calculation
-    """
-
    model_name = "qwenimage"

    @property
@@ -940,23 +712,13 @@ class QwenImageEditRoPEInputsStep(ModularPipelineBlocks):
    @property
    def inputs(self) -> List[InputParam]:
        return [
-            InputParam.template("batch_size"),
-            InputParam(
-                name="image_height",
-                required=True,
-                type_hint=int,
-                description="The height of the reference image. Can be generated in input step.",
-            ),
-            InputParam(
-                name="image_width",
-                required=True,
-                type_hint=int,
-                description="The width of the reference image. Can be generated in input step.",
-            ),
-            InputParam.template("height", required=True),
-            InputParam.template("width", required=True),
-            InputParam.template("prompt_embeds_mask"),
-            InputParam.template("negative_prompt_embeds_mask"),
+            InputParam(name="batch_size", required=True),
+            InputParam(name="image_height", required=True),
+            InputParam(name="image_width", required=True),
+            InputParam(name="height", required=True),
+            InputParam(name="width", required=True),
+            InputParam(name="prompt_embeds_mask"),
+            InputParam(name="negative_prompt_embeds_mask"),
        ]

    @property
@@ -994,39 +756,7 @@ class QwenImageEditRoPEInputsStep(ModularPipelineBlocks):
        return components, state


-# auto_docstring
 class QwenImageEditPlusRoPEInputsStep(ModularPipelineBlocks):
-    """
-    Step that prepares the RoPE inputs for denoising process. This is used in QwenImage Edit Plus.
-      Unlike Edit, Edit Plus handles lists of image_height/image_width for multiple reference images. Should be placed
-      after prepare_latents step.
-
-      Inputs:
-          batch_size (`int`, *optional*, defaults to 1):
-              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
-              be generated in input step.
-          image_height (`List`):
-              The heights of the reference images. Can be generated in input step.
-          image_width (`List`):
-              The widths of the reference images. Can be generated in input step.
-          height (`int`):
-              The height in pixels of the generated image.
-          width (`int`):
-              The width in pixels of the generated image.
-          prompt_embeds_mask (`Tensor`):
-              mask for the text embeddings. Can be generated from text_encoder step.
-          negative_prompt_embeds_mask (`Tensor`, *optional*):
-              mask for the negative text embeddings. Can be generated from text_encoder step.
-
-      Outputs:
-          img_shapes (`List`):
-              The shapes of the image latents, used for RoPE calculation
-          txt_seq_lens (`List`):
-              The sequence lengths of the prompt embeds, used for RoPE calculation
-          negative_txt_seq_lens (`List`):
-              The sequence lengths of the negative prompt embeds, used for RoPE calculation
-    """
-
    model_name = "qwenimage-edit-plus"

    @property
@@ -1040,23 +770,13 @@ class QwenImageEditPlusRoPEInputsStep(ModularPipelineBlocks):
    @property
    def inputs(self) -> List[InputParam]:
        return [
-            InputParam.template("batch_size"),
-            InputParam(
-                name="image_height",
-                required=True,
-                type_hint=List[int],
-                description="The heights of the reference images. Can be generated in input step.",
-            ),
-            InputParam(
-                name="image_width",
-                required=True,
-                type_hint=List[int],
-                description="The widths of the reference images. Can be generated in input step.",
-            ),
-            InputParam.template("height", required=True),
-            InputParam.template("width", required=True),
-            InputParam.template("prompt_embeds_mask"),
-            InputParam.template("negative_prompt_embeds_mask"),
+            InputParam(name="batch_size", required=True),
+            InputParam(name="image_height", required=True, type_hint=List[int]),
+            InputParam(name="image_width", required=True, type_hint=List[int]),
+            InputParam(name="height", required=True),
+            InputParam(name="width", required=True),
+            InputParam(name="prompt_embeds_mask"),
+            InputParam(name="negative_prompt_embeds_mask"),
        ]

    @property
@@ -1112,37 +832,7 @@ class QwenImageEditPlusRoPEInputsStep(ModularPipelineBlocks):
        return components, state


-# auto_docstring
 class QwenImageLayeredRoPEInputsStep(ModularPipelineBlocks):
-    """
-    Step that prepares the RoPE inputs for the denoising process. Should be place after prepare_latents step
-
-      Inputs:
-          batch_size (`int`, *optional*, defaults to 1):
-              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
-              be generated in input step.
-          layers (`int`, *optional*, defaults to 4):
-              Number of layers to extract from the image
-          height (`int`):
-              The height in pixels of the generated image.
-          width (`int`):
-              The width in pixels of the generated image.
-          prompt_embeds_mask (`Tensor`):
-              mask for the text embeddings. Can be generated from text_encoder step.
-          negative_prompt_embeds_mask (`Tensor`, *optional*):
-              mask for the negative text embeddings. Can be generated from text_encoder step.
-
-      Outputs:
-          img_shapes (`List`):
-              The shapes of the image latents, used for RoPE calculation
-          txt_seq_lens (`List`):
-              The sequence lengths of the prompt embeds, used for RoPE calculation
-          negative_txt_seq_lens (`List`):
-              The sequence lengths of the negative prompt embeds, used for RoPE calculation
-          additional_t_cond (`Tensor`):
-              The additional t cond, used for RoPE calculation
-    """
-
    model_name = "qwenimage-layered"

    @property
@@ -1154,12 +844,12 @@ class QwenImageLayeredRoPEInputsStep(ModularPipelineBlocks):
    @property
    def inputs(self) -> List[InputParam]:
        return [
-            InputParam.template("batch_size"),
-            InputParam.template("layers"),
-            InputParam.template("height", required=True),
-            InputParam.template("width", required=True),
-            InputParam.template("prompt_embeds_mask"),
-            InputParam.template("negative_prompt_embeds_mask"),
+            InputParam(name="batch_size", required=True),
+            InputParam(name="layers", required=True),
+            InputParam(name="height", required=True),
+            InputParam(name="width", required=True),
+            InputParam(name="prompt_embeds_mask"),
+            InputParam(name="negative_prompt_embeds_mask"),
        ]

    @property
@@ -1224,34 +914,7 @@ class QwenImageLayeredRoPEInputsStep(ModularPipelineBlocks):


 ## ControlNet inputs for denoiser
-
-
-# auto_docstring
 class QwenImageControlNetBeforeDenoiserStep(ModularPipelineBlocks):
-    """
-    step that prepare inputs for controlnet. Insert before the Denoise Step, after set_timesteps step.
-
-      Components:
-          controlnet (`QwenImageControlNetModel`)
-
-      Inputs:
-          control_guidance_start (`float`, *optional*, defaults to 0.0):
-              When to start applying ControlNet.
-          control_guidance_end (`float`, *optional*, defaults to 1.0):
-              When to stop applying ControlNet.
-          controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
-              Scale for ControlNet conditioning.
-          control_image_latents (`Tensor`):
-              The control image latents to use for the denoising process. Can be generated in controlnet vae encoder
-              step.
-          timesteps (`Tensor`):
-              The timesteps to use for the denoising process. Can be generated in set_timesteps step.
-
-      Outputs:
-          controlnet_keep (`List`):
-              The controlnet keep values
-    """
-
    model_name = "qwenimage"

    @property
@@ -1267,17 +930,12 @@ class QwenImageControlNetBeforeDenoiserStep(ModularPipelineBlocks):
    @property
    def inputs(self) -> List[InputParam]:
        return [
-            InputParam.template("control_guidance_start"),
-            InputParam.template("control_guidance_end"),
-            InputParam.template("controlnet_conditioning_scale"),
+            InputParam("control_guidance_start", default=0.0),
+            InputParam("control_guidance_end", default=1.0),
+            InputParam("controlnet_conditioning_scale", default=1.0),
+            InputParam("control_image_latents", required=True),
            InputParam(
-                name="control_image_latents",
-                required=True,
-                type_hint=torch.Tensor,
-                description="The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step.",
-            ),
-            InputParam(
-                name="timesteps",
+                "timesteps",
                required=True,
                type_hint=torch.Tensor,
                description="The timesteps to use for the denoising process. Can be generated in set_timesteps step.",
--- a/src/diffusers/modular_pipelines/qwenimage/decoders.py
+++ b/src/diffusers/modular_pipelines/qwenimage/decoders.py
@@ -12,8 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-from typing import Any, Dict, List
+from typing import List, Union

+import numpy as np
+import PIL
 import torch

 from ...configuration_utils import FrozenDict
@@ -29,30 +31,7 @@ logger = logging.get_logger(__name__)


 # after denoising loop (unpack latents)
-
-
-# auto_docstring
 class QwenImageAfterDenoiseStep(ModularPipelineBlocks):
-    """
-    Step that unpack the latents from 3D tensor (batch_size, sequence_length, channels) into 5D tensor (batch_size,
-    channels, 1, height, width)
-
-      Components:
-          pachifier (`QwenImagePachifier`)
-
-      Inputs:
-          height (`int`):
-              The height in pixels of the generated image.
-          width (`int`):
-              The width in pixels of the generated image.
-          latents (`Tensor`):
-              The latents to decode, can be generated in the denoise step.
-
-      Outputs:
-          latents (`Tensor`):
-              The denoisedlatents unpacked to B, C, 1, H, W
-    """
-
    model_name = "qwenimage"

    @property
@@ -70,21 +49,13 @@ class QwenImageAfterDenoiseStep(ModularPipelineBlocks):
    @property
    def inputs(self) -> List[InputParam]:
        return [
-            InputParam.template("height", required=True),
-            InputParam.template("width", required=True),
+            InputParam(name="height", required=True),
+            InputParam(name="width", required=True),
            InputParam(
                name="latents",
                required=True,
                type_hint=torch.Tensor,
-                description="The latents to decode, can be generated in the denoise step.",
-            ),
-        ]
-
-    @property
-    def intermediate_outputs(self) -> List[OutputParam]:
-        return [
-            OutputParam(
-                name="latents", type_hint=torch.Tensor, description="The denoisedlatents unpacked to B, C, 1, H, W"
+                description="The latents to decode, can be generated in the denoise step",
            ),
        ]

@@ -101,29 +72,7 @@ class QwenImageAfterDenoiseStep(ModularPipelineBlocks):
        return components, state


-# auto_docstring
 class QwenImageLayeredAfterDenoiseStep(ModularPipelineBlocks):
-    """
-    Unpack latents from (B, seq, C*4) to (B, C, layers+1, H, W) after denoising.
-
-      Components:
-          pachifier (`QwenImageLayeredPachifier`)
-
-      Inputs:
-          latents (`Tensor`):
-              The denoised latents to decode, can be generated in the denoise step.
-          height (`int`):
-              The height in pixels of the generated image.
-          width (`int`):
-              The width in pixels of the generated image.
-          layers (`int`, *optional*, defaults to 4):
-              Number of layers to extract from the image
-
-      Outputs:
-          latents (`Tensor`):
-              Denoised latents. (unpacked to B, C, layers+1, H, W)
-    """
-
    model_name = "qwenimage-layered"

    @property
@@ -139,21 +88,10 @@ class QwenImageLayeredAfterDenoiseStep(ModularPipelineBlocks):
    @property
    def inputs(self) -> List[InputParam]:
        return [
-            InputParam(
-                name="latents",
-                required=True,
-                type_hint=torch.Tensor,
-                description="The denoised latents to decode, can be generated in the denoise step.",
-            ),
-            InputParam.template("height", required=True),
-            InputParam.template("width", required=True),
-            InputParam.template("layers"),
-        ]
-
-    @property
-    def intermediate_outputs(self) -> List[OutputParam]:
-        return [
-            OutputParam.template("latents", note="unpacked to B, C, layers+1, H, W"),
+            InputParam("latents", required=True, type_hint=torch.Tensor),
+            InputParam("height", required=True, type_hint=int),
+            InputParam("width", required=True, type_hint=int),
+            InputParam("layers", required=True, type_hint=int),
        ]

    @torch.no_grad()
@@ -174,26 +112,7 @@ class QwenImageLayeredAfterDenoiseStep(ModularPipelineBlocks):


 # decode step
-
-
-# auto_docstring
 class QwenImageDecoderStep(ModularPipelineBlocks):
-    """
-    Step that decodes the latents to images
-
-      Components:
-          vae (`AutoencoderKLQwenImage`)
-
-      Inputs:
-          latents (`Tensor`):
-              The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise
-              step.
-
-      Outputs:
-          images (`List`):
-              Generated images. (tensor output of the vae decoder.)
-    """
-
    model_name = "qwenimage"

    @property
@@ -215,13 +134,19 @@ class QwenImageDecoderStep(ModularPipelineBlocks):
                name="latents",
                required=True,
                type_hint=torch.Tensor,
-                description="The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step.",
+                description="The latents to decode, can be generated in the denoise step",
            ),
        ]

    @property
-    def intermediate_outputs(self) -> List[OutputParam]:
-        return [OutputParam.template("images", note="tensor output of the vae decoder.")]
+    def intermediate_outputs(self) -> List[str]:
+        return [
+            OutputParam(
+                "images",
+                type_hint=Union[List[PIL.Image.Image], List[torch.Tensor], List[np.array]],
+                description="The generated images, can be a PIL.Image.Image, torch.Tensor or a numpy array",
+            )
+        ]

    @torch.no_grad()
    def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
@@ -251,26 +176,7 @@ class QwenImageDecoderStep(ModularPipelineBlocks):
        return components, state


-# auto_docstring
 class QwenImageLayeredDecoderStep(ModularPipelineBlocks):
-    """
-    Decode unpacked latents (B, C, layers+1, H, W) into layer images.
-
-      Components:
-          vae (`AutoencoderKLQwenImage`) image_processor (`VaeImageProcessor`)
-
-      Inputs:
-          latents (`Tensor`):
-              The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise
-              step.
-          output_type (`str`, *optional*, defaults to pil):
-              Output format: 'pil', 'np', 'pt'.
-
-      Outputs:
-          images (`List`):
-              Generated images.
-    """
-
    model_name = "qwenimage-layered"

    @property
@@ -292,19 +198,14 @@ class QwenImageLayeredDecoderStep(ModularPipelineBlocks):
    @property
    def inputs(self) -> List[InputParam]:
        return [
-            InputParam(
-                name="latents",
-                required=True,
-                type_hint=torch.Tensor,
-                description="The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step.",
-            ),
-            InputParam.template("output_type"),
+            InputParam("latents", required=True, type_hint=torch.Tensor),
+            InputParam("output_type", default="pil", type_hint=str),
        ]

    @property
    def intermediate_outputs(self) -> List[OutputParam]:
        return [
-            OutputParam.template("images"),
+            OutputParam(name="images", type_hint=List[List[PIL.Image.Image]]),
        ]

    @torch.no_grad()
@@ -350,27 +251,7 @@ class QwenImageLayeredDecoderStep(ModularPipelineBlocks):


 # postprocess the decoded images
-
-
-# auto_docstring
 class QwenImageProcessImagesOutputStep(ModularPipelineBlocks):
-    """
-    postprocess the generated image
-
-      Components:
-          image_processor (`VaeImageProcessor`)
-
-      Inputs:
-          images (`Tensor`):
-              the generated image tensor from decoders step
-          output_type (`str`, *optional*, defaults to pil):
-              Output format: 'pil', 'np', 'pt'.
-
-      Outputs:
-          images (`List`):
-              Generated images.
-    """
-
    model_name = "qwenimage"

    @property
@@ -391,19 +272,15 @@ class QwenImageProcessImagesOutputStep(ModularPipelineBlocks):
    @property
    def inputs(self) -> List[InputParam]:
        return [
+            InputParam("images", required=True, description="the generated image from decoders step"),
            InputParam(
-                name="images",
-                required=True,
-                type_hint=torch.Tensor,
-                description="the generated image tensor from decoders step",
+                name="output_type",
+                default="pil",
+                type_hint=str,
+                description="The type of the output images, can be 'pil', 'np', 'pt'",
            ),
-            InputParam.template("output_type"),
        ]

-    @property
-    def intermediate_outputs(self) -> List[OutputParam]:
-        return [OutputParam.template("images")]
-
    @staticmethod
    def check_inputs(output_type):
        if output_type not in ["pil", "np", "pt"]:
@@ -424,28 +301,7 @@ class QwenImageProcessImagesOutputStep(ModularPipelineBlocks):
        return components, state


-# auto_docstring
 class QwenImageInpaintProcessImagesOutputStep(ModularPipelineBlocks):
-    """
-    postprocess the generated image, optional apply the mask overally to the original image..
-
-      Components:
-          image_mask_processor (`InpaintProcessor`)
-
-      Inputs:
-          images (`Tensor`):
-              the generated image tensor from decoders step
-          output_type (`str`, *optional*, defaults to pil):
-              Output format: 'pil', 'np', 'pt'.
-          mask_overlay_kwargs (`Dict`, *optional*):
-              The kwargs for the postprocess step to apply the mask overlay. generated in
-              InpaintProcessImagesInputStep.
-
-      Outputs:
-          images (`List`):
-              Generated images.
-    """
-
    model_name = "qwenimage"

    @property
@@ -466,24 +322,16 @@ class QwenImageInpaintProcessImagesOutputStep(ModularPipelineBlocks):
    @property
    def inputs(self) -> List[InputParam]:
        return [
+            InputParam("images", required=True, description="the generated image from decoders step"),
            InputParam(
-                name="images",
-                required=True,
-                type_hint=torch.Tensor,
-                description="the generated image tensor from decoders step",
-            ),
-            InputParam.template("output_type"),
-            InputParam(
-                name="mask_overlay_kwargs",
-                type_hint=Dict[str, Any],
-                description="The kwargs for the postprocess step to apply the mask overlay. generated in InpaintProcessImagesInputStep.",
+                name="output_type",
+                default="pil",
+                type_hint=str,
+                description="The type of the output images, can be 'pil', 'np', 'pt'",
            ),
+            InputParam("mask_overlay_kwargs"),
        ]

-    @property
-    def intermediate_outputs(self) -> List[OutputParam]:
-        return [OutputParam.template("images")]
-
    @staticmethod
    def check_inputs(output_type, mask_overlay_kwargs):
        if output_type not in ["pil", "np", "pt"]:
--- a/src/diffusers/modular_pipelines/qwenimage/denoise.py
+++ b/src/diffusers/modular_pipelines/qwenimage/denoise.py
@@ -50,7 +50,7 @@ class QwenImageLoopBeforeDenoiser(ModularPipelineBlocks):
    def inputs(self) -> List[InputParam]:
        return [
            InputParam(
-                name="latents",
+                "latents",
                required=True,
                type_hint=torch.Tensor,
                description="The initial latents to use for the denoising process. Can be generated in prepare_latent step.",
@@ -80,12 +80,17 @@ class QwenImageEditLoopBeforeDenoiser(ModularPipelineBlocks):
    def inputs(self) -> List[InputParam]:
        return [
            InputParam(
-                name="latents",
+                "latents",
                required=True,
                type_hint=torch.Tensor,
                description="The initial latents to use for the denoising process. Can be generated in prepare_latent step.",
            ),
-            InputParam.template("image_latents"),
+            InputParam(
+                "image_latents",
+                required=True,
+                type_hint=torch.Tensor,
+                description="The initial image latents to use for the denoising process. Can be encoded in vae_encoder step and packed in prepare_image_latents step.",
+            ),
        ]

    @torch.no_grad()
@@ -129,12 +134,29 @@ class QwenImageLoopBeforeDenoiserControlNet(ModularPipelineBlocks):
                type_hint=torch.Tensor,
                description="The control image to use for the denoising process. Can be generated in prepare_controlnet_inputs step.",
            ),
-            InputParam.template("controlnet_conditioning_scale", note="updated in prepare_controlnet_inputs step."),
            InputParam(
-                name="controlnet_keep",
+                "controlnet_conditioning_scale",
+                type_hint=float,
+                description="The controlnet conditioning scale value to use for the denoising process. Can be generated in prepare_controlnet_inputs step.",
+            ),
+            InputParam(
+                "controlnet_keep",
                required=True,
                type_hint=List[float],
-                description="The controlnet keep values. Can be generated in prepare_controlnet_inputs step.",
+                description="The controlnet keep values to use for the denoising process. Can be generated in prepare_controlnet_inputs step.",
+            ),
+            InputParam(
+                "num_inference_steps",
+                required=True,
+                type_hint=int,
+                description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.",
+            ),
+            InputParam(
+                kwargs_type="denoiser_input_fields",
+                description=(
+                    "All conditional model inputs for the denoiser. "
+                    "It should contain prompt_embeds/negative_prompt_embeds."
+                ),
            ),
        ]

@@ -195,13 +217,28 @@ class QwenImageLoopDenoiser(ModularPipelineBlocks):
    @property
    def inputs(self) -> List[InputParam]:
        return [
-            InputParam.template("attention_kwargs"),
-            InputParam.template("denoiser_input_fields"),
+            InputParam("attention_kwargs"),
+            InputParam(
+                "latents",
+                required=True,
+                type_hint=torch.Tensor,
+                description="The latents to use for the denoising process. Can be generated in prepare_latents step.",
+            ),
+            InputParam(
+                "num_inference_steps",
+                required=True,
+                type_hint=int,
+                description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.",
+            ),
+            InputParam(
+                kwargs_type="denoiser_input_fields",
+                description="conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.",
+            ),
            InputParam(
                "img_shapes",
                required=True,
                type_hint=List[Tuple[int, int]],
-                description="The shape of the image latents for RoPE calculation. can be generated in prepare_additional_inputs step.",
+                description="The shape of the image latents for RoPE calculation. Can be generated in prepare_additional_inputs step.",
            ),
        ]

@@ -280,8 +317,23 @@ class QwenImageEditLoopDenoiser(ModularPipelineBlocks):
    @property
    def inputs(self) -> List[InputParam]:
        return [
-            InputParam.template("attention_kwargs"),
-            InputParam.template("denoiser_input_fields"),
+            InputParam("attention_kwargs"),
+            InputParam(
+                "latents",
+                required=True,
+                type_hint=torch.Tensor,
+                description="The latents to use for the denoising process. Can be generated in prepare_latents step.",
+            ),
+            InputParam(
+                "num_inference_steps",
+                required=True,
+                type_hint=int,
+                description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.",
+            ),
+            InputParam(
+                kwargs_type="denoiser_input_fields",
+                description="conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.",
+            ),
            InputParam(
                "img_shapes",
                required=True,
@@ -363,7 +415,7 @@ class QwenImageLoopAfterDenoiser(ModularPipelineBlocks):
    @property
    def intermediate_outputs(self) -> List[OutputParam]:
        return [
-            OutputParam.template("latents"),
+            OutputParam("latents", type_hint=torch.Tensor, description="The denoised latents."),
        ]

    @torch.no_grad()
@@ -404,19 +456,24 @@ class QwenImageLoopAfterDenoiserInpaint(ModularPipelineBlocks):
                type_hint=torch.Tensor,
                description="The mask to use for the inpainting process. Can be generated in inpaint prepare latents step.",
            ),
-            InputParam.template("image_latents"),
+            InputParam(
+                "image_latents",
+                required=True,
+                type_hint=torch.Tensor,
+                description="The image latents to use for the inpainting process. Can be generated in inpaint prepare latents step.",
+            ),
            InputParam(
                "initial_noise",
                required=True,
                type_hint=torch.Tensor,
                description="The initial noise to use for the inpainting process. Can be generated in inpaint prepare latents step.",
            ),
-        ]
-
-    @property
-    def intermediate_outputs(self) -> List[OutputParam]:
-        return [
-            OutputParam.template("latents"),
+            InputParam(
+                "timesteps",
+                required=True,
+                type_hint=torch.Tensor,
+                description="The timesteps to use for the denoising process. Can be generated in set_timesteps step.",
+            ),
        ]

    @torch.no_grad()
@@ -458,12 +515,17 @@ class QwenImageDenoiseLoopWrapper(LoopSequentialPipelineBlocks):
    def loop_inputs(self) -> List[InputParam]:
        return [
            InputParam(
-                name="timesteps",
+                "timesteps",
                required=True,
                type_hint=torch.Tensor,
                description="The timesteps to use for the denoising process. Can be generated in set_timesteps step.",
            ),
-            InputParam.template("num_inference_steps", required=True),
+            InputParam(
+                "num_inference_steps",
+                required=True,
+                type_hint=int,
+                description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.",
+            ),
        ]

    @torch.no_grad()
@@ -495,42 +557,7 @@ class QwenImageDenoiseLoopWrapper(LoopSequentialPipelineBlocks):


 # Qwen Image (text2image, image2image)
-
-
-# auto_docstring
 class QwenImageDenoiseStep(QwenImageDenoiseLoopWrapper):
-    """
-    Denoise step that iteratively denoise the latents.
-      Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method At each iteration, it runs blocks
-      defined in `sub_blocks` sequencially:
-       - `QwenImageLoopBeforeDenoiser`
-       - `QwenImageLoopDenoiser`
-       - `QwenImageLoopAfterDenoiser`
-      This block supports text2image and image2image tasks for QwenImage.
-
-      Components:
-          guider (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`) scheduler
-          (`FlowMatchEulerDiscreteScheduler`)
-
-      Inputs:
-          timesteps (`Tensor`):
-              The timesteps to use for the denoising process. Can be generated in set_timesteps step.
-          num_inference_steps (`int`):
-              The number of denoising steps.
-          latents (`Tensor`):
-              The initial latents to use for the denoising process. Can be generated in prepare_latent step.
-          attention_kwargs (`Dict`, *optional*):
-              Additional kwargs for attention processors.
-          **denoiser_input_fields (`None`, *optional*):
-              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
-          img_shapes (`List`):
-              The shape of the image latents for RoPE calculation. can be generated in prepare_additional_inputs step.
-
-      Outputs:
-          latents (`Tensor`):
-              Denoised latents.
-    """
-
    model_name = "qwenimage"

    block_classes = [
@@ -543,8 +570,8 @@ class QwenImageDenoiseStep(QwenImageDenoiseLoopWrapper):
    @property
    def description(self) -> str:
        return (
-            "Denoise step that iteratively denoise the latents.\n"
-            "Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method\n"
+            "Denoise step that iteratively denoise the latents. \n"
+            "Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method \n"
            "At each iteration, it runs blocks defined in `sub_blocks` sequencially:\n"
            " - `QwenImageLoopBeforeDenoiser`\n"
            " - `QwenImageLoopDenoiser`\n"
@@ -554,47 +581,7 @@ class QwenImageDenoiseStep(QwenImageDenoiseLoopWrapper):


 # Qwen Image (inpainting)
-# auto_docstring
 class QwenImageInpaintDenoiseStep(QwenImageDenoiseLoopWrapper):
-    """
-    Denoise step that iteratively denoise the latents.
-      Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method At each iteration, it runs blocks
-      defined in `sub_blocks` sequencially:
-       - `QwenImageLoopBeforeDenoiser`
-       - `QwenImageLoopDenoiser`
-       - `QwenImageLoopAfterDenoiser`
-       - `QwenImageLoopAfterDenoiserInpaint`
-      This block supports inpainting tasks for QwenImage.
-
-      Components:
-          guider (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`) scheduler
-          (`FlowMatchEulerDiscreteScheduler`)
-
-      Inputs:
-          timesteps (`Tensor`):
-              The timesteps to use for the denoising process. Can be generated in set_timesteps step.
-          num_inference_steps (`int`):
-              The number of denoising steps.
-          latents (`Tensor`):
-              The initial latents to use for the denoising process. Can be generated in prepare_latent step.
-          attention_kwargs (`Dict`, *optional*):
-              Additional kwargs for attention processors.
-          **denoiser_input_fields (`None`, *optional*):
-              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
-          img_shapes (`List`):
-              The shape of the image latents for RoPE calculation. can be generated in prepare_additional_inputs step.
-          mask (`Tensor`):
-              The mask to use for the inpainting process. Can be generated in inpaint prepare latents step.
-          image_latents (`Tensor`):
-              image latents used to guide the image generation. Can be generated from vae_encoder step.
-          initial_noise (`Tensor`):
-              The initial noise to use for the inpainting process. Can be generated in inpaint prepare latents step.
-
-      Outputs:
-          latents (`Tensor`):
-              Denoised latents.
-    """
-
    model_name = "qwenimage"
    block_classes = [
        QwenImageLoopBeforeDenoiser,
@@ -619,47 +606,7 @@ class QwenImageInpaintDenoiseStep(QwenImageDenoiseLoopWrapper):


 # Qwen Image (text2image, image2image) with controlnet
-# auto_docstring
 class QwenImageControlNetDenoiseStep(QwenImageDenoiseLoopWrapper):
-    """
-    Denoise step that iteratively denoise the latents.
-      Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method At each iteration, it runs blocks
-      defined in `sub_blocks` sequencially:
-       - `QwenImageLoopBeforeDenoiser`
-       - `QwenImageLoopBeforeDenoiserControlNet`
-       - `QwenImageLoopDenoiser`
-       - `QwenImageLoopAfterDenoiser`
-      This block supports text2img/img2img tasks with controlnet for QwenImage.
-
-      Components:
-          guider (`ClassifierFreeGuidance`) controlnet (`QwenImageControlNetModel`) transformer
-          (`QwenImageTransformer2DModel`) scheduler (`FlowMatchEulerDiscreteScheduler`)
-
-      Inputs:
-          timesteps (`Tensor`):
-              The timesteps to use for the denoising process. Can be generated in set_timesteps step.
-          num_inference_steps (`int`):
-              The number of denoising steps.
-          latents (`Tensor`):
-              The initial latents to use for the denoising process. Can be generated in prepare_latent step.
-          control_image_latents (`Tensor`):
-              The control image to use for the denoising process. Can be generated in prepare_controlnet_inputs step.
-          controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
-              Scale for ControlNet conditioning. (updated in prepare_controlnet_inputs step.)
-          controlnet_keep (`List`):
-              The controlnet keep values. Can be generated in prepare_controlnet_inputs step.
-          attention_kwargs (`Dict`, *optional*):
-              Additional kwargs for attention processors.
-          **denoiser_input_fields (`None`, *optional*):
-              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
-          img_shapes (`List`):
-              The shape of the image latents for RoPE calculation. can be generated in prepare_additional_inputs step.
-
-      Outputs:
-          latents (`Tensor`):
-              Denoised latents.
-    """
-
    model_name = "qwenimage"
    block_classes = [
        QwenImageLoopBeforeDenoiser,
@@ -684,54 +631,7 @@ class QwenImageControlNetDenoiseStep(QwenImageDenoiseLoopWrapper):


 # Qwen Image (inpainting) with controlnet
-# auto_docstring
 class QwenImageInpaintControlNetDenoiseStep(QwenImageDenoiseLoopWrapper):
-    """
-    Denoise step that iteratively denoise the latents.
-      Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method At each iteration, it runs blocks
-      defined in `sub_blocks` sequencially:
-       - `QwenImageLoopBeforeDenoiser`
-       - `QwenImageLoopBeforeDenoiserControlNet`
-       - `QwenImageLoopDenoiser`
-       - `QwenImageLoopAfterDenoiser`
-       - `QwenImageLoopAfterDenoiserInpaint`
-      This block supports inpainting tasks with controlnet for QwenImage.
-
-      Components:
-          guider (`ClassifierFreeGuidance`) controlnet (`QwenImageControlNetModel`) transformer
-          (`QwenImageTransformer2DModel`) scheduler (`FlowMatchEulerDiscreteScheduler`)
-
-      Inputs:
-          timesteps (`Tensor`):
-              The timesteps to use for the denoising process. Can be generated in set_timesteps step.
-          num_inference_steps (`int`):
-              The number of denoising steps.
-          latents (`Tensor`):
-              The initial latents to use for the denoising process. Can be generated in prepare_latent step.
-          control_image_latents (`Tensor`):
-              The control image to use for the denoising process. Can be generated in prepare_controlnet_inputs step.
-          controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
-              Scale for ControlNet conditioning. (updated in prepare_controlnet_inputs step.)
-          controlnet_keep (`List`):
-              The controlnet keep values. Can be generated in prepare_controlnet_inputs step.
-          attention_kwargs (`Dict`, *optional*):
-              Additional kwargs for attention processors.
-          **denoiser_input_fields (`None`, *optional*):
-              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
-          img_shapes (`List`):
-              The shape of the image latents for RoPE calculation. can be generated in prepare_additional_inputs step.
-          mask (`Tensor`):
-              The mask to use for the inpainting process. Can be generated in inpaint prepare latents step.
-          image_latents (`Tensor`):
-              image latents used to guide the image generation. Can be generated from vae_encoder step.
-          initial_noise (`Tensor`):
-              The initial noise to use for the inpainting process. Can be generated in inpaint prepare latents step.
-
-      Outputs:
-          latents (`Tensor`):
-              Denoised latents.
-    """
-
    model_name = "qwenimage"
    block_classes = [
        QwenImageLoopBeforeDenoiser,
@@ -764,42 +664,7 @@ class QwenImageInpaintControlNetDenoiseStep(QwenImageDenoiseLoopWrapper):


 # Qwen Image Edit (image2image)
-# auto_docstring
 class QwenImageEditDenoiseStep(QwenImageDenoiseLoopWrapper):
-    """
-    Denoise step that iteratively denoise the latents.
-      Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method At each iteration, it runs blocks
-      defined in `sub_blocks` sequencially:
-       - `QwenImageEditLoopBeforeDenoiser`
-       - `QwenImageEditLoopDenoiser`
-       - `QwenImageLoopAfterDenoiser`
-      This block supports QwenImage Edit.
-
-      Components:
-          guider (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`) scheduler
-          (`FlowMatchEulerDiscreteScheduler`)
-
-      Inputs:
-          timesteps (`Tensor`):
-              The timesteps to use for the denoising process. Can be generated in set_timesteps step.
-          num_inference_steps (`int`):
-              The number of denoising steps.
-          latents (`Tensor`):
-              The initial latents to use for the denoising process. Can be generated in prepare_latent step.
-          image_latents (`Tensor`):
-              image latents used to guide the image generation. Can be generated from vae_encoder step.
-          attention_kwargs (`Dict`, *optional*):
-              Additional kwargs for attention processors.
-          **denoiser_input_fields (`None`, *optional*):
-              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
-          img_shapes (`List`):
-              The shape of the image latents for RoPE calculation. Can be generated in prepare_additional_inputs step.
-
-      Outputs:
-          latents (`Tensor`):
-              Denoised latents.
-    """
-
    model_name = "qwenimage-edit"
    block_classes = [
        QwenImageEditLoopBeforeDenoiser,
@@ -822,47 +687,7 @@ class QwenImageEditDenoiseStep(QwenImageDenoiseLoopWrapper):


 # Qwen Image Edit (inpainting)
-# auto_docstring
 class QwenImageEditInpaintDenoiseStep(QwenImageDenoiseLoopWrapper):
-    """
-    Denoise step that iteratively denoise the latents.
-      Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method At each iteration, it runs blocks
-      defined in `sub_blocks` sequencially:
-       - `QwenImageEditLoopBeforeDenoiser`
-       - `QwenImageEditLoopDenoiser`
-       - `QwenImageLoopAfterDenoiser`
-       - `QwenImageLoopAfterDenoiserInpaint`
-      This block supports inpainting tasks for QwenImage Edit.
-
-      Components:
-          guider (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`) scheduler
-          (`FlowMatchEulerDiscreteScheduler`)
-
-      Inputs:
-          timesteps (`Tensor`):
-              The timesteps to use for the denoising process. Can be generated in set_timesteps step.
-          num_inference_steps (`int`):
-              The number of denoising steps.
-          latents (`Tensor`):
-              The initial latents to use for the denoising process. Can be generated in prepare_latent step.
-          image_latents (`Tensor`):
-              image latents used to guide the image generation. Can be generated from vae_encoder step.
-          attention_kwargs (`Dict`, *optional*):
-              Additional kwargs for attention processors.
-          **denoiser_input_fields (`None`, *optional*):
-              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
-          img_shapes (`List`):
-              The shape of the image latents for RoPE calculation. Can be generated in prepare_additional_inputs step.
-          mask (`Tensor`):
-              The mask to use for the inpainting process. Can be generated in inpaint prepare latents step.
-          initial_noise (`Tensor`):
-              The initial noise to use for the inpainting process. Can be generated in inpaint prepare latents step.
-
-      Outputs:
-          latents (`Tensor`):
-              Denoised latents.
-    """
-
    model_name = "qwenimage-edit"
    block_classes = [
        QwenImageEditLoopBeforeDenoiser,
@@ -887,42 +712,7 @@ class QwenImageEditInpaintDenoiseStep(QwenImageDenoiseLoopWrapper):


 # Qwen Image Layered (image2image)
-# auto_docstring
 class QwenImageLayeredDenoiseStep(QwenImageDenoiseLoopWrapper):
-    """
-    Denoise step that iteratively denoise the latents.
-      Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method At each iteration, it runs blocks
-      defined in `sub_blocks` sequencially:
-       - `QwenImageEditLoopBeforeDenoiser`
-       - `QwenImageEditLoopDenoiser`
-       - `QwenImageLoopAfterDenoiser`
-      This block supports QwenImage Layered.
-
-      Components:
-          guider (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`) scheduler
-          (`FlowMatchEulerDiscreteScheduler`)
-
-      Inputs:
-          timesteps (`Tensor`):
-              The timesteps to use for the denoising process. Can be generated in set_timesteps step.
-          num_inference_steps (`int`):
-              The number of denoising steps.
-          latents (`Tensor`):
-              The initial latents to use for the denoising process. Can be generated in prepare_latent step.
-          image_latents (`Tensor`):
-              image latents used to guide the image generation. Can be generated from vae_encoder step.
-          attention_kwargs (`Dict`, *optional*):
-              Additional kwargs for attention processors.
-          **denoiser_input_fields (`None`, *optional*):
-              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
-          img_shapes (`List`):
-              The shape of the image latents for RoPE calculation. Can be generated in prepare_additional_inputs step.
-
-      Outputs:
-          latents (`Tensor`):
-              Denoised latents.
-    """
-
    model_name = "qwenimage-layered"
    block_classes = [
        QwenImageEditLoopBeforeDenoiser,
--- a/src/diffusers/modular_pipelines/qwenimage/encoders.py
+++ b/src/diffusers/modular_pipelines/qwenimage/encoders.py
--- a/src/diffusers/modular_pipelines/qwenimage/inputs.py
+++ b/src/diffusers/modular_pipelines/qwenimage/inputs.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-from typing import List, Optional, Tuple
+from typing import List, Tuple

 import torch

@@ -109,44 +109,7 @@ def calculate_dimension_from_latents(latents: torch.Tensor, vae_scale_factor: in
    return height, width


-# auto_docstring
 class QwenImageTextInputsStep(ModularPipelineBlocks):
-    """
-    Text input processing step that standardizes text embeddings for the pipeline.
-      This step:
-        1. Determines `batch_size` and `dtype` based on `prompt_embeds`
-        2. Ensures all text embeddings have consistent batch sizes (batch_size * num_images_per_prompt)
-
-      This block should be placed after all encoder steps to process the text embeddings before they are used in
-      subsequent pipeline steps.
-
-      Inputs:
-          num_images_per_prompt (`int`, *optional*, defaults to 1):
-              The number of images to generate per prompt.
-          prompt_embeds (`Tensor`):
-              text embeddings used to guide the image generation. Can be generated from text_encoder step.
-          prompt_embeds_mask (`Tensor`):
-              mask for the text embeddings. Can be generated from text_encoder step.
-          negative_prompt_embeds (`Tensor`, *optional*):
-              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
-          negative_prompt_embeds_mask (`Tensor`, *optional*):
-              mask for the negative text embeddings. Can be generated from text_encoder step.
-
-      Outputs:
-          batch_size (`int`):
-              The batch size of the prompt embeddings
-          dtype (`dtype`):
-              The data type of the prompt embeddings
-          prompt_embeds (`Tensor`):
-              The prompt embeddings. (batch-expanded)
-          prompt_embeds_mask (`Tensor`):
-              The encoder attention mask. (batch-expanded)
-          negative_prompt_embeds (`Tensor`):
-              The negative prompt embeddings. (batch-expanded)
-          negative_prompt_embeds_mask (`Tensor`):
-              The negative prompt embeddings mask. (batch-expanded)
-    """
-
    model_name = "qwenimage"

    @property
@@ -166,22 +129,26 @@ class QwenImageTextInputsStep(ModularPipelineBlocks):
    @property
    def inputs(self) -> List[InputParam]:
        return [
-            InputParam.template("num_images_per_prompt"),
-            InputParam.template("prompt_embeds"),
-            InputParam.template("prompt_embeds_mask"),
-            InputParam.template("negative_prompt_embeds"),
-            InputParam.template("negative_prompt_embeds_mask"),
+            InputParam(name="num_images_per_prompt", default=1),
+            InputParam(name="prompt_embeds", required=True, kwargs_type="denoiser_input_fields"),
+            InputParam(name="prompt_embeds_mask", required=True, kwargs_type="denoiser_input_fields"),
+            InputParam(name="negative_prompt_embeds", kwargs_type="denoiser_input_fields"),
+            InputParam(name="negative_prompt_embeds_mask", kwargs_type="denoiser_input_fields"),
        ]

    @property
-    def intermediate_outputs(self) -> List[OutputParam]:
+    def intermediate_outputs(self) -> List[str]:
        return [
-            OutputParam(name="batch_size", type_hint=int, description="The batch size of the prompt embeddings"),
-            OutputParam(name="dtype", type_hint=torch.dtype, description="The data type of the prompt embeddings"),
-            OutputParam.template("prompt_embeds", note="batch-expanded"),
-            OutputParam.template("prompt_embeds_mask", note="batch-expanded"),
-            OutputParam.template("negative_prompt_embeds", note="batch-expanded"),
-            OutputParam.template("negative_prompt_embeds_mask", note="batch-expanded"),
+            OutputParam(
+                "batch_size",
+                type_hint=int,
+                description="Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt",
+            ),
+            OutputParam(
+                "dtype",
+                type_hint=torch.dtype,
+                description="Data type of model tensor inputs (determined by `prompt_embeds`)",
+            ),
        ]

    @staticmethod
@@ -254,76 +221,20 @@ class QwenImageTextInputsStep(ModularPipelineBlocks):
        return components, state


-# auto_docstring
 class QwenImageAdditionalInputsStep(ModularPipelineBlocks):
-    """
-    Input processing step that:
-        1. For image latent inputs: Updates height/width if None, patchifies, and expands batch size
-        2. For additional batch inputs: Expands batch dimensions to match final batch size
-
-      Configured inputs:
-        - Image latent inputs: ['image_latents']
-
-      This block should be placed after the encoder steps and the text input step.
-
-      Components:
-          pachifier (`QwenImagePachifier`)
-
-      Inputs:
-          num_images_per_prompt (`int`, *optional*, defaults to 1):
-              The number of images to generate per prompt.
-          batch_size (`int`, *optional*, defaults to 1):
-              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
-              be generated in input step.
-          height (`int`, *optional*):
-              The height in pixels of the generated image.
-          width (`int`, *optional*):
-              The width in pixels of the generated image.
-          image_latents (`Tensor`):
-              image latents used to guide the image generation. Can be generated from vae_encoder step.
-
-      Outputs:
-          image_height (`int`):
-              The image height calculated from the image latents dimension
-          image_width (`int`):
-              The image width calculated from the image latents dimension
-          height (`int`):
-              if not provided, updated to image height
-          width (`int`):
-              if not provided, updated to image width
-          image_latents (`Tensor`):
-              image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified and
-              batch-expanded)
-    """
+    """Input step for QwenImage: update height/width, expand batch, patchify."""

    model_name = "qwenimage"

    def __init__(
        self,
-        image_latent_inputs: Optional[List[InputParam]] = None,
-        additional_batch_inputs: Optional[List[InputParam]] = None,
+        image_latent_inputs: List[str] = ["image_latents"],
+        additional_batch_inputs: List[str] = [],
    ):
-        # by default, process `image_latents`
-        if image_latent_inputs is None:
-            image_latent_inputs = [InputParam.template("image_latents")]
-        if additional_batch_inputs is None:
-            additional_batch_inputs = []
-
        if not isinstance(image_latent_inputs, list):
-            raise ValueError(f"image_latent_inputs must be a list, but got {type(image_latent_inputs)}")
-        else:
-            for input_param in image_latent_inputs:
-                if not isinstance(input_param, InputParam):
-                    raise ValueError(f"image_latent_inputs must be a list of InputParam, but got {type(input_param)}")
-
+            image_latent_inputs = [image_latent_inputs]
        if not isinstance(additional_batch_inputs, list):
-            raise ValueError(f"additional_batch_inputs must be a list, but got {type(additional_batch_inputs)}")
-        else:
-            for input_param in additional_batch_inputs:
-                if not isinstance(input_param, InputParam):
-                    raise ValueError(
-                        f"additional_batch_inputs must be a list of InputParam, but got {type(input_param)}"
-                    )
+            additional_batch_inputs = [additional_batch_inputs]

        self._image_latent_inputs = image_latent_inputs
        self._additional_batch_inputs = additional_batch_inputs
@@ -341,9 +252,9 @@ class QwenImageAdditionalInputsStep(ModularPipelineBlocks):
        if self._image_latent_inputs or self._additional_batch_inputs:
            inputs_info = "\n\nConfigured inputs:"
            if self._image_latent_inputs:
-                inputs_info += f"\n  - Image latent inputs: {[p.name for p in self._image_latent_inputs]}"
+                inputs_info += f"\n  - Image latent inputs: {self._image_latent_inputs}"
            if self._additional_batch_inputs:
-                inputs_info += f"\n  - Additional batch inputs: {[p.name for p in self._additional_batch_inputs]}"
+                inputs_info += f"\n  - Additional batch inputs: {self._additional_batch_inputs}"

        placement_section = "\n\nThis block should be placed after the encoder steps and the text input step."

@@ -358,19 +269,23 @@ class QwenImageAdditionalInputsStep(ModularPipelineBlocks):
    @property
    def inputs(self) -> List[InputParam]:
        inputs = [
-            InputParam.template("num_images_per_prompt"),
-            InputParam.template("batch_size"),
-            InputParam.template("height"),
-            InputParam.template("width"),
+            InputParam(name="num_images_per_prompt", default=1),
+            InputParam(name="batch_size", required=True),
+            InputParam(name="height"),
+            InputParam(name="width"),
        ]
-        # default is `image_latents`
-        inputs += self._image_latent_inputs + self._additional_batch_inputs
+
+        for image_latent_input_name in self._image_latent_inputs:
+            inputs.append(InputParam(name=image_latent_input_name))
+
+        for input_name in self._additional_batch_inputs:
+            inputs.append(InputParam(name=input_name))

        return inputs

    @property
    def intermediate_outputs(self) -> List[OutputParam]:
-        outputs = [
+        return [
            OutputParam(
                name="image_height",
                type_hint=int,
@@ -383,43 +298,11 @@ class QwenImageAdditionalInputsStep(ModularPipelineBlocks):
            ),
        ]

-        # `height`/`width` are not new outputs, but they will be updated if any image latent inputs are provided
-        if len(self._image_latent_inputs) > 0:
-            outputs.append(
-                OutputParam(name="height", type_hint=int, description="if not provided, updated to image height")
-            )
-            outputs.append(
-                OutputParam(name="width", type_hint=int, description="if not provided, updated to image width")
-            )
-
-        # image latent inputs are modified in place (patchified and batch-expanded)
-        for input_param in self._image_latent_inputs:
-            outputs.append(
-                OutputParam(
-                    name=input_param.name,
-                    type_hint=input_param.type_hint,
-                    description=input_param.description + " (patchified and batch-expanded)",
-                )
-            )
-
-        # additional batch inputs (batch-expanded only)
-        for input_param in self._additional_batch_inputs:
-            outputs.append(
-                OutputParam(
-                    name=input_param.name,
-                    type_hint=input_param.type_hint,
-                    description=input_param.description + " (batch-expanded)",
-                )
-            )
-
-        return outputs
-
    def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
        block_state = self.get_block_state(state)

        # Process image latent inputs
-        for input_param in self._image_latent_inputs:
-            image_latent_input_name = input_param.name
+        for image_latent_input_name in self._image_latent_inputs:
            image_latent_tensor = getattr(block_state, image_latent_input_name)
            if image_latent_tensor is None:
                continue
@@ -448,8 +331,7 @@ class QwenImageAdditionalInputsStep(ModularPipelineBlocks):
            setattr(block_state, image_latent_input_name, image_latent_tensor)

        # Process additional batch inputs (only batch expansion)
-        for input_param in self._additional_batch_inputs:
-            input_name = input_param.name
+        for input_name in self._additional_batch_inputs:
            input_tensor = getattr(block_state, input_name)
            if input_tensor is None:
                continue
@@ -467,76 +349,20 @@ class QwenImageAdditionalInputsStep(ModularPipelineBlocks):
        return components, state


-# auto_docstring
 class QwenImageEditPlusAdditionalInputsStep(ModularPipelineBlocks):
-    """
-    Input processing step for Edit Plus that:
-        1. For image latent inputs (list): Collects heights/widths, patchifies each, concatenates, expands batch
-        2. For additional batch inputs: Expands batch dimensions to match final batch size
-        Height/width defaults to last image in the list.
-
-      Configured inputs:
-        - Image latent inputs: ['image_latents']
-
-      This block should be placed after the encoder steps and the text input step.
-
-      Components:
-          pachifier (`QwenImagePachifier`)
-
-      Inputs:
-          num_images_per_prompt (`int`, *optional*, defaults to 1):
-              The number of images to generate per prompt.
-          batch_size (`int`, *optional*, defaults to 1):
-              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
-              be generated in input step.
-          height (`int`, *optional*):
-              The height in pixels of the generated image.
-          width (`int`, *optional*):
-              The width in pixels of the generated image.
-          image_latents (`Tensor`):
-              image latents used to guide the image generation. Can be generated from vae_encoder step.
-
-      Outputs:
-          image_height (`List`):
-              The image heights calculated from the image latents dimension
-          image_width (`List`):
-              The image widths calculated from the image latents dimension
-          height (`int`):
-              if not provided, updated to image height
-          width (`int`):
-              if not provided, updated to image width
-          image_latents (`Tensor`):
-              image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified,
-              concatenated, and batch-expanded)
-    """
+    """Input step for QwenImage Edit Plus: handles list of latents with different sizes."""

    model_name = "qwenimage-edit-plus"

    def __init__(
        self,
-        image_latent_inputs: Optional[List[InputParam]] = None,
-        additional_batch_inputs: Optional[List[InputParam]] = None,
+        image_latent_inputs: List[str] = ["image_latents"],
+        additional_batch_inputs: List[str] = [],
    ):
-        if image_latent_inputs is None:
-            image_latent_inputs = [InputParam.template("image_latents")]
-        if additional_batch_inputs is None:
-            additional_batch_inputs = []
-
        if not isinstance(image_latent_inputs, list):
-            raise ValueError(f"image_latent_inputs must be a list, but got {type(image_latent_inputs)}")
-        else:
-            for input_param in image_latent_inputs:
-                if not isinstance(input_param, InputParam):
-                    raise ValueError(f"image_latent_inputs must be a list of InputParam, but got {type(input_param)}")
-
+            image_latent_inputs = [image_latent_inputs]
        if not isinstance(additional_batch_inputs, list):
-            raise ValueError(f"additional_batch_inputs must be a list, but got {type(additional_batch_inputs)}")
-        else:
-            for input_param in additional_batch_inputs:
-                if not isinstance(input_param, InputParam):
-                    raise ValueError(
-                        f"additional_batch_inputs must be a list of InputParam, but got {type(input_param)}"
-                    )
+            additional_batch_inputs = [additional_batch_inputs]

        self._image_latent_inputs = image_latent_inputs
        self._additional_batch_inputs = additional_batch_inputs
@@ -555,9 +381,9 @@ class QwenImageEditPlusAdditionalInputsStep(ModularPipelineBlocks):
        if self._image_latent_inputs or self._additional_batch_inputs:
            inputs_info = "\n\nConfigured inputs:"
            if self._image_latent_inputs:
-                inputs_info += f"\n  - Image latent inputs: {[p.name for p in self._image_latent_inputs]}"
+                inputs_info += f"\n  - Image latent inputs: {self._image_latent_inputs}"
            if self._additional_batch_inputs:
-                inputs_info += f"\n  - Additional batch inputs: {[p.name for p in self._additional_batch_inputs]}"
+                inputs_info += f"\n  - Additional batch inputs: {self._additional_batch_inputs}"

        placement_section = "\n\nThis block should be placed after the encoder steps and the text input step."

@@ -572,20 +398,23 @@ class QwenImageEditPlusAdditionalInputsStep(ModularPipelineBlocks):
    @property
    def inputs(self) -> List[InputParam]:
        inputs = [
-            InputParam.template("num_images_per_prompt"),
-            InputParam.template("batch_size"),
-            InputParam.template("height"),
-            InputParam.template("width"),
+            InputParam(name="num_images_per_prompt", default=1),
+            InputParam(name="batch_size", required=True),
+            InputParam(name="height"),
+            InputParam(name="width"),
        ]

-        # default is `image_latents`
-        inputs += self._image_latent_inputs + self._additional_batch_inputs
+        for image_latent_input_name in self._image_latent_inputs:
+            inputs.append(InputParam(name=image_latent_input_name))
+
+        for input_name in self._additional_batch_inputs:
+            inputs.append(InputParam(name=input_name))

        return inputs

    @property
    def intermediate_outputs(self) -> List[OutputParam]:
-        outputs = [
+        return [
            OutputParam(
                name="image_height",
                type_hint=List[int],
@@ -598,43 +427,11 @@ class QwenImageEditPlusAdditionalInputsStep(ModularPipelineBlocks):
            ),
        ]

-        # `height`/`width` are updated if any image latent inputs are provided
-        if len(self._image_latent_inputs) > 0:
-            outputs.append(
-                OutputParam(name="height", type_hint=int, description="if not provided, updated to image height")
-            )
-            outputs.append(
-                OutputParam(name="width", type_hint=int, description="if not provided, updated to image width")
-            )
-
-        # image latent inputs are modified in place (patchified, concatenated, and batch-expanded)
-        for input_param in self._image_latent_inputs:
-            outputs.append(
-                OutputParam(
-                    name=input_param.name,
-                    type_hint=input_param.type_hint,
-                    description=input_param.description + " (patchified, concatenated, and batch-expanded)",
-                )
-            )
-
-        # additional batch inputs (batch-expanded only)
-        for input_param in self._additional_batch_inputs:
-            outputs.append(
-                OutputParam(
-                    name=input_param.name,
-                    type_hint=input_param.type_hint,
-                    description=input_param.description + " (batch-expanded)",
-                )
-            )
-
-        return outputs
-
    def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
        block_state = self.get_block_state(state)

        # Process image latent inputs
-        for input_param in self._image_latent_inputs:
-            image_latent_input_name = input_param.name
+        for image_latent_input_name in self._image_latent_inputs:
            image_latent_tensor = getattr(block_state, image_latent_input_name)
            if image_latent_tensor is None:
                continue
@@ -679,8 +476,7 @@ class QwenImageEditPlusAdditionalInputsStep(ModularPipelineBlocks):
            setattr(block_state, image_latent_input_name, packed_image_latent_tensors)

        # Process additional batch inputs (only batch expansion)
-        for input_param in self._additional_batch_inputs:
-            input_name = input_param.name
+        for input_name in self._additional_batch_inputs:
            input_tensor = getattr(block_state, input_name)
            if input_tensor is None:
                continue
@@ -698,75 +494,22 @@ class QwenImageEditPlusAdditionalInputsStep(ModularPipelineBlocks):
        return components, state


-# same as QwenImageAdditionalInputsStep, but with layered pachifier.
-
-
-# auto_docstring
+# YiYi TODO: support define config default component from the ModularPipeline level.
+# it is same as QwenImageAdditionalInputsStep, but with layered pachifier.
 class QwenImageLayeredAdditionalInputsStep(ModularPipelineBlocks):
-    """
-    Input processing step for Layered that:
-        1. For image latent inputs: Updates height/width if None, patchifies with layered pachifier, and expands batch
-           size
-        2. For additional batch inputs: Expands batch dimensions to match final batch size
-
-      Configured inputs:
-        - Image latent inputs: ['image_latents']
-
-      This block should be placed after the encoder steps and the text input step.
-
-      Components:
-          pachifier (`QwenImageLayeredPachifier`)
-
-      Inputs:
-          num_images_per_prompt (`int`, *optional*, defaults to 1):
-              The number of images to generate per prompt.
-          batch_size (`int`, *optional*, defaults to 1):
-              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
-              be generated in input step.
-          image_latents (`Tensor`):
-              image latents used to guide the image generation. Can be generated from vae_encoder step.
-
-      Outputs:
-          image_height (`int`):
-              The image height calculated from the image latents dimension
-          image_width (`int`):
-              The image width calculated from the image latents dimension
-          height (`int`):
-              if not provided, updated to image height
-          width (`int`):
-              if not provided, updated to image width
-          image_latents (`Tensor`):
-              image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified
-              with layered pachifier and batch-expanded)
-    """
+    """Input step for QwenImage Layered: update height/width, expand batch, patchify with layered pachifier."""

    model_name = "qwenimage-layered"

    def __init__(
        self,
-        image_latent_inputs: Optional[List[InputParam]] = None,
-        additional_batch_inputs: Optional[List[InputParam]] = None,
+        image_latent_inputs: List[str] = ["image_latents"],
+        additional_batch_inputs: List[str] = [],
    ):
-        if image_latent_inputs is None:
-            image_latent_inputs = [InputParam.template("image_latents")]
-        if additional_batch_inputs is None:
-            additional_batch_inputs = []
-
        if not isinstance(image_latent_inputs, list):
-            raise ValueError(f"image_latent_inputs must be a list, but got {type(image_latent_inputs)}")
-        else:
-            for input_param in image_latent_inputs:
-                if not isinstance(input_param, InputParam):
-                    raise ValueError(f"image_latent_inputs must be a list of InputParam, but got {type(input_param)}")
-
+            image_latent_inputs = [image_latent_inputs]
        if not isinstance(additional_batch_inputs, list):
-            raise ValueError(f"additional_batch_inputs must be a list, but got {type(additional_batch_inputs)}")
-        else:
-            for input_param in additional_batch_inputs:
-                if not isinstance(input_param, InputParam):
-                    raise ValueError(
-                        f"additional_batch_inputs must be a list of InputParam, but got {type(input_param)}"
-                    )
+            additional_batch_inputs = [additional_batch_inputs]

        self._image_latent_inputs = image_latent_inputs
        self._additional_batch_inputs = additional_batch_inputs
@@ -784,9 +527,9 @@ class QwenImageLayeredAdditionalInputsStep(ModularPipelineBlocks):
        if self._image_latent_inputs or self._additional_batch_inputs:
            inputs_info = "\n\nConfigured inputs:"
            if self._image_latent_inputs:
-                inputs_info += f"\n  - Image latent inputs: {[p.name for p in self._image_latent_inputs]}"
+                inputs_info += f"\n  - Image latent inputs: {self._image_latent_inputs}"
            if self._additional_batch_inputs:
-                inputs_info += f"\n  - Additional batch inputs: {[p.name for p in self._additional_batch_inputs]}"
+                inputs_info += f"\n  - Additional batch inputs: {self._additional_batch_inputs}"

        placement_section = "\n\nThis block should be placed after the encoder steps and the text input step."

@@ -801,18 +544,21 @@ class QwenImageLayeredAdditionalInputsStep(ModularPipelineBlocks):
    @property
    def inputs(self) -> List[InputParam]:
        inputs = [
-            InputParam.template("num_images_per_prompt"),
-            InputParam.template("batch_size"),
+            InputParam(name="num_images_per_prompt", default=1),
+            InputParam(name="batch_size", required=True),
        ]
-        # default is `image_latents`

-        inputs += self._image_latent_inputs + self._additional_batch_inputs
+        for image_latent_input_name in self._image_latent_inputs:
+            inputs.append(InputParam(name=image_latent_input_name))
+
+        for input_name in self._additional_batch_inputs:
+            inputs.append(InputParam(name=input_name))

        return inputs

    @property
    def intermediate_outputs(self) -> List[OutputParam]:
-        outputs = [
+        return [
            OutputParam(
                name="image_height",
                type_hint=int,
@@ -823,44 +569,15 @@ class QwenImageLayeredAdditionalInputsStep(ModularPipelineBlocks):
                type_hint=int,
                description="The image width calculated from the image latents dimension",
            ),
+            OutputParam(name="height", type_hint=int, description="The height of the image output"),
+            OutputParam(name="width", type_hint=int, description="The width of the image output"),
        ]

-        if len(self._image_latent_inputs) > 0:
-            outputs.append(
-                OutputParam(name="height", type_hint=int, description="if not provided, updated to image height")
-            )
-            outputs.append(
-                OutputParam(name="width", type_hint=int, description="if not provided, updated to image width")
-            )
-
-        # Add outputs for image latent inputs (patchified with layered pachifier and batch-expanded)
-        for input_param in self._image_latent_inputs:
-            outputs.append(
-                OutputParam(
-                    name=input_param.name,
-                    type_hint=input_param.type_hint,
-                    description=input_param.description + " (patchified with layered pachifier and batch-expanded)",
-                )
-            )
-
-        # Add outputs for additional batch inputs (batch-expanded only)
-        for input_param in self._additional_batch_inputs:
-            outputs.append(
-                OutputParam(
-                    name=input_param.name,
-                    type_hint=input_param.type_hint,
-                    description=input_param.description + " (batch-expanded)",
-                )
-            )
-
-        return outputs
-
    def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
        block_state = self.get_block_state(state)

        # Process image latent inputs
-        for input_param in self._image_latent_inputs:
-            image_latent_input_name = input_param.name
+        for image_latent_input_name in self._image_latent_inputs:
            image_latent_tensor = getattr(block_state, image_latent_input_name)
            if image_latent_tensor is None:
                continue
@@ -891,8 +608,7 @@ class QwenImageLayeredAdditionalInputsStep(ModularPipelineBlocks):
            setattr(block_state, image_latent_input_name, image_latent_tensor)

        # Process additional batch inputs (only batch expansion)
-        for input_param in self._additional_batch_inputs:
-            input_name = input_param.name
+        for input_name in self._additional_batch_inputs:
            input_tensor = getattr(block_state, input_name)
            if input_tensor is None:
                continue
@@ -910,34 +626,7 @@ class QwenImageLayeredAdditionalInputsStep(ModularPipelineBlocks):
        return components, state


-# auto_docstring
 class QwenImageControlNetInputsStep(ModularPipelineBlocks):
-    """
-    prepare the `control_image_latents` for controlnet. Insert after all the other inputs steps.
-
-      Inputs:
-          control_image_latents (`Tensor`):
-              The control image latents to use for the denoising process. Can be generated in controlnet vae encoder
-              step.
-          batch_size (`int`, *optional*, defaults to 1):
-              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
-              be generated in input step.
-          num_images_per_prompt (`int`, *optional*, defaults to 1):
-              The number of images to generate per prompt.
-          height (`int`, *optional*):
-              The height in pixels of the generated image.
-          width (`int`, *optional*):
-              The width in pixels of the generated image.
-
-      Outputs:
-          control_image_latents (`Tensor`):
-              The control image latents (patchified and batch-expanded).
-          height (`int`):
-              if not provided, updated to control image height
-          width (`int`):
-              if not provided, updated to control image width
-    """
-
    model_name = "qwenimage"

    @property
@@ -947,28 +636,11 @@ class QwenImageControlNetInputsStep(ModularPipelineBlocks):
    @property
    def inputs(self) -> List[InputParam]:
        return [
-            InputParam(
-                name="control_image_latents",
-                required=True,
-                type_hint=torch.Tensor,
-                description="The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step.",
-            ),
-            InputParam.template("batch_size"),
-            InputParam.template("num_images_per_prompt"),
-            InputParam.template("height"),
-            InputParam.template("width"),
-        ]
-
-    @property
-    def intermediate_outputs(self) -> List[OutputParam]:
-        return [
-            OutputParam(
-                name="control_image_latents",
-                type_hint=torch.Tensor,
-                description="The control image latents (patchified and batch-expanded).",
-            ),
-            OutputParam(name="height", type_hint=int, description="if not provided, updated to control image height"),
-            OutputParam(name="width", type_hint=int, description="if not provided, updated to control image width"),
+            InputParam(name="control_image_latents", required=True),
+            InputParam(name="batch_size", required=True),
+            InputParam(name="num_images_per_prompt", default=1),
+            InputParam(name="height"),
+            InputParam(name="width"),
        ]

    @torch.no_grad()
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
@@ -12,11 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+from typing import List
+
+import PIL.Image
 import torch

 from ...utils import logging
 from ..modular_pipeline import AutoPipelineBlocks, ConditionalPipelineBlocks, SequentialPipelineBlocks
-from ..modular_pipeline_utils import InputParam, InsertableDict, OutputParam
+from ..modular_pipeline_utils import InsertableDict, OutputParam
 from .before_denoise import (
    QwenImageControlNetBeforeDenoiserStep,
    QwenImageCreateMaskLatentsStep,
@@ -56,91 +59,11 @@ logger = logging.get_logger(__name__)


 # ====================
-# 1. TEXT ENCODER
+# 1. VAE ENCODER
 # ====================


-# auto_docstring
-class QwenImageAutoTextEncoderStep(AutoPipelineBlocks):
-    """
-    Text encoder step that encodes the text prompt into a text embedding. This is an auto pipeline block.
-
-      Components:
-          text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use tokenizer (`Qwen2Tokenizer`):
-          The tokenizer to use guider (`ClassifierFreeGuidance`)
-
-      Inputs:
-          prompt (`str`, *optional*):
-              The prompt or prompts to guide image generation.
-          negative_prompt (`str`, *optional*):
-              The prompt or prompts not to guide the image generation.
-          max_sequence_length (`int`, *optional*, defaults to 1024):
-              Maximum sequence length for prompt encoding.
-
-      Outputs:
-          prompt_embeds (`Tensor`):
-              The prompt embeddings.
-          prompt_embeds_mask (`Tensor`):
-              The encoder attention mask.
-          negative_prompt_embeds (`Tensor`):
-              The negative prompt embeddings.
-          negative_prompt_embeds_mask (`Tensor`):
-              The negative prompt embeddings mask.
-    """
-
-    model_name = "qwenimage"
-    block_classes = [QwenImageTextEncoderStep()]
-    block_names = ["text_encoder"]
-    block_trigger_inputs = ["prompt"]
-
-    @property
-    def description(self) -> str:
-        return "Text encoder step that encodes the text prompt into a text embedding. This is an auto pipeline block."
-        " - `QwenImageTextEncoderStep` (text_encoder) is used when `prompt` is provided."
-        " - if `prompt` is not provided, step will be skipped."
-
-
-# ====================
-# 2. VAE ENCODER
-# ====================
-
-
-# auto_docstring
 class QwenImageInpaintVaeEncoderStep(SequentialPipelineBlocks):
-    """
-    This step is used for processing image and mask inputs for inpainting tasks. It:
-       - Resizes the image to the target size, based on `height` and `width`.
-       - Processes and updates `image` and `mask_image`.
-       - Creates `image_latents`.
-
-      Components:
-          image_mask_processor (`InpaintProcessor`) vae (`AutoencoderKLQwenImage`)
-
-      Inputs:
-          mask_image (`Image`):
-              Mask image for inpainting.
-          image (`Union[Image, List]`):
-              Reference image(s) for denoising. Can be a single image or list of images.
-          height (`int`, *optional*):
-              The height in pixels of the generated image.
-          width (`int`, *optional*):
-              The width in pixels of the generated image.
-          padding_mask_crop (`int`, *optional*):
-              Padding for mask cropping in inpainting.
-          generator (`Generator`, *optional*):
-              Torch generator for deterministic generation.
-
-      Outputs:
-          processed_image (`Tensor`):
-              The processed image
-          processed_mask_image (`Tensor`):
-              The processed mask image
-          mask_overlay_kwargs (`Dict`):
-              The kwargs for the postprocess step to apply the mask overlay
-          image_latents (`Tensor`):
-              The latent representation of the input image.
-    """
-
    model_name = "qwenimage"
    block_classes = [QwenImageInpaintProcessImagesInputStep(), QwenImageVaeEncoderStep()]
    block_names = ["preprocess", "encode"]
@@ -155,31 +78,7 @@ class QwenImageInpaintVaeEncoderStep(SequentialPipelineBlocks):
        )


-# auto_docstring
 class QwenImageImg2ImgVaeEncoderStep(SequentialPipelineBlocks):
-    """
-    Vae encoder step that preprocess andencode the image inputs into their latent representations.
-
-      Components:
-          image_processor (`VaeImageProcessor`) vae (`AutoencoderKLQwenImage`)
-
-      Inputs:
-          image (`Union[Image, List]`):
-              Reference image(s) for denoising. Can be a single image or list of images.
-          height (`int`, *optional*):
-              The height in pixels of the generated image.
-          width (`int`, *optional*):
-              The width in pixels of the generated image.
-          generator (`Generator`, *optional*):
-              Torch generator for deterministic generation.
-
-      Outputs:
-          processed_image (`Tensor`):
-              The processed image
-          image_latents (`Tensor`):
-              The latent representation of the input image.
-    """
-
    model_name = "qwenimage"

    block_classes = [QwenImageProcessImagesInputStep(), QwenImageVaeEncoderStep()]
@@ -190,6 +89,7 @@ class QwenImageImg2ImgVaeEncoderStep(SequentialPipelineBlocks):
        return "Vae encoder step that preprocess andencode the image inputs into their latent representations."


+# Auto VAE encoder
 class QwenImageAutoVaeEncoderStep(AutoPipelineBlocks):
    block_classes = [QwenImageInpaintVaeEncoderStep, QwenImageImg2ImgVaeEncoderStep]
    block_names = ["inpaint", "img2img"]
@@ -207,33 +107,7 @@ class QwenImageAutoVaeEncoderStep(AutoPipelineBlocks):


 # optional controlnet vae encoder
-# auto_docstring
 class QwenImageOptionalControlNetVaeEncoderStep(AutoPipelineBlocks):
-    """
-    Vae encoder step that encode the image inputs into their latent representations.
-      This is an auto pipeline block.
-       - `QwenImageControlNetVaeEncoderStep` (controlnet) is used when `control_image` is provided.
-       - if `control_image` is not provided, step will be skipped.
-
-      Components:
-          vae (`AutoencoderKLQwenImage`) controlnet (`QwenImageControlNetModel`) control_image_processor
-          (`VaeImageProcessor`)
-
-      Inputs:
-          control_image (`Image`, *optional*):
-              Control image for ControlNet conditioning.
-          height (`int`, *optional*):
-              The height in pixels of the generated image.
-          width (`int`, *optional*):
-              The width in pixels of the generated image.
-          generator (`Generator`, *optional*):
-              Torch generator for deterministic generation.
-
-      Outputs:
-          control_image_latents (`Tensor`):
-              The latents representing the control image
-    """
-
    block_classes = [QwenImageControlNetVaeEncoderStep]
    block_names = ["controlnet"]
    block_trigger_inputs = ["control_image"]
@@ -249,65 +123,14 @@ class QwenImageOptionalControlNetVaeEncoderStep(AutoPipelineBlocks):


 # ====================
-# 3. DENOISE (input -> prepare_latents -> set_timesteps -> prepare_rope_inputs -> denoise -> after_denoise)
+# 2. DENOISE (input -> prepare_latents -> set_timesteps -> prepare_rope_inputs -> denoise -> after_denoise)
 # ====================


 # assemble input steps
-# auto_docstring
 class QwenImageImg2ImgInputStep(SequentialPipelineBlocks):
-    """
-    Input step that prepares the inputs for the img2img denoising step. It:
-
-      Components:
-          pachifier (`QwenImagePachifier`)
-
-      Inputs:
-          num_images_per_prompt (`int`, *optional*, defaults to 1):
-              The number of images to generate per prompt.
-          prompt_embeds (`Tensor`):
-              text embeddings used to guide the image generation. Can be generated from text_encoder step.
-          prompt_embeds_mask (`Tensor`):
-              mask for the text embeddings. Can be generated from text_encoder step.
-          negative_prompt_embeds (`Tensor`, *optional*):
-              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
-          negative_prompt_embeds_mask (`Tensor`, *optional*):
-              mask for the negative text embeddings. Can be generated from text_encoder step.
-          height (`int`, *optional*):
-              The height in pixels of the generated image.
-          width (`int`, *optional*):
-              The width in pixels of the generated image.
-          image_latents (`Tensor`):
-              image latents used to guide the image generation. Can be generated from vae_encoder step.
-
-      Outputs:
-          batch_size (`int`):
-              The batch size of the prompt embeddings
-          dtype (`dtype`):
-              The data type of the prompt embeddings
-          prompt_embeds (`Tensor`):
-              The prompt embeddings. (batch-expanded)
-          prompt_embeds_mask (`Tensor`):
-              The encoder attention mask. (batch-expanded)
-          negative_prompt_embeds (`Tensor`):
-              The negative prompt embeddings. (batch-expanded)
-          negative_prompt_embeds_mask (`Tensor`):
-              The negative prompt embeddings mask. (batch-expanded)
-          image_height (`int`):
-              The image height calculated from the image latents dimension
-          image_width (`int`):
-              The image width calculated from the image latents dimension
-          height (`int`):
-              if not provided, updated to image height
-          width (`int`):
-              if not provided, updated to image width
-          image_latents (`Tensor`):
-              image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified and
-              batch-expanded)
-    """
-
    model_name = "qwenimage"
-    block_classes = [QwenImageTextInputsStep(), QwenImageAdditionalInputsStep()]
+    block_classes = [QwenImageTextInputsStep(), QwenImageAdditionalInputsStep(image_latent_inputs=["image_latents"])]
    block_names = ["text_inputs", "additional_inputs"]

    @property
@@ -317,69 +140,12 @@ class QwenImageImg2ImgInputStep(SequentialPipelineBlocks):
        " - update height/width based `image_latents`, patchify `image_latents`."


-# auto_docstring
 class QwenImageInpaintInputStep(SequentialPipelineBlocks):
-    """
-    Input step that prepares the inputs for the inpainting denoising step. It:
-
-      Components:
-          pachifier (`QwenImagePachifier`)
-
-      Inputs:
-          num_images_per_prompt (`int`, *optional*, defaults to 1):
-              The number of images to generate per prompt.
-          prompt_embeds (`Tensor`):
-              text embeddings used to guide the image generation. Can be generated from text_encoder step.
-          prompt_embeds_mask (`Tensor`):
-              mask for the text embeddings. Can be generated from text_encoder step.
-          negative_prompt_embeds (`Tensor`, *optional*):
-              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
-          negative_prompt_embeds_mask (`Tensor`, *optional*):
-              mask for the negative text embeddings. Can be generated from text_encoder step.
-          height (`int`, *optional*):
-              The height in pixels of the generated image.
-          width (`int`, *optional*):
-              The width in pixels of the generated image.
-          image_latents (`Tensor`, *optional*):
-              image latents used to guide the image generation. Can be generated from vae_encoder step.
-          processed_mask_image (`Tensor`, *optional*):
-              The processed mask image
-
-      Outputs:
-          batch_size (`int`):
-              The batch size of the prompt embeddings
-          dtype (`dtype`):
-              The data type of the prompt embeddings
-          prompt_embeds (`Tensor`):
-              The prompt embeddings. (batch-expanded)
-          prompt_embeds_mask (`Tensor`):
-              The encoder attention mask. (batch-expanded)
-          negative_prompt_embeds (`Tensor`):
-              The negative prompt embeddings. (batch-expanded)
-          negative_prompt_embeds_mask (`Tensor`):
-              The negative prompt embeddings mask. (batch-expanded)
-          image_height (`int`):
-              The image height calculated from the image latents dimension
-          image_width (`int`):
-              The image width calculated from the image latents dimension
-          height (`int`):
-              if not provided, updated to image height
-          width (`int`):
-              if not provided, updated to image width
-          image_latents (`Tensor`):
-              image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified and
-              batch-expanded)
-          processed_mask_image (`Tensor`):
-              The processed mask image (batch-expanded)
-    """
-
    model_name = "qwenimage"
    block_classes = [
        QwenImageTextInputsStep(),
        QwenImageAdditionalInputsStep(
-            additional_batch_inputs=[
-                InputParam(name="processed_mask_image", type_hint=torch.Tensor, description="The processed mask image")
-            ]
+            image_latent_inputs=["image_latents"], additional_batch_inputs=["processed_mask_image"]
        ),
    ]
    block_names = ["text_inputs", "additional_inputs"]
@@ -392,42 +158,7 @@ class QwenImageInpaintInputStep(SequentialPipelineBlocks):


 # assemble prepare latents steps
-# auto_docstring
 class QwenImageInpaintPrepareLatentsStep(SequentialPipelineBlocks):
-    """
-    This step prepares the latents/image_latents and mask inputs for the inpainting denoising step. It:
-       - Add noise to the image latents to create the latents input for the denoiser.
-       - Create the pachified latents `mask` based on the processedmask image.
-
-      Components:
-          scheduler (`FlowMatchEulerDiscreteScheduler`) pachifier (`QwenImagePachifier`)
-
-      Inputs:
-          latents (`Tensor`):
-              The initial random noised, can be generated in prepare latent step.
-          image_latents (`Tensor`):
-              image latents used to guide the image generation. Can be generated from vae_encoder step. (Can be
-              generated from vae encoder and updated in input step.)
-          timesteps (`Tensor`):
-              The timesteps to use for the denoising process. Can be generated in set_timesteps step.
-          processed_mask_image (`Tensor`):
-              The processed mask to use for the inpainting process.
-          height (`int`):
-              The height in pixels of the generated image.
-          width (`int`):
-              The width in pixels of the generated image.
-          dtype (`dtype`, *optional*, defaults to torch.float32):
-              The dtype of the model inputs, can be generated in input step.
-
-      Outputs:
-          initial_noise (`Tensor`):
-              The initial random noised used for inpainting denoising.
-          latents (`Tensor`):
-              The scaled noisy latents to use for inpainting/image-to-image denoising.
-          mask (`Tensor`):
-              The mask to use for the inpainting process.
-    """
-
    model_name = "qwenimage"
    block_classes = [QwenImagePrepareLatentsWithStrengthStep(), QwenImageCreateMaskLatentsStep()]
    block_names = ["add_noise_to_latents", "create_mask_latents"]
@@ -445,49 +176,7 @@ class QwenImageInpaintPrepareLatentsStep(SequentialPipelineBlocks):


 # Qwen Image (text2image)
-# auto_docstring
 class QwenImageCoreDenoiseStep(SequentialPipelineBlocks):
-    """
-    step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs
-    (timesteps, latents, rope inputs etc.).
-
-      Components:
-          pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider
-          (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)
-
-      Inputs:
-          num_images_per_prompt (`int`, *optional*, defaults to 1):
-              The number of images to generate per prompt.
-          prompt_embeds (`Tensor`):
-              text embeddings used to guide the image generation. Can be generated from text_encoder step.
-          prompt_embeds_mask (`Tensor`):
-              mask for the text embeddings. Can be generated from text_encoder step.
-          negative_prompt_embeds (`Tensor`, *optional*):
-              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
-          negative_prompt_embeds_mask (`Tensor`, *optional*):
-              mask for the negative text embeddings. Can be generated from text_encoder step.
-          latents (`Tensor`, *optional*):
-              Pre-generated noisy latents for image generation.
-          height (`int`, *optional*):
-              The height in pixels of the generated image.
-          width (`int`, *optional*):
-              The width in pixels of the generated image.
-          generator (`Generator`, *optional*):
-              Torch generator for deterministic generation.
-          num_inference_steps (`int`, *optional*, defaults to 50):
-              The number of denoising steps.
-          sigmas (`List`, *optional*):
-              Custom sigmas for the denoising process.
-          attention_kwargs (`Dict`, *optional*):
-              Additional kwargs for attention processors.
-          **denoiser_input_fields (`None`, *optional*):
-              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
-
-      Outputs:
-          latents (`Tensor`):
-              Denoised latents.
-    """
-
    model_name = "qwenimage"
    block_classes = [
        QwenImageTextInputsStep(),
@@ -510,63 +199,9 @@ class QwenImageCoreDenoiseStep(SequentialPipelineBlocks):
    def description(self):
        return "step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.)."

-    @property
-    def outputs(self):
-        return [
-            OutputParam.template("latents"),
-        ]
-

 # Qwen Image (inpainting)
-# auto_docstring
 class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks):
-    """
-    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint
-    task.
-
-      Components:
-          pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider
-          (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)
-
-      Inputs:
-          num_images_per_prompt (`int`, *optional*, defaults to 1):
-              The number of images to generate per prompt.
-          prompt_embeds (`Tensor`):
-              text embeddings used to guide the image generation. Can be generated from text_encoder step.
-          prompt_embeds_mask (`Tensor`):
-              mask for the text embeddings. Can be generated from text_encoder step.
-          negative_prompt_embeds (`Tensor`, *optional*):
-              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
-          negative_prompt_embeds_mask (`Tensor`, *optional*):
-              mask for the negative text embeddings. Can be generated from text_encoder step.
-          height (`int`, *optional*):
-              The height in pixels of the generated image.
-          width (`int`, *optional*):
-              The width in pixels of the generated image.
-          image_latents (`Tensor`, *optional*):
-              image latents used to guide the image generation. Can be generated from vae_encoder step.
-          processed_mask_image (`Tensor`, *optional*):
-              The processed mask image
-          latents (`Tensor`, *optional*):
-              Pre-generated noisy latents for image generation.
-          generator (`Generator`, *optional*):
-              Torch generator for deterministic generation.
-          num_inference_steps (`int`, *optional*, defaults to 50):
-              The number of denoising steps.
-          sigmas (`List`, *optional*):
-              Custom sigmas for the denoising process.
-          strength (`float`, *optional*, defaults to 0.9):
-              Strength for img2img/inpainting.
-          attention_kwargs (`Dict`, *optional*):
-              Additional kwargs for attention processors.
-          **denoiser_input_fields (`None`, *optional*):
-              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
-
-      Outputs:
-          latents (`Tensor`):
-              Denoised latents.
-    """
-
    model_name = "qwenimage"
    block_classes = [
        QwenImageInpaintInputStep(),
@@ -591,61 +226,9 @@ class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks):
    def description(self):
        return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task."

-    @property
-    def outputs(self):
-        return [
-            OutputParam.template("latents"),
-        ]
-

 # Qwen Image (image2image)
-# auto_docstring
 class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
-    """
-    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img
-    task.
-
-      Components:
-          pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider
-          (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)
-
-      Inputs:
-          num_images_per_prompt (`int`, *optional*, defaults to 1):
-              The number of images to generate per prompt.
-          prompt_embeds (`Tensor`):
-              text embeddings used to guide the image generation. Can be generated from text_encoder step.
-          prompt_embeds_mask (`Tensor`):
-              mask for the text embeddings. Can be generated from text_encoder step.
-          negative_prompt_embeds (`Tensor`, *optional*):
-              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
-          negative_prompt_embeds_mask (`Tensor`, *optional*):
-              mask for the negative text embeddings. Can be generated from text_encoder step.
-          height (`int`, *optional*):
-              The height in pixels of the generated image.
-          width (`int`, *optional*):
-              The width in pixels of the generated image.
-          image_latents (`Tensor`):
-              image latents used to guide the image generation. Can be generated from vae_encoder step.
-          latents (`Tensor`, *optional*):
-              Pre-generated noisy latents for image generation.
-          generator (`Generator`, *optional*):
-              Torch generator for deterministic generation.
-          num_inference_steps (`int`, *optional*, defaults to 50):
-              The number of denoising steps.
-          sigmas (`List`, *optional*):
-              Custom sigmas for the denoising process.
-          strength (`float`, *optional*, defaults to 0.9):
-              Strength for img2img/inpainting.
-          attention_kwargs (`Dict`, *optional*):
-              Additional kwargs for attention processors.
-          **denoiser_input_fields (`None`, *optional*):
-              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
-
-      Outputs:
-          latents (`Tensor`):
-              Denoised latents.
-    """
-
    model_name = "qwenimage"
    block_classes = [
        QwenImageImg2ImgInputStep(),
@@ -670,66 +253,9 @@ class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
    def description(self):
        return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task."

-    @property
-    def outputs(self):
-        return [
-            OutputParam.template("latents"),
-        ]
-

 # Qwen Image (text2image) with controlnet
-# auto_docstring
 class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks):
-    """
-    step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs
-    (timesteps, latents, rope inputs etc.).
-
-      Components:
-          pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) controlnet
-          (`QwenImageControlNetModel`) guider (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)
-
-      Inputs:
-          num_images_per_prompt (`int`, *optional*, defaults to 1):
-              The number of images to generate per prompt.
-          prompt_embeds (`Tensor`):
-              text embeddings used to guide the image generation. Can be generated from text_encoder step.
-          prompt_embeds_mask (`Tensor`):
-              mask for the text embeddings. Can be generated from text_encoder step.
-          negative_prompt_embeds (`Tensor`, *optional*):
-              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
-          negative_prompt_embeds_mask (`Tensor`, *optional*):
-              mask for the negative text embeddings. Can be generated from text_encoder step.
-          control_image_latents (`Tensor`):
-              The control image latents to use for the denoising process. Can be generated in controlnet vae encoder
-              step.
-          height (`int`, *optional*):
-              The height in pixels of the generated image.
-          width (`int`, *optional*):
-              The width in pixels of the generated image.
-          latents (`Tensor`, *optional*):
-              Pre-generated noisy latents for image generation.
-          generator (`Generator`, *optional*):
-              Torch generator for deterministic generation.
-          num_inference_steps (`int`, *optional*, defaults to 50):
-              The number of denoising steps.
-          sigmas (`List`, *optional*):
-              Custom sigmas for the denoising process.
-          control_guidance_start (`float`, *optional*, defaults to 0.0):
-              When to start applying ControlNet.
-          control_guidance_end (`float`, *optional*, defaults to 1.0):
-              When to stop applying ControlNet.
-          controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
-              Scale for ControlNet conditioning.
-          attention_kwargs (`Dict`, *optional*):
-              Additional kwargs for attention processors.
-          **denoiser_input_fields (`None`, *optional*):
-              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
-
-      Outputs:
-          latents (`Tensor`):
-              Denoised latents.
-    """
-
    model_name = "qwenimage"
    block_classes = [
        QwenImageTextInputsStep(),
@@ -756,72 +282,9 @@ class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks):
    def description(self):
        return "step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.)."

-    @property
-    def outputs(self):
-        return [
-            OutputParam.template("latents"),
-        ]
-

 # Qwen Image (inpainting) with controlnet
-# auto_docstring
 class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks):
-    """
-    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint
-    task.
-
-      Components:
-          pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) controlnet
-          (`QwenImageControlNetModel`) guider (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)
-
-      Inputs:
-          num_images_per_prompt (`int`, *optional*, defaults to 1):
-              The number of images to generate per prompt.
-          prompt_embeds (`Tensor`):
-              text embeddings used to guide the image generation. Can be generated from text_encoder step.
-          prompt_embeds_mask (`Tensor`):
-              mask for the text embeddings. Can be generated from text_encoder step.
-          negative_prompt_embeds (`Tensor`, *optional*):
-              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
-          negative_prompt_embeds_mask (`Tensor`, *optional*):
-              mask for the negative text embeddings. Can be generated from text_encoder step.
-          height (`int`, *optional*):
-              The height in pixels of the generated image.
-          width (`int`, *optional*):
-              The width in pixels of the generated image.
-          image_latents (`Tensor`, *optional*):
-              image latents used to guide the image generation. Can be generated from vae_encoder step.
-          processed_mask_image (`Tensor`, *optional*):
-              The processed mask image
-          control_image_latents (`Tensor`):
-              The control image latents to use for the denoising process. Can be generated in controlnet vae encoder
-              step.
-          latents (`Tensor`, *optional*):
-              Pre-generated noisy latents for image generation.
-          generator (`Generator`, *optional*):
-              Torch generator for deterministic generation.
-          num_inference_steps (`int`, *optional*, defaults to 50):
-              The number of denoising steps.
-          sigmas (`List`, *optional*):
-              Custom sigmas for the denoising process.
-          strength (`float`, *optional*, defaults to 0.9):
-              Strength for img2img/inpainting.
-          control_guidance_start (`float`, *optional*, defaults to 0.0):
-              When to start applying ControlNet.
-          control_guidance_end (`float`, *optional*, defaults to 1.0):
-              When to stop applying ControlNet.
-          controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
-              Scale for ControlNet conditioning.
-          attention_kwargs (`Dict`, *optional*):
-              Additional kwargs for attention processors.
-          **denoiser_input_fields (`None`, *optional*):
-              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
-
-      Outputs:
-          latents (`Tensor`):
-              Denoised latents.
-    """
-
    model_name = "qwenimage"
    block_classes = [
        QwenImageInpaintInputStep(),
@@ -850,70 +313,9 @@ class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks):
    def description(self):
        return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task."

-    @property
-    def outputs(self):
-        return [
-            OutputParam.template("latents"),
-        ]
-

 # Qwen Image (image2image) with controlnet
-# auto_docstring
 class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
-    """
-    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img
-    task.
-
-      Components:
-          pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) controlnet
-          (`QwenImageControlNetModel`) guider (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)
-
-      Inputs:
-          num_images_per_prompt (`int`, *optional*, defaults to 1):
-              The number of images to generate per prompt.
-          prompt_embeds (`Tensor`):
-              text embeddings used to guide the image generation. Can be generated from text_encoder step.
-          prompt_embeds_mask (`Tensor`):
-              mask for the text embeddings. Can be generated from text_encoder step.
-          negative_prompt_embeds (`Tensor`, *optional*):
-              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
-          negative_prompt_embeds_mask (`Tensor`, *optional*):
-              mask for the negative text embeddings. Can be generated from text_encoder step.
-          height (`int`, *optional*):
-              The height in pixels of the generated image.
-          width (`int`, *optional*):
-              The width in pixels of the generated image.
-          image_latents (`Tensor`):
-              image latents used to guide the image generation. Can be generated from vae_encoder step.
-          control_image_latents (`Tensor`):
-              The control image latents to use for the denoising process. Can be generated in controlnet vae encoder
-              step.
-          latents (`Tensor`, *optional*):
-              Pre-generated noisy latents for image generation.
-          generator (`Generator`, *optional*):
-              Torch generator for deterministic generation.
-          num_inference_steps (`int`, *optional*, defaults to 50):
-              The number of denoising steps.
-          sigmas (`List`, *optional*):
-              Custom sigmas for the denoising process.
-          strength (`float`, *optional*, defaults to 0.9):
-              Strength for img2img/inpainting.
-          control_guidance_start (`float`, *optional*, defaults to 0.0):
-              When to start applying ControlNet.
-          control_guidance_end (`float`, *optional*, defaults to 1.0):
-              When to stop applying ControlNet.
-          controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
-              Scale for ControlNet conditioning.
-          attention_kwargs (`Dict`, *optional*):
-              Additional kwargs for attention processors.
-          **denoiser_input_fields (`None`, *optional*):
-              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
-
-      Outputs:
-          latents (`Tensor`):
-              Denoised latents.
-    """
-
    model_name = "qwenimage"
    block_classes = [
        QwenImageImg2ImgInputStep(),
@@ -942,12 +344,6 @@ class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
    def description(self):
        return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task."

-    @property
-    def outputs(self):
-        return [
-            OutputParam.template("latents"),
-        ]
-

 # Auto denoise step for QwenImage
 class QwenImageAutoCoreDenoiseStep(ConditionalPipelineBlocks):
@@ -1006,36 +402,19 @@ class QwenImageAutoCoreDenoiseStep(ConditionalPipelineBlocks):
    @property
    def outputs(self):
        return [
-            OutputParam.template("latents"),
+            OutputParam(
+                name="latents", type_hint=torch.Tensor, description="The latents generated by the denoising step"
+            ),
        ]


 # ====================
-# 4. DECODE
+# 3. DECODE
 # ====================


 # standard decode step works for most tasks except for inpaint
-# auto_docstring
 class QwenImageDecodeStep(SequentialPipelineBlocks):
-    """
-    Decode step that decodes the latents to images and postprocess the generated image.
-
-      Components:
-          vae (`AutoencoderKLQwenImage`) image_processor (`VaeImageProcessor`)
-
-      Inputs:
-          latents (`Tensor`):
-              The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise
-              step.
-          output_type (`str`, *optional*, defaults to pil):
-              Output format: 'pil', 'np', 'pt'.
-
-      Outputs:
-          images (`List`):
-              Generated images. (tensor output of the vae decoder.)
-    """
-
    model_name = "qwenimage"
    block_classes = [QwenImageDecoderStep(), QwenImageProcessImagesOutputStep()]
    block_names = ["decode", "postprocess"]
@@ -1046,30 +425,7 @@ class QwenImageDecodeStep(SequentialPipelineBlocks):


 # Inpaint decode step
-# auto_docstring
 class QwenImageInpaintDecodeStep(SequentialPipelineBlocks):
-    """
-    Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask
-    overally to the original image.
-
-      Components:
-          vae (`AutoencoderKLQwenImage`) image_mask_processor (`InpaintProcessor`)
-
-      Inputs:
-          latents (`Tensor`):
-              The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise
-              step.
-          output_type (`str`, *optional*, defaults to pil):
-              Output format: 'pil', 'np', 'pt'.
-          mask_overlay_kwargs (`Dict`, *optional*):
-              The kwargs for the postprocess step to apply the mask overlay. generated in
-              InpaintProcessImagesInputStep.
-
-      Outputs:
-          images (`List`):
-              Generated images. (tensor output of the vae decoder.)
-    """
-
    model_name = "qwenimage"
    block_classes = [QwenImageDecoderStep(), QwenImageInpaintProcessImagesOutputStep()]
    block_names = ["decode", "postprocess"]
@@ -1096,11 +452,11 @@ class QwenImageAutoDecodeStep(AutoPipelineBlocks):


 # ====================
-# 5. AUTO BLOCKS & PRESETS
+# 4. AUTO BLOCKS & PRESETS
 # ====================
 AUTO_BLOCKS = InsertableDict(
    [
-        ("text_encoder", QwenImageAutoTextEncoderStep()),
+        ("text_encoder", QwenImageTextEncoderStep()),
        ("vae_encoder", QwenImageAutoVaeEncoderStep()),
        ("controlnet_vae_encoder", QwenImageOptionalControlNetVaeEncoderStep()),
        ("denoise", QwenImageAutoCoreDenoiseStep()),
@@ -1109,89 +465,7 @@ AUTO_BLOCKS = InsertableDict(
 )


-# auto_docstring
 class QwenImageAutoBlocks(SequentialPipelineBlocks):
-    """
-    Auto Modular pipeline for text-to-image, image-to-image, inpainting, and controlnet tasks using QwenImage.
-      - for image-to-image generation, you need to provide `image`
-      - for inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop`.
-      - to run the controlnet workflow, you need to provide `control_image`
-      - for text-to-image generation, all you need to provide is `prompt`
-
-      Components:
-          text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use tokenizer (`Qwen2Tokenizer`):
-          The tokenizer to use guider (`ClassifierFreeGuidance`) image_mask_processor (`InpaintProcessor`) vae
-          (`AutoencoderKLQwenImage`) image_processor (`VaeImageProcessor`) controlnet (`QwenImageControlNetModel`)
-          control_image_processor (`VaeImageProcessor`) pachifier (`QwenImagePachifier`) scheduler
-          (`FlowMatchEulerDiscreteScheduler`) transformer (`QwenImageTransformer2DModel`)
-
-      Inputs:
-          prompt (`str`, *optional*):
-              The prompt or prompts to guide image generation.
-          negative_prompt (`str`, *optional*):
-              The prompt or prompts not to guide the image generation.
-          max_sequence_length (`int`, *optional*, defaults to 1024):
-              Maximum sequence length for prompt encoding.
-          mask_image (`Image`, *optional*):
-              Mask image for inpainting.
-          image (`Union[Image, List]`, *optional*):
-              Reference image(s) for denoising. Can be a single image or list of images.
-          height (`int`, *optional*):
-              The height in pixels of the generated image.
-          width (`int`, *optional*):
-              The width in pixels of the generated image.
-          padding_mask_crop (`int`, *optional*):
-              Padding for mask cropping in inpainting.
-          generator (`Generator`, *optional*):
-              Torch generator for deterministic generation.
-          control_image (`Image`, *optional*):
-              Control image for ControlNet conditioning.
-          num_images_per_prompt (`int`, *optional*, defaults to 1):
-              The number of images to generate per prompt.
-          prompt_embeds (`Tensor`):
-              text embeddings used to guide the image generation. Can be generated from text_encoder step.
-          prompt_embeds_mask (`Tensor`):
-              mask for the text embeddings. Can be generated from text_encoder step.
-          negative_prompt_embeds (`Tensor`, *optional*):
-              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
-          negative_prompt_embeds_mask (`Tensor`, *optional*):
-              mask for the negative text embeddings. Can be generated from text_encoder step.
-          latents (`Tensor`):
-              Pre-generated noisy latents for image generation.
-          num_inference_steps (`int`):
-              The number of denoising steps.
-          sigmas (`List`, *optional*):
-              Custom sigmas for the denoising process.
-          attention_kwargs (`Dict`, *optional*):
-              Additional kwargs for attention processors.
-          **denoiser_input_fields (`None`, *optional*):
-              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
-          image_latents (`Tensor`, *optional*):
-              image latents used to guide the image generation. Can be generated from vae_encoder step.
-          processed_mask_image (`Tensor`, *optional*):
-              The processed mask image
-          strength (`float`, *optional*, defaults to 0.9):
-              Strength for img2img/inpainting.
-          control_image_latents (`Tensor`, *optional*):
-              The control image latents to use for the denoising process. Can be generated in controlnet vae encoder
-              step.
-          control_guidance_start (`float`, *optional*, defaults to 0.0):
-              When to start applying ControlNet.
-          control_guidance_end (`float`, *optional*, defaults to 1.0):
-              When to stop applying ControlNet.
-          controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
-              Scale for ControlNet conditioning.
-          output_type (`str`, *optional*, defaults to pil):
-              Output format: 'pil', 'np', 'pt'.
-          mask_overlay_kwargs (`Dict`, *optional*):
-              The kwargs for the postprocess step to apply the mask overlay. generated in
-              InpaintProcessImagesInputStep.
-
-      Outputs:
-          images (`List`):
-              Generated images.
-    """
-
    model_name = "qwenimage"

    block_classes = AUTO_BLOCKS.values()
@@ -1202,7 +476,7 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks):
        return (
            "Auto Modular pipeline for text-to-image, image-to-image, inpainting, and controlnet tasks using QwenImage.\n"
            + "- for image-to-image generation, you need to provide `image`\n"
-            + "- for inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop`.\n"
+            + "- for inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop` \n"
            + "- to run the controlnet workflow, you need to provide `control_image`\n"
            + "- for text-to-image generation, all you need to provide is `prompt`"
        )
@@ -1210,5 +484,5 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks):
    @property
    def outputs(self):
        return [
-            OutputParam.template("images"),
+            OutputParam(name="images", type_hint=List[List[PIL.Image.Image]]),
        ]
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
@@ -12,13 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-from typing import Optional
+from typing import List, Optional

+import PIL.Image
 import torch

 from ...utils import logging
 from ..modular_pipeline import AutoPipelineBlocks, ConditionalPipelineBlocks, SequentialPipelineBlocks
-from ..modular_pipeline_utils import InputParam, InsertableDict, OutputParam
+from ..modular_pipeline_utils import InsertableDict, OutputParam
 from .before_denoise import (
    QwenImageCreateMaskLatentsStep,
    QwenImageEditRoPEInputsStep,
@@ -58,35 +59,8 @@ logger = logging.get_logger(__name__)
 # ====================


-# auto_docstring
 class QwenImageEditVLEncoderStep(SequentialPipelineBlocks):
-    """
-    QwenImage-Edit VL encoder step that encode the image and text prompts together.
-
-      Components:
-          image_resize_processor (`VaeImageProcessor`) text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor
-          (`Qwen2VLProcessor`) guider (`ClassifierFreeGuidance`)
-
-      Inputs:
-          image (`Union[Image, List]`):
-              Reference image(s) for denoising. Can be a single image or list of images.
-          prompt (`str`):
-              The prompt or prompts to guide image generation.
-          negative_prompt (`str`, *optional*):
-              The prompt or prompts not to guide the image generation.
-
-      Outputs:
-          resized_image (`List`):
-              The resized images
-          prompt_embeds (`Tensor`):
-              The prompt embeddings.
-          prompt_embeds_mask (`Tensor`):
-              The encoder attention mask.
-          negative_prompt_embeds (`Tensor`):
-              The negative prompt embeddings.
-          negative_prompt_embeds_mask (`Tensor`):
-              The negative prompt embeddings mask.
-    """
+    """VL encoder that takes both image and text prompts."""

    model_name = "qwenimage-edit"
    block_classes = [
@@ -106,30 +80,7 @@ class QwenImageEditVLEncoderStep(SequentialPipelineBlocks):


 # Edit VAE encoder
-# auto_docstring
 class QwenImageEditVaeEncoderStep(SequentialPipelineBlocks):
-    """
-    Vae encoder step that encode the image inputs into their latent representations.
-
-      Components:
-          image_resize_processor (`VaeImageProcessor`) image_processor (`VaeImageProcessor`) vae
-          (`AutoencoderKLQwenImage`)
-
-      Inputs:
-          image (`Union[Image, List]`):
-              Reference image(s) for denoising. Can be a single image or list of images.
-          generator (`Generator`, *optional*):
-              Torch generator for deterministic generation.
-
-      Outputs:
-          resized_image (`List`):
-              The resized images
-          processed_image (`Tensor`):
-              The processed image
-          image_latents (`Tensor`):
-              The latent representation of the input image.
-    """
-
    model_name = "qwenimage-edit"
    block_classes = [
        QwenImageEditResizeStep(),
@@ -144,46 +95,12 @@ class QwenImageEditVaeEncoderStep(SequentialPipelineBlocks):


 # Edit Inpaint VAE encoder
-# auto_docstring
 class QwenImageEditInpaintVaeEncoderStep(SequentialPipelineBlocks):
-    """
-    This step is used for processing image and mask inputs for QwenImage-Edit inpaint tasks. It:
-       - resize the image for target area (1024 * 1024) while maintaining the aspect ratio.
-       - process the resized image and mask image.
-       - create image latents.
-
-      Components:
-          image_resize_processor (`VaeImageProcessor`) image_mask_processor (`InpaintProcessor`) vae
-          (`AutoencoderKLQwenImage`)
-
-      Inputs:
-          image (`Union[Image, List]`):
-              Reference image(s) for denoising. Can be a single image or list of images.
-          mask_image (`Image`):
-              Mask image for inpainting.
-          padding_mask_crop (`int`, *optional*):
-              Padding for mask cropping in inpainting.
-          generator (`Generator`, *optional*):
-              Torch generator for deterministic generation.
-
-      Outputs:
-          resized_image (`List`):
-              The resized images
-          processed_image (`Tensor`):
-              The processed image
-          processed_mask_image (`Tensor`):
-              The processed mask image
-          mask_overlay_kwargs (`Dict`):
-              The kwargs for the postprocess step to apply the mask overlay
-          image_latents (`Tensor`):
-              The latent representation of the input image.
-    """
-
    model_name = "qwenimage-edit"
    block_classes = [
        QwenImageEditResizeStep(),
        QwenImageEditInpaintProcessImagesInputStep(),
-        QwenImageVaeEncoderStep(),
+        QwenImageVaeEncoderStep(input_name="processed_image", output_name="image_latents"),
    ]
    block_names = ["resize", "preprocess", "encode"]

@@ -220,64 +137,11 @@ class QwenImageEditAutoVaeEncoderStep(AutoPipelineBlocks):


 # assemble input steps
-# auto_docstring
 class QwenImageEditInputStep(SequentialPipelineBlocks):
-    """
-    Input step that prepares the inputs for the edit denoising step. It:
-       - make sure the text embeddings have consistent batch size as well as the additional inputs.
-       - update height/width based `image_latents`, patchify `image_latents`.
-
-      Components:
-          pachifier (`QwenImagePachifier`)
-
-      Inputs:
-          num_images_per_prompt (`int`, *optional*, defaults to 1):
-              The number of images to generate per prompt.
-          prompt_embeds (`Tensor`):
-              text embeddings used to guide the image generation. Can be generated from text_encoder step.
-          prompt_embeds_mask (`Tensor`):
-              mask for the text embeddings. Can be generated from text_encoder step.
-          negative_prompt_embeds (`Tensor`, *optional*):
-              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
-          negative_prompt_embeds_mask (`Tensor`, *optional*):
-              mask for the negative text embeddings. Can be generated from text_encoder step.
-          height (`int`, *optional*):
-              The height in pixels of the generated image.
-          width (`int`, *optional*):
-              The width in pixels of the generated image.
-          image_latents (`Tensor`):
-              image latents used to guide the image generation. Can be generated from vae_encoder step.
-
-      Outputs:
-          batch_size (`int`):
-              The batch size of the prompt embeddings
-          dtype (`dtype`):
-              The data type of the prompt embeddings
-          prompt_embeds (`Tensor`):
-              The prompt embeddings. (batch-expanded)
-          prompt_embeds_mask (`Tensor`):
-              The encoder attention mask. (batch-expanded)
-          negative_prompt_embeds (`Tensor`):
-              The negative prompt embeddings. (batch-expanded)
-          negative_prompt_embeds_mask (`Tensor`):
-              The negative prompt embeddings mask. (batch-expanded)
-          image_height (`int`):
-              The image height calculated from the image latents dimension
-          image_width (`int`):
-              The image width calculated from the image latents dimension
-          height (`int`):
-              if not provided, updated to image height
-          width (`int`):
-              if not provided, updated to image width
-          image_latents (`Tensor`):
-              image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified and
-              batch-expanded)
-    """
-
    model_name = "qwenimage-edit"
    block_classes = [
        QwenImageTextInputsStep(),
-        QwenImageAdditionalInputsStep(),
+        QwenImageAdditionalInputsStep(image_latent_inputs=["image_latents"]),
    ]
    block_names = ["text_inputs", "additional_inputs"]

@@ -290,71 +154,12 @@ class QwenImageEditInputStep(SequentialPipelineBlocks):
        )


-# auto_docstring
 class QwenImageEditInpaintInputStep(SequentialPipelineBlocks):
-    """
-    Input step that prepares the inputs for the edit inpaint denoising step. It:
-       - make sure the text embeddings have consistent batch size as well as the additional inputs.
-       - update height/width based `image_latents`, patchify `image_latents`.
-
-      Components:
-          pachifier (`QwenImagePachifier`)
-
-      Inputs:
-          num_images_per_prompt (`int`, *optional*, defaults to 1):
-              The number of images to generate per prompt.
-          prompt_embeds (`Tensor`):
-              text embeddings used to guide the image generation. Can be generated from text_encoder step.
-          prompt_embeds_mask (`Tensor`):
-              mask for the text embeddings. Can be generated from text_encoder step.
-          negative_prompt_embeds (`Tensor`, *optional*):
-              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
-          negative_prompt_embeds_mask (`Tensor`, *optional*):
-              mask for the negative text embeddings. Can be generated from text_encoder step.
-          height (`int`, *optional*):
-              The height in pixels of the generated image.
-          width (`int`, *optional*):
-              The width in pixels of the generated image.
-          image_latents (`Tensor`):
-              image latents used to guide the image generation. Can be generated from vae_encoder step.
-          processed_mask_image (`Tensor`, *optional*):
-              The processed mask image
-
-      Outputs:
-          batch_size (`int`):
-              The batch size of the prompt embeddings
-          dtype (`dtype`):
-              The data type of the prompt embeddings
-          prompt_embeds (`Tensor`):
-              The prompt embeddings. (batch-expanded)
-          prompt_embeds_mask (`Tensor`):
-              The encoder attention mask. (batch-expanded)
-          negative_prompt_embeds (`Tensor`):
-              The negative prompt embeddings. (batch-expanded)
-          negative_prompt_embeds_mask (`Tensor`):
-              The negative prompt embeddings mask. (batch-expanded)
-          image_height (`int`):
-              The image height calculated from the image latents dimension
-          image_width (`int`):
-              The image width calculated from the image latents dimension
-          height (`int`):
-              if not provided, updated to image height
-          width (`int`):
-              if not provided, updated to image width
-          image_latents (`Tensor`):
-              image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified and
-              batch-expanded)
-          processed_mask_image (`Tensor`):
-              The processed mask image (batch-expanded)
-    """
-
    model_name = "qwenimage-edit"
    block_classes = [
        QwenImageTextInputsStep(),
        QwenImageAdditionalInputsStep(
-            additional_batch_inputs=[
-                InputParam(name="processed_mask_image", type_hint=torch.Tensor, description="The processed mask image")
-            ]
+            image_latent_inputs=["image_latents"], additional_batch_inputs=["processed_mask_image"]
        ),
    ]
    block_names = ["text_inputs", "additional_inputs"]
@@ -369,42 +174,7 @@ class QwenImageEditInpaintInputStep(SequentialPipelineBlocks):


 # assemble prepare latents steps
-# auto_docstring
 class QwenImageEditInpaintPrepareLatentsStep(SequentialPipelineBlocks):
-    """
-    This step prepares the latents/image_latents and mask inputs for the edit inpainting denoising step. It:
-       - Add noise to the image latents to create the latents input for the denoiser.
-       - Create the patchified latents `mask` based on the processed mask image.
-
-      Components:
-          scheduler (`FlowMatchEulerDiscreteScheduler`) pachifier (`QwenImagePachifier`)
-
-      Inputs:
-          latents (`Tensor`):
-              The initial random noised, can be generated in prepare latent step.
-          image_latents (`Tensor`):
-              image latents used to guide the image generation. Can be generated from vae_encoder step. (Can be
-              generated from vae encoder and updated in input step.)
-          timesteps (`Tensor`):
-              The timesteps to use for the denoising process. Can be generated in set_timesteps step.
-          processed_mask_image (`Tensor`):
-              The processed mask to use for the inpainting process.
-          height (`int`):
-              The height in pixels of the generated image.
-          width (`int`):
-              The width in pixels of the generated image.
-          dtype (`dtype`, *optional*, defaults to torch.float32):
-              The dtype of the model inputs, can be generated in input step.
-
-      Outputs:
-          initial_noise (`Tensor`):
-              The initial random noised used for inpainting denoising.
-          latents (`Tensor`):
-              The scaled noisy latents to use for inpainting/image-to-image denoising.
-          mask (`Tensor`):
-              The mask to use for the inpainting process.
-    """
-
    model_name = "qwenimage-edit"
    block_classes = [QwenImagePrepareLatentsWithStrengthStep(), QwenImageCreateMaskLatentsStep()]
    block_names = ["add_noise_to_latents", "create_mask_latents"]
@@ -419,50 +189,7 @@ class QwenImageEditInpaintPrepareLatentsStep(SequentialPipelineBlocks):


 # Qwen Image Edit (image2image) core denoise step
-# auto_docstring
 class QwenImageEditCoreDenoiseStep(SequentialPipelineBlocks):
-    """
-    Core denoising workflow for QwenImage-Edit edit (img2img) task.
-
-      Components:
-          pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider
-          (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)
-
-      Inputs:
-          num_images_per_prompt (`int`, *optional*, defaults to 1):
-              The number of images to generate per prompt.
-          prompt_embeds (`Tensor`):
-              text embeddings used to guide the image generation. Can be generated from text_encoder step.
-          prompt_embeds_mask (`Tensor`):
-              mask for the text embeddings. Can be generated from text_encoder step.
-          negative_prompt_embeds (`Tensor`, *optional*):
-              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
-          negative_prompt_embeds_mask (`Tensor`, *optional*):
-              mask for the negative text embeddings. Can be generated from text_encoder step.
-          height (`int`, *optional*):
-              The height in pixels of the generated image.
-          width (`int`, *optional*):
-              The width in pixels of the generated image.
-          image_latents (`Tensor`):
-              image latents used to guide the image generation. Can be generated from vae_encoder step.
-          latents (`Tensor`, *optional*):
-              Pre-generated noisy latents for image generation.
-          generator (`Generator`, *optional*):
-              Torch generator for deterministic generation.
-          num_inference_steps (`int`, *optional*, defaults to 50):
-              The number of denoising steps.
-          sigmas (`List`, *optional*):
-              Custom sigmas for the denoising process.
-          attention_kwargs (`Dict`, *optional*):
-              Additional kwargs for attention processors.
-          **denoiser_input_fields (`None`, *optional*):
-              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
-
-      Outputs:
-          latents (`Tensor`):
-              Denoised latents.
-    """
-
    model_name = "qwenimage-edit"
    block_classes = [
        QwenImageEditInputStep(),
@@ -485,62 +212,9 @@ class QwenImageEditCoreDenoiseStep(SequentialPipelineBlocks):
    def description(self):
        return "Core denoising workflow for QwenImage-Edit edit (img2img) task."

-    @property
-    def outputs(self):
-        return [
-            OutputParam.template("latents"),
-        ]
-

 # Qwen Image Edit (inpainting) core denoise step
-# auto_docstring
 class QwenImageEditInpaintCoreDenoiseStep(SequentialPipelineBlocks):
-    """
-    Core denoising workflow for QwenImage-Edit edit inpaint task.
-
-      Components:
-          pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider
-          (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)
-
-      Inputs:
-          num_images_per_prompt (`int`, *optional*, defaults to 1):
-              The number of images to generate per prompt.
-          prompt_embeds (`Tensor`):
-              text embeddings used to guide the image generation. Can be generated from text_encoder step.
-          prompt_embeds_mask (`Tensor`):
-              mask for the text embeddings. Can be generated from text_encoder step.
-          negative_prompt_embeds (`Tensor`, *optional*):
-              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
-          negative_prompt_embeds_mask (`Tensor`, *optional*):
-              mask for the negative text embeddings. Can be generated from text_encoder step.
-          height (`int`, *optional*):
-              The height in pixels of the generated image.
-          width (`int`, *optional*):
-              The width in pixels of the generated image.
-          image_latents (`Tensor`):
-              image latents used to guide the image generation. Can be generated from vae_encoder step.
-          processed_mask_image (`Tensor`, *optional*):
-              The processed mask image
-          latents (`Tensor`, *optional*):
-              Pre-generated noisy latents for image generation.
-          generator (`Generator`, *optional*):
-              Torch generator for deterministic generation.
-          num_inference_steps (`int`, *optional*, defaults to 50):
-              The number of denoising steps.
-          sigmas (`List`, *optional*):
-              Custom sigmas for the denoising process.
-          strength (`float`, *optional*, defaults to 0.9):
-              Strength for img2img/inpainting.
-          attention_kwargs (`Dict`, *optional*):
-              Additional kwargs for attention processors.
-          **denoiser_input_fields (`None`, *optional*):
-              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
-
-      Outputs:
-          latents (`Tensor`):
-              Denoised latents.
-    """
-
    model_name = "qwenimage-edit"
    block_classes = [
        QwenImageEditInpaintInputStep(),
@@ -565,12 +239,6 @@ class QwenImageEditInpaintCoreDenoiseStep(SequentialPipelineBlocks):
    def description(self):
        return "Core denoising workflow for QwenImage-Edit edit inpaint task."

-    @property
-    def outputs(self):
-        return [
-            OutputParam.template("latents"),
-        ]
-

 # Auto core denoise step for QwenImage Edit
 class QwenImageEditAutoCoreDenoiseStep(ConditionalPipelineBlocks):
@@ -599,12 +267,6 @@ class QwenImageEditAutoCoreDenoiseStep(ConditionalPipelineBlocks):
            "Supports edit (img2img) and edit inpainting tasks for QwenImage-Edit."
        )

-    @property
-    def outputs(self):
-        return [
-            OutputParam.template("latents"),
-        ]
-

 # ====================
 # 4. DECODE
@@ -612,26 +274,7 @@ class QwenImageEditAutoCoreDenoiseStep(ConditionalPipelineBlocks):


 # Decode step (standard)
-# auto_docstring
 class QwenImageEditDecodeStep(SequentialPipelineBlocks):
-    """
-    Decode step that decodes the latents to images and postprocess the generated image.
-
-      Components:
-          vae (`AutoencoderKLQwenImage`) image_processor (`VaeImageProcessor`)
-
-      Inputs:
-          latents (`Tensor`):
-              The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise
-              step.
-          output_type (`str`, *optional*, defaults to pil):
-              Output format: 'pil', 'np', 'pt'.
-
-      Outputs:
-          images (`List`):
-              Generated images. (tensor output of the vae decoder.)
-    """
-
    model_name = "qwenimage-edit"
    block_classes = [QwenImageDecoderStep(), QwenImageProcessImagesOutputStep()]
    block_names = ["decode", "postprocess"]
@@ -642,30 +285,7 @@ class QwenImageEditDecodeStep(SequentialPipelineBlocks):


 # Inpaint decode step
-# auto_docstring
 class QwenImageEditInpaintDecodeStep(SequentialPipelineBlocks):
-    """
-    Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask
-    overlay to the original image.
-
-      Components:
-          vae (`AutoencoderKLQwenImage`) image_mask_processor (`InpaintProcessor`)
-
-      Inputs:
-          latents (`Tensor`):
-              The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise
-              step.
-          output_type (`str`, *optional*, defaults to pil):
-              Output format: 'pil', 'np', 'pt'.
-          mask_overlay_kwargs (`Dict`, *optional*):
-              The kwargs for the postprocess step to apply the mask overlay. generated in
-              InpaintProcessImagesInputStep.
-
-      Outputs:
-          images (`List`):
-              Generated images. (tensor output of the vae decoder.)
-    """
-
    model_name = "qwenimage-edit"
    block_classes = [QwenImageDecoderStep(), QwenImageInpaintProcessImagesOutputStep()]
    block_names = ["decode", "postprocess"]
@@ -693,7 +313,9 @@ class QwenImageEditAutoDecodeStep(AutoPipelineBlocks):
    @property
    def outputs(self):
        return [
-            OutputParam.template("latents"),
+            OutputParam(
+                name="latents", type_hint=torch.Tensor, description="The latents generated by the denoising step"
+            ),
        ]


@@ -711,66 +333,7 @@ EDIT_AUTO_BLOCKS = InsertableDict(
 )


-# auto_docstring
 class QwenImageEditAutoBlocks(SequentialPipelineBlocks):
-    """
-    Auto Modular pipeline for edit (img2img) and edit inpaint tasks using QwenImage-Edit.
-      - for edit (img2img) generation, you need to provide `image`
-      - for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide
-        `padding_mask_crop`
-
-      Components:
-          image_resize_processor (`VaeImageProcessor`) text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor
-          (`Qwen2VLProcessor`) guider (`ClassifierFreeGuidance`) image_mask_processor (`InpaintProcessor`) vae
-          (`AutoencoderKLQwenImage`) image_processor (`VaeImageProcessor`) pachifier (`QwenImagePachifier`) scheduler
-          (`FlowMatchEulerDiscreteScheduler`) transformer (`QwenImageTransformer2DModel`)
-
-      Inputs:
-          image (`Union[Image, List]`):
-              Reference image(s) for denoising. Can be a single image or list of images.
-          prompt (`str`):
-              The prompt or prompts to guide image generation.
-          negative_prompt (`str`, *optional*):
-              The prompt or prompts not to guide the image generation.
-          mask_image (`Image`, *optional*):
-              Mask image for inpainting.
-          padding_mask_crop (`int`, *optional*):
-              Padding for mask cropping in inpainting.
-          generator (`Generator`, *optional*):
-              Torch generator for deterministic generation.
-          num_images_per_prompt (`int`, *optional*, defaults to 1):
-              The number of images to generate per prompt.
-          height (`int`):
-              The height in pixels of the generated image.
-          width (`int`):
-              The width in pixels of the generated image.
-          image_latents (`Tensor`):
-              image latents used to guide the image generation. Can be generated from vae_encoder step.
-          processed_mask_image (`Tensor`, *optional*):
-              The processed mask image
-          latents (`Tensor`):
-              Pre-generated noisy latents for image generation.
-          num_inference_steps (`int`):
-              The number of denoising steps.
-          sigmas (`List`, *optional*):
-              Custom sigmas for the denoising process.
-          strength (`float`, *optional*, defaults to 0.9):
-              Strength for img2img/inpainting.
-          attention_kwargs (`Dict`, *optional*):
-              Additional kwargs for attention processors.
-          **denoiser_input_fields (`None`, *optional*):
-              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
-          output_type (`str`, *optional*, defaults to pil):
-              Output format: 'pil', 'np', 'pt'.
-          mask_overlay_kwargs (`Dict`, *optional*):
-              The kwargs for the postprocess step to apply the mask overlay. generated in
-              InpaintProcessImagesInputStep.
-
-      Outputs:
-          images (`List`):
-              Generated images.
-    """
-
    model_name = "qwenimage-edit"
    block_classes = EDIT_AUTO_BLOCKS.values()
    block_names = EDIT_AUTO_BLOCKS.keys()
@@ -786,5 +349,5 @@ class QwenImageEditAutoBlocks(SequentialPipelineBlocks):
    @property
    def outputs(self):
        return [
-            OutputParam.template("images"),
+            OutputParam(name="images", type_hint=List[List[PIL.Image.Image]], description="The generated images"),
        ]
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
@@ -12,6 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+from typing import List
+
+import PIL.Image
+import torch
+
 from ...utils import logging
 from ..modular_pipeline import SequentialPipelineBlocks
 from ..modular_pipeline_utils import InsertableDict, OutputParam
@@ -48,41 +53,12 @@ logger = logging.get_logger(__name__)
 # ====================


-# auto_docstring
 class QwenImageEditPlusVLEncoderStep(SequentialPipelineBlocks):
-    """
-    QwenImage-Edit Plus VL encoder step that encodes the image and text prompts together.
-
-      Components:
-          image_resize_processor (`VaeImageProcessor`) text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor
-          (`Qwen2VLProcessor`) guider (`ClassifierFreeGuidance`)
-
-      Inputs:
-          image (`Union[Image, List]`):
-              Reference image(s) for denoising. Can be a single image or list of images.
-          prompt (`str`):
-              The prompt or prompts to guide image generation.
-          negative_prompt (`str`, *optional*):
-              The prompt or prompts not to guide the image generation.
-
-      Outputs:
-          resized_image (`List`):
-              Images resized to 1024x1024 target area for VAE encoding
-          resized_cond_image (`List`):
-              Images resized to 384x384 target area for VL text encoding
-          prompt_embeds (`Tensor`):
-              The prompt embeddings.
-          prompt_embeds_mask (`Tensor`):
-              The encoder attention mask.
-          negative_prompt_embeds (`Tensor`):
-              The negative prompt embeddings.
-          negative_prompt_embeds_mask (`Tensor`):
-              The negative prompt embeddings mask.
-    """
+    """VL encoder that takes both image and text prompts. Uses 384x384 target area."""

    model_name = "qwenimage-edit-plus"
    block_classes = [
-        QwenImageEditPlusResizeStep(),
+        QwenImageEditPlusResizeStep(target_area=384 * 384, output_name="resized_cond_image"),
        QwenImageEditPlusTextEncoderStep(),
    ]
    block_names = ["resize", "encode"]
@@ -97,36 +73,12 @@ class QwenImageEditPlusVLEncoderStep(SequentialPipelineBlocks):
 # ====================


-# auto_docstring
 class QwenImageEditPlusVaeEncoderStep(SequentialPipelineBlocks):
-    """
-    VAE encoder step that encodes image inputs into latent representations.
-      Each image is resized independently based on its own aspect ratio to 1024x1024 target area.
-
-      Components:
-          image_resize_processor (`VaeImageProcessor`) image_processor (`VaeImageProcessor`) vae
-          (`AutoencoderKLQwenImage`)
-
-      Inputs:
-          image (`Union[Image, List]`):
-              Reference image(s) for denoising. Can be a single image or list of images.
-          generator (`Generator`, *optional*):
-              Torch generator for deterministic generation.
-
-      Outputs:
-          resized_image (`List`):
-              Images resized to 1024x1024 target area for VAE encoding
-          resized_cond_image (`List`):
-              Images resized to 384x384 target area for VL text encoding
-          processed_image (`Tensor`):
-              The processed image
-          image_latents (`Tensor`):
-              The latent representation of the input image.
-    """
+    """VAE encoder that handles multiple images with different sizes. Uses 1024x1024 target area."""

    model_name = "qwenimage-edit-plus"
    block_classes = [
-        QwenImageEditPlusResizeStep(),
+        QwenImageEditPlusResizeStep(target_area=1024 * 1024, output_name="resized_image"),
        QwenImageEditPlusProcessImagesInputStep(),
        QwenImageVaeEncoderStep(),
    ]
@@ -146,66 +98,11 @@ class QwenImageEditPlusVaeEncoderStep(SequentialPipelineBlocks):


 # assemble input steps
-# auto_docstring
 class QwenImageEditPlusInputStep(SequentialPipelineBlocks):
-    """
-    Input step that prepares the inputs for the Edit Plus denoising step. It:
-       - Standardizes text embeddings batch size.
-       - Processes list of image latents: patchifies, concatenates along dim=1, expands batch.
-       - Outputs lists of image_height/image_width for RoPE calculation.
-       - Defaults height/width from last image in the list.
-
-      Components:
-          pachifier (`QwenImagePachifier`)
-
-      Inputs:
-          num_images_per_prompt (`int`, *optional*, defaults to 1):
-              The number of images to generate per prompt.
-          prompt_embeds (`Tensor`):
-              text embeddings used to guide the image generation. Can be generated from text_encoder step.
-          prompt_embeds_mask (`Tensor`):
-              mask for the text embeddings. Can be generated from text_encoder step.
-          negative_prompt_embeds (`Tensor`, *optional*):
-              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
-          negative_prompt_embeds_mask (`Tensor`, *optional*):
-              mask for the negative text embeddings. Can be generated from text_encoder step.
-          height (`int`, *optional*):
-              The height in pixels of the generated image.
-          width (`int`, *optional*):
-              The width in pixels of the generated image.
-          image_latents (`Tensor`):
-              image latents used to guide the image generation. Can be generated from vae_encoder step.
-
-      Outputs:
-          batch_size (`int`):
-              The batch size of the prompt embeddings
-          dtype (`dtype`):
-              The data type of the prompt embeddings
-          prompt_embeds (`Tensor`):
-              The prompt embeddings. (batch-expanded)
-          prompt_embeds_mask (`Tensor`):
-              The encoder attention mask. (batch-expanded)
-          negative_prompt_embeds (`Tensor`):
-              The negative prompt embeddings. (batch-expanded)
-          negative_prompt_embeds_mask (`Tensor`):
-              The negative prompt embeddings mask. (batch-expanded)
-          image_height (`List`):
-              The image heights calculated from the image latents dimension
-          image_width (`List`):
-              The image widths calculated from the image latents dimension
-          height (`int`):
-              if not provided, updated to image height
-          width (`int`):
-              if not provided, updated to image width
-          image_latents (`Tensor`):
-              image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified,
-              concatenated, and batch-expanded)
-    """
-
    model_name = "qwenimage-edit-plus"
    block_classes = [
        QwenImageTextInputsStep(),
-        QwenImageEditPlusAdditionalInputsStep(),
+        QwenImageEditPlusAdditionalInputsStep(image_latent_inputs=["image_latents"]),
    ]
    block_names = ["text_inputs", "additional_inputs"]

@@ -221,50 +118,7 @@ class QwenImageEditPlusInputStep(SequentialPipelineBlocks):


 # Qwen Image Edit Plus (image2image) core denoise step
-# auto_docstring
 class QwenImageEditPlusCoreDenoiseStep(SequentialPipelineBlocks):
-    """
-    Core denoising workflow for QwenImage-Edit Plus edit (img2img) task.
-
-      Components:
-          pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider
-          (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)
-
-      Inputs:
-          num_images_per_prompt (`int`, *optional*, defaults to 1):
-              The number of images to generate per prompt.
-          prompt_embeds (`Tensor`):
-              text embeddings used to guide the image generation. Can be generated from text_encoder step.
-          prompt_embeds_mask (`Tensor`):
-              mask for the text embeddings. Can be generated from text_encoder step.
-          negative_prompt_embeds (`Tensor`, *optional*):
-              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
-          negative_prompt_embeds_mask (`Tensor`, *optional*):
-              mask for the negative text embeddings. Can be generated from text_encoder step.
-          height (`int`, *optional*):
-              The height in pixels of the generated image.
-          width (`int`, *optional*):
-              The width in pixels of the generated image.
-          image_latents (`Tensor`):
-              image latents used to guide the image generation. Can be generated from vae_encoder step.
-          latents (`Tensor`, *optional*):
-              Pre-generated noisy latents for image generation.
-          generator (`Generator`, *optional*):
-              Torch generator for deterministic generation.
-          num_inference_steps (`int`, *optional*, defaults to 50):
-              The number of denoising steps.
-          sigmas (`List`, *optional*):
-              Custom sigmas for the denoising process.
-          attention_kwargs (`Dict`, *optional*):
-              Additional kwargs for attention processors.
-          **denoiser_input_fields (`None`, *optional*):
-              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
-
-      Outputs:
-          latents (`Tensor`):
-              Denoised latents.
-    """
-
    model_name = "qwenimage-edit-plus"
    block_classes = [
        QwenImageEditPlusInputStep(),
@@ -290,7 +144,9 @@ class QwenImageEditPlusCoreDenoiseStep(SequentialPipelineBlocks):
    @property
    def outputs(self):
        return [
-            OutputParam.template("latents"),
+            OutputParam(
+                name="latents", type_hint=torch.Tensor, description="The latents generated by the denoising step"
+            ),
        ]


@@ -299,26 +155,7 @@ class QwenImageEditPlusCoreDenoiseStep(SequentialPipelineBlocks):
 # ====================


-# auto_docstring
 class QwenImageEditPlusDecodeStep(SequentialPipelineBlocks):
-    """
-    Decode step that decodes the latents to images and postprocesses the generated image.
-
-      Components:
-          vae (`AutoencoderKLQwenImage`) image_processor (`VaeImageProcessor`)
-
-      Inputs:
-          latents (`Tensor`):
-              The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise
-              step.
-          output_type (`str`, *optional*, defaults to pil):
-              Output format: 'pil', 'np', 'pt'.
-
-      Outputs:
-          images (`List`):
-              Generated images. (tensor output of the vae decoder.)
-    """
-
    model_name = "qwenimage-edit-plus"
    block_classes = [QwenImageDecoderStep(), QwenImageProcessImagesOutputStep()]
    block_names = ["decode", "postprocess"]
@@ -342,53 +179,7 @@ EDIT_PLUS_AUTO_BLOCKS = InsertableDict(
 )


-# auto_docstring
 class QwenImageEditPlusAutoBlocks(SequentialPipelineBlocks):
-    """
-    Auto Modular pipeline for edit (img2img) tasks using QwenImage-Edit Plus.
-      - `image` is required input (can be single image or list of images).
-      - Each image is resized independently based on its own aspect ratio.
-      - VL encoder uses 384x384 target area, VAE encoder uses 1024x1024 target area.
-
-      Components:
-          image_resize_processor (`VaeImageProcessor`) text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor
-          (`Qwen2VLProcessor`) guider (`ClassifierFreeGuidance`) image_processor (`VaeImageProcessor`) vae
-          (`AutoencoderKLQwenImage`) pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`)
-          transformer (`QwenImageTransformer2DModel`)
-
-      Inputs:
-          image (`Union[Image, List]`):
-              Reference image(s) for denoising. Can be a single image or list of images.
-          prompt (`str`):
-              The prompt or prompts to guide image generation.
-          negative_prompt (`str`, *optional*):
-              The prompt or prompts not to guide the image generation.
-          generator (`Generator`, *optional*):
-              Torch generator for deterministic generation.
-          num_images_per_prompt (`int`, *optional*, defaults to 1):
-              The number of images to generate per prompt.
-          height (`int`, *optional*):
-              The height in pixels of the generated image.
-          width (`int`, *optional*):
-              The width in pixels of the generated image.
-          latents (`Tensor`, *optional*):
-              Pre-generated noisy latents for image generation.
-          num_inference_steps (`int`, *optional*, defaults to 50):
-              The number of denoising steps.
-          sigmas (`List`, *optional*):
-              Custom sigmas for the denoising process.
-          attention_kwargs (`Dict`, *optional*):
-              Additional kwargs for attention processors.
-          **denoiser_input_fields (`None`, *optional*):
-              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
-          output_type (`str`, *optional*, defaults to pil):
-              Output format: 'pil', 'np', 'pt'.
-
-      Outputs:
-          images (`List`):
-              Generated images.
-    """
-
    model_name = "qwenimage-edit-plus"
    block_classes = EDIT_PLUS_AUTO_BLOCKS.values()
    block_names = EDIT_PLUS_AUTO_BLOCKS.keys()
@@ -405,5 +196,5 @@ class QwenImageEditPlusAutoBlocks(SequentialPipelineBlocks):
    @property
    def outputs(self):
        return [
-            OutputParam.template("images"),
+            OutputParam(name="images", type_hint=List[List[PIL.Image.Image]], description="The generated images"),
        ]
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
@@ -12,6 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+
+from typing import List
+
+import PIL.Image
+import torch
+
 from ...utils import logging
 from ..modular_pipeline import SequentialPipelineBlocks
 from ..modular_pipeline_utils import InsertableDict, OutputParam
@@ -49,44 +55,8 @@ logger = logging.get_logger(__name__)
 # ====================


-# auto_docstring
 class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks):
-    """
-    QwenImage-Layered Text encoder step that encode the text prompt, will generate a prompt based on image if not
-    provided.
-
-      Components:
-          image_resize_processor (`VaeImageProcessor`) text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor
-          (`Qwen2VLProcessor`) tokenizer (`Qwen2Tokenizer`): The tokenizer to use guider (`ClassifierFreeGuidance`)
-
-      Inputs:
-          image (`Union[Image, List]`):
-              Reference image(s) for denoising. Can be a single image or list of images.
-          resolution (`int`, *optional*, defaults to 640):
-              The target area to resize the image to, can be 1024 or 640
-          prompt (`str`, *optional*):
-              The prompt or prompts to guide image generation.
-          use_en_prompt (`bool`, *optional*, defaults to False):
-              Whether to use English prompt template
-          negative_prompt (`str`, *optional*):
-              The prompt or prompts not to guide the image generation.
-          max_sequence_length (`int`, *optional*, defaults to 1024):
-              Maximum sequence length for prompt encoding.
-
-      Outputs:
-          resized_image (`List`):
-              The resized images
-          prompt (`str`):
-              The prompt or prompts to guide image generation. If not provided, updated using image caption
-          prompt_embeds (`Tensor`):
-              The prompt embeddings.
-          prompt_embeds_mask (`Tensor`):
-              The encoder attention mask.
-          negative_prompt_embeds (`Tensor`):
-              The negative prompt embeddings.
-          negative_prompt_embeds_mask (`Tensor`):
-              The negative prompt embeddings mask.
-    """
+    """Text encoder that takes text prompt, will generate a prompt based on image if not provided."""

    model_name = "qwenimage-layered"
    block_classes = [
@@ -107,32 +77,7 @@ class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks):


 # Edit VAE encoder
-# auto_docstring
 class QwenImageLayeredVaeEncoderStep(SequentialPipelineBlocks):
-    """
-    Vae encoder step that encode the image inputs into their latent representations.
-
-      Components:
-          image_resize_processor (`VaeImageProcessor`) image_processor (`VaeImageProcessor`) vae
-          (`AutoencoderKLQwenImage`)
-
-      Inputs:
-          image (`Union[Image, List]`):
-              Reference image(s) for denoising. Can be a single image or list of images.
-          resolution (`int`, *optional*, defaults to 640):
-              The target area to resize the image to, can be 1024 or 640
-          generator (`Generator`, *optional*):
-              Torch generator for deterministic generation.
-
-      Outputs:
-          resized_image (`List`):
-              The resized images
-          processed_image (`Tensor`):
-              The processed image
-          image_latents (`Tensor`):
-              The latent representation of the input image.
-    """
-
    model_name = "qwenimage-layered"
    block_classes = [
        QwenImageLayeredResizeStep(),
@@ -153,60 +98,11 @@ class QwenImageLayeredVaeEncoderStep(SequentialPipelineBlocks):


 # assemble input steps
-# auto_docstring
 class QwenImageLayeredInputStep(SequentialPipelineBlocks):
-    """
-    Input step that prepares the inputs for the layered denoising step. It:
-       - make sure the text embeddings have consistent batch size as well as the additional inputs.
-       - update height/width based `image_latents`, patchify `image_latents`.
-
-      Components:
-          pachifier (`QwenImageLayeredPachifier`)
-
-      Inputs:
-          num_images_per_prompt (`int`, *optional*, defaults to 1):
-              The number of images to generate per prompt.
-          prompt_embeds (`Tensor`):
-              text embeddings used to guide the image generation. Can be generated from text_encoder step.
-          prompt_embeds_mask (`Tensor`):
-              mask for the text embeddings. Can be generated from text_encoder step.
-          negative_prompt_embeds (`Tensor`, *optional*):
-              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
-          negative_prompt_embeds_mask (`Tensor`, *optional*):
-              mask for the negative text embeddings. Can be generated from text_encoder step.
-          image_latents (`Tensor`):
-              image latents used to guide the image generation. Can be generated from vae_encoder step.
-
-      Outputs:
-          batch_size (`int`):
-              The batch size of the prompt embeddings
-          dtype (`dtype`):
-              The data type of the prompt embeddings
-          prompt_embeds (`Tensor`):
-              The prompt embeddings. (batch-expanded)
-          prompt_embeds_mask (`Tensor`):
-              The encoder attention mask. (batch-expanded)
-          negative_prompt_embeds (`Tensor`):
-              The negative prompt embeddings. (batch-expanded)
-          negative_prompt_embeds_mask (`Tensor`):
-              The negative prompt embeddings mask. (batch-expanded)
-          image_height (`int`):
-              The image height calculated from the image latents dimension
-          image_width (`int`):
-              The image width calculated from the image latents dimension
-          height (`int`):
-              if not provided, updated to image height
-          width (`int`):
-              if not provided, updated to image width
-          image_latents (`Tensor`):
-              image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified
-              with layered pachifier and batch-expanded)
-    """
-
    model_name = "qwenimage-layered"
    block_classes = [
        QwenImageTextInputsStep(),
-        QwenImageLayeredAdditionalInputsStep(),
+        QwenImageLayeredAdditionalInputsStep(image_latent_inputs=["image_latents"]),
    ]
    block_names = ["text_inputs", "additional_inputs"]

@@ -220,48 +116,7 @@ class QwenImageLayeredInputStep(SequentialPipelineBlocks):


 # Qwen Image Layered (image2image) core denoise step
-# auto_docstring
 class QwenImageLayeredCoreDenoiseStep(SequentialPipelineBlocks):
-    """
-    Core denoising workflow for QwenImage-Layered img2img task.
-
-      Components:
-          pachifier (`QwenImageLayeredPachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider
-          (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)
-
-      Inputs:
-          num_images_per_prompt (`int`, *optional*, defaults to 1):
-              The number of images to generate per prompt.
-          prompt_embeds (`Tensor`):
-              text embeddings used to guide the image generation. Can be generated from text_encoder step.
-          prompt_embeds_mask (`Tensor`):
-              mask for the text embeddings. Can be generated from text_encoder step.
-          negative_prompt_embeds (`Tensor`, *optional*):
-              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
-          negative_prompt_embeds_mask (`Tensor`, *optional*):
-              mask for the negative text embeddings. Can be generated from text_encoder step.
-          image_latents (`Tensor`):
-              image latents used to guide the image generation. Can be generated from vae_encoder step.
-          latents (`Tensor`, *optional*):
-              Pre-generated noisy latents for image generation.
-          layers (`int`, *optional*, defaults to 4):
-              Number of layers to extract from the image
-          generator (`Generator`, *optional*):
-              Torch generator for deterministic generation.
-          num_inference_steps (`int`, *optional*, defaults to 50):
-              The number of denoising steps.
-          sigmas (`List`, *optional*):
-              Custom sigmas for the denoising process.
-          attention_kwargs (`Dict`, *optional*):
-              Additional kwargs for attention processors.
-          **denoiser_input_fields (`None`, *optional*):
-              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
-
-      Outputs:
-          latents (`Tensor`):
-              Denoised latents.
-    """
-
    model_name = "qwenimage-layered"
    block_classes = [
        QwenImageLayeredInputStep(),
@@ -287,7 +142,9 @@ class QwenImageLayeredCoreDenoiseStep(SequentialPipelineBlocks):
    @property
    def outputs(self):
        return [
-            OutputParam.template("latents"),
+            OutputParam(
+                name="latents", type_hint=torch.Tensor, description="The latents generated by the denoising step"
+            ),
        ]


@@ -305,54 +162,7 @@ LAYERED_AUTO_BLOCKS = InsertableDict(
 )


-# auto_docstring
 class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks):
-    """
-    Auto Modular pipeline for layered denoising tasks using QwenImage-Layered.
-
-      Components:
-          image_resize_processor (`VaeImageProcessor`) text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor
-          (`Qwen2VLProcessor`) tokenizer (`Qwen2Tokenizer`): The tokenizer to use guider (`ClassifierFreeGuidance`)
-          image_processor (`VaeImageProcessor`) vae (`AutoencoderKLQwenImage`) pachifier (`QwenImageLayeredPachifier`)
-          scheduler (`FlowMatchEulerDiscreteScheduler`) transformer (`QwenImageTransformer2DModel`)
-
-      Inputs:
-          image (`Union[Image, List]`):
-              Reference image(s) for denoising. Can be a single image or list of images.
-          resolution (`int`, *optional*, defaults to 640):
-              The target area to resize the image to, can be 1024 or 640
-          prompt (`str`, *optional*):
-              The prompt or prompts to guide image generation.
-          use_en_prompt (`bool`, *optional*, defaults to False):
-              Whether to use English prompt template
-          negative_prompt (`str`, *optional*):
-              The prompt or prompts not to guide the image generation.
-          max_sequence_length (`int`, *optional*, defaults to 1024):
-              Maximum sequence length for prompt encoding.
-          generator (`Generator`, *optional*):
-              Torch generator for deterministic generation.
-          num_images_per_prompt (`int`, *optional*, defaults to 1):
-              The number of images to generate per prompt.
-          latents (`Tensor`, *optional*):
-              Pre-generated noisy latents for image generation.
-          layers (`int`, *optional*, defaults to 4):
-              Number of layers to extract from the image
-          num_inference_steps (`int`, *optional*, defaults to 50):
-              The number of denoising steps.
-          sigmas (`List`, *optional*):
-              Custom sigmas for the denoising process.
-          attention_kwargs (`Dict`, *optional*):
-              Additional kwargs for attention processors.
-          **denoiser_input_fields (`None`, *optional*):
-              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
-          output_type (`str`, *optional*, defaults to pil):
-              Output format: 'pil', 'np', 'pt'.
-
-      Outputs:
-          images (`List`):
-              Generated images.
-    """
-
    model_name = "qwenimage-layered"
    block_classes = LAYERED_AUTO_BLOCKS.values()
    block_names = LAYERED_AUTO_BLOCKS.keys()
@@ -364,5 +174,5 @@ class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks):
    @property
    def outputs(self):
        return [
-            OutputParam.template("images"),
+            OutputParam(name="images", type_hint=List[List[PIL.Image.Image]], description="The generated images"),
        ]
--- a/src/diffusers/modular_pipelines/z_image/denoise.py
+++ b/src/diffusers/modular_pipelines/z_image/denoise.py
@@ -131,7 +131,7 @@ class ZImageLoopDenoiser(ModularPipelineBlocks):
            ),
            InputParam(
                kwargs_type="denoiser_input_fields",
-                description="The conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.",
+                description="conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.",
            ),
        ]
        guider_input_names = []
--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py
@@ -84,6 +84,7 @@ EXAMPLE_DOC_STRING = """
        >>> from diffusers import ControlNetModel, StableDiffusionXLControlNetImg2ImgPipeline, AutoencoderKL
        >>> from diffusers.utils import load_image

+
        >>> depth_estimator = DPTForDepthEstimation.from_pretrained("Intel/dpt-hybrid-midas").to("cuda")
        >>> feature_extractor = DPTImageProcessor.from_pretrained("Intel/dpt-hybrid-midas")
        >>> controlnet = ControlNetModel.from_pretrained(
--- a/src/diffusers/pipelines/hidream_image/pipeline_hidream_image.py
+++ b/src/diffusers/pipelines/hidream_image/pipeline_hidream_image.py
@@ -53,6 +53,7 @@ EXAMPLE_DOC_STRING = """
        >>> from transformers import AutoTokenizer, LlamaForCausalLM
        >>> from diffusers import HiDreamImagePipeline

+
        >>> tokenizer_4 = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct")
        >>> text_encoder_4 = LlamaForCausalLM.from_pretrained(
        ...     "meta-llama/Meta-Llama-3.1-8B-Instruct",
--- a/src/diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py
+++ b/src/diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py
@@ -85,6 +85,7 @@ EXAMPLE_DOC_STRING = """
        >>> from diffusers import ControlNetModel, StableDiffusionXLControlNetPAGImg2ImgPipeline, AutoencoderKL
        >>> from diffusers.utils import load_image

+
        >>> depth_estimator = DPTForDepthEstimation.from_pretrained("Intel/dpt-hybrid-midas").to("cuda")
        >>> feature_extractor = DPTFeatureExtractor.from_pretrained("Intel/dpt-hybrid-midas")
        >>> controlnet = ControlNetModel.from_pretrained(
--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -111,7 +111,7 @@ LIBRARIES = []
 for library in LOADABLE_CLASSES:
    LIBRARIES.append(library)

-SUPPORTED_DEVICE_MAP = ["balanced"] + [get_device()]
+SUPPORTED_DEVICE_MAP = ["balanced"] + [get_device(), "cpu"]

 logger = logging.get_logger(__name__)

@@ -467,8 +467,7 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
        pipeline_is_sequentially_offloaded = any(
            module_is_sequentially_offloaded(module) for _, module in self.components.items()
        )
-
-        is_pipeline_device_mapped = self.hf_device_map is not None and len(self.hf_device_map) > 1
+        is_pipeline_device_mapped = self._is_pipeline_device_mapped()
        if is_pipeline_device_mapped:
            raise ValueError(
                "It seems like you have activated a device mapping strategy on the pipeline which doesn't allow explicit device placement using `to()`. You can call `reset_device_map()` to remove the existing device map from the pipeline."
@@ -1187,7 +1186,7 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
        """
        self._maybe_raise_error_if_group_offload_active(raise_error=True)

-        is_pipeline_device_mapped = self.hf_device_map is not None and len(self.hf_device_map) > 1
+        is_pipeline_device_mapped = self._is_pipeline_device_mapped()
        if is_pipeline_device_mapped:
            raise ValueError(
                "It seems like you have activated a device mapping strategy on the pipeline so calling `enable_model_cpu_offload() isn't allowed. You can call `reset_device_map()` first and then call `enable_model_cpu_offload()`."
@@ -1311,7 +1310,7 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
            raise ImportError("`enable_sequential_cpu_offload` requires `accelerate v0.14.0` or higher")
        self.remove_all_hooks()

-        is_pipeline_device_mapped = self.hf_device_map is not None and len(self.hf_device_map) > 1
+        is_pipeline_device_mapped = self._is_pipeline_device_mapped()
        if is_pipeline_device_mapped:
            raise ValueError(
                "It seems like you have activated a device mapping strategy on the pipeline so calling `enable_sequential_cpu_offload() isn't allowed. You can call `reset_device_map()` first and then call `enable_sequential_cpu_offload()`."
@@ -2200,6 +2199,21 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
                return True
        return False

+    def _is_pipeline_device_mapped(self):
+        # We support passing `device_map="cuda"`, for example. This is helpful, in case
+        # users want to pass `device_map="cpu"` when initializing a pipeline. This explicit declaration is desirable
+        # in limited VRAM environments because quantized models often initialize directly on the accelerator.
+        device_map = self.hf_device_map
+        is_device_type_map = False
+        if isinstance(device_map, str):
+            try:
+                torch.device(device_map)
+                is_device_type_map = True
+            except RuntimeError:
+                pass
+
+        return not is_device_type_map and isinstance(device_map, dict) and len(device_map) > 1
+

 class StableDiffusionMixin:
    r"""
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py
@@ -459,6 +459,7 @@ class StableDiffusionLatentUpscalePipeline(DiffusionPipeline, StableDiffusionMix
        >>> from diffusers import StableDiffusionLatentUpscalePipeline, StableDiffusionPipeline
        >>> import torch

+
        >>> pipeline = StableDiffusionPipeline.from_pretrained(
        ...     "CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16
        ... )
--- a/utils/modular_auto_docstring.py
+++ b/utils/modular_auto_docstring.py
@@ -1,300 +0,0 @@
-# coding=utf-8
-# Copyright 2025 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Auto Docstring Generator for Modular Pipeline Blocks
-
-This script scans Python files for classes that have `# auto_docstring` comment above them
-and inserts/updates the docstring from the class's `doc` property.
-
-Run from the root of the repo:
-    python utils/modular_auto_docstring.py [path] [--fix_and_overwrite]
-
-Examples:
-    # Check for auto_docstring markers (will error if found without proper docstring)
-    python utils/modular_auto_docstring.py
-
-    # Check specific directory
-    python utils/modular_auto_docstring.py src/diffusers/modular_pipelines/
-
-    # Fix and overwrite the docstrings
-    python utils/modular_auto_docstring.py --fix_and_overwrite
-
-Usage in code:
-    # auto_docstring
-    class QwenImageAutoVaeEncoderStep(AutoPipelineBlocks):
-        # docstring will be automatically inserted here
-
-        @property
-        def doc(self):
-            return "Your docstring content..."
-"""
-
-import argparse
-import ast
-import glob
-import importlib
-import os
-import re
-import sys
-
-
-# All paths are set with the intent you should run this script from the root of the repo
-DIFFUSERS_PATH = "src/diffusers"
-REPO_PATH = "."
-
-# Pattern to match the auto_docstring comment
-AUTO_DOCSTRING_PATTERN = re.compile(r"^\s*#\s*auto_docstring\s*$")
-
-
-def setup_diffusers_import():
-    """Setup import path to use the local diffusers module."""
-    src_path = os.path.join(REPO_PATH, "src")
-    if src_path not in sys.path:
-        sys.path.insert(0, src_path)
-
-
-def get_module_from_filepath(filepath: str) -> str:
-    """Convert a filepath to a module name."""
-    filepath = os.path.normpath(filepath)
-
-    if filepath.startswith("src" + os.sep):
-        filepath = filepath[4:]
-
-    if filepath.endswith(".py"):
-        filepath = filepath[:-3]
-
-    module_name = filepath.replace(os.sep, ".")
-    return module_name
-
-
-def load_module(filepath: str):
-    """Load a module from filepath."""
-    setup_diffusers_import()
-    module_name = get_module_from_filepath(filepath)
-
-    try:
-        module = importlib.import_module(module_name)
-        return module
-    except Exception as e:
-        print(f"Warning: Could not import module {module_name}: {e}")
-        return None
-
-
-def get_doc_from_class(module, class_name: str) -> str:
-    """Get the doc property from an instantiated class."""
-    if module is None:
-        return None
-
-    cls = getattr(module, class_name, None)
-    if cls is None:
-        return None
-
-    try:
-        instance = cls()
-        if hasattr(instance, "doc"):
-            return instance.doc
-    except Exception as e:
-        print(f"Warning: Could not instantiate {class_name}: {e}")
-
-    return None
-
-
-def find_auto_docstring_classes(filepath: str) -> list:
-    """
-    Find all classes in a file that have # auto_docstring comment above them.
-
-    Returns list of (class_name, class_line_number, has_existing_docstring, docstring_end_line)
-    """
-    with open(filepath, "r", encoding="utf-8", newline="\n") as f:
-        lines = f.readlines()
-
-    # Parse AST to find class locations and their docstrings
-    content = "".join(lines)
-    try:
-        tree = ast.parse(content)
-    except SyntaxError as e:
-        print(f"Syntax error in {filepath}: {e}")
-        return []
-
-    # Build a map of class_name -> (class_line, has_docstring, docstring_end_line)
-    class_info = {}
-    for node in ast.walk(tree):
-        if isinstance(node, ast.ClassDef):
-            has_docstring = False
-            docstring_end_line = node.lineno  # default to class line
-
-            if node.body and isinstance(node.body[0], ast.Expr):
-                first_stmt = node.body[0]
-                if isinstance(first_stmt.value, ast.Constant) and isinstance(first_stmt.value.value, str):
-                    has_docstring = True
-                    docstring_end_line = first_stmt.end_lineno or first_stmt.lineno
-
-            class_info[node.name] = (node.lineno, has_docstring, docstring_end_line)
-
-    # Now scan for # auto_docstring comments
-    classes_to_update = []
-
-    for i, line in enumerate(lines):
-        if AUTO_DOCSTRING_PATTERN.match(line):
-            # Found the marker, look for class definition on next non-empty, non-comment line
-            j = i + 1
-            while j < len(lines):
-                next_line = lines[j].strip()
-                if next_line and not next_line.startswith("#"):
-                    break
-                j += 1
-
-            if j < len(lines) and lines[j].strip().startswith("class "):
-                # Extract class name
-                match = re.match(r"class\s+(\w+)", lines[j].strip())
-                if match:
-                    class_name = match.group(1)
-                    if class_name in class_info:
-                        class_line, has_docstring, docstring_end_line = class_info[class_name]
-                        classes_to_update.append((class_name, class_line, has_docstring, docstring_end_line))
-
-    return classes_to_update
-
-
-def strip_class_name_line(doc: str, class_name: str) -> str:
-    """Remove the 'class ClassName' line from the doc if present."""
-    lines = doc.strip().split("\n")
-    if lines and lines[0].strip() == f"class {class_name}":
-        # Remove the class line and any blank line following it
-        lines = lines[1:]
-        while lines and not lines[0].strip():
-            lines = lines[1:]
-    return "\n".join(lines)
-
-
-def format_docstring(doc: str, indent: str = "    ") -> str:
-    """Format a doc string as a properly indented docstring."""
-    lines = doc.strip().split("\n")
-
-    if len(lines) == 1:
-        return f'{indent}"""{lines[0]}"""\n'
-    else:
-        result = [f'{indent}"""\n']
-        for line in lines:
-            if line.strip():
-                result.append(f"{indent}{line}\n")
-            else:
-                result.append("\n")
-        result.append(f'{indent}"""\n')
-        return "".join(result)
-
-
-def process_file(filepath: str, overwrite: bool = False) -> list:
-    """
-    Process a file and find/insert docstrings for # auto_docstring marked classes.
-
-    Returns list of classes that need updating.
-    """
-    classes_to_update = find_auto_docstring_classes(filepath)
-
-    if not classes_to_update:
-        return []
-
-    if not overwrite:
-        # Just return the list of classes that need updating
-        return [(filepath, cls_name, line) for cls_name, line, _, _ in classes_to_update]
-
-    # Load the module to get doc properties
-    module = load_module(filepath)
-
-    with open(filepath, "r", encoding="utf-8", newline="\n") as f:
-        lines = f.readlines()
-
-    # Process in reverse order to maintain line numbers
-    updated = False
-    for class_name, class_line, has_docstring, docstring_end_line in reversed(classes_to_update):
-        doc = get_doc_from_class(module, class_name)
-
-        if doc is None:
-            print(f"Warning: Could not get doc for {class_name} in {filepath}")
-            continue
-
-        # Remove the "class ClassName" line since it's redundant in a docstring
-        doc = strip_class_name_line(doc, class_name)
-
-        # Format the new docstring with 4-space indent
-        new_docstring = format_docstring(doc, "    ")
-
-        if has_docstring:
-            # Replace existing docstring (line after class definition to docstring_end_line)
-            # class_line is 1-indexed, we want to replace from class_line+1 to docstring_end_line
-            lines = lines[:class_line] + [new_docstring] + lines[docstring_end_line:]
-        else:
-            # Insert new docstring right after class definition line
-            # class_line is 1-indexed, so lines[class_line-1] is the class line
-            # Insert at position class_line (which is right after the class line)
-            lines = lines[:class_line] + [new_docstring] + lines[class_line:]
-
-        updated = True
-        print(f"Updated docstring for {class_name} in {filepath}")
-
-    if updated:
-        with open(filepath, "w", encoding="utf-8", newline="\n") as f:
-            f.writelines(lines)
-
-    return [(filepath, cls_name, line) for cls_name, line, _, _ in classes_to_update]
-
-
-def check_auto_docstrings(path: str = None, overwrite: bool = False):
-    """
-    Check all files for # auto_docstring markers and optionally fix them.
-    """
-    if path is None:
-        path = DIFFUSERS_PATH
-
-    if os.path.isfile(path):
-        all_files = [path]
-    else:
-        all_files = glob.glob(os.path.join(path, "**/*.py"), recursive=True)
-
-    all_markers = []
-
-    for filepath in all_files:
-        markers = process_file(filepath, overwrite)
-        all_markers.extend(markers)
-
-    if not overwrite and len(all_markers) > 0:
-        message = "\n".join([f"- {f}: {cls} at line {line}" for f, cls, line in all_markers])
-        raise ValueError(
-            f"Found the following # auto_docstring markers that need docstrings:\n{message}\n\n"
-            f"Run `python utils/modular_auto_docstring.py --fix_and_overwrite` to fix them."
-        )
-
-    if overwrite and len(all_markers) > 0:
-        print(f"\nUpdated {len(all_markers)} docstring(s).")
-    elif len(all_markers) == 0:
-        print("No # auto_docstring markers found.")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="Check and fix # auto_docstring markers in modular pipeline blocks",
-    )
-    parser.add_argument("path", nargs="?", default=None, help="File or directory to process (default: src/diffusers)")
-    parser.add_argument(
-        "--fix_and_overwrite",
-        action="store_true",
-        help="Whether to fix the docstrings by inserting them from doc property.",
-    )
-
-    args = parser.parse_args()
-
-    check_auto_docstrings(args.path, args.fix_and_overwrite)
Author	SHA1	Message	Date
Sayak Paul	fe4c0be8a6	Merge branch 'main' into device-map-direct	2026-01-19 10:26:24 +05:30
Sayak Paul	b28d6d45fa	Merge branch 'main' into device-map-direct	2026-01-13 10:35:08 +05:30
Sayak Paul	3b334de68a	Merge branch 'main' into device-map-direct	2026-01-08 12:23:39 +05:30
Sayak Paul	c61e455ce7	Merge branch 'main' into device-map-direct	2025-12-23 13:16:10 +05:30
Sayak Paul	6f5eb0a933	Merge branch 'main' into device-map-direct	2025-12-11 14:47:09 +08:00
sayakpaul	83ec2fb793	support device type device_maps to work with offloading.	2025-12-09 11:10:41 +05:30