Merge branch 'main' into custom-modular-tests

resolve conflicts
2025-12-22 12:24:39 +08:00 · 2025-12-15 20:27:45 +08:00 · 2025-12-08 16:29:06 +08:00 · 2025-12-02 21:29:28 +08:00 · 2025-11-11 17:54:01 +05:30 · 2025-11-11 17:52:53 +05:30
57 changed files with 2692 additions and 9359 deletions
--- a/.github/workflows/pr_modular_tests.yml
+++ b/.github/workflows/pr_modular_tests.yml
@@ -77,63 +77,47 @@ jobs:

  run_fast_tests:
    needs: [check_code_quality, check_repository_consistency]
-    strategy:
-      fail-fast: false
-      matrix:
-        config:
-          - name: Fast PyTorch Modular Pipeline CPU tests
-            framework: pytorch_pipelines
-            runner: aws-highmemory-32-plus
-            image: diffusers/diffusers-pytorch-cpu
-            report: torch_cpu_modular_pipelines
-
-    name: ${{ matrix.config.name }}
-
+    name: Fast PyTorch Modular Pipeline CPU tests
    runs-on:
-      group: ${{ matrix.config.runner }}
-
+      group: aws-highmemory-32-plus
    container:
-      image: ${{ matrix.config.image }}
+      image: diffusers/diffusers-pytorch-cpu
      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
-
    defaults:
      run:
        shell: bash

    steps:
-    - name: Checkout diffusers
-      uses: actions/checkout@v3
-      with:
-        fetch-depth: 2
+      - name: Checkout diffusers
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 2

-    - name: Install dependencies
-      run: |
-        uv pip install -e ".[quality]"
-        #uv pip uninstall transformers huggingface_hub && uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
-        uv pip uninstall transformers huggingface_hub && uv pip install transformers==4.57.1
-        uv pip uninstall accelerate && uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git --no-deps
+      - name: Install dependencies
+        run: |
+          uv pip install -e ".[quality]"
+          #uv pip uninstall transformers huggingface_hub && uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
+          uv pip uninstall transformers huggingface_hub && uv pip install transformers==4.57.1
+          uv pip uninstall accelerate && uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git --no-deps

-    - name: Environment
-      run: |
-        python utils/print_env.py
+      - name: Environment
+        run: |
+          python utils/print_env.py

-    - name: Run fast PyTorch Pipeline CPU tests
-      if: ${{ matrix.config.framework == 'pytorch_pipelines' }}
-      run: |
-        pytest -n 8 --max-worker-restart=0 --dist=loadfile \
-          -k "not Flax and not Onnx" \
-          --make-reports=tests_${{ matrix.config.report }} \
-          tests/modular_pipelines
-
-    - name: Failure short reports
-      if: ${{ failure() }}
-      run: cat reports/tests_${{ matrix.config.report }}_failures_short.txt
-
-    - name: Test suite reports artifacts
-      if: ${{ always() }}
-      uses: actions/upload-artifact@v4
-      with:
-        name: pr_${{ matrix.config.framework }}_${{ matrix.config.report }}_test_reports
-        path: reports
+      - name: Run fast PyTorch Pipeline CPU tests
+        run: |
+          pytest -n 8 --max-worker-restart=0 --dist=loadfile \
+            -s -v \
+            --make-reports=tests_torch_cpu_modular_pipelines \
+            tests/modular_pipelines

+      - name: Failure short reports
+        if: ${{ failure() }}
+        run: cat reports/tests_torch_cpu_modular_pipelines_failures_short.txt

+      - name: Test suite reports artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: pr_pytorch_pipelines_torch_cpu_modular_pipelines_test_reports
+          path: reports
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -365,8 +365,6 @@
        title: HunyuanVideoTransformer3DModel
      - local: api/models/latte_transformer3d
        title: LatteTransformer3DModel
-      - local: api/models/longcat_image_transformer2d
-        title: LongCatImageTransformer2DModel
      - local: api/models/ltx_video_transformer3d
        title: LTXVideoTransformer3DModel
      - local: api/models/lumina2_transformer2d
@@ -404,7 +402,7 @@
      - local: api/models/wan_transformer_3d
        title: WanTransformer3DModel
      - local: api/models/z_image_transformer2d
-        title: ZImageTransformer2DModel
+        title: ZImageTransformer2DModel        
      title: Transformers
    - sections:
      - local: api/models/stable_cascade_unet
@@ -565,8 +563,6 @@
        title: Latent Diffusion
      - local: api/pipelines/ledits_pp
        title: LEDITS++
-      - local: api/pipelines/longcat_image
-        title: LongCat-Image
      - local: api/pipelines/lumina2
        title: Lumina 2.0
      - local: api/pipelines/lumina
--- a/docs/source/en/api/models/longcat_image_transformer2d.md
+++ b/docs/source/en/api/models/longcat_image_transformer2d.md
@@ -1,25 +0,0 @@
-<!--Copyright 2025 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# LongCatImageTransformer2DModel
-
-The model can be loaded with the following code snippet.
-
-```python
-from diffusers import LongCatImageTransformer2DModel
-
-transformer = LongCatImageTransformer2DModel.from_pretrained("meituan-longcat/LongCat-Image ", subfolder="transformer", torch_dtype=torch.bfloat16)
-```
-
-## LongCatImageTransformer2DModel
-
-[[autodoc]] LongCatImageTransformer2DModel
--- a/docs/source/en/api/pipelines/cosmos.md
+++ b/docs/source/en/api/pipelines/cosmos.md
@@ -70,12 +70,6 @@ output.save("output.png")
  - all
  - __call__

-## Cosmos2_5_PredictBasePipeline
-
-[[autodoc]] Cosmos2_5_PredictBasePipeline
-  - all
-  - __call__
-
 ## CosmosPipelineOutput

 [[autodoc]] pipelines.cosmos.pipeline_output.CosmosPipelineOutput
--- a/docs/source/en/api/pipelines/longcat_image.md
+++ b/docs/source/en/api/pipelines/longcat_image.md
@@ -1,114 +0,0 @@
-<!--Copyright 2025 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# LongCat-Image
-
-<div class="flex flex-wrap space-x-1">
-  <img alt="LoRA" src="https://img.shields.io/badge/LoRA-d8b4fe?style=flat"/>
-</div>
-
-
-We introduce LongCat-Image, a pioneering open-source and bilingual (Chinese-English) foundation model for image generation, designed to address core challenges in multilingual text rendering, photorealism, deployment efficiency, and developer accessibility prevalent in current leading models.
-
-
-### Key Features
- 🌟 **Exceptional Efficiency and Performance**: With only **6B parameters**, LongCat-Image surpasses numerous open-source models that are several times larger across multiple benchmarks, demonstrating the immense potential of efficient model design.
- 🌟 **Superior Editing Performance**: LongCat-Image-Edit model achieves state-of-the-art performance among open-source models, delivering leading instruction-following and image quality with superior visual consistency.
- 🌟 **Powerful Chinese Text Rendering**: LongCat-Image demonstrates superior accuracy and stability in rendering common Chinese characters compared to existing SOTA open-source models and achieves industry-leading coverage of the Chinese dictionary.
- 🌟 **Remarkable Photorealism**: Through an innovative data strategy and training framework, LongCat-Image achieves remarkable photorealism in generated images.
- 🌟 **Comprehensive Open-Source Ecosystem**: We provide a complete toolchain, from intermediate checkpoints to full training code, significantly lowering the barrier for further research and development.
-
-For more details, please refer to the comprehensive [***LongCat-Image Technical Report***](https://arxiv.org/abs/2412.11963)
-
-
-## Usage Example
-
-```py
-import torch
-import diffusers
-from diffusers import LongCatImagePipeline
-
-weight_dtype = torch.bfloat16
-pipe = LongCatImagePipeline.from_pretrained("meituan-longcat/LongCat-Image", torch_dtype=torch.bfloat16 )
-pipe.to('cuda')
-# pipe.enable_model_cpu_offload()
-
-prompt = '一个年轻的亚裔女性，身穿黄色针织衫，搭配白色项链。她的双手放在膝盖上，表情恬静。背景是一堵粗糙的砖墙，午后的阳光温暖地洒在她身上，营造出一种宁静而温馨的氛围。镜头采用中距离视角，突出她的神态和服饰的细节。光线柔和地打在她的脸上，强调她的五官和饰品的质感，增加画面的层次感与亲和力。整个画面构图简洁，砖墙的纹理与阳光的光影效果相得益彰，突显出人物的优雅与从容。'
-image = pipe(
-    prompt,
-    height=768,
-    width=1344,
-    guidance_scale=4.0,
-    num_inference_steps=50,
-    num_images_per_prompt=1,
-    generator=torch.Generator("cpu").manual_seed(43),
-    enable_cfg_renorm=True,
-    enable_prompt_rewrite=True,
-).images[0]
-image.save(f'./longcat_image_t2i_example.png')
-```
-
-
-This pipeline was contributed by LongCat-Image Team. The original codebase can be found [here](https://github.com/meituan-longcat/LongCat-Image).
-
-Available models:
-<div style="overflow-x: auto; margin-bottom: 16px;">
-  <table style="border-collapse: collapse; width: 100%;">
-    <thead>
-      <tr>
-        <th style="white-space: nowrap; padding: 8px; border: 1px solid #d0d7de; background-color: #f6f8fa;">Models</th>
-        <th style="white-space: nowrap; padding: 8px; border: 1px solid #d0d7de; background-color: #f6f8fa;">Type</th>
-        <th style="padding: 8px; border: 1px solid #d0d7de; background-color: #f6f8fa;">Description</th>
-        <th style="padding: 8px; border: 1px solid #d0d7de; background-color: #f6f8fa;">Download Link</th>
-      </tr>
-    </thead>
-    <tbody>
-      <tr>
-        <td style="white-space: nowrap; padding: 8px; border: 1px solid #d0d7de;">LongCat&#8209;Image</td>
-        <td style="white-space: nowrap; padding: 8px; border: 1px solid #d0d7de;">Text&#8209;to&#8209;Image</td>
-        <td style="padding: 8px; border: 1px solid #d0d7de;">Final Release. The standard model for out&#8209;of&#8209;the&#8209;box inference.</td>
-        <td style="padding: 8px; border: 1px solid #d0d7de;">
-          <span style="white-space: nowrap;">🤗&nbsp;<a href="https://huggingface.co/meituan-longcat/LongCat-Image">Huggingface</a></span>
-        </td>
-      </tr>
-      <tr>
-        <td style="white-space: nowrap; padding: 8px; border: 1px solid #d0d7de;">LongCat&#8209;Image&#8209;Dev</td>
-        <td style="white-space: nowrap; padding: 8px; border: 1px solid #d0d7de;">Text&#8209;to&#8209;Image</td>
-        <td style="padding: 8px; border: 1px solid #d0d7de;">Development. Mid-training checkpoint, suitable for fine-tuning.</td>
-        <td style="padding: 8px; border: 1px solid #d0d7de;">
-          <span style="white-space: nowrap;">🤗&nbsp;<a href="https://huggingface.co/meituan-longcat/LongCat-Image-Dev">Huggingface</a></span>
-        </td>
-      </tr>
-      <tr>
-        <td style="white-space: nowrap; padding: 8px; border: 1px solid #d0d7de;">LongCat&#8209;Image&#8209;Edit</td>
-        <td style="white-space: nowrap; padding: 8px; border: 1px solid #d0d7de;">Image Editing</td>
-        <td style="padding: 8px; border: 1px solid #d0d7de;">Specialized model for image editing.</td>
-        <td style="padding: 8px; border: 1px solid #d0d7de;">
-          <span style="white-space: nowrap;">🤗&nbsp;<a href="https://huggingface.co/meituan-longcat/LongCat-Image-Edit">Huggingface</a></span>
-        </td>
-      </tr>
-    </tbody>
-  </table>
-</div>
-
-## LongCatImagePipeline
-
-[[autodoc]] LongCatImagePipeline
- all
- __call__
-
-## LongCatImagePipelineOutput
-
-[[autodoc]] pipelines.longcat_image.pipeline_output.LongCatImagePipelineOutput
-
-
-
--- a/scripts/convert_cosmos_to_diffusers.py
+++ b/scripts/convert_cosmos_to_diffusers.py
@@ -1,55 +1,11 @@
-"""
-# Cosmos 2 Predict
-
-Download checkpoint
-```bash
-hf download nvidia/Cosmos-Predict2-2B-Text2Image
-```
-
-convert checkpoint
-```bash
-transformer_ckpt_path=~/.cache/huggingface/hub/models--nvidia--Cosmos-Predict2-2B-Text2Image/snapshots/acdb5fde992a73ef0355f287977d002cbfd127e0/model.pt
-
-python scripts/convert_cosmos_to_diffusers.py \
-    --transformer_ckpt_path $transformer_ckpt_path \
-    --transformer_type Cosmos-2.0-Diffusion-2B-Text2Image \
-    --text_encoder_path google-t5/t5-11b \
-    --tokenizer_path google-t5/t5-11b \
-    --vae_type wan2.1 \
-    --output_path converted/cosmos-p2-t2i-2b \
-    --save_pipeline
-```
-
-# Cosmos 2.5 Predict
-
-Download checkpoint
-```bash
-hf download nvidia/Cosmos-Predict2.5-2B
-```
-
-Convert checkpoint
-```bash
-transformer_ckpt_path=~/.cache/huggingface/hub/models--nvidia--Cosmos-Predict2.5-2B/snapshots/865baf084d4c9e850eac59a021277d5a9b9e8b63/base/pre-trained/d20b7120-df3e-4911-919d-db6e08bad31c_ema_bf16.pt
-
-python scripts/convert_cosmos_to_diffusers.py \
-    --transformer_type Cosmos-2.5-Predict-Base-2B \
-    --transformer_ckpt_path $transformer_ckpt_path \
-    --vae_type wan2.1 \
-    --output_path converted/cosmos-p2.5-base-2b \
-    --save_pipeline
-```
-
-"""
-
 import argparse
 import pathlib
-import sys
 from typing import Any, Dict

 import torch
 from accelerate import init_empty_weights
 from huggingface_hub import snapshot_download
-from transformers import AutoTokenizer, Qwen2_5_VLForConditionalGeneration, T5EncoderModel, T5TokenizerFast
+from transformers import T5EncoderModel, T5TokenizerFast

 from diffusers import (
    AutoencoderKLCosmos,
@@ -61,9 +17,7 @@ from diffusers import (
    CosmosVideoToWorldPipeline,
    EDMEulerScheduler,
    FlowMatchEulerDiscreteScheduler,
-    UniPCMultistepScheduler,
 )
-from diffusers.pipelines.cosmos.pipeline_cosmos2_5_predict import Cosmos2_5_PredictBasePipeline


 def remove_keys_(key: str, state_dict: Dict[str, Any]):
@@ -279,25 +233,6 @@ TRANSFORMER_CONFIGS = {
        "concat_padding_mask": True,
        "extra_pos_embed_type": None,
    },
-    "Cosmos-2.5-Predict-Base-2B": {
-        "in_channels": 16 + 1,
-        "out_channels": 16,
-        "num_attention_heads": 16,
-        "attention_head_dim": 128,
-        "num_layers": 28,
-        "mlp_ratio": 4.0,
-        "text_embed_dim": 1024,
-        "adaln_lora_dim": 256,
-        "max_size": (128, 240, 240),
-        "patch_size": (1, 2, 2),
-        "rope_scale": (1.0, 3.0, 3.0),
-        "concat_padding_mask": True,
-        # NOTE: source config has pos_emb_learnable: 'True' - but params are missing
-        "extra_pos_embed_type": None,
-        "use_crossattn_projection": True,
-        "crossattn_proj_in_channels": 100352,
-        "encoder_hidden_states_channels": 1024,
-    },
 }

 VAE_KEYS_RENAME_DICT = {
@@ -399,9 +334,6 @@ def convert_transformer(transformer_type: str, ckpt_path: str, weights_only: boo
    elif "Cosmos-2.0" in transformer_type:
        TRANSFORMER_KEYS_RENAME_DICT = TRANSFORMER_KEYS_RENAME_DICT_COSMOS_2_0
        TRANSFORMER_SPECIAL_KEYS_REMAP = TRANSFORMER_SPECIAL_KEYS_REMAP_COSMOS_2_0
-    elif "Cosmos-2.5" in transformer_type:
-        TRANSFORMER_KEYS_RENAME_DICT = TRANSFORMER_KEYS_RENAME_DICT_COSMOS_2_0
-        TRANSFORMER_SPECIAL_KEYS_REMAP = TRANSFORMER_SPECIAL_KEYS_REMAP_COSMOS_2_0
    else:
        assert False

@@ -415,7 +347,6 @@ def convert_transformer(transformer_type: str, ckpt_path: str, weights_only: boo
            new_key = new_key.removeprefix(PREFIX_KEY)
        for replace_key, rename_key in TRANSFORMER_KEYS_RENAME_DICT.items():
            new_key = new_key.replace(replace_key, rename_key)
-        print(key, "->", new_key, flush=True)
        update_state_dict_(original_state_dict, key, new_key)

    for key in list(original_state_dict.keys()):
@@ -424,21 +355,6 @@ def convert_transformer(transformer_type: str, ckpt_path: str, weights_only: boo
                continue
            handler_fn_inplace(key, original_state_dict)

-    expected_keys = set(transformer.state_dict().keys())
-    mapped_keys = set(original_state_dict.keys())
-    missing_keys = expected_keys - mapped_keys
-    unexpected_keys = mapped_keys - expected_keys
-    if missing_keys:
-        print(f"ERROR: missing keys ({len(missing_keys)} from state_dict:", flush=True, file=sys.stderr)
-        for k in missing_keys:
-            print(k)
-        sys.exit(1)
-    if unexpected_keys:
-        print(f"ERROR: unexpected keys ({len(unexpected_keys)}) from state_dict:", flush=True, file=sys.stderr)
-        for k in unexpected_keys:
-            print(k)
-        sys.exit(2)
-
    transformer.load_state_dict(original_state_dict, strict=True, assign=True)
    return transformer

@@ -528,34 +444,6 @@ def save_pipeline_cosmos_2_0(args, transformer, vae):
    pipe.save_pretrained(args.output_path, safe_serialization=True, max_shard_size="5GB")


-def save_pipeline_cosmos2_5(args, transformer, vae):
-    text_encoder_path = args.text_encoder_path or "nvidia/Cosmos-Reason1-7B"
-    tokenizer_path = args.tokenizer_path or "Qwen/Qwen2.5-VL-7B-Instruct"
-
-    text_encoder = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-        text_encoder_path, torch_dtype="auto", device_map="cpu"
-    )
-    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
-
-    scheduler = UniPCMultistepScheduler(
-        use_karras_sigmas=True,
-        use_flow_sigmas=True,
-        prediction_type="flow_prediction",
-        sigma_max=200.0,
-        sigma_min=0.01,
-    )
-
-    pipe = Cosmos2_5_PredictBasePipeline(
-        text_encoder=text_encoder,
-        tokenizer=tokenizer,
-        transformer=transformer,
-        vae=vae,
-        scheduler=scheduler,
-        safety_checker=lambda *args, **kwargs: None,
-    )
-    pipe.save_pretrained(args.output_path, safe_serialization=True, max_shard_size="5GB")
-
-
 def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--transformer_type", type=str, default=None, choices=list(TRANSFORMER_CONFIGS.keys()))
@@ -563,10 +451,10 @@ def get_args():
        "--transformer_ckpt_path", type=str, default=None, help="Path to original transformer checkpoint"
    )
    parser.add_argument(
-        "--vae_type", type=str, default="wan2.1", choices=["wan2.1", *list(VAE_CONFIGS.keys())], help="Type of VAE"
+        "--vae_type", type=str, default=None, choices=["none", *list(VAE_CONFIGS.keys())], help="Type of VAE"
    )
-    parser.add_argument("--text_encoder_path", type=str, default=None)
-    parser.add_argument("--tokenizer_path", type=str, default=None)
+    parser.add_argument("--text_encoder_path", type=str, default="google-t5/t5-11b")
+    parser.add_argument("--tokenizer_path", type=str, default="google-t5/t5-11b")
    parser.add_argument("--save_pipeline", action="store_true")
    parser.add_argument("--output_path", type=str, required=True, help="Path where converted model should be saved")
    parser.add_argument("--dtype", default="bf16", help="Torch dtype to save the transformer in.")
@@ -589,6 +477,8 @@ if __name__ == "__main__":
    if args.save_pipeline:
        assert args.transformer_ckpt_path is not None
        assert args.vae_type is not None
+        assert args.text_encoder_path is not None
+        assert args.tokenizer_path is not None

    if args.transformer_ckpt_path is not None:
        weights_only = "Cosmos-1.0" in args.transformer_type
@@ -600,26 +490,17 @@ if __name__ == "__main__":
    if args.vae_type is not None:
        if "Cosmos-1.0" in args.transformer_type:
            vae = convert_vae(args.vae_type)
-        elif "Cosmos-2.0" in args.transformer_type or "Cosmos-2.5" in args.transformer_type:
+        else:
            vae = AutoencoderKLWan.from_pretrained(
                "Wan-AI/Wan2.1-T2V-1.3B-Diffusers", subfolder="vae", torch_dtype=torch.float32
            )
-        else:
-            raise AssertionError(f"{args.transformer_type} not supported")
-
        if not args.save_pipeline:
            vae.save_pretrained(args.output_path, safe_serialization=True, max_shard_size="5GB")

    if args.save_pipeline:
        if "Cosmos-1.0" in args.transformer_type:
-            assert args.text_encoder_path is not None
-            assert args.tokenizer_path is not None
            save_pipeline_cosmos_1_0(args, transformer, vae)
        elif "Cosmos-2.0" in args.transformer_type:
-            assert args.text_encoder_path is not None
-            assert args.tokenizer_path is not None
            save_pipeline_cosmos_2_0(args, transformer, vae)
-        elif "Cosmos-2.5" in args.transformer_type:
-            save_pipeline_cosmos2_5(args, transformer, vae)
        else:
-            raise AssertionError(f"{args.transformer_type} not supported")
+            assert False
--- a/src/diffusers/init.py
+++ b/src/diffusers/init.py
@@ -235,7 +235,6 @@ else:
            "Kandinsky3UNet",
            "Kandinsky5Transformer3DModel",
            "LatteTransformer3DModel",
-            "LongCatImageTransformer2DModel",
            "LTXVideoTransformer3DModel",
            "Lumina2Transformer2DModel",
            "LuminaNextDiT2DModel",
@@ -279,7 +278,6 @@ else:
            "WanAnimateTransformer3DModel",
            "WanTransformer3DModel",
            "WanVACETransformer3DModel",
-            "ZImageControlNetModel",
            "ZImageTransformer2DModel",
            "attention_backend",
        ]
@@ -463,7 +461,6 @@ else:
            "CogView4ControlPipeline",
            "CogView4Pipeline",
            "ConsisIDPipeline",
-            "Cosmos2_5_PredictBasePipeline",
            "Cosmos2TextToImagePipeline",
            "Cosmos2VideoToWorldPipeline",
            "CosmosTextToWorldPipeline",
@@ -535,8 +532,6 @@ else:
            "LDMTextToImagePipeline",
            "LEditsPPPipelineStableDiffusion",
            "LEditsPPPipelineStableDiffusionXL",
-            "LongCatImageEditPipeline",
-            "LongCatImagePipeline",
            "LTXConditionPipeline",
            "LTXImageToVideoPipeline",
            "LTXLatentUpsamplePipeline",
@@ -566,7 +561,6 @@ else:
            "QwenImageEditPlusPipeline",
            "QwenImageImg2ImgPipeline",
            "QwenImageInpaintPipeline",
-            "QwenImageLayeredPipeline",
            "QwenImagePipeline",
            "ReduxImageEncoder",
            "SanaControlNetPipeline",
@@ -672,8 +666,6 @@ else:
            "WuerstchenCombinedPipeline",
            "WuerstchenDecoderPipeline",
            "WuerstchenPriorPipeline",
-            "ZImageControlNetInpaintPipeline",
-            "ZImageControlNetPipeline",
            "ZImageImg2ImgPipeline",
            "ZImagePipeline",
        ]
@@ -978,7 +970,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
            Kandinsky3UNet,
            Kandinsky5Transformer3DModel,
            LatteTransformer3DModel,
-            LongCatImageTransformer2DModel,
            LTXVideoTransformer3DModel,
            Lumina2Transformer2DModel,
            LuminaNextDiT2DModel,
@@ -1021,7 +1012,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
            WanAnimateTransformer3DModel,
            WanTransformer3DModel,
            WanVACETransformer3DModel,
-            ZImageControlNetModel,
            ZImageTransformer2DModel,
            attention_backend,
        )
@@ -1176,7 +1166,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
            CogView4ControlPipeline,
            CogView4Pipeline,
            ConsisIDPipeline,
-            Cosmos2_5_PredictBasePipeline,
            Cosmos2TextToImagePipeline,
            Cosmos2VideoToWorldPipeline,
            CosmosTextToWorldPipeline,
@@ -1248,8 +1237,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
            LDMTextToImagePipeline,
            LEditsPPPipelineStableDiffusion,
            LEditsPPPipelineStableDiffusionXL,
-            LongCatImageEditPipeline,
-            LongCatImagePipeline,
            LTXConditionPipeline,
            LTXImageToVideoPipeline,
            LTXLatentUpsamplePipeline,
@@ -1279,7 +1266,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
            QwenImageEditPlusPipeline,
            QwenImageImg2ImgPipeline,
            QwenImageInpaintPipeline,
-            QwenImageLayeredPipeline,
            QwenImagePipeline,
            ReduxImageEncoder,
            SanaControlNetPipeline,
@@ -1383,8 +1369,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
            WuerstchenCombinedPipeline,
            WuerstchenDecoderPipeline,
            WuerstchenPriorPipeline,
-            ZImageControlNetInpaintPipeline,
-            ZImageControlNetPipeline,
            ZImageImg2ImgPipeline,
            ZImagePipeline,
        )
--- a/src/diffusers/loaders/single_file_model.py
+++ b/src/diffusers/loaders/single_file_model.py
@@ -49,7 +49,6 @@ from .single_file_utils import (
    convert_stable_cascade_unet_single_file_to_diffusers,
    convert_wan_transformer_to_diffusers,
    convert_wan_vae_to_diffusers,
-    convert_z_image_controlnet_checkpoint_to_diffusers,
    convert_z_image_transformer_checkpoint_to_diffusers,
    create_controlnet_diffusers_config_from_ldm,
    create_unet_diffusers_config_from_ldm,
@@ -173,18 +172,11 @@ SINGLE_FILE_LOADABLE_CLASSES = {
        "checkpoint_mapping_fn": convert_z_image_transformer_checkpoint_to_diffusers,
        "default_subfolder": "transformer",
    },
-    "ZImageControlNetModel": {
-        "checkpoint_mapping_fn": convert_z_image_controlnet_checkpoint_to_diffusers,
-    },
 }


 def _should_convert_state_dict_to_diffusers(model_state_dict, checkpoint_state_dict):
-    model_state_dict_keys = set(model_state_dict.keys())
-    checkpoint_state_dict_keys = set(checkpoint_state_dict.keys())
-    is_subset = model_state_dict_keys.issubset(checkpoint_state_dict_keys)
-    is_match = model_state_dict_keys == checkpoint_state_dict_keys
-    return not (is_subset and is_match)
+    return not set(model_state_dict.keys()).issubset(set(checkpoint_state_dict.keys()))


 def _get_single_file_loadable_mapping_class(cls):
--- a/src/diffusers/loaders/single_file_utils.py
+++ b/src/diffusers/loaders/single_file_utils.py
@@ -121,8 +121,6 @@ CHECKPOINT_KEY_NAMES = {
    "instruct-pix2pix": "model.diffusion_model.input_blocks.0.0.weight",
    "lumina2": ["model.diffusion_model.cap_embedder.0.weight", "cap_embedder.0.weight"],
    "z-image-turbo": "cap_embedder.0.weight",
-    "z-image-turbo-controlnet": "control_all_x_embedder.2-1.weight",
-    "z-image-turbo-controlnet-2.x": "control_layers.14.adaLN_modulation.0.weight",
    "sana": [
        "blocks.0.cross_attn.q_linear.weight",
        "blocks.0.cross_attn.q_linear.bias",
@@ -222,8 +220,6 @@ DIFFUSERS_DEFAULT_PIPELINE_PATHS = {
    "cosmos-2.0-v2w-2B": {"pretrained_model_name_or_path": "nvidia/Cosmos-Predict2-2B-Video2World"},
    "cosmos-2.0-v2w-14B": {"pretrained_model_name_or_path": "nvidia/Cosmos-Predict2-14B-Video2World"},
    "z-image-turbo": {"pretrained_model_name_or_path": "Tongyi-MAI/Z-Image-Turbo"},
-    "z-image-turbo-controlnet": {"pretrained_model_name_or_path": "hlky/Z-Image-Turbo-Fun-Controlnet-Union"},
-    "z-image-turbo-controlnet-2.x": {"pretrained_model_name_or_path": "hlky/Z-Image-Turbo-Fun-Controlnet-Union-2.1"},
 }

 # Use to configure model sample size when original config is provided
@@ -783,12 +779,6 @@ def infer_diffusers_model_type(checkpoint):
        else:
            raise ValueError(f"Unexpected x_embedder shape: {x_embedder_shape} when loading Cosmos 2.0 model.")

-    elif CHECKPOINT_KEY_NAMES["z-image-turbo-controlnet-2.x"] in checkpoint:
-        model_type = "z-image-turbo-controlnet-2.x"
-
-    elif CHECKPOINT_KEY_NAMES["z-image-turbo-controlnet"] in checkpoint:
-        model_type = "z-image-turbo-controlnet"
-
    else:
        model_type = "v1"

@@ -3895,17 +3885,3 @@ def convert_z_image_transformer_checkpoint_to_diffusers(checkpoint, **kwargs):
            handler_fn_inplace(key, converted_state_dict)

    return converted_state_dict
-
-
-def convert_z_image_controlnet_checkpoint_to_diffusers(checkpoint, config, **kwargs):
-    if config["add_control_noise_refiner"] is None:
-        return checkpoint
-    elif config["add_control_noise_refiner"] == "control_noise_refiner":
-        return checkpoint
-    elif config["add_control_noise_refiner"] == "control_layers":
-        converted_state_dict = {
-            key: checkpoint.pop(key) for key in list(checkpoint.keys()) if not key.startswith("control_noise_refiner.")
-        }
-        return converted_state_dict
-    else:
-        raise ValueError("Unknown Z-Image Turbo ControlNet type.")
--- a/src/diffusers/models/init.py
+++ b/src/diffusers/models/init.py
@@ -66,7 +66,6 @@ if is_torch_available():
    _import_structure["controlnets.controlnet_sparsectrl"] = ["SparseControlNetModel"]
    _import_structure["controlnets.controlnet_union"] = ["ControlNetUnionModel"]
    _import_structure["controlnets.controlnet_xs"] = ["ControlNetXSAdapter", "UNetControlNetXSModel"]
-    _import_structure["controlnets.controlnet_z_image"] = ["ZImageControlNetModel"]
    _import_structure["controlnets.multicontrolnet"] = ["MultiControlNetModel"]
    _import_structure["controlnets.multicontrolnet_union"] = ["MultiControlNetUnionModel"]
    _import_structure["embeddings"] = ["ImageProjection"]
@@ -102,7 +101,6 @@ if is_torch_available():
    _import_structure["transformers.transformer_hunyuan_video_framepack"] = ["HunyuanVideoFramepackTransformer3DModel"]
    _import_structure["transformers.transformer_hunyuanimage"] = ["HunyuanImageTransformer2DModel"]
    _import_structure["transformers.transformer_kandinsky"] = ["Kandinsky5Transformer3DModel"]
-    _import_structure["transformers.transformer_longcat_image"] = ["LongCatImageTransformer2DModel"]
    _import_structure["transformers.transformer_ltx"] = ["LTXVideoTransformer3DModel"]
    _import_structure["transformers.transformer_lumina2"] = ["Lumina2Transformer2DModel"]
    _import_structure["transformers.transformer_mochi"] = ["MochiTransformer3DModel"]
@@ -182,7 +180,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
            SD3MultiControlNetModel,
            SparseControlNetModel,
            UNetControlNetXSModel,
-            ZImageControlNetModel,
        )
        from .embeddings import ImageProjection
        from .modeling_utils import ModelMixin
@@ -211,7 +208,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
            HunyuanVideoTransformer3DModel,
            Kandinsky5Transformer3DModel,
            LatteTransformer3DModel,
-            LongCatImageTransformer2DModel,
            LTXVideoTransformer3DModel,
            Lumina2Transformer2DModel,
            LuminaNextDiT2DModel,
--- a/src/diffusers/models/autoencoders/autoencoder_kl_qwenimage.py
+++ b/src/diffusers/models/autoencoders/autoencoder_kl_qwenimage.py
@@ -394,7 +394,6 @@ class QwenImageEncoder3d(nn.Module):
        attn_scales=[],
        temperal_downsample=[True, True, False],
        dropout=0.0,
-        input_channels=3,
        non_linearity: str = "silu",
    ):
        super().__init__()
@@ -411,7 +410,7 @@ class QwenImageEncoder3d(nn.Module):
        scale = 1.0

        # init block
-        self.conv_in = QwenImageCausalConv3d(input_channels, dims[0], 3, padding=1)
+        self.conv_in = QwenImageCausalConv3d(3, dims[0], 3, padding=1)

        # downsample blocks
        self.down_blocks = nn.ModuleList([])
@@ -571,7 +570,6 @@ class QwenImageDecoder3d(nn.Module):
        attn_scales=[],
        temperal_upsample=[False, True, True],
        dropout=0.0,
-        input_channels=3,
        non_linearity: str = "silu",
    ):
        super().__init__()
@@ -623,7 +621,7 @@ class QwenImageDecoder3d(nn.Module):

        # output blocks
        self.norm_out = QwenImageRMS_norm(out_dim, images=False)
-        self.conv_out = QwenImageCausalConv3d(out_dim, input_channels, 3, padding=1)
+        self.conv_out = QwenImageCausalConv3d(out_dim, 3, 3, padding=1)

        self.gradient_checkpointing = False

@@ -686,7 +684,6 @@ class AutoencoderKLQwenImage(ModelMixin, AutoencoderMixin, ConfigMixin, FromOrig
        attn_scales: List[float] = [],
        temperal_downsample: List[bool] = [False, True, True],
        dropout: float = 0.0,
-        input_channels: int = 3,
        latents_mean: List[float] = [-0.7571, -0.7089, -0.9113, 0.1075, -0.1745, 0.9653, -0.1517, 1.5508, 0.4134, -0.0715, 0.5517, -0.3632, -0.1922, -0.9497, 0.2503, -0.2921],
        latents_std: List[float] = [2.8184, 1.4541, 2.3275, 2.6558, 1.2196, 1.7708, 2.6052, 2.0743, 3.2687, 2.1526, 2.8652, 1.5579, 1.6382, 1.1253, 2.8251, 1.9160],
    ) -> None:
@@ -698,13 +695,13 @@ class AutoencoderKLQwenImage(ModelMixin, AutoencoderMixin, ConfigMixin, FromOrig
        self.temperal_upsample = temperal_downsample[::-1]

        self.encoder = QwenImageEncoder3d(
-            base_dim, z_dim * 2, dim_mult, num_res_blocks, attn_scales, self.temperal_downsample, dropout, input_channels
+            base_dim, z_dim * 2, dim_mult, num_res_blocks, attn_scales, self.temperal_downsample, dropout
        )
        self.quant_conv = QwenImageCausalConv3d(z_dim * 2, z_dim * 2, 1)
        self.post_quant_conv = QwenImageCausalConv3d(z_dim, z_dim, 1)

        self.decoder = QwenImageDecoder3d(
-            base_dim, z_dim, dim_mult, num_res_blocks, attn_scales, self.temperal_upsample, dropout, input_channels
+            base_dim, z_dim, dim_mult, num_res_blocks, attn_scales, self.temperal_upsample, dropout
        )

        self.spatial_compression_ratio = 2 ** len(self.temperal_downsample)
--- a/src/diffusers/models/controlnets/init.py
+++ b/src/diffusers/models/controlnets/init.py
@@ -19,7 +19,6 @@ if is_torch_available():
    )
    from .controlnet_union import ControlNetUnionModel
    from .controlnet_xs import ControlNetXSAdapter, ControlNetXSOutput, UNetControlNetXSModel
-    from .controlnet_z_image import ZImageControlNetModel
    from .multicontrolnet import MultiControlNetModel
    from .multicontrolnet_union import MultiControlNetUnionModel

--- a/src/diffusers/models/controlnets/controlnet_z_image.py
+++ b/src/diffusers/models/controlnets/controlnet_z_image.py
@@ -1,824 +0,0 @@
-# Copyright 2025 Alibaba Z-Image Team and The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import math
-from typing import List, Literal, Optional
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torch.nn.utils.rnn import pad_sequence
-
-from ...configuration_utils import ConfigMixin, register_to_config
-from ...loaders import PeftAdapterMixin
-from ...loaders.single_file_model import FromOriginalModelMixin
-from ...models.attention_processor import Attention
-from ...models.normalization import RMSNorm
-from ...utils.torch_utils import maybe_allow_in_graph
-from ..attention_dispatch import dispatch_attention_fn
-from ..controlnets.controlnet import zero_module
-from ..modeling_utils import ModelMixin
-
-
-ADALN_EMBED_DIM = 256
-SEQ_MULTI_OF = 32
-
-
-# Copied from diffusers.models.transformers.transformer_z_image.TimestepEmbedder
-class TimestepEmbedder(nn.Module):
-    def __init__(self, out_size, mid_size=None, frequency_embedding_size=256):
-        super().__init__()
-        if mid_size is None:
-            mid_size = out_size
-        self.mlp = nn.Sequential(
-            nn.Linear(frequency_embedding_size, mid_size, bias=True),
-            nn.SiLU(),
-            nn.Linear(mid_size, out_size, bias=True),
-        )
-
-        self.frequency_embedding_size = frequency_embedding_size
-
-    @staticmethod
-    def timestep_embedding(t, dim, max_period=10000):
-        with torch.amp.autocast("cuda", enabled=False):
-            half = dim // 2
-            freqs = torch.exp(
-                -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32, device=t.device) / half
-            )
-            args = t[:, None].float() * freqs[None]
-            embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
-            if dim % 2:
-                embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
-            return embedding
-
-    def forward(self, t):
-        t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
-        weight_dtype = self.mlp[0].weight.dtype
-        compute_dtype = getattr(self.mlp[0], "compute_dtype", None)
-        if weight_dtype.is_floating_point:
-            t_freq = t_freq.to(weight_dtype)
-        elif compute_dtype is not None:
-            t_freq = t_freq.to(compute_dtype)
-        t_emb = self.mlp(t_freq)
-        return t_emb
-
-
-# Copied from diffusers.models.transformers.transformer_z_image.ZSingleStreamAttnProcessor
-class ZSingleStreamAttnProcessor:
-    """
-    Processor for Z-Image single stream attention that adapts the existing Attention class to match the behavior of the
-    original Z-ImageAttention module.
-    """
-
-    _attention_backend = None
-    _parallel_config = None
-
-    def __init__(self):
-        if not hasattr(F, "scaled_dot_product_attention"):
-            raise ImportError(
-                "ZSingleStreamAttnProcessor requires PyTorch 2.0. To use it, please upgrade PyTorch to version 2.0 or higher."
-            )
-
-    def __call__(
-        self,
-        attn: Attention,
-        hidden_states: torch.Tensor,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        freqs_cis: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        query = attn.to_q(hidden_states)
-        key = attn.to_k(hidden_states)
-        value = attn.to_v(hidden_states)
-
-        query = query.unflatten(-1, (attn.heads, -1))
-        key = key.unflatten(-1, (attn.heads, -1))
-        value = value.unflatten(-1, (attn.heads, -1))
-
-        # Apply Norms
-        if attn.norm_q is not None:
-            query = attn.norm_q(query)
-        if attn.norm_k is not None:
-            key = attn.norm_k(key)
-
-        # Apply RoPE
-        def apply_rotary_emb(x_in: torch.Tensor, freqs_cis: torch.Tensor) -> torch.Tensor:
-            with torch.amp.autocast("cuda", enabled=False):
-                x = torch.view_as_complex(x_in.float().reshape(*x_in.shape[:-1], -1, 2))
-                freqs_cis = freqs_cis.unsqueeze(2)
-                x_out = torch.view_as_real(x * freqs_cis).flatten(3)
-                return x_out.type_as(x_in)  # todo
-
-        if freqs_cis is not None:
-            query = apply_rotary_emb(query, freqs_cis)
-            key = apply_rotary_emb(key, freqs_cis)
-
-        # Cast to correct dtype
-        dtype = query.dtype
-        query, key = query.to(dtype), key.to(dtype)
-
-        # From [batch, seq_len] to [batch, 1, 1, seq_len] -> broadcast to [batch, heads, seq_len, seq_len]
-        if attention_mask is not None and attention_mask.ndim == 2:
-            attention_mask = attention_mask[:, None, None, :]
-
-        # Compute joint attention
-        hidden_states = dispatch_attention_fn(
-            query,
-            key,
-            value,
-            attn_mask=attention_mask,
-            dropout_p=0.0,
-            is_causal=False,
-            backend=self._attention_backend,
-            parallel_config=self._parallel_config,
-        )
-
-        # Reshape back
-        hidden_states = hidden_states.flatten(2, 3)
-        hidden_states = hidden_states.to(dtype)
-
-        output = attn.to_out[0](hidden_states)
-        if len(attn.to_out) > 1:  # dropout
-            output = attn.to_out[1](output)
-
-        return output
-
-
-# Copied from diffusers.models.transformers.transformer_z_image.FeedForward
-class FeedForward(nn.Module):
-    def __init__(self, dim: int, hidden_dim: int):
-        super().__init__()
-        self.w1 = nn.Linear(dim, hidden_dim, bias=False)
-        self.w2 = nn.Linear(hidden_dim, dim, bias=False)
-        self.w3 = nn.Linear(dim, hidden_dim, bias=False)
-
-    def _forward_silu_gating(self, x1, x3):
-        return F.silu(x1) * x3
-
-    def forward(self, x):
-        return self.w2(self._forward_silu_gating(self.w1(x), self.w3(x)))
-
-
-@maybe_allow_in_graph
-# Copied from diffusers.models.transformers.transformer_z_image.ZImageTransformerBlock
-class ZImageTransformerBlock(nn.Module):
-    def __init__(
-        self,
-        layer_id: int,
-        dim: int,
-        n_heads: int,
-        n_kv_heads: int,
-        norm_eps: float,
-        qk_norm: bool,
-        modulation=True,
-    ):
-        super().__init__()
-        self.dim = dim
-        self.head_dim = dim // n_heads
-
-        # Refactored to use diffusers Attention with custom processor
-        # Original Z-Image params: dim, n_heads, n_kv_heads, qk_norm
-        self.attention = Attention(
-            query_dim=dim,
-            cross_attention_dim=None,
-            dim_head=dim // n_heads,
-            heads=n_heads,
-            qk_norm="rms_norm" if qk_norm else None,
-            eps=1e-5,
-            bias=False,
-            out_bias=False,
-            processor=ZSingleStreamAttnProcessor(),
-        )
-
-        self.feed_forward = FeedForward(dim=dim, hidden_dim=int(dim / 3 * 8))
-        self.layer_id = layer_id
-
-        self.attention_norm1 = RMSNorm(dim, eps=norm_eps)
-        self.ffn_norm1 = RMSNorm(dim, eps=norm_eps)
-
-        self.attention_norm2 = RMSNorm(dim, eps=norm_eps)
-        self.ffn_norm2 = RMSNorm(dim, eps=norm_eps)
-
-        self.modulation = modulation
-        if modulation:
-            self.adaLN_modulation = nn.Sequential(nn.Linear(min(dim, ADALN_EMBED_DIM), 4 * dim, bias=True))
-
-    def forward(
-        self,
-        x: torch.Tensor,
-        attn_mask: torch.Tensor,
-        freqs_cis: torch.Tensor,
-        adaln_input: Optional[torch.Tensor] = None,
-    ):
-        if self.modulation:
-            assert adaln_input is not None
-            scale_msa, gate_msa, scale_mlp, gate_mlp = self.adaLN_modulation(adaln_input).unsqueeze(1).chunk(4, dim=2)
-            gate_msa, gate_mlp = gate_msa.tanh(), gate_mlp.tanh()
-            scale_msa, scale_mlp = 1.0 + scale_msa, 1.0 + scale_mlp
-
-            # Attention block
-            attn_out = self.attention(
-                self.attention_norm1(x) * scale_msa, attention_mask=attn_mask, freqs_cis=freqs_cis
-            )
-            x = x + gate_msa * self.attention_norm2(attn_out)
-
-            # FFN block
-            x = x + gate_mlp * self.ffn_norm2(self.feed_forward(self.ffn_norm1(x) * scale_mlp))
-        else:
-            # Attention block
-            attn_out = self.attention(self.attention_norm1(x), attention_mask=attn_mask, freqs_cis=freqs_cis)
-            x = x + self.attention_norm2(attn_out)
-
-            # FFN block
-            x = x + self.ffn_norm2(self.feed_forward(self.ffn_norm1(x)))
-
-        return x
-
-
-# Copied from diffusers.models.transformers.transformer_z_image.RopeEmbedder
-class RopeEmbedder:
-    def __init__(
-        self,
-        theta: float = 256.0,
-        axes_dims: List[int] = (16, 56, 56),
-        axes_lens: List[int] = (64, 128, 128),
-    ):
-        self.theta = theta
-        self.axes_dims = axes_dims
-        self.axes_lens = axes_lens
-        assert len(axes_dims) == len(axes_lens), "axes_dims and axes_lens must have the same length"
-        self.freqs_cis = None
-
-    @staticmethod
-    def precompute_freqs_cis(dim: List[int], end: List[int], theta: float = 256.0):
-        with torch.device("cpu"):
-            freqs_cis = []
-            for i, (d, e) in enumerate(zip(dim, end)):
-                freqs = 1.0 / (theta ** (torch.arange(0, d, 2, dtype=torch.float64, device="cpu") / d))
-                timestep = torch.arange(e, device=freqs.device, dtype=torch.float64)
-                freqs = torch.outer(timestep, freqs).float()
-                freqs_cis_i = torch.polar(torch.ones_like(freqs), freqs).to(torch.complex64)  # complex64
-                freqs_cis.append(freqs_cis_i)
-
-            return freqs_cis
-
-    def __call__(self, ids: torch.Tensor):
-        assert ids.ndim == 2
-        assert ids.shape[-1] == len(self.axes_dims)
-        device = ids.device
-
-        if self.freqs_cis is None:
-            self.freqs_cis = self.precompute_freqs_cis(self.axes_dims, self.axes_lens, theta=self.theta)
-            self.freqs_cis = [freqs_cis.to(device) for freqs_cis in self.freqs_cis]
-        else:
-            # Ensure freqs_cis are on the same device as ids
-            if self.freqs_cis[0].device != device:
-                self.freqs_cis = [freqs_cis.to(device) for freqs_cis in self.freqs_cis]
-
-        result = []
-        for i in range(len(self.axes_dims)):
-            index = ids[:, i]
-            result.append(self.freqs_cis[i][index])
-        return torch.cat(result, dim=-1)
-
-
-@maybe_allow_in_graph
-class ZImageControlTransformerBlock(nn.Module):
-    def __init__(
-        self,
-        layer_id: int,
-        dim: int,
-        n_heads: int,
-        n_kv_heads: int,
-        norm_eps: float,
-        qk_norm: bool,
-        modulation=True,
-        block_id=0,
-    ):
-        super().__init__()
-        self.dim = dim
-        self.head_dim = dim // n_heads
-
-        # Refactored to use diffusers Attention with custom processor
-        # Original Z-Image params: dim, n_heads, n_kv_heads, qk_norm
-        self.attention = Attention(
-            query_dim=dim,
-            cross_attention_dim=None,
-            dim_head=dim // n_heads,
-            heads=n_heads,
-            qk_norm="rms_norm" if qk_norm else None,
-            eps=1e-5,
-            bias=False,
-            out_bias=False,
-            processor=ZSingleStreamAttnProcessor(),
-        )
-
-        self.feed_forward = FeedForward(dim=dim, hidden_dim=int(dim / 3 * 8))
-        self.layer_id = layer_id
-
-        self.attention_norm1 = RMSNorm(dim, eps=norm_eps)
-        self.ffn_norm1 = RMSNorm(dim, eps=norm_eps)
-
-        self.attention_norm2 = RMSNorm(dim, eps=norm_eps)
-        self.ffn_norm2 = RMSNorm(dim, eps=norm_eps)
-
-        self.modulation = modulation
-        if modulation:
-            self.adaLN_modulation = nn.Sequential(nn.Linear(min(dim, ADALN_EMBED_DIM), 4 * dim, bias=True))
-
-        # Control variant start
-        self.block_id = block_id
-        if block_id == 0:
-            self.before_proj = zero_module(nn.Linear(self.dim, self.dim))
-        self.after_proj = zero_module(nn.Linear(self.dim, self.dim))
-
-    def forward(
-        self,
-        c: torch.Tensor,
-        x: torch.Tensor,
-        attn_mask: torch.Tensor,
-        freqs_cis: torch.Tensor,
-        adaln_input: Optional[torch.Tensor] = None,
-    ):
-        # Control
-        if self.block_id == 0:
-            c = self.before_proj(c) + x
-            all_c = []
-        else:
-            all_c = list(torch.unbind(c))
-            c = all_c.pop(-1)
-
-        # Compared to `ZImageTransformerBlock` x -> c
-        if self.modulation:
-            assert adaln_input is not None
-            scale_msa, gate_msa, scale_mlp, gate_mlp = self.adaLN_modulation(adaln_input).unsqueeze(1).chunk(4, dim=2)
-            gate_msa, gate_mlp = gate_msa.tanh(), gate_mlp.tanh()
-            scale_msa, scale_mlp = 1.0 + scale_msa, 1.0 + scale_mlp
-
-            # Attention block
-            attn_out = self.attention(
-                self.attention_norm1(c) * scale_msa, attention_mask=attn_mask, freqs_cis=freqs_cis
-            )
-            c = c + gate_msa * self.attention_norm2(attn_out)
-
-            # FFN block
-            c = c + gate_mlp * self.ffn_norm2(self.feed_forward(self.ffn_norm1(c) * scale_mlp))
-        else:
-            # Attention block
-            attn_out = self.attention(self.attention_norm1(c), attention_mask=attn_mask, freqs_cis=freqs_cis)
-            c = c + self.attention_norm2(attn_out)
-
-            # FFN block
-            c = c + self.ffn_norm2(self.feed_forward(self.ffn_norm1(c)))
-
-        # Control
-        c_skip = self.after_proj(c)
-        all_c += [c_skip, c]
-        c = torch.stack(all_c)
-        return c
-
-
-class ZImageControlNetModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin):
-    _supports_gradient_checkpointing = True
-
-    @register_to_config
-    def __init__(
-        self,
-        control_layers_places: List[int] = None,
-        control_refiner_layers_places: List[int] = None,
-        control_in_dim=None,
-        add_control_noise_refiner: Optional[Literal["control_layers", "control_noise_refiner"]] = None,
-        all_patch_size=(2,),
-        all_f_patch_size=(1,),
-        dim=3840,
-        n_refiner_layers=2,
-        n_heads=30,
-        n_kv_heads=30,
-        norm_eps=1e-5,
-        qk_norm=True,
-    ):
-        super().__init__()
-        self.control_layers_places = control_layers_places
-        self.control_in_dim = control_in_dim
-        self.control_refiner_layers_places = control_refiner_layers_places
-        self.add_control_noise_refiner = add_control_noise_refiner
-
-        assert 0 in self.control_layers_places
-
-        # control blocks
-        self.control_layers = nn.ModuleList(
-            [
-                ZImageControlTransformerBlock(i, dim, n_heads, n_kv_heads, norm_eps, qk_norm, block_id=i)
-                for i in self.control_layers_places
-            ]
-        )
-
-        # control patch embeddings
-        all_x_embedder = {}
-        for patch_idx, (patch_size, f_patch_size) in enumerate(zip(all_patch_size, all_f_patch_size)):
-            x_embedder = nn.Linear(f_patch_size * patch_size * patch_size * self.control_in_dim, dim, bias=True)
-            all_x_embedder[f"{patch_size}-{f_patch_size}"] = x_embedder
-
-        self.control_all_x_embedder = nn.ModuleDict(all_x_embedder)
-        if self.add_control_noise_refiner == "control_layers":
-            self.control_noise_refiner = None
-        elif self.add_control_noise_refiner == "control_noise_refiner":
-            self.control_noise_refiner = nn.ModuleList(
-                [
-                    ZImageControlTransformerBlock(
-                        1000 + layer_id,
-                        dim,
-                        n_heads,
-                        n_kv_heads,
-                        norm_eps,
-                        qk_norm,
-                        modulation=True,
-                        block_id=layer_id,
-                    )
-                    for layer_id in range(n_refiner_layers)
-                ]
-            )
-        else:
-            self.control_noise_refiner = nn.ModuleList(
-                [
-                    ZImageTransformerBlock(
-                        1000 + layer_id,
-                        dim,
-                        n_heads,
-                        n_kv_heads,
-                        norm_eps,
-                        qk_norm,
-                        modulation=True,
-                    )
-                    for layer_id in range(n_refiner_layers)
-                ]
-            )
-
-        self.t_scale: Optional[float] = None
-        self.t_embedder: Optional[TimestepEmbedder] = None
-        self.all_x_embedder: Optional[nn.ModuleDict] = None
-        self.cap_embedder: Optional[nn.Sequential] = None
-        self.rope_embedder: Optional[RopeEmbedder] = None
-        self.noise_refiner: Optional[nn.ModuleList] = None
-        self.context_refiner: Optional[nn.ModuleList] = None
-        self.x_pad_token: Optional[nn.Parameter] = None
-        self.cap_pad_token: Optional[nn.Parameter] = None
-
-    @classmethod
-    def from_transformer(cls, controlnet, transformer):
-        controlnet.t_scale = transformer.t_scale
-        controlnet.t_embedder = transformer.t_embedder
-        controlnet.all_x_embedder = transformer.all_x_embedder
-        controlnet.cap_embedder = transformer.cap_embedder
-        controlnet.rope_embedder = transformer.rope_embedder
-        controlnet.noise_refiner = transformer.noise_refiner
-        controlnet.context_refiner = transformer.context_refiner
-        controlnet.x_pad_token = transformer.x_pad_token
-        controlnet.cap_pad_token = transformer.cap_pad_token
-        return controlnet
-
-    @staticmethod
-    # Copied from diffusers.models.transformers.transformer_z_image.ZImageTransformer2DModel.create_coordinate_grid
-    def create_coordinate_grid(size, start=None, device=None):
-        if start is None:
-            start = (0 for _ in size)
-
-        axes = [torch.arange(x0, x0 + span, dtype=torch.int32, device=device) for x0, span in zip(start, size)]
-        grids = torch.meshgrid(axes, indexing="ij")
-        return torch.stack(grids, dim=-1)
-
-    # Copied from diffusers.models.transformers.transformer_z_image.ZImageTransformer2DModel.patchify_and_embed
-    def patchify_and_embed(
-        self,
-        all_image: List[torch.Tensor],
-        all_cap_feats: List[torch.Tensor],
-        patch_size: int,
-        f_patch_size: int,
-    ):
-        pH = pW = patch_size
-        pF = f_patch_size
-        device = all_image[0].device
-
-        all_image_out = []
-        all_image_size = []
-        all_image_pos_ids = []
-        all_image_pad_mask = []
-        all_cap_pos_ids = []
-        all_cap_pad_mask = []
-        all_cap_feats_out = []
-
-        for i, (image, cap_feat) in enumerate(zip(all_image, all_cap_feats)):
-            ### Process Caption
-            cap_ori_len = len(cap_feat)
-            cap_padding_len = (-cap_ori_len) % SEQ_MULTI_OF
-            # padded position ids
-            cap_padded_pos_ids = self.create_coordinate_grid(
-                size=(cap_ori_len + cap_padding_len, 1, 1),
-                start=(1, 0, 0),
-                device=device,
-            ).flatten(0, 2)
-            all_cap_pos_ids.append(cap_padded_pos_ids)
-            # pad mask
-            cap_pad_mask = torch.cat(
-                [
-                    torch.zeros((cap_ori_len,), dtype=torch.bool, device=device),
-                    torch.ones((cap_padding_len,), dtype=torch.bool, device=device),
-                ],
-                dim=0,
-            )
-            all_cap_pad_mask.append(
-                cap_pad_mask if cap_padding_len > 0 else torch.zeros((cap_ori_len,), dtype=torch.bool, device=device)
-            )
-
-            # padded feature
-            cap_padded_feat = torch.cat([cap_feat, cap_feat[-1:].repeat(cap_padding_len, 1)], dim=0)
-            all_cap_feats_out.append(cap_padded_feat)
-
-            ### Process Image
-            C, F, H, W = image.size()
-            all_image_size.append((F, H, W))
-            F_tokens, H_tokens, W_tokens = F // pF, H // pH, W // pW
-
-            image = image.view(C, F_tokens, pF, H_tokens, pH, W_tokens, pW)
-            # "c f pf h ph w pw -> (f h w) (pf ph pw c)"
-            image = image.permute(1, 3, 5, 2, 4, 6, 0).reshape(F_tokens * H_tokens * W_tokens, pF * pH * pW * C)
-
-            image_ori_len = len(image)
-            image_padding_len = (-image_ori_len) % SEQ_MULTI_OF
-
-            image_ori_pos_ids = self.create_coordinate_grid(
-                size=(F_tokens, H_tokens, W_tokens),
-                start=(cap_ori_len + cap_padding_len + 1, 0, 0),
-                device=device,
-            ).flatten(0, 2)
-            image_padded_pos_ids = torch.cat(
-                [
-                    image_ori_pos_ids,
-                    self.create_coordinate_grid(size=(1, 1, 1), start=(0, 0, 0), device=device)
-                    .flatten(0, 2)
-                    .repeat(image_padding_len, 1),
-                ],
-                dim=0,
-            )
-            all_image_pos_ids.append(image_padded_pos_ids if image_padding_len > 0 else image_ori_pos_ids)
-            # pad mask
-            image_pad_mask = torch.cat(
-                [
-                    torch.zeros((image_ori_len,), dtype=torch.bool, device=device),
-                    torch.ones((image_padding_len,), dtype=torch.bool, device=device),
-                ],
-                dim=0,
-            )
-            all_image_pad_mask.append(
-                image_pad_mask
-                if image_padding_len > 0
-                else torch.zeros((image_ori_len,), dtype=torch.bool, device=device)
-            )
-            # padded feature
-            image_padded_feat = torch.cat(
-                [image, image[-1:].repeat(image_padding_len, 1)],
-                dim=0,
-            )
-            all_image_out.append(image_padded_feat if image_padding_len > 0 else image)
-
-        return (
-            all_image_out,
-            all_cap_feats_out,
-            all_image_size,
-            all_image_pos_ids,
-            all_cap_pos_ids,
-            all_image_pad_mask,
-            all_cap_pad_mask,
-        )
-
-    def patchify(
-        self,
-        all_image: List[torch.Tensor],
-        patch_size: int,
-        f_patch_size: int,
-    ):
-        pH = pW = patch_size
-        pF = f_patch_size
-        all_image_out = []
-
-        for i, image in enumerate(all_image):
-            ### Process Image
-            C, F, H, W = image.size()
-            F_tokens, H_tokens, W_tokens = F // pF, H // pH, W // pW
-
-            image = image.view(C, F_tokens, pF, H_tokens, pH, W_tokens, pW)
-            # "c f pf h ph w pw -> (f h w) (pf ph pw c)"
-            image = image.permute(1, 3, 5, 2, 4, 6, 0).reshape(F_tokens * H_tokens * W_tokens, pF * pH * pW * C)
-
-            image_ori_len = len(image)
-            image_padding_len = (-image_ori_len) % SEQ_MULTI_OF
-
-            # padded feature
-            image_padded_feat = torch.cat([image, image[-1:].repeat(image_padding_len, 1)], dim=0)
-            all_image_out.append(image_padded_feat)
-
-        return all_image_out
-
-    def forward(
-        self,
-        x: List[torch.Tensor],
-        t,
-        cap_feats: List[torch.Tensor],
-        control_context: List[torch.Tensor],
-        conditioning_scale: float = 1.0,
-        patch_size=2,
-        f_patch_size=1,
-    ):
-        if (
-            self.t_scale is None
-            or self.t_embedder is None
-            or self.all_x_embedder is None
-            or self.cap_embedder is None
-            or self.rope_embedder is None
-            or self.noise_refiner is None
-            or self.context_refiner is None
-            or self.x_pad_token is None
-            or self.cap_pad_token is None
-        ):
-            raise ValueError(
-                "Required modules are `None`, use `from_transformer` to share required modules from `transformer`."
-            )
-
-        assert patch_size in self.config.all_patch_size
-        assert f_patch_size in self.config.all_f_patch_size
-
-        bsz = len(x)
-        device = x[0].device
-        t = t * self.t_scale
-        t = self.t_embedder(t)
-
-        (
-            x,
-            cap_feats,
-            x_size,
-            x_pos_ids,
-            cap_pos_ids,
-            x_inner_pad_mask,
-            cap_inner_pad_mask,
-        ) = self.patchify_and_embed(x, cap_feats, patch_size, f_patch_size)
-
-        x_item_seqlens = [len(_) for _ in x]
-        assert all(_ % SEQ_MULTI_OF == 0 for _ in x_item_seqlens)
-        x_max_item_seqlen = max(x_item_seqlens)
-
-        control_context = self.patchify(control_context, patch_size, f_patch_size)
-        control_context = torch.cat(control_context, dim=0)
-        control_context = self.control_all_x_embedder[f"{patch_size}-{f_patch_size}"](control_context)
-
-        control_context[torch.cat(x_inner_pad_mask)] = self.x_pad_token
-        control_context = list(control_context.split(x_item_seqlens, dim=0))
-
-        control_context = pad_sequence(control_context, batch_first=True, padding_value=0.0)
-
-        # x embed & refine
-        x = torch.cat(x, dim=0)
-        x = self.all_x_embedder[f"{patch_size}-{f_patch_size}"](x)
-
-        # Match t_embedder output dtype to x for layerwise casting compatibility
-        adaln_input = t.type_as(x)
-        x[torch.cat(x_inner_pad_mask)] = self.x_pad_token
-        x = list(x.split(x_item_seqlens, dim=0))
-        x_freqs_cis = list(self.rope_embedder(torch.cat(x_pos_ids, dim=0)).split([len(_) for _ in x_pos_ids], dim=0))
-
-        x = pad_sequence(x, batch_first=True, padding_value=0.0)
-        x_freqs_cis = pad_sequence(x_freqs_cis, batch_first=True, padding_value=0.0)
-        # Clarify the length matches to satisfy Dynamo due to "Symbolic Shape Inference" to avoid compilation errors
-        x_freqs_cis = x_freqs_cis[:, : x.shape[1]]
-
-        x_attn_mask = torch.zeros((bsz, x_max_item_seqlen), dtype=torch.bool, device=device)
-        for i, seq_len in enumerate(x_item_seqlens):
-            x_attn_mask[i, :seq_len] = 1
-
-        if self.add_control_noise_refiner is not None:
-            if self.add_control_noise_refiner == "control_layers":
-                layers = self.control_layers
-            elif self.add_control_noise_refiner == "control_noise_refiner":
-                layers = self.control_noise_refiner
-            else:
-                raise ValueError(f"Unsupported `add_control_noise_refiner` type: {self.add_control_noise_refiner}.")
-            for layer in layers:
-                if torch.is_grad_enabled() and self.gradient_checkpointing:
-                    control_context = self._gradient_checkpointing_func(
-                        layer, control_context, x, x_attn_mask, x_freqs_cis, adaln_input
-                    )
-                else:
-                    control_context = layer(control_context, x, x_attn_mask, x_freqs_cis, adaln_input)
-
-            hints = torch.unbind(control_context)[:-1]
-            control_context = torch.unbind(control_context)[-1]
-            noise_refiner_block_samples = {
-                layer_idx: hints[idx] * conditioning_scale
-                for idx, layer_idx in enumerate(self.control_refiner_layers_places)
-            }
-        else:
-            noise_refiner_block_samples = None
-
-        if torch.is_grad_enabled() and self.gradient_checkpointing:
-            for layer_idx, layer in enumerate(self.noise_refiner):
-                x = self._gradient_checkpointing_func(layer, x, x_attn_mask, x_freqs_cis, adaln_input)
-                if noise_refiner_block_samples is not None:
-                    if layer_idx in noise_refiner_block_samples:
-                        x = x + noise_refiner_block_samples[layer_idx]
-        else:
-            for layer_idx, layer in enumerate(self.noise_refiner):
-                x = layer(x, x_attn_mask, x_freqs_cis, adaln_input)
-                if noise_refiner_block_samples is not None:
-                    if layer_idx in noise_refiner_block_samples:
-                        x = x + noise_refiner_block_samples[layer_idx]
-
-        # cap embed & refine
-        cap_item_seqlens = [len(_) for _ in cap_feats]
-        cap_max_item_seqlen = max(cap_item_seqlens)
-
-        cap_feats = torch.cat(cap_feats, dim=0)
-        cap_feats = self.cap_embedder(cap_feats)
-        cap_feats[torch.cat(cap_inner_pad_mask)] = self.cap_pad_token
-        cap_feats = list(cap_feats.split(cap_item_seqlens, dim=0))
-        cap_freqs_cis = list(
-            self.rope_embedder(torch.cat(cap_pos_ids, dim=0)).split([len(_) for _ in cap_pos_ids], dim=0)
-        )
-
-        cap_feats = pad_sequence(cap_feats, batch_first=True, padding_value=0.0)
-        cap_freqs_cis = pad_sequence(cap_freqs_cis, batch_first=True, padding_value=0.0)
-        # Clarify the length matches to satisfy Dynamo due to "Symbolic Shape Inference" to avoid compilation errors
-        cap_freqs_cis = cap_freqs_cis[:, : cap_feats.shape[1]]
-
-        cap_attn_mask = torch.zeros((bsz, cap_max_item_seqlen), dtype=torch.bool, device=device)
-        for i, seq_len in enumerate(cap_item_seqlens):
-            cap_attn_mask[i, :seq_len] = 1
-
-        if torch.is_grad_enabled() and self.gradient_checkpointing:
-            for layer in self.context_refiner:
-                cap_feats = self._gradient_checkpointing_func(layer, cap_feats, cap_attn_mask, cap_freqs_cis)
-        else:
-            for layer in self.context_refiner:
-                cap_feats = layer(cap_feats, cap_attn_mask, cap_freqs_cis)
-
-        # unified
-        unified = []
-        unified_freqs_cis = []
-        for i in range(bsz):
-            x_len = x_item_seqlens[i]
-            cap_len = cap_item_seqlens[i]
-            unified.append(torch.cat([x[i][:x_len], cap_feats[i][:cap_len]]))
-            unified_freqs_cis.append(torch.cat([x_freqs_cis[i][:x_len], cap_freqs_cis[i][:cap_len]]))
-        unified_item_seqlens = [a + b for a, b in zip(cap_item_seqlens, x_item_seqlens)]
-        assert unified_item_seqlens == [len(_) for _ in unified]
-        unified_max_item_seqlen = max(unified_item_seqlens)
-
-        unified = pad_sequence(unified, batch_first=True, padding_value=0.0)
-        unified_freqs_cis = pad_sequence(unified_freqs_cis, batch_first=True, padding_value=0.0)
-        unified_attn_mask = torch.zeros((bsz, unified_max_item_seqlen), dtype=torch.bool, device=device)
-        for i, seq_len in enumerate(unified_item_seqlens):
-            unified_attn_mask[i, :seq_len] = 1
-
-        ## ControlNet start
-        if not self.add_control_noise_refiner:
-            if torch.is_grad_enabled() and self.gradient_checkpointing:
-                for layer in self.control_noise_refiner:
-                    control_context = self._gradient_checkpointing_func(
-                        layer, control_context, x_attn_mask, x_freqs_cis, adaln_input
-                    )
-            else:
-                for layer in self.control_noise_refiner:
-                    control_context = layer(control_context, x_attn_mask, x_freqs_cis, adaln_input)
-
-        # unified
-        control_context_unified = []
-        for i in range(bsz):
-            x_len = x_item_seqlens[i]
-            cap_len = cap_item_seqlens[i]
-            control_context_unified.append(torch.cat([control_context[i][:x_len], cap_feats[i][:cap_len]]))
-        control_context_unified = pad_sequence(control_context_unified, batch_first=True, padding_value=0.0)
-
-        for layer in self.control_layers:
-            if torch.is_grad_enabled() and self.gradient_checkpointing:
-                control_context_unified = self._gradient_checkpointing_func(
-                    layer, control_context_unified, unified, unified_attn_mask, unified_freqs_cis, adaln_input
-                )
-            else:
-                control_context_unified = layer(
-                    control_context_unified, unified, unified_attn_mask, unified_freqs_cis, adaln_input
-                )
-
-        hints = torch.unbind(control_context_unified)[:-1]
-        controlnet_block_samples = {
-            layer_idx: hints[idx] * conditioning_scale for idx, layer_idx in enumerate(self.control_layers_places)
-        }
-        return controlnet_block_samples
--- a/src/diffusers/models/transformers/init.py
+++ b/src/diffusers/models/transformers/init.py
@@ -33,7 +33,6 @@ if is_torch_available():
    from .transformer_hunyuan_video_framepack import HunyuanVideoFramepackTransformer3DModel
    from .transformer_hunyuanimage import HunyuanImageTransformer2DModel
    from .transformer_kandinsky import Kandinsky5Transformer3DModel
-    from .transformer_longcat_image import LongCatImageTransformer2DModel
    from .transformer_ltx import LTXVideoTransformer3DModel
    from .transformer_lumina2 import Lumina2Transformer2DModel
    from .transformer_mochi import MochiTransformer3DModel
--- a/src/diffusers/models/transformers/transformer_cosmos.py
+++ b/src/diffusers/models/transformers/transformer_cosmos.py
@@ -439,9 +439,6 @@ class CosmosTransformer3DModel(ModelMixin, ConfigMixin, FromOriginalModelMixin):
        rope_scale: Tuple[float, float, float] = (2.0, 1.0, 1.0),
        concat_padding_mask: bool = True,
        extra_pos_embed_type: Optional[str] = "learnable",
-        use_crossattn_projection: bool = False,
-        crossattn_proj_in_channels: int = 1024,
-        encoder_hidden_states_channels: int = 1024,
    ) -> None:
        super().__init__()
        hidden_size = num_attention_heads * attention_head_dim
@@ -488,12 +485,6 @@ class CosmosTransformer3DModel(ModelMixin, ConfigMixin, FromOriginalModelMixin):
            hidden_size, patch_size[0] * patch_size[1] * patch_size[2] * out_channels, bias=False
        )

-        if self.config.use_crossattn_projection:
-            self.crossattn_proj = nn.Sequential(
-                nn.Linear(crossattn_proj_in_channels, encoder_hidden_states_channels, bias=True),
-                nn.GELU(),
-            )
-
        self.gradient_checkpointing = False

    def forward(
@@ -533,7 +524,6 @@ class CosmosTransformer3DModel(ModelMixin, ConfigMixin, FromOriginalModelMixin):
        post_patch_num_frames = num_frames // p_t
        post_patch_height = height // p_h
        post_patch_width = width // p_w
-
        hidden_states = self.patch_embed(hidden_states)
        hidden_states = hidden_states.flatten(1, 3)  # [B, T, H, W, C] -> [B, THW, C]

@@ -556,9 +546,6 @@ class CosmosTransformer3DModel(ModelMixin, ConfigMixin, FromOriginalModelMixin):
        else:
            assert False

-        if self.config.use_crossattn_projection:
-            encoder_hidden_states = self.crossattn_proj(encoder_hidden_states)
-
        # 5. Transformer blocks
        for block in self.transformer_blocks:
            if torch.is_grad_enabled() and self.gradient_checkpointing:
--- a/src/diffusers/models/transformers/transformer_longcat_image.py
+++ b/src/diffusers/models/transformers/transformer_longcat_image.py
@@ -1,548 +0,0 @@
-# Copyright 2025 MeiTuan LongCat-Image Team and The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-from typing import Any, Dict, List, Optional, Tuple, Union
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from ...configuration_utils import ConfigMixin, register_to_config
-from ...loaders import FromOriginalModelMixin, PeftAdapterMixin
-from ...utils import is_torch_npu_available, logging
-from ...utils.torch_utils import maybe_allow_in_graph
-from ..attention import AttentionModuleMixin, FeedForward
-from ..attention_dispatch import dispatch_attention_fn
-from ..cache_utils import CacheMixin
-from ..embeddings import TimestepEmbedding, Timesteps, apply_rotary_emb, get_1d_rotary_pos_embed
-from ..modeling_outputs import Transformer2DModelOutput
-from ..modeling_utils import ModelMixin
-from ..normalization import AdaLayerNormContinuous, AdaLayerNormZero, AdaLayerNormZeroSingle
-
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-
-def _get_projections(attn: "LongCatImageAttention", hidden_states, encoder_hidden_states=None):
-    query = attn.to_q(hidden_states)
-    key = attn.to_k(hidden_states)
-    value = attn.to_v(hidden_states)
-
-    encoder_query = encoder_key = encoder_value = None
-    if encoder_hidden_states is not None and attn.added_kv_proj_dim is not None:
-        encoder_query = attn.add_q_proj(encoder_hidden_states)
-        encoder_key = attn.add_k_proj(encoder_hidden_states)
-        encoder_value = attn.add_v_proj(encoder_hidden_states)
-
-    return query, key, value, encoder_query, encoder_key, encoder_value
-
-
-def _get_fused_projections(attn: "LongCatImageAttention", hidden_states, encoder_hidden_states=None):
-    query, key, value = attn.to_qkv(hidden_states).chunk(3, dim=-1)
-
-    encoder_query = encoder_key = encoder_value = (None,)
-    if encoder_hidden_states is not None and hasattr(attn, "to_added_qkv"):
-        encoder_query, encoder_key, encoder_value = attn.to_added_qkv(encoder_hidden_states).chunk(3, dim=-1)
-
-    return query, key, value, encoder_query, encoder_key, encoder_value
-
-
-def _get_qkv_projections(attn: "LongCatImageAttention", hidden_states, encoder_hidden_states=None):
-    if attn.fused_projections:
-        return _get_fused_projections(attn, hidden_states, encoder_hidden_states)
-    return _get_projections(attn, hidden_states, encoder_hidden_states)
-
-
-class LongCatImageAttnProcessor:
-    _attention_backend = None
-    _parallel_config = None
-
-    def __init__(self):
-        if not hasattr(F, "scaled_dot_product_attention"):
-            raise ImportError(f"{self.__class__.__name__} requires PyTorch 2.0. Please upgrade your pytorch version.")
-
-    def __call__(
-        self,
-        attn: "LongCatImageAttention",
-        hidden_states: torch.Tensor,
-        encoder_hidden_states: torch.Tensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        image_rotary_emb: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        query, key, value, encoder_query, encoder_key, encoder_value = _get_qkv_projections(
-            attn, hidden_states, encoder_hidden_states
-        )
-
-        query = query.unflatten(-1, (attn.heads, -1))
-        key = key.unflatten(-1, (attn.heads, -1))
-        value = value.unflatten(-1, (attn.heads, -1))
-
-        query = attn.norm_q(query)
-        key = attn.norm_k(key)
-
-        if attn.added_kv_proj_dim is not None:
-            encoder_query = encoder_query.unflatten(-1, (attn.heads, -1))
-            encoder_key = encoder_key.unflatten(-1, (attn.heads, -1))
-            encoder_value = encoder_value.unflatten(-1, (attn.heads, -1))
-
-            encoder_query = attn.norm_added_q(encoder_query)
-            encoder_key = attn.norm_added_k(encoder_key)
-
-            query = torch.cat([encoder_query, query], dim=1)
-            key = torch.cat([encoder_key, key], dim=1)
-            value = torch.cat([encoder_value, value], dim=1)
-
-        if image_rotary_emb is not None:
-            query = apply_rotary_emb(query, image_rotary_emb, sequence_dim=1)
-            key = apply_rotary_emb(key, image_rotary_emb, sequence_dim=1)
-
-        hidden_states = dispatch_attention_fn(
-            query,
-            key,
-            value,
-            attn_mask=attention_mask,
-            backend=self._attention_backend,
-            parallel_config=self._parallel_config,
-        )
-        hidden_states = hidden_states.flatten(2, 3)
-        hidden_states = hidden_states.to(query.dtype)
-
-        if encoder_hidden_states is not None:
-            encoder_hidden_states, hidden_states = hidden_states.split_with_sizes(
-                [encoder_hidden_states.shape[1], hidden_states.shape[1] - encoder_hidden_states.shape[1]], dim=1
-            )
-            hidden_states = attn.to_out[0](hidden_states)
-            hidden_states = attn.to_out[1](hidden_states)
-            encoder_hidden_states = attn.to_add_out(encoder_hidden_states)
-
-            return hidden_states, encoder_hidden_states
-        else:
-            return hidden_states
-
-
-class LongCatImageAttention(torch.nn.Module, AttentionModuleMixin):
-    _default_processor_cls = LongCatImageAttnProcessor
-    _available_processors = [
-        LongCatImageAttnProcessor,
-    ]
-
-    def __init__(
-        self,
-        query_dim: int,
-        heads: int = 8,
-        dim_head: int = 64,
-        dropout: float = 0.0,
-        bias: bool = False,
-        added_kv_proj_dim: Optional[int] = None,
-        added_proj_bias: Optional[bool] = True,
-        out_bias: bool = True,
-        eps: float = 1e-5,
-        out_dim: int = None,
-        context_pre_only: Optional[bool] = None,
-        pre_only: bool = False,
-        elementwise_affine: bool = True,
-        processor=None,
-    ):
-        super().__init__()
-
-        self.head_dim = dim_head
-        self.inner_dim = out_dim if out_dim is not None else dim_head * heads
-        self.query_dim = query_dim
-        self.use_bias = bias
-        self.dropout = dropout
-        self.out_dim = out_dim if out_dim is not None else query_dim
-        self.context_pre_only = context_pre_only
-        self.pre_only = pre_only
-        self.heads = out_dim // dim_head if out_dim is not None else heads
-        self.added_kv_proj_dim = added_kv_proj_dim
-        self.added_proj_bias = added_proj_bias
-
-        self.norm_q = torch.nn.RMSNorm(dim_head, eps=eps, elementwise_affine=elementwise_affine)
-        self.norm_k = torch.nn.RMSNorm(dim_head, eps=eps, elementwise_affine=elementwise_affine)
-        self.to_q = torch.nn.Linear(query_dim, self.inner_dim, bias=bias)
-        self.to_k = torch.nn.Linear(query_dim, self.inner_dim, bias=bias)
-        self.to_v = torch.nn.Linear(query_dim, self.inner_dim, bias=bias)
-
-        if not self.pre_only:
-            self.to_out = torch.nn.ModuleList([])
-            self.to_out.append(torch.nn.Linear(self.inner_dim, self.out_dim, bias=out_bias))
-            self.to_out.append(torch.nn.Dropout(dropout))
-
-        if added_kv_proj_dim is not None:
-            self.norm_added_q = torch.nn.RMSNorm(dim_head, eps=eps)
-            self.norm_added_k = torch.nn.RMSNorm(dim_head, eps=eps)
-            self.add_q_proj = torch.nn.Linear(added_kv_proj_dim, self.inner_dim, bias=added_proj_bias)
-            self.add_k_proj = torch.nn.Linear(added_kv_proj_dim, self.inner_dim, bias=added_proj_bias)
-            self.add_v_proj = torch.nn.Linear(added_kv_proj_dim, self.inner_dim, bias=added_proj_bias)
-            self.to_add_out = torch.nn.Linear(self.inner_dim, query_dim, bias=out_bias)
-
-        if processor is None:
-            processor = self._default_processor_cls()
-        self.set_processor(processor)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        image_rotary_emb: Optional[torch.Tensor] = None,
-        **kwargs,
-    ) -> torch.Tensor:
-        attn_parameters = set(inspect.signature(self.processor.__call__).parameters.keys())
-        quiet_attn_parameters = {"ip_adapter_masks", "ip_hidden_states"}
-        unused_kwargs = [k for k, _ in kwargs.items() if k not in attn_parameters and k not in quiet_attn_parameters]
-        if len(unused_kwargs) > 0:
-            logger.warning(
-                f"joint_attention_kwargs {unused_kwargs} are not expected by {self.processor.__class__.__name__} and will be ignored."
-            )
-        kwargs = {k: w for k, w in kwargs.items() if k in attn_parameters}
-        return self.processor(self, hidden_states, encoder_hidden_states, attention_mask, image_rotary_emb, **kwargs)
-
-
-@maybe_allow_in_graph
-class LongCatImageSingleTransformerBlock(nn.Module):
-    def __init__(self, dim: int, num_attention_heads: int, attention_head_dim: int, mlp_ratio: float = 4.0):
-        super().__init__()
-        self.mlp_hidden_dim = int(dim * mlp_ratio)
-
-        self.norm = AdaLayerNormZeroSingle(dim)
-        self.proj_mlp = nn.Linear(dim, self.mlp_hidden_dim)
-        self.act_mlp = nn.GELU(approximate="tanh")
-        self.proj_out = nn.Linear(dim + self.mlp_hidden_dim, dim)
-
-        self.attn = LongCatImageAttention(
-            query_dim=dim,
-            dim_head=attention_head_dim,
-            heads=num_attention_heads,
-            out_dim=dim,
-            bias=True,
-            processor=LongCatImageAttnProcessor(),
-            eps=1e-6,
-            pre_only=True,
-        )
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        encoder_hidden_states: torch.Tensor,
-        temb: torch.Tensor,
-        image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
-        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        text_seq_len = encoder_hidden_states.shape[1]
-        hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
-
-        residual = hidden_states
-        norm_hidden_states, gate = self.norm(hidden_states, emb=temb)
-        mlp_hidden_states = self.act_mlp(self.proj_mlp(norm_hidden_states))
-        joint_attention_kwargs = joint_attention_kwargs or {}
-        attn_output = self.attn(
-            hidden_states=norm_hidden_states,
-            image_rotary_emb=image_rotary_emb,
-            **joint_attention_kwargs,
-        )
-
-        hidden_states = torch.cat([attn_output, mlp_hidden_states], dim=2)
-        gate = gate.unsqueeze(1)
-        hidden_states = gate * self.proj_out(hidden_states)
-        hidden_states = residual + hidden_states
-        if hidden_states.dtype == torch.float16:
-            hidden_states = hidden_states.clip(-65504, 65504)
-
-        encoder_hidden_states, hidden_states = hidden_states[:, :text_seq_len], hidden_states[:, text_seq_len:]
-        return encoder_hidden_states, hidden_states
-
-
-@maybe_allow_in_graph
-class LongCatImageTransformerBlock(nn.Module):
-    def __init__(
-        self, dim: int, num_attention_heads: int, attention_head_dim: int, qk_norm: str = "rms_norm", eps: float = 1e-6
-    ):
-        super().__init__()
-
-        self.norm1 = AdaLayerNormZero(dim)
-        self.norm1_context = AdaLayerNormZero(dim)
-
-        self.attn = LongCatImageAttention(
-            query_dim=dim,
-            added_kv_proj_dim=dim,
-            dim_head=attention_head_dim,
-            heads=num_attention_heads,
-            out_dim=dim,
-            context_pre_only=False,
-            bias=True,
-            processor=LongCatImageAttnProcessor(),
-            eps=eps,
-        )
-
-        self.norm2 = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
-        self.ff = FeedForward(dim=dim, dim_out=dim, activation_fn="gelu-approximate")
-
-        self.norm2_context = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
-        self.ff_context = FeedForward(dim=dim, dim_out=dim, activation_fn="gelu-approximate")
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        encoder_hidden_states: torch.Tensor,
-        temb: torch.Tensor,
-        image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
-        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(hidden_states, emb=temb)
-
-        norm_encoder_hidden_states, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp = self.norm1_context(
-            encoder_hidden_states, emb=temb
-        )
-        joint_attention_kwargs = joint_attention_kwargs or {}
-
-        # Attention.
-        attention_outputs = self.attn(
-            hidden_states=norm_hidden_states,
-            encoder_hidden_states=norm_encoder_hidden_states,
-            image_rotary_emb=image_rotary_emb,
-            **joint_attention_kwargs,
-        )
-
-        if len(attention_outputs) == 2:
-            attn_output, context_attn_output = attention_outputs
-        elif len(attention_outputs) == 3:
-            attn_output, context_attn_output, ip_attn_output = attention_outputs
-
-        # Process attention outputs for the `hidden_states`.
-        attn_output = gate_msa.unsqueeze(1) * attn_output
-        hidden_states = hidden_states + attn_output
-
-        norm_hidden_states = self.norm2(hidden_states)
-        norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
-
-        ff_output = self.ff(norm_hidden_states)
-        ff_output = gate_mlp.unsqueeze(1) * ff_output
-
-        hidden_states = hidden_states + ff_output
-        if len(attention_outputs) == 3:
-            hidden_states = hidden_states + ip_attn_output
-
-        # Process attention outputs for the `encoder_hidden_states`.
-        context_attn_output = c_gate_msa.unsqueeze(1) * context_attn_output
-        encoder_hidden_states = encoder_hidden_states + context_attn_output
-
-        norm_encoder_hidden_states = self.norm2_context(encoder_hidden_states)
-        norm_encoder_hidden_states = norm_encoder_hidden_states * (1 + c_scale_mlp[:, None]) + c_shift_mlp[:, None]
-
-        context_ff_output = self.ff_context(norm_encoder_hidden_states)
-        encoder_hidden_states = encoder_hidden_states + c_gate_mlp.unsqueeze(1) * context_ff_output
-        if encoder_hidden_states.dtype == torch.float16:
-            encoder_hidden_states = encoder_hidden_states.clip(-65504, 65504)
-
-        return encoder_hidden_states, hidden_states
-
-
-class LongCatImagePosEmbed(nn.Module):
-    def __init__(self, theta: int, axes_dim: List[int]):
-        super().__init__()
-        self.theta = theta
-        self.axes_dim = axes_dim
-
-    def forward(self, ids: torch.Tensor) -> torch.Tensor:
-        n_axes = ids.shape[-1]
-        cos_out = []
-        sin_out = []
-        pos = ids.float()
-        is_mps = ids.device.type == "mps"
-        is_npu = ids.device.type == "npu"
-        freqs_dtype = torch.float32 if (is_mps or is_npu) else torch.float64
-        for i in range(n_axes):
-            cos, sin = get_1d_rotary_pos_embed(
-                self.axes_dim[i],
-                pos[:, i],
-                theta=self.theta,
-                repeat_interleave_real=True,
-                use_real=True,
-                freqs_dtype=freqs_dtype,
-            )
-            cos_out.append(cos)
-            sin_out.append(sin)
-        freqs_cos = torch.cat(cos_out, dim=-1).to(ids.device)
-        freqs_sin = torch.cat(sin_out, dim=-1).to(ids.device)
-        return freqs_cos, freqs_sin
-
-
-class LongCatImageTimestepEmbeddings(nn.Module):
-    def __init__(self, embedding_dim):
-        super().__init__()
-
-        self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
-        self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
-
-    def forward(self, timestep, hidden_dtype):
-        timesteps_proj = self.time_proj(timestep)
-        timesteps_emb = self.timestep_embedder(timesteps_proj.to(dtype=hidden_dtype))  # (N, D)
-
-        return timesteps_emb
-
-
-class LongCatImageTransformer2DModel(
-    ModelMixin,
-    ConfigMixin,
-    PeftAdapterMixin,
-    FromOriginalModelMixin,
-    CacheMixin,
-):
-    """
-    The Transformer model introduced in Longcat-Image.
-    """
-
-    _supports_gradient_checkpointing = True
-
-    @register_to_config
-    def __init__(
-        self,
-        patch_size: int = 1,
-        in_channels: int = 64,
-        num_layers: int = 19,
-        num_single_layers: int = 38,
-        attention_head_dim: int = 128,
-        num_attention_heads: int = 24,
-        joint_attention_dim: int = 3584,
-        pooled_projection_dim: int = 3584,
-        axes_dims_rope: List[int] = [16, 56, 56],
-    ):
-        super().__init__()
-        self.out_channels = in_channels
-        self.inner_dim = num_attention_heads * attention_head_dim
-        self.pooled_projection_dim = pooled_projection_dim
-
-        self.pos_embed = LongCatImagePosEmbed(theta=10000, axes_dim=axes_dims_rope)
-
-        self.time_embed = LongCatImageTimestepEmbeddings(embedding_dim=self.inner_dim)
-
-        self.context_embedder = nn.Linear(joint_attention_dim, self.inner_dim)
-        self.x_embedder = torch.nn.Linear(in_channels, self.inner_dim)
-
-        self.transformer_blocks = nn.ModuleList(
-            [
-                LongCatImageTransformerBlock(
-                    dim=self.inner_dim,
-                    num_attention_heads=num_attention_heads,
-                    attention_head_dim=attention_head_dim,
-                )
-                for i in range(num_layers)
-            ]
-        )
-
-        self.single_transformer_blocks = nn.ModuleList(
-            [
-                LongCatImageSingleTransformerBlock(
-                    dim=self.inner_dim,
-                    num_attention_heads=num_attention_heads,
-                    attention_head_dim=attention_head_dim,
-                )
-                for i in range(num_single_layers)
-            ]
-        )
-
-        self.norm_out = AdaLayerNormContinuous(self.inner_dim, self.inner_dim, elementwise_affine=False, eps=1e-6)
-        self.proj_out = nn.Linear(self.inner_dim, patch_size * patch_size * self.out_channels, bias=True)
-
-        self.gradient_checkpointing = False
-        self.use_checkpoint = [True] * num_layers
-        self.use_single_checkpoint = [True] * num_single_layers
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        encoder_hidden_states: torch.Tensor = None,
-        timestep: torch.LongTensor = None,
-        img_ids: torch.Tensor = None,
-        txt_ids: torch.Tensor = None,
-        guidance: torch.Tensor = None,
-        return_dict: bool = True,
-    ) -> Union[torch.FloatTensor, Transformer2DModelOutput]:
-        """
-        The forward method.
-
-        Args:
-            hidden_states (`torch.FloatTensor` of shape `(batch size, channel, height, width)`):
-                Input `hidden_states`.
-            encoder_hidden_states (`torch.FloatTensor` of shape `(batch size, sequence_len, embed_dims)`):
-                Conditional embeddings (embeddings computed from the input conditions such as prompts) to use.
-            timestep ( `torch.LongTensor`):
-                Used to indicate denoising step.
-            block_controlnet_hidden_states: (`list` of `torch.Tensor`):
-                A list of tensors that if specified are added to the residuals of transformer blocks.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain
-                tuple.
-
-        Returns:
-            If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
-            `tuple` where the first element is the sample tensor.
-        """
-        hidden_states = self.x_embedder(hidden_states)
-
-        timestep = timestep.to(hidden_states.dtype) * 1000
-
-        temb = self.time_embed(timestep, hidden_states.dtype)
-        encoder_hidden_states = self.context_embedder(encoder_hidden_states)
-
-        ids = torch.cat((txt_ids, img_ids), dim=0)
-        if is_torch_npu_available():
-            freqs_cos, freqs_sin = self.pos_embed(ids.cpu())
-            image_rotary_emb = (freqs_cos.npu(), freqs_sin.npu())
-        else:
-            image_rotary_emb = self.pos_embed(ids)
-
-        for index_block, block in enumerate(self.transformer_blocks):
-            if torch.is_grad_enabled() and self.gradient_checkpointing and self.use_checkpoint[index_block]:
-                encoder_hidden_states, hidden_states = self._gradient_checkpointing_func(
-                    block,
-                    hidden_states,
-                    encoder_hidden_states,
-                    temb,
-                    image_rotary_emb,
-                )
-            else:
-                encoder_hidden_states, hidden_states = block(
-                    hidden_states=hidden_states,
-                    encoder_hidden_states=encoder_hidden_states,
-                    temb=temb,
-                    image_rotary_emb=image_rotary_emb,
-                )
-
-        for index_block, block in enumerate(self.single_transformer_blocks):
-            if torch.is_grad_enabled() and self.gradient_checkpointing and self.use_single_checkpoint[index_block]:
-                encoder_hidden_states, hidden_states = self._gradient_checkpointing_func(
-                    block,
-                    hidden_states,
-                    encoder_hidden_states,
-                    temb,
-                    image_rotary_emb,
-                )
-            else:
-                encoder_hidden_states, hidden_states = block(
-                    hidden_states=hidden_states,
-                    encoder_hidden_states=encoder_hidden_states,
-                    temb=temb,
-                    image_rotary_emb=image_rotary_emb,
-                )
-
-        hidden_states = self.norm_out(hidden_states, temb)
-        output = self.proj_out(hidden_states)
-
-        if not return_dict:
-            return (output,)
-
-        return Transformer2DModelOutput(sample=output)
--- a/src/diffusers/models/transformers/transformer_qwenimage.py
+++ b/src/diffusers/models/transformers/transformer_qwenimage.py
@@ -143,26 +143,17 @@ def apply_rotary_emb_qwen(


 class QwenTimestepProjEmbeddings(nn.Module):
-    def __init__(self, embedding_dim, use_additional_t_cond=False):
+    def __init__(self, embedding_dim):
        super().__init__()

        self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0, scale=1000)
        self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
-        self.use_additional_t_cond = use_additional_t_cond
-        if use_additional_t_cond:
-            self.addition_t_embedding = nn.Embedding(2, embedding_dim)

-    def forward(self, timestep, hidden_states, addition_t_cond=None):
+    def forward(self, timestep, hidden_states):
        timesteps_proj = self.time_proj(timestep)
        timesteps_emb = self.timestep_embedder(timesteps_proj.to(dtype=hidden_states.dtype))  # (N, D)

        conditioning = timesteps_emb
-        if self.use_additional_t_cond:
-            if addition_t_cond is None:
-                raise ValueError("When additional_t_cond is True, addition_t_cond must be provided.")
-            addition_t_emb = self.addition_t_embedding(addition_t_cond)
-            addition_t_emb = addition_t_emb.to(dtype=hidden_states.dtype)
-            conditioning = conditioning + addition_t_emb

        return conditioning

@@ -268,120 +259,6 @@ class QwenEmbedRope(nn.Module):
        return freqs.clone().contiguous()


-class QwenEmbedLayer3DRope(nn.Module):
-    def __init__(self, theta: int, axes_dim: List[int], scale_rope=False):
-        super().__init__()
-        self.theta = theta
-        self.axes_dim = axes_dim
-        pos_index = torch.arange(4096)
-        neg_index = torch.arange(4096).flip(0) * -1 - 1
-        self.pos_freqs = torch.cat(
-            [
-                self.rope_params(pos_index, self.axes_dim[0], self.theta),
-                self.rope_params(pos_index, self.axes_dim[1], self.theta),
-                self.rope_params(pos_index, self.axes_dim[2], self.theta),
-            ],
-            dim=1,
-        )
-        self.neg_freqs = torch.cat(
-            [
-                self.rope_params(neg_index, self.axes_dim[0], self.theta),
-                self.rope_params(neg_index, self.axes_dim[1], self.theta),
-                self.rope_params(neg_index, self.axes_dim[2], self.theta),
-            ],
-            dim=1,
-        )
-
-        self.scale_rope = scale_rope
-
-    def rope_params(self, index, dim, theta=10000):
-        """
-        Args:
-            index: [0, 1, 2, 3] 1D Tensor representing the position index of the token
-        """
-        assert dim % 2 == 0
-        freqs = torch.outer(index, 1.0 / torch.pow(theta, torch.arange(0, dim, 2).to(torch.float32).div(dim)))
-        freqs = torch.polar(torch.ones_like(freqs), freqs)
-        return freqs
-
-    def forward(self, video_fhw, txt_seq_lens, device):
-        """
-        Args: video_fhw: [frame, height, width] a list of 3 integers representing the shape of the video Args:
-        txt_length: [bs] a list of 1 integers representing the length of the text
-        """
-        if self.pos_freqs.device != device:
-            self.pos_freqs = self.pos_freqs.to(device)
-            self.neg_freqs = self.neg_freqs.to(device)
-
-        if isinstance(video_fhw, list):
-            video_fhw = video_fhw[0]
-        if not isinstance(video_fhw, list):
-            video_fhw = [video_fhw]
-
-        vid_freqs = []
-        max_vid_index = 0
-        layer_num = len(video_fhw) - 1
-        for idx, fhw in enumerate(video_fhw):
-            frame, height, width = fhw
-            if idx != layer_num:
-                video_freq = self._compute_video_freqs(frame, height, width, idx)
-            else:
-                ### For the condition image, we set the layer index to -1
-                video_freq = self._compute_condition_freqs(frame, height, width)
-            video_freq = video_freq.to(device)
-            vid_freqs.append(video_freq)
-
-            if self.scale_rope:
-                max_vid_index = max(height // 2, width // 2, max_vid_index)
-            else:
-                max_vid_index = max(height, width, max_vid_index)
-
-        max_vid_index = max(max_vid_index, layer_num)
-        max_len = max(txt_seq_lens)
-        txt_freqs = self.pos_freqs[max_vid_index : max_vid_index + max_len, ...]
-        vid_freqs = torch.cat(vid_freqs, dim=0)
-
-        return vid_freqs, txt_freqs
-
-    @functools.lru_cache(maxsize=None)
-    def _compute_video_freqs(self, frame, height, width, idx=0):
-        seq_lens = frame * height * width
-        freqs_pos = self.pos_freqs.split([x // 2 for x in self.axes_dim], dim=1)
-        freqs_neg = self.neg_freqs.split([x // 2 for x in self.axes_dim], dim=1)
-
-        freqs_frame = freqs_pos[0][idx : idx + frame].view(frame, 1, 1, -1).expand(frame, height, width, -1)
-        if self.scale_rope:
-            freqs_height = torch.cat([freqs_neg[1][-(height - height // 2) :], freqs_pos[1][: height // 2]], dim=0)
-            freqs_height = freqs_height.view(1, height, 1, -1).expand(frame, height, width, -1)
-            freqs_width = torch.cat([freqs_neg[2][-(width - width // 2) :], freqs_pos[2][: width // 2]], dim=0)
-            freqs_width = freqs_width.view(1, 1, width, -1).expand(frame, height, width, -1)
-        else:
-            freqs_height = freqs_pos[1][:height].view(1, height, 1, -1).expand(frame, height, width, -1)
-            freqs_width = freqs_pos[2][:width].view(1, 1, width, -1).expand(frame, height, width, -1)
-
-        freqs = torch.cat([freqs_frame, freqs_height, freqs_width], dim=-1).reshape(seq_lens, -1)
-        return freqs.clone().contiguous()
-
-    @functools.lru_cache(maxsize=None)
-    def _compute_condition_freqs(self, frame, height, width):
-        seq_lens = frame * height * width
-        freqs_pos = self.pos_freqs.split([x // 2 for x in self.axes_dim], dim=1)
-        freqs_neg = self.neg_freqs.split([x // 2 for x in self.axes_dim], dim=1)
-
-        freqs_frame = freqs_neg[0][-1:].view(frame, 1, 1, -1).expand(frame, height, width, -1)
-        if self.scale_rope:
-            freqs_height = torch.cat([freqs_neg[1][-(height - height // 2) :], freqs_pos[1][: height // 2]], dim=0)
-            freqs_height = freqs_height.view(1, height, 1, -1).expand(frame, height, width, -1)
-            freqs_width = torch.cat([freqs_neg[2][-(width - width // 2) :], freqs_pos[2][: width // 2]], dim=0)
-            freqs_width = freqs_width.view(1, 1, width, -1).expand(frame, height, width, -1)
-        else:
-            freqs_height = freqs_pos[1][:height].view(1, height, 1, -1).expand(frame, height, width, -1)
-            freqs_width = freqs_pos[2][:width].view(1, 1, width, -1).expand(frame, height, width, -1)
-
-        freqs = torch.cat([freqs_frame, freqs_height, freqs_width], dim=-1).reshape(seq_lens, -1)
-        return freqs.clone().contiguous()
-
-
 class QwenDoubleStreamAttnProcessor2_0:
    """
    Attention processor for Qwen double-stream architecture, matching DoubleStreamLayerMegatron logic. This processor
@@ -701,21 +578,14 @@ class QwenImageTransformer2DModel(
        guidance_embeds: bool = False,  # TODO: this should probably be removed
        axes_dims_rope: Tuple[int, int, int] = (16, 56, 56),
        zero_cond_t: bool = False,
-        use_additional_t_cond: bool = False,
-        use_layer3d_rope: bool = False,
    ):
        super().__init__()
        self.out_channels = out_channels or in_channels
        self.inner_dim = num_attention_heads * attention_head_dim

-        if not use_layer3d_rope:
-            self.pos_embed = QwenEmbedRope(theta=10000, axes_dim=list(axes_dims_rope), scale_rope=True)
-        else:
-            self.pos_embed = QwenEmbedLayer3DRope(theta=10000, axes_dim=list(axes_dims_rope), scale_rope=True)
+        self.pos_embed = QwenEmbedRope(theta=10000, axes_dim=list(axes_dims_rope), scale_rope=True)

-        self.time_text_embed = QwenTimestepProjEmbeddings(
-            embedding_dim=self.inner_dim, use_additional_t_cond=use_additional_t_cond
-        )
+        self.time_text_embed = QwenTimestepProjEmbeddings(embedding_dim=self.inner_dim)

        self.txt_norm = RMSNorm(joint_attention_dim, eps=1e-6)

@@ -751,7 +621,6 @@ class QwenImageTransformer2DModel(
        guidance: torch.Tensor = None,  # TODO: this should probably be removed
        attention_kwargs: Optional[Dict[str, Any]] = None,
        controlnet_block_samples=None,
-        additional_t_cond=None,
        return_dict: bool = True,
    ) -> Union[torch.Tensor, Transformer2DModelOutput]:
        """
@@ -814,9 +683,9 @@ class QwenImageTransformer2DModel(
            guidance = guidance.to(hidden_states.dtype) * 1000

        temb = (
-            self.time_text_embed(timestep, hidden_states, additional_t_cond)
+            self.time_text_embed(timestep, hidden_states)
            if guidance is None
-            else self.time_text_embed(timestep, guidance, hidden_states, additional_t_cond)
+            else self.time_text_embed(timestep, guidance, hidden_states)
        )

        image_rotary_emb = self.pos_embed(img_shapes, txt_seq_lens, device=hidden_states.device)
--- a/src/diffusers/models/transformers/transformer_z_image.py
+++ b/src/diffusers/models/transformers/transformer_z_image.py
@@ -13,7 +13,7 @@
 # limitations under the License.

 import math
-from typing import Dict, List, Optional, Tuple
+from typing import List, Optional, Tuple

 import torch
 import torch.nn as nn
@@ -536,7 +536,6 @@ class ZImageTransformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOr
        x: List[torch.Tensor],
        t,
        cap_feats: List[torch.Tensor],
-        controlnet_block_samples: Optional[Dict[int, torch.Tensor]] = None,
        patch_size=2,
        f_patch_size=1,
        return_dict: bool = True,
@@ -636,19 +635,13 @@ class ZImageTransformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOr
            unified_attn_mask[i, :seq_len] = 1

        if torch.is_grad_enabled() and self.gradient_checkpointing:
-            for layer_idx, layer in enumerate(self.layers):
+            for layer in self.layers:
                unified = self._gradient_checkpointing_func(
                    layer, unified, unified_attn_mask, unified_freqs_cis, adaln_input
                )
-                if controlnet_block_samples is not None:
-                    if layer_idx in controlnet_block_samples:
-                        unified = unified + controlnet_block_samples[layer_idx]
        else:
-            for layer_idx, layer in enumerate(self.layers):
+            for layer in self.layers:
                unified = layer(unified, unified_attn_mask, unified_freqs_cis, adaln_input)
-                if controlnet_block_samples is not None:
-                    if layer_idx in controlnet_block_samples:
-                        unified = unified + controlnet_block_samples[layer_idx]

        unified = self.all_final_layer[f"{patch_size}-{f_patch_size}"](unified, adaln_input)
        unified = list(unified.unbind(dim=0))
--- a/src/diffusers/modular_pipelines/flux/modular_blocks.py
+++ b/src/diffusers/modular_pipelines/flux/modular_blocks.py
@@ -360,7 +360,7 @@ class FluxKontextCoreDenoiseStep(SequentialPipelineBlocks):
 AUTO_BLOCKS = InsertableDict(
    [
        ("text_encoder", FluxTextEncoderStep()),
-        ("vae_encoder", FluxAutoVaeEncoderStep()),
+        ("image_encoder", FluxAutoVaeEncoderStep()),
        ("denoise", FluxCoreDenoiseStep()),
        ("decode", FluxDecodeStep()),
    ]
@@ -369,7 +369,7 @@ AUTO_BLOCKS = InsertableDict(
 AUTO_BLOCKS_KONTEXT = InsertableDict(
    [
        ("text_encoder", FluxTextEncoderStep()),
-        ("vae_encoder", FluxKontextAutoVaeEncoderStep()),
+        ("image_encoder", FluxKontextAutoVaeEncoderStep()),
        ("denoise", FluxKontextCoreDenoiseStep()),
        ("decode", FluxDecodeStep()),
    ]
--- a/src/diffusers/modular_pipelines/mellon_node_utils.py
+++ b/src/diffusers/modular_pipelines/mellon_node_utils.py
--- a/src/diffusers/modular_pipelines/modular_pipeline.py
+++ b/src/diffusers/modular_pipelines/modular_pipeline.py
@@ -231,7 +231,7 @@ class BlockState:

 class ModularPipelineBlocks(ConfigMixin, PushToHubMixin):
    """
-    Base class for all Pipeline Blocks: ConditionalPipelineBlocks, AutoPipelineBlocks, SequentialPipelineBlocks,
+    Base class for all Pipeline Blocks: PipelineBlock, AutoPipelineBlocks, SequentialPipelineBlocks,
    LoopSequentialPipelineBlocks

    [`ModularPipelineBlocks`] provides method to load and save the definition of pipeline blocks.
@@ -501,19 +501,15 @@ class ModularPipelineBlocks(ConfigMixin, PushToHubMixin):

    @property
    def input_names(self) -> List[str]:
-        return [input_param.name for input_param in self.inputs if input_param.name is not None]
+        return [input_param.name for input_param in self.inputs]

    @property
    def intermediate_output_names(self) -> List[str]:
-        return [output_param.name for output_param in self.intermediate_outputs if output_param.name is not None]
+        return [output_param.name for output_param in self.intermediate_outputs]

    @property
    def output_names(self) -> List[str]:
-        return [output_param.name for output_param in self.outputs if output_param.name is not None]
-
-    @property
-    def component_names(self) -> List[str]:
-        return [component.name for component in self.expected_components]
+        return [output_param.name for output_param in self.outputs]

    @property
    def doc(self):
@@ -527,10 +523,9 @@ class ModularPipelineBlocks(ConfigMixin, PushToHubMixin):
        )


-class ConditionalPipelineBlocks(ModularPipelineBlocks):
+class AutoPipelineBlocks(ModularPipelineBlocks):
    """
-    A Pipeline Blocks that conditionally selects a block to run based on the inputs.
-    Subclasses must implement the `select_block` method to define the logic for selecting the block.
+    A Pipeline Blocks that automatically selects a block to run based on the inputs.

    This class inherits from [`ModularPipelineBlocks`]. Check the superclass documentation for the generic methods the
    library implements for all the pipeline blocks (such as loading or saving etc.)
@@ -540,13 +535,12 @@ class ConditionalPipelineBlocks(ModularPipelineBlocks):
    Attributes:
        block_classes: List of block classes to be used
        block_names: List of prefixes for each block
-        block_trigger_inputs: List of input names that select_block() uses to determine which block to run
+        block_trigger_inputs: List of input names that trigger specific blocks, with None for default
    """

    block_classes = []
    block_names = []
    block_trigger_inputs = []
-    default_block_name = None # name of the default block if no trigger inputs are provided, if None, this block can be skipped if no trigger inputs are provided

    def __init__(self):
        sub_blocks = InsertableDict()
@@ -556,15 +550,26 @@ class ConditionalPipelineBlocks(ModularPipelineBlocks):
            else:
                sub_blocks[block_name] = block
        self.sub_blocks = sub_blocks
-        if not (len(self.block_classes) == len(self.block_names)):
+        if not (len(self.block_classes) == len(self.block_names) == len(self.block_trigger_inputs)):
            raise ValueError(
-                f"In {self.__class__.__name__}, the number of block_classes and block_names must be the same."
+                f"In {self.__class__.__name__}, the number of block_classes, block_names, and block_trigger_inputs must be the same."
            )
-        if self.default_block_name is not None and self.default_block_name not in self.block_names:
+        default_blocks = [t for t in self.block_trigger_inputs if t is None]
+        # can only have 1 or 0 default block, and has to put in the last
+        # the order of blocks matters here because the first block with matching trigger will be dispatched
+        # e.g. blocks = [inpaint, img2img] and block_trigger_inputs = ["mask", "image"]
+        # as long as mask is provided, it is inpaint; if only image is provided, it is img2img
+        if len(default_blocks) > 1 or (len(default_blocks) == 1 and self.block_trigger_inputs[-1] is not None):
            raise ValueError(
-                f"In {self.__class__.__name__}, default_block_name '{self.default_block_name}' must be one of block_names: {self.block_names}"
+                f"In {self.__class__.__name__}, exactly one None must be specified as the last element "
+                "in block_trigger_inputs."
            )

+        # Map trigger inputs to block objects
+        self.trigger_to_block_map = dict(zip(self.block_trigger_inputs, self.sub_blocks.values()))
+        self.trigger_to_block_name_map = dict(zip(self.block_trigger_inputs, self.sub_blocks.keys()))
+        self.block_to_trigger_map = dict(zip(self.sub_blocks.keys(), self.block_trigger_inputs))
+
    @property
    def model_name(self):
        return next(iter(self.sub_blocks.values())).model_name
@@ -593,11 +598,8 @@ class ConditionalPipelineBlocks(ModularPipelineBlocks):

    @property
    def required_inputs(self) -> List[str]:
-
-        # no default block means this conditional block can be skipped entirely
-        if self.default_block_name is None:
+        if None not in self.block_trigger_inputs:
            return []
-        
        first_block = next(iter(self.sub_blocks.values()))
        required_by_all = set(getattr(first_block, "required_inputs", set()))

@@ -608,7 +610,7 @@ class ConditionalPipelineBlocks(ModularPipelineBlocks):

        return list(required_by_all)

-
+    # YiYi TODO: add test for this
    @property
    def inputs(self) -> List[Tuple[str, Any]]:
        named_inputs = [(name, block.inputs) for name, block in self.sub_blocks.items()]
@@ -633,69 +635,22 @@ class ConditionalPipelineBlocks(ModularPipelineBlocks):
        combined_outputs = self.combine_outputs(*named_outputs)
        return combined_outputs

-    def _get_trigger_inputs(self) -> set:
-        """
-        Returns a set of all unique trigger input values found in this block and nested blocks.
-        """
-
-        def fn_recursive_get_trigger(blocks):
-            trigger_values = set()
-
-            if blocks is not None:
-                for name, block in blocks.items():
-                    # Check if current block has block_trigger_inputs
-                    if hasattr(block, "block_trigger_inputs") and block.block_trigger_inputs is not None:
-                        trigger_values.update(t for t in block.block_trigger_inputs if t is not None)
-
-                    # If block has sub_blocks, recursively check them
-                    if block.sub_blocks:
-                        nested_triggers = fn_recursive_get_trigger(block.sub_blocks)
-                        trigger_values.update(nested_triggers)
-
-            return trigger_values
-
-        # Start with this block's block_trigger_inputs
-        all_triggers = set(t for t in self.block_trigger_inputs if t is not None)
-        # Add nested triggers
-        all_triggers.update(fn_recursive_get_trigger(self.sub_blocks))
-
-        return all_triggers
-
-    @property
-    def trigger_inputs(self):
-        """All trigger inputs including from nested blocks."""
-        return self._get_trigger_inputs()
-
-    def select_block(self, **kwargs) -> Optional[str]:
-        """
-        Select the block to run based on the trigger inputs.
-        Subclasses must implement this method to define the logic for selecting the block.
-
-        Args:
-            **kwargs: Trigger input names and their values from the state.
-
-        Returns:
-            Optional[str]: The name of the block to run, or None to use default/skip.
-        """
-        raise NotImplementedError(f"Subclass {self.__class__.__name__} must implement the `select_block` method.")
-
    @torch.no_grad()
    def __call__(self, pipeline, state: PipelineState) -> PipelineState:
-        
-        trigger_kwargs = {name: state.get(name) for name in self.block_trigger_inputs if name is not None}
-        block_name = self.select_block(**trigger_kwargs)
+        # Find default block first (if any)

-        if block_name is None:
-            block_name = self.default_block_name
+        block = self.trigger_to_block_map.get(None)
+        for input_name in self.block_trigger_inputs:
+            if input_name is not None and state.get(input_name) is not None:
+                block = self.trigger_to_block_map[input_name]
+                break

-        if block_name is None:
-            logger.info(f"skipping conditional block: {self.__class__.__name__}")
+        if block is None:
+            logger.info(f"skipping auto block: {self.__class__.__name__}")
            return pipeline, state
-        
-        block = self.sub_blocks[block_name]

        try:
-            logger.info(f"Running block: {block.__class__.__name__}")
+            logger.info(f"Running block: {block.__class__.__name__}, trigger: {input_name}")
            return block(pipeline, state)
        except Exception as e:
            error_msg = (
@@ -706,6 +661,38 @@ class ConditionalPipelineBlocks(ModularPipelineBlocks):
            logger.error(error_msg)
            raise

+    def _get_trigger_inputs(self):
+        """
+        Returns a set of all unique trigger input values found in the blocks. Returns: Set[str] containing all unique
+        block_trigger_inputs values
+        """
+
+        def fn_recursive_get_trigger(blocks):
+            trigger_values = set()
+
+            if blocks is not None:
+                for name, block in blocks.items():
+                    # Check if current block has trigger inputs(i.e. auto block)
+                    if hasattr(block, "block_trigger_inputs") and block.block_trigger_inputs is not None:
+                        # Add all non-None values from the trigger inputs list
+                        trigger_values.update(t for t in block.block_trigger_inputs if t is not None)
+
+                    # If block has sub_blocks, recursively check them
+                    if block.sub_blocks:
+                        nested_triggers = fn_recursive_get_trigger(block.sub_blocks)
+                        trigger_values.update(nested_triggers)
+
+            return trigger_values
+
+        trigger_inputs = set(self.block_trigger_inputs)
+        trigger_inputs.update(fn_recursive_get_trigger(self.sub_blocks))
+
+        return trigger_inputs
+
+    @property
+    def trigger_inputs(self):
+        return self._get_trigger_inputs()
+
    def __repr__(self):
        class_name = self.__class__.__name__
        base_class = self.__class__.__bases__[0].__name__
@@ -717,7 +704,7 @@ class ConditionalPipelineBlocks(ModularPipelineBlocks):
            header += "\n"
            header += "  " + "=" * 100 + "\n"
            header += "  This pipeline contains blocks that are selected at runtime based on inputs.\n"
-            header += f"  Trigger Inputs: {sorted(self.trigger_inputs)}\n"
+            header += f"  Trigger Inputs: {[inp for inp in self.trigger_inputs if inp is not None]}\n"
            header += "  " + "=" * 100 + "\n\n"

        # Format description with proper indentation
@@ -738,20 +725,31 @@ class ConditionalPipelineBlocks(ModularPipelineBlocks):
        expected_configs = getattr(self, "expected_configs", [])
        configs_str = format_configs(expected_configs, indent_level=2, add_empty_lines=False)

-        # Blocks section 
+        # Blocks section - moved to the end with simplified format
        blocks_str = "  Sub-Blocks:\n"
        for i, (name, block) in enumerate(self.sub_blocks.items()):
-            if name == self.default_block_name:
-                addtional_str  = " [default]"
+            # Get trigger input for this block
+            trigger = None
+            if hasattr(self, "block_to_trigger_map"):
+                trigger = self.block_to_trigger_map.get(name)
+                # Format the trigger info
+                if trigger is None:
+                    trigger_str = "[default]"
+                elif isinstance(trigger, (list, tuple)):
+                    trigger_str = f"[trigger: {', '.join(str(t) for t in trigger)}]"
+                else:
+                    trigger_str = f"[trigger: {trigger}]"
+                # For AutoPipelineBlocks, add bullet points
+                blocks_str += f"    • {name} {trigger_str} ({block.__class__.__name__})\n"
            else:
-                addtional_str = ""
-            blocks_str += f"    • {name}{addtional_str} ({block.__class__.__name__})\n"
+                # For SequentialPipelineBlocks, show execution order
+                blocks_str += f"    [{i}] {name} ({block.__class__.__name__})\n"

            # Add block description
-            block_desc_lines = block.description.split("\n")
-            indented_desc = block_desc_lines[0]
-            if len(block_desc_lines) > 1:
-                indented_desc += "\n" + "\n".join("                   " + line for line in block_desc_lines[1:])
+            desc_lines = block.description.split("\n")
+            indented_desc = desc_lines[0]
+            if len(desc_lines) > 1:
+                indented_desc += "\n" + "\n".join("                   " + line for line in desc_lines[1:])
            blocks_str += f"       Description: {indented_desc}\n\n"

        # Build the representation with conditional sections
@@ -782,35 +780,6 @@ class ConditionalPipelineBlocks(ModularPipelineBlocks):
        )


-class AutoPipelineBlocks(ConditionalPipelineBlocks):
-    """
-    A Pipeline Blocks that automatically selects a block to run based on the presence of trigger inputs.
-    """
-
-    def __init__(self):
-        super().__init__()
-
-        if not (len(self.block_classes) == len(self.block_names) == len(self.block_trigger_inputs)):
-            raise ValueError(
-                f"In {self.__class__.__name__}, the number of block_classes, block_names, and block_trigger_inputs must be the same."
-            )
-
-    @property
-    def default_block_name(self) -> Optional[str]:
-        """Derive default_block_name from block_trigger_inputs (None entry)."""
-        if None in self.block_trigger_inputs:
-            idx = self.block_trigger_inputs.index(None)
-            return self.block_names[idx]
-        return None
-
-    def select_block(self, **kwargs) -> Optional[str]:
-        """Select block based on which trigger input is present (not None)."""
-        for trigger_input, block_name in zip(self.block_trigger_inputs, self.block_names):
-            if trigger_input is not None and kwargs.get(trigger_input) is not None:
-                return block_name
-        return None
-
-
 class SequentialPipelineBlocks(ModularPipelineBlocks):
    """
    A Pipeline Blocks that combines multiple pipeline block classes into one. When called, it will call each block in
@@ -912,8 +881,7 @@ class SequentialPipelineBlocks(ModularPipelineBlocks):

            # Only add outputs if the block cannot be skipped
            should_add_outputs = True
-            if isinstance(block, ConditionalPipelineBlocks) and block.default_block_name is None:
-                # ConditionalPipelineBlocks without default can be skipped
+            if hasattr(block, "block_trigger_inputs") and None not in block.block_trigger_inputs:
                should_add_outputs = False

            if should_add_outputs:
@@ -976,7 +944,8 @@ class SequentialPipelineBlocks(ModularPipelineBlocks):

    def _get_trigger_inputs(self):
        """
-        Returns a set of all unique trigger input values found in the blocks.
+        Returns a set of all unique trigger input values found in the blocks. Returns: Set[str] containing all unique
+        block_trigger_inputs values
        """

        def fn_recursive_get_trigger(blocks):
@@ -984,8 +953,9 @@ class SequentialPipelineBlocks(ModularPipelineBlocks):

            if blocks is not None:
                for name, block in blocks.items():
-                    # Check if current block has block_trigger_inputs (ConditionalPipelineBlocks)
+                    # Check if current block has trigger inputs(i.e. auto block)
                    if hasattr(block, "block_trigger_inputs") and block.block_trigger_inputs is not None:
+                        # Add all non-None values from the trigger inputs list
                        trigger_values.update(t for t in block.block_trigger_inputs if t is not None)

                    # If block has sub_blocks, recursively check them
@@ -1001,85 +971,82 @@ class SequentialPipelineBlocks(ModularPipelineBlocks):
    def trigger_inputs(self):
        return self._get_trigger_inputs()

-    def _traverse_trigger_blocks(self, active_inputs):
-        """
-        Traverse blocks and select which ones would run given the active inputs.
+    def _traverse_trigger_blocks(self, trigger_inputs):
+        # Convert trigger_inputs to a set for easier manipulation
+        active_triggers = set(trigger_inputs)

-        Args:
-            active_inputs: Dict of input names to values that are "present"
-
-        Returns:
-            OrderedDict of block_name -> block that would execute
-        """
-
-        def fn_recursive_traverse(block, block_name, active_inputs):
+        def fn_recursive_traverse(block, block_name, active_triggers):
            result_blocks = OrderedDict()

-            # ConditionalPipelineBlocks (includes AutoPipelineBlocks)
-            if isinstance(block, ConditionalPipelineBlocks):
-                trigger_kwargs = {name: active_inputs.get(name) for name in block.block_trigger_inputs}
-                selected_block_name = block.select_block(**trigger_kwargs)
-
-                if selected_block_name is None:
-                    selected_block_name = block.default_block_name
-
-                if selected_block_name is None:
-                    return result_blocks
-
-                selected_block = block.sub_blocks[selected_block_name]
-
-                if selected_block.sub_blocks:
-                    result_blocks.update(fn_recursive_traverse(selected_block, block_name, active_inputs))
+            # sequential(include loopsequential) or PipelineBlock
+            if not hasattr(block, "block_trigger_inputs"):
+                if block.sub_blocks:
+                    # sequential or LoopSequentialPipelineBlocks (keep traversing)
+                    for sub_block_name, sub_block in block.sub_blocks.items():
+                        blocks_to_update = fn_recursive_traverse(sub_block, sub_block_name, active_triggers)
+                        blocks_to_update = fn_recursive_traverse(sub_block, sub_block_name, active_triggers)
+                        blocks_to_update = {f"{block_name}.{k}": v for k, v in blocks_to_update.items()}
+                        result_blocks.update(blocks_to_update)
                else:
-                    result_blocks[block_name] = selected_block
-                    if hasattr(selected_block, "outputs"):
-                        for out in selected_block.outputs:
-                            active_inputs[out.name] = True
-
+                    # PipelineBlock
+                    result_blocks[block_name] = block
+                    # Add this block's output names to active triggers if defined
+                    if hasattr(block, "outputs"):
+                        active_triggers.update(out.name for out in block.outputs)
                return result_blocks

-            # SequentialPipelineBlocks or LoopSequentialPipelineBlocks
-            if block.sub_blocks:
-                for sub_block_name, sub_block in block.sub_blocks.items():
-                    blocks_to_update = fn_recursive_traverse(sub_block, sub_block_name, active_inputs)
-                    blocks_to_update = {f"{block_name}.{k}": v for k, v in blocks_to_update.items()}
-                    result_blocks.update(blocks_to_update)
+            # auto
            else:
-                result_blocks[block_name] = block
-                if hasattr(block, "outputs"):
-                    for out in block.outputs:
-                        active_inputs[out.name] = True
+                # Find first block_trigger_input that matches any value in our active_triggers
+                this_block = None
+                for trigger_input in block.block_trigger_inputs:
+                    if trigger_input is not None and trigger_input in active_triggers:
+                        this_block = block.trigger_to_block_map[trigger_input]
+                        break
+
+                # If no matches found, try to get the default (None) block
+                if this_block is None and None in block.block_trigger_inputs:
+                    this_block = block.trigger_to_block_map[None]
+
+                if this_block is not None:
+                    # sequential/auto (keep traversing)
+                    if this_block.sub_blocks:
+                        result_blocks.update(fn_recursive_traverse(this_block, block_name, active_triggers))
+                    else:
+                        # PipelineBlock
+                        result_blocks[block_name] = this_block
+                        # Add this block's output names to active triggers if defined
+                        # YiYi TODO: do we need outputs here? can it just be intermediate_outputs? can we get rid of outputs attribute?
+                        if hasattr(this_block, "outputs"):
+                            active_triggers.update(out.name for out in this_block.outputs)

            return result_blocks

        all_blocks = OrderedDict()
        for block_name, block in self.sub_blocks.items():
-            blocks_to_update = fn_recursive_traverse(block, block_name, active_inputs)
+            blocks_to_update = fn_recursive_traverse(block, block_name, active_triggers)
            all_blocks.update(blocks_to_update)
        return all_blocks

-    def get_execution_blocks(self, **kwargs):
-        """
-        Get the blocks that would execute given the specified inputs.
+    def get_execution_blocks(self, *trigger_inputs):
+        trigger_inputs_all = self.trigger_inputs

-        Args:
-            **kwargs: Input names and values. Only trigger inputs affect block selection.
-                    Pass any inputs that would be non-None at runtime.
+        if trigger_inputs is not None:
+            if not isinstance(trigger_inputs, (list, tuple, set)):
+                trigger_inputs = [trigger_inputs]
+            invalid_inputs = [x for x in trigger_inputs if x not in trigger_inputs_all]
+            if invalid_inputs:
+                logger.warning(
+                    f"The following trigger inputs will be ignored as they are not supported: {invalid_inputs}"
+                )
+                trigger_inputs = [x for x in trigger_inputs if x in trigger_inputs_all]

-        Returns:
-            SequentialPipelineBlocks containing only the blocks that would execute
-        
-        Example:
-            # Get blocks for inpainting workflow
-            blocks = pipeline.get_execution_blocks(prompt="a cat", mask=mask, image=image)
-            
-            # Get blocks for text2image workflow
-            blocks = pipeline.get_execution_blocks(prompt="a cat")
-        """
-        # Filter out None values
-        active_inputs = {k: v for k, v in kwargs.items() if v is not None}
-        
-        blocks_triggered = self._traverse_trigger_blocks(active_inputs)
+        if trigger_inputs is None:
+            if None in trigger_inputs_all:
+                trigger_inputs = [None]
+            else:
+                trigger_inputs = [trigger_inputs_all[0]]
+        blocks_triggered = self._traverse_trigger_blocks(trigger_inputs)
        return SequentialPipelineBlocks.from_blocks_dict(blocks_triggered)

    def __repr__(self):
@@ -1096,7 +1063,7 @@ class SequentialPipelineBlocks(ModularPipelineBlocks):
            header += f"  Trigger Inputs: {[inp for inp in self.trigger_inputs if inp is not None]}\n"
            # Get first trigger input as example
            example_input = next(t for t in self.trigger_inputs if t is not None)
-            header += f"  Use `get_execution_blocks()` to see selected blocks (e.g. `get_execution_blocks({example_input}=...)`).\n"
+            header += f"  Use `get_execution_blocks()` with input names to see selected blocks (e.g. `get_execution_blocks('{example_input}')`).\n"
            header += "  " + "=" * 100 + "\n\n"

        # Format description with proper indentation
@@ -1120,9 +1087,22 @@ class SequentialPipelineBlocks(ModularPipelineBlocks):
        # Blocks section - moved to the end with simplified format
        blocks_str = "  Sub-Blocks:\n"
        for i, (name, block) in enumerate(self.sub_blocks.items()):
-
-            # show execution order
-            blocks_str += f"    [{i}] {name} ({block.__class__.__name__})\n"
+            # Get trigger input for this block
+            trigger = None
+            if hasattr(self, "block_to_trigger_map"):
+                trigger = self.block_to_trigger_map.get(name)
+                # Format the trigger info
+                if trigger is None:
+                    trigger_str = "[default]"
+                elif isinstance(trigger, (list, tuple)):
+                    trigger_str = f"[trigger: {', '.join(str(t) for t in trigger)}]"
+                else:
+                    trigger_str = f"[trigger: {trigger}]"
+                # For AutoPipelineBlocks, add bullet points
+                blocks_str += f"    • {name} {trigger_str} ({block.__class__.__name__})\n"
+            else:
+                # For SequentialPipelineBlocks, show execution order
+                blocks_str += f"    [{i}] {name} ({block.__class__.__name__})\n"

            # Add block description
            desc_lines = block.description.split("\n")
@@ -1246,9 +1226,15 @@ class LoopSequentialPipelineBlocks(ModularPipelineBlocks):
                if inp.name not in outputs and inp not in inputs:
                    inputs.append(inp)

-            # Add this block's outputs
-            block_intermediate_outputs = [out.name for out in block.intermediate_outputs]
-            outputs.update(block_intermediate_outputs)
+            # Only add outputs if the block cannot be skipped
+            should_add_outputs = True
+            if hasattr(block, "block_trigger_inputs") and None not in block.block_trigger_inputs:
+                should_add_outputs = False
+
+            if should_add_outputs:
+                # Add this block's outputs
+                block_intermediate_outputs = [out.name for out in block.intermediate_outputs]
+                outputs.update(block_intermediate_outputs)

        for input_param in inputs:
            if input_param.name in self.required_inputs:
@@ -1305,14 +1291,6 @@ class LoopSequentialPipelineBlocks(ModularPipelineBlocks):
                sub_blocks[block_name] = block
        self.sub_blocks = sub_blocks

-        # Validate that sub_blocks are only leaf blocks
-        for block_name, block in self.sub_blocks.items():
-            if block.sub_blocks:
-                raise ValueError(
-                    f"In {self.__class__.__name__}, sub_blocks must be leaf blocks (no sub_blocks). "
-                    f"Block '{block_name}' ({block.__class__.__name__}) has sub_blocks."
-                )
-
    @classmethod
    def from_blocks_dict(cls, blocks_dict: Dict[str, Any]) -> "LoopSequentialPipelineBlocks":
        """
@@ -1547,8 +1525,10 @@ class ModularPipeline(ConfigMixin, PushToHubMixin):
        if blocks is None:
            if modular_config_dict is not None:
                blocks_class_name = modular_config_dict.get("_blocks_class_name")
-            else:
+            elif config_dict is not None:
                blocks_class_name = self.get_default_blocks_name(config_dict)
+            else:
+                blocks_class_name = None
            if blocks_class_name is not None:
                diffusers_module = importlib.import_module("diffusers")
                blocks_class = getattr(diffusers_module, blocks_class_name)
@@ -1645,10 +1625,7 @@ class ModularPipeline(ConfigMixin, PushToHubMixin):
            return None, config_dict

        except EnvironmentError as e:
-            raise EnvironmentError(
-                f"Failed to load config from '{pretrained_model_name_or_path}'. "
-                f"Could not find or load 'modular_model_index.json' or 'model_index.json'."
-            ) from e
+            logger.debug(f" model_index.json not found in the repo: {e}")

        return None, None

@@ -2573,11 +2550,7 @@ class ModularPipeline(ConfigMixin, PushToHubMixin):
            kwargs_type = expected_input_param.kwargs_type
            if name in passed_kwargs:
                state.set(name, passed_kwargs.pop(name), kwargs_type)
-            elif kwargs_type is not None and kwargs_type in passed_kwargs:
-                kwargs_dict = passed_kwargs.pop(kwargs_type)
-                for k, v in kwargs_dict.items():
-                    state.set(k, v, kwargs_type)
-            elif name is not None and name not in state.values:
+            elif name not in state.values:
                state.set(name, default, kwargs_type)

        # Warn about unexpected inputs
--- a/src/diffusers/modular_pipelines/qwenimage/init.py
+++ b/src/diffusers/modular_pipelines/qwenimage/init.py
@@ -21,16 +21,21 @@ except OptionalDependencyNotAvailable:

    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
 else:
-    _import_structure["modular_blocks_qwenimage"] = [
+    _import_structure["encoders"] = ["QwenImageTextEncoderStep"]
+    _import_structure["modular_blocks"] = [
+        "ALL_BLOCKS",
        "AUTO_BLOCKS",
-        "QwenImageAutoBlocks",
-    ]
-    _import_structure["modular_blocks_qwenimage_edit"] = [
+        "CONTROLNET_BLOCKS",
        "EDIT_AUTO_BLOCKS",
-        "QwenImageEditAutoBlocks",
-    ]
-    _import_structure["modular_blocks_qwenimage_edit_plus"] = [
+        "EDIT_BLOCKS",
+        "EDIT_INPAINT_BLOCKS",
        "EDIT_PLUS_AUTO_BLOCKS",
+        "EDIT_PLUS_BLOCKS",
+        "IMAGE2IMAGE_BLOCKS",
+        "INPAINT_BLOCKS",
+        "TEXT2IMAGE_BLOCKS",
+        "QwenImageAutoBlocks",
+        "QwenImageEditAutoBlocks",
        "QwenImageEditPlusAutoBlocks",
    ]
    _import_structure["modular_pipeline"] = [
@@ -46,16 +51,23 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
    except OptionalDependencyNotAvailable:
        from ...utils.dummy_torch_and_transformers_objects import *  # noqa F403
    else:
-        from .modular_blocks_qwenimage import (
+        from .encoders import (
+            QwenImageTextEncoderStep,
+        )
+        from .modular_blocks import (
+            ALL_BLOCKS,
            AUTO_BLOCKS,
-            QwenImageAutoBlocks,
-        )
-        from .modular_blocks_qwenimage_edit import (
+            CONTROLNET_BLOCKS,
            EDIT_AUTO_BLOCKS,
-            QwenImageEditAutoBlocks,
-        )
-        from .modular_blocks_qwenimage_edit_plus import (
+            EDIT_BLOCKS,
+            EDIT_INPAINT_BLOCKS,
            EDIT_PLUS_AUTO_BLOCKS,
+            EDIT_PLUS_BLOCKS,
+            IMAGE2IMAGE_BLOCKS,
+            INPAINT_BLOCKS,
+            TEXT2IMAGE_BLOCKS,
+            QwenImageAutoBlocks,
+            QwenImageEditAutoBlocks,
            QwenImageEditPlusAutoBlocks,
        )
        from .modular_pipeline import (
@@ -74,4 +86,4 @@ else:
    )

    for name, value in _dummy_objects.items():
-        setattr(sys.modules[__name__], name, value)
+        setattr(sys.modules[__name__], name, value)
--- a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
+++ b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
@@ -639,65 +639,19 @@ class QwenImageEditRoPEInputsStep(ModularPipelineBlocks):
        return components, state


-class QwenImageEditPlusRoPEInputsStep(ModularPipelineBlocks):
-    """RoPE inputs step for Edit Plus that handles lists of image heights/widths."""
-
+class QwenImageEditPlusRoPEInputsStep(QwenImageEditRoPEInputsStep):
    model_name = "qwenimage-edit-plus"

-    @property
-    def description(self) -> str:
-        return (
-            "Step that prepares the RoPE inputs for denoising process. This is used in QwenImage Edit Plus.\n"
-            "Unlike Edit, Edit Plus handles lists of image_height/image_width for multiple reference images.\n"
-            "Should be placed after prepare_latents step."
-        )
-
-    @property
-    def inputs(self) -> List[InputParam]:
-        return [
-            InputParam(name="batch_size", required=True),
-            InputParam(name="image_height", required=True, type_hint=List[int]),
-            InputParam(name="image_width", required=True, type_hint=List[int]),
-            InputParam(name="height", required=True),
-            InputParam(name="width", required=True),
-            InputParam(name="prompt_embeds_mask"),
-            InputParam(name="negative_prompt_embeds_mask"),
-        ]
-
-    @property
-    def intermediate_outputs(self) -> List[OutputParam]:
-        return [
-            OutputParam(
-                name="img_shapes",
-                type_hint=List[List[Tuple[int, int, int]]],
-                description="The shapes of the image latents, used for RoPE calculation",
-            ),
-            OutputParam(
-                name="txt_seq_lens",
-                kwargs_type="denoiser_input_fields",
-                type_hint=List[int],
-                description="The sequence lengths of the prompt embeds, used for RoPE calculation",
-            ),
-            OutputParam(
-                name="negative_txt_seq_lens",
-                kwargs_type="denoiser_input_fields",
-                type_hint=List[int],
-                description="The sequence lengths of the negative prompt embeds, used for RoPE calculation",
-            ),
-        ]
-
    def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
        block_state = self.get_block_state(state)

        vae_scale_factor = components.vae_scale_factor
-
-        # Edit Plus: image_height and image_width are lists
        block_state.img_shapes = [
            [
                (1, block_state.height // vae_scale_factor // 2, block_state.width // vae_scale_factor // 2),
                *[
-                    (1, img_height // vae_scale_factor // 2, img_width // vae_scale_factor // 2)
-                    for img_height, img_width in zip(block_state.image_height, block_state.image_width)
+                    (1, vae_height // vae_scale_factor // 2, vae_width // vae_scale_factor // 2)
+                    for vae_height, vae_width in zip(block_state.image_height, block_state.image_width)
                ],
            ]
        ] * block_state.batch_size
--- a/src/diffusers/modular_pipelines/qwenimage/decoders.py
+++ b/src/diffusers/modular_pipelines/qwenimage/decoders.py
@@ -30,47 +30,6 @@ from .modular_pipeline import QwenImageModularPipeline, QwenImagePachifier
 logger = logging.get_logger(__name__)


-class QwenImageAfterDenoiseStep(ModularPipelineBlocks):
-    model_name = "qwenimage"
-
-    @property
-    def description(self) -> str:
-        return "Step that unpack the latents from 3D tensor (batch_size, sequence_length, channels) into 5D tensor (batch_size, channels, 1, height, width)"
-
-    @property
-    def expected_components(self) -> List[ComponentSpec]:
-        components = [
-            ComponentSpec("pachifier", QwenImagePachifier, default_creation_method="from_config"),
-        ]
-
-        return components
-
-    @property
-    def inputs(self) -> List[InputParam]:
-        return [
-            InputParam(name="height", required=True),
-            InputParam(name="width", required=True),
-            InputParam(
-                name="latents",
-                required=True,
-                type_hint=torch.Tensor,
-                description="The latents to decode, can be generated in the denoise step",
-            ),
-        ]
-
-    @torch.no_grad()
-    def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
-        block_state = self.get_block_state(state)
-
-        vae_scale_factor = components.vae_scale_factor
-        block_state.latents = components.pachifier.unpack_latents(
-            block_state.latents, block_state.height, block_state.width, vae_scale_factor=vae_scale_factor
-        )
-
-        self.set_block_state(state, block_state)
-        return components, state
-
-
 class QwenImageDecoderStep(ModularPipelineBlocks):
    model_name = "qwenimage"

@@ -82,6 +41,7 @@ class QwenImageDecoderStep(ModularPipelineBlocks):
    def expected_components(self) -> List[ComponentSpec]:
        components = [
            ComponentSpec("vae", AutoencoderKLQwenImage),
+            ComponentSpec("pachifier", QwenImagePachifier, default_creation_method="from_config"),
        ]

        return components
@@ -89,6 +49,8 @@ class QwenImageDecoderStep(ModularPipelineBlocks):
    @property
    def inputs(self) -> List[InputParam]:
        return [
+            InputParam(name="height", required=True),
+            InputParam(name="width", required=True),
            InputParam(
                name="latents",
                required=True,
@@ -112,12 +74,10 @@ class QwenImageDecoderStep(ModularPipelineBlocks):
        block_state = self.get_block_state(state)

        # YiYi Notes: remove support for output_type = "latents', we can just skip decode/encode step in modular
-        if block_state.latents.ndim == 4:
-            block_state.latents = block_state.latents.unsqueeze(dim=1)
-        elif block_state.latents.ndim != 5:
-            raise ValueError(
-                f"expect latents to be a 4D or 5D tensor but got: {block_state.latents.shape}. Please make sure the latents are unpacked before decode step."
-            )
+        vae_scale_factor = components.vae_scale_factor
+        block_state.latents = components.pachifier.unpack_latents(
+            block_state.latents, block_state.height, block_state.width, vae_scale_factor=vae_scale_factor
+        )
        block_state.latents = block_state.latents.to(components.vae.dtype)

        latents_mean = (
--- a/src/diffusers/modular_pipelines/qwenimage/encoders.py
+++ b/src/diffusers/modular_pipelines/qwenimage/encoders.py
@@ -244,19 +244,18 @@ def encode_vae_image(
 class QwenImageEditResizeDynamicStep(ModularPipelineBlocks):
    model_name = "qwenimage"

-    def __init__(
-        self, 
-        input_name: str = "image", 
-        output_name: str = "resized_image",
-        target_area: int = 1024 * 1024,
-    ):
-        """Create a configurable step for resizing images to the target area while maintaining the aspect ratio.
+    def __init__(self, input_name: str = "image", output_name: str = "resized_image"):
+        """Create a configurable step for resizing images to the target area (1024 * 1024) while maintaining the aspect ratio.
+
+        This block resizes an input image tensor and exposes the resized result under configurable input and output
+        names. Use this when you need to wire the resize step to different image fields (e.g., "image",
+        "control_image")
+
        Args:
            input_name (str, optional): Name of the image field to read from the
                pipeline state. Defaults to "image".
            output_name (str, optional): Name of the resized image field to write
                back to the pipeline state. Defaults to "resized_image".
-            target_area (int, optional): Target area in pixels. Defaults to 1024*1024.
        """
        if not isinstance(input_name, str) or not isinstance(output_name, str):
            raise ValueError(
@@ -264,12 +263,11 @@ class QwenImageEditResizeDynamicStep(ModularPipelineBlocks):
            )
        self._image_input_name = input_name
        self._resized_image_output_name = output_name
-        self._target_area = target_area
        super().__init__()

    @property
    def description(self) -> str:
-        return f"Image Resize step that resize the {self._image_input_name} to the target area {self._target_area} while maintaining the aspect ratio."
+        return f"Image Resize step that resize the {self._image_input_name} to the target area (1024 * 1024) while maintaining the aspect ratio."

    @property
    def expected_components(self) -> List[ComponentSpec]:
@@ -322,67 +320,48 @@ class QwenImageEditResizeDynamicStep(ModularPipelineBlocks):
        self.set_block_state(state, block_state)
        return components, state

-class QwenImageEditPlusResizeDynamicStep(ModularPipelineBlocks):
-    """Resize each image independently based on its own aspect ratio. For QwenImage Edit Plus."""

-    model_name = "qwenimage-edit-plus"
+class QwenImageEditPlusResizeDynamicStep(QwenImageEditResizeDynamicStep):
+    model_name = "qwenimage"

    def __init__(
-        self, 
-        input_name: str = "image", 
+        self,
+        input_name: str = "image",
        output_name: str = "resized_image",
-        target_area: int = 1024 * 1024,
+        vae_image_output_name: str = "vae_image",
    ):
-        """Create a step for resizing images to a target area.
+        """Create a configurable step for resizing images to the target area (384 * 384) while maintaining the aspect ratio.

-        Each image is resized independently based on its own aspect ratio.
-        This is suitable for Edit Plus where multiple reference images can have different dimensions.
+        This block resizes an input image or a list input images and exposes the resized result under configurable
+        input and output names. Use this when you need to wire the resize step to different image fields (e.g.,
+        "image", "control_image")

        Args:
-            input_name (str, optional): Name of the image field to read. Defaults to "image".
-            output_name (str, optional): Name of the resized image field to write. Defaults to "resized_image".
-            target_area (int, optional): Target area in pixels. Defaults to 1024*1024.
+            input_name (str, optional): Name of the image field to read from the
+                pipeline state. Defaults to "image".
+            output_name (str, optional): Name of the resized image field to write
+                back to the pipeline state. Defaults to "resized_image".
+            vae_image_output_name (str, optional): Name of the image field
+                to write back to the pipeline state. This is used by the VAE encoder step later on. QwenImage Edit Plus
+                processes the input image(s) differently for the VL and the VAE.
        """
        if not isinstance(input_name, str) or not isinstance(output_name, str):
            raise ValueError(
                f"input_name and output_name must be strings but are {type(input_name)} and {type(output_name)}"
            )
+        self.condition_image_size = 384 * 384
        self._image_input_name = input_name
        self._resized_image_output_name = output_name
-        self._target_area = target_area
+        self._vae_image_output_name = vae_image_output_name
        super().__init__()

-    @property
-    def description(self) -> str:
-        return (
-            f"Image Resize step that resizes {self._image_input_name} to target area {self._target_area}.\n"
-            "Each image is resized independently based on its own aspect ratio."
-        )
-
-    @property
-    def expected_components(self) -> List[ComponentSpec]:
-        return [
-            ComponentSpec(
-                "image_resize_processor",
-                VaeImageProcessor,
-                config=FrozenDict({"vae_scale_factor": 16}),
-                default_creation_method="from_config",
-            ),
-        ]
-
-    @property
-    def inputs(self) -> List[InputParam]:
-        return [
-            InputParam(
-                name=self._image_input_name, required=True, type_hint=torch.Tensor, description="The image(s) to resize"
-            ),
-        ]
-
    @property
    def intermediate_outputs(self) -> List[OutputParam]:
-        return [
+        return super().intermediate_outputs + [
            OutputParam(
-                name=self._resized_image_output_name, type_hint=List[PIL.Image.Image], description="The resized images"
+                name=self._vae_image_output_name,
+                type_hint=List[PIL.Image.Image],
+                description="The images to be processed which will be further used by the VAE encoder.",
            ),
        ]

@@ -395,21 +374,26 @@ class QwenImageEditPlusResizeDynamicStep(ModularPipelineBlocks):
        if not is_valid_image_imagelist(images):
            raise ValueError(f"Images must be image or list of images but are {type(images)}")

-        if is_valid_image(images):
+        if (
+            not isinstance(images, torch.Tensor)
+            and isinstance(images, PIL.Image.Image)
+            and not isinstance(images, list)
+        ):
            images = [images]

-        # Resize each image independently based on its own aspect ratio
-        resized_images = []
-        for image in images:
-            image_width, image_height = image.size
-            calculated_width, calculated_height, _ = calculate_dimensions(
-                self._target_area, image_width / image_height
-            )
-            resized_images.append(
-                components.image_resize_processor.resize(image, height=calculated_height, width=calculated_width)
+        # TODO (sayakpaul): revisit this when the inputs are `torch.Tensor`s
+        condition_images = []
+        vae_images = []
+        for img in images:
+            image_width, image_height = img.size
+            condition_width, condition_height, _ = calculate_dimensions(
+                self.condition_image_size, image_width / image_height
            )
+            condition_images.append(components.image_resize_processor.resize(img, condition_height, condition_width))
+            vae_images.append(img)

-        setattr(block_state, self._resized_image_output_name, resized_images)
+        setattr(block_state, self._resized_image_output_name, condition_images)
+        setattr(block_state, self._vae_image_output_name, vae_images)
        self.set_block_state(state, block_state)
        return components, state

@@ -663,30 +647,8 @@ class QwenImageEditTextEncoderStep(ModularPipelineBlocks):
        return components, state


-class QwenImageEditPlusTextEncoderStep(ModularPipelineBlocks):
-    """Text encoder for QwenImage Edit Plus that handles multiple reference images."""
-
-    model_name = "qwenimage-edit-plus"
-
-    @property
-    def description(self) -> str:
-        return (
-            "Text Encoder step for QwenImage Edit Plus that processes prompt and multiple images together "
-            "to generate text embeddings for guiding image generation."
-        )
-
-    @property
-    def expected_components(self) -> List[ComponentSpec]:
-        return [
-            ComponentSpec("text_encoder", Qwen2_5_VLForConditionalGeneration),
-            ComponentSpec("processor", Qwen2VLProcessor),
-            ComponentSpec(
-                "guider",
-                ClassifierFreeGuidance,
-                config=FrozenDict({"guidance_scale": 4.0}),
-                default_creation_method="from_config",
-            ),
-        ]
+class QwenImageEditPlusTextEncoderStep(QwenImageEditTextEncoderStep):
+    model_name = "qwenimage"

    @property
    def expected_configs(self) -> List[ConfigSpec]:
@@ -702,60 +664,6 @@ class QwenImageEditPlusTextEncoderStep(ModularPipelineBlocks):
            ConfigSpec(name="prompt_template_encode_start_idx", default=64),
        ]

-    @property
-    def inputs(self) -> List[InputParam]:
-        return [
-            InputParam(name="prompt", required=True, type_hint=str, description="The prompt to encode"),
-            InputParam(name="negative_prompt", type_hint=str, description="The negative prompt to encode"),
-            InputParam(
-                name="resized_cond_image",
-                required=True,
-                type_hint=torch.Tensor,
-                description="The image(s) to encode, can be a single image or list of images, should be resized to 384x384 using resize step",
-            ),
-        ]
-
-    @property
-    def intermediate_outputs(self) -> List[OutputParam]:
-        return [
-            OutputParam(
-                name="prompt_embeds",
-                kwargs_type="denoiser_input_fields",
-                type_hint=torch.Tensor,
-                description="The prompt embeddings",
-            ),
-            OutputParam(
-                name="prompt_embeds_mask",
-                kwargs_type="denoiser_input_fields",
-                type_hint=torch.Tensor,
-                description="The encoder attention mask",
-            ),
-            OutputParam(
-                name="negative_prompt_embeds",
-                kwargs_type="denoiser_input_fields",
-                type_hint=torch.Tensor,
-                description="The negative prompt embeddings",
-            ),
-            OutputParam(
-                name="negative_prompt_embeds_mask",
-                kwargs_type="denoiser_input_fields",
-                type_hint=torch.Tensor,
-                description="The negative prompt embeddings mask",
-            ),
-        ]
-
-    @staticmethod
-    def check_inputs(prompt, negative_prompt):
-        if not isinstance(prompt, str) and not isinstance(prompt, list):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if (
-            negative_prompt is not None
-            and not isinstance(negative_prompt, str)
-            and not isinstance(negative_prompt, list)
-        ):
-            raise ValueError(f"`negative_prompt` has to be of type `str` or `list` but is {type(negative_prompt)}")
-
    @torch.no_grad()
    def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
        block_state = self.get_block_state(state)
@@ -768,7 +676,7 @@ class QwenImageEditPlusTextEncoderStep(ModularPipelineBlocks):
            components.text_encoder,
            components.processor,
            prompt=block_state.prompt,
-            image=block_state.resized_cond_image,
+            image=block_state.resized_image,
            prompt_template_encode=components.config.prompt_template_encode,
            img_template_encode=components.config.img_template_encode,
            prompt_template_encode_start_idx=components.config.prompt_template_encode_start_idx,
@@ -784,7 +692,7 @@ class QwenImageEditPlusTextEncoderStep(ModularPipelineBlocks):
                    components.text_encoder,
                    components.processor,
                    prompt=negative_prompt,
-                    image=block_state.resized_cond_image,
+                    image=block_state.resized_image,
                    prompt_template_encode=components.config.prompt_template_encode,
                    img_template_encode=components.config.img_template_encode,
                    prompt_template_encode_start_idx=components.config.prompt_template_encode_start_idx,
@@ -938,60 +846,60 @@ class QwenImageProcessImagesInputStep(ModularPipelineBlocks):
        self.set_block_state(state, block_state)
        return components, state

-class QwenImageEditPlusProcessImagesInputStep(ModularPipelineBlocks):
+
+class QwenImageEditPlusProcessImagesInputStep(QwenImageProcessImagesInputStep):
    model_name = "qwenimage-edit-plus"

+    def __init__(self):
+        self.vae_image_size = 1024 * 1024
+        super().__init__()
+
    @property
    def description(self) -> str:
-        return "Image Preprocess step. Images can be resized first using QwenImageEditResizeDynamicStep."
-
-    @property
-    def expected_components(self) -> List[ComponentSpec]:
-        return [
-            ComponentSpec(
-                "image_processor",
-                VaeImageProcessor,
-                config=FrozenDict({"vae_scale_factor": 16}),
-                default_creation_method="from_config",
-            ),
-        ]
+        return "Image Preprocess step for QwenImage Edit Plus. Unlike QwenImage Edit, QwenImage Edit Plus doesn't use the same resized image for further preprocessing."

    @property
    def inputs(self) -> List[InputParam]:
-        return [InputParam("resized_image")]
-
-    @property
-    def intermediate_outputs(self) -> List[OutputParam]:
-        return [OutputParam(name="processed_image")]
+        return [InputParam("vae_image"), InputParam("image"), InputParam("height"), InputParam("width")]

    @torch.no_grad()
    def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
        block_state = self.get_block_state(state)

+        if block_state.vae_image is None and block_state.image is None:
+            raise ValueError("`vae_image` and `image` cannot be None at the same time")

-
-        image = block_state.resized_image
-
-        is_image_list = isinstance(image, list)
-        if not is_image_list:
-            image = [image]
-
-        processed_images = []
-        for img in image:
-            img_width, img_height = img.size
-            processed_images.append(components.image_processor.preprocess(image=img, height=img_height, width=img_width))
-        block_state.processed_image = processed_images
-        if is_image_list:
-            block_state.processed_image = processed_images
+        vae_image_sizes = None
+        if block_state.vae_image is None:
+            image = block_state.image
+            self.check_inputs(
+                height=block_state.height, width=block_state.width, vae_scale_factor=components.vae_scale_factor
+            )
+            height = block_state.height or components.default_height
+            width = block_state.width or components.default_width
+            block_state.processed_image = components.image_processor.preprocess(
+                image=image, height=height, width=width
+            )
        else:
-            block_state.processed_image = processed_images[0]
+            # QwenImage Edit Plus can allow multiple input images with varied resolutions
+            processed_images = []
+            vae_image_sizes = []
+            for img in block_state.vae_image:
+                width, height = img.size
+                vae_width, vae_height, _ = calculate_dimensions(self.vae_image_size, width / height)
+                vae_image_sizes.append((vae_width, vae_height))
+                processed_images.append(
+                    components.image_processor.preprocess(image=img, height=vae_height, width=vae_width)
+                )
+            block_state.processed_image = processed_images
+
+        block_state.vae_image_sizes = vae_image_sizes

        self.set_block_state(state, block_state)
        return components, state

-class QwenImageVaeEncoderDynamicStep(ModularPipelineBlocks):
-    """VAE encoder that handles both single images and lists of images with varied resolutions."""

+class QwenImageVaeEncoderDynamicStep(ModularPipelineBlocks):
    model_name = "qwenimage"

    def __init__(
@@ -1001,12 +909,21 @@ class QwenImageVaeEncoderDynamicStep(ModularPipelineBlocks):
    ):
        """Initialize a VAE encoder step for converting images to latent representations.

-        Handles both single images and lists of images. When input is a list, outputs a list of latents.
-        When input is a single tensor, outputs a single latent tensor.
+        Both the input and output names are configurable so this block can be configured to process to different image
+        inputs (e.g., "processed_image" -> "image_latents", "processed_control_image" -> "control_image_latents").

        Args:
-            input_name (str, optional): Name of the input image tensor or list. Defaults to "processed_image".
-            output_name (str, optional): Name of the output latent tensor or list. Defaults to "image_latents".
+            input_name (str, optional): Name of the input image tensor. Defaults to "processed_image".
+                Examples: "processed_image" or "processed_control_image"
+            output_name (str, optional): Name of the output latent tensor. Defaults to "image_latents".
+                Examples: "image_latents" or "control_image_latents"
+
+        Examples:
+            # Basic usage with default settings (includes image processor) QwenImageVaeEncoderDynamicStep()
+
+            # Custom input/output names for control image QwenImageVaeEncoderDynamicStep(
+                input_name="processed_control_image", output_name="control_image_latents"
+            )
        """
        self._image_input_name = input_name
        self._image_latents_output_name = output_name
@@ -1014,18 +931,17 @@ class QwenImageVaeEncoderDynamicStep(ModularPipelineBlocks):

    @property
    def description(self) -> str:
-        return (
-            f"VAE Encoder step that converts {self._image_input_name} into latent representations {self._image_latents_output_name}.\n"
-            "Handles both single images and lists of images with varied resolutions."
-        )
+        return f"Dynamic VAE Encoder step that converts {self._image_input_name} into latent representations {self._image_latents_output_name}.\n"

    @property
    def expected_components(self) -> List[ComponentSpec]:
-        return [ComponentSpec("vae", AutoencoderKLQwenImage)]
+        components = [ComponentSpec("vae", AutoencoderKLQwenImage)]
+        return components

    @property
    def inputs(self) -> List[InputParam]:
-        return [InputParam(self._image_input_name, required=True), InputParam("generator")]
+        inputs = [InputParam(self._image_input_name, required=True), InputParam("generator")]
+        return inputs

    @property
    def intermediate_outputs(self) -> List[OutputParam]:
@@ -1033,7 +949,7 @@ class QwenImageVaeEncoderDynamicStep(ModularPipelineBlocks):
            OutputParam(
                self._image_latents_output_name,
                type_hint=torch.Tensor,
-                description="The latents representing the reference image(s). Single tensor or list depending on input.",
+                description="The latents representing the reference image",
            )
        ]

@@ -1045,11 +961,47 @@ class QwenImageVaeEncoderDynamicStep(ModularPipelineBlocks):
        dtype = components.vae.dtype

        image = getattr(block_state, self._image_input_name)
-        is_image_list = isinstance(image, list)
-        if not is_image_list:
-            image = [image]

-        # Handle both single image and list of images
+        # Encode image into latents
+        image_latents = encode_vae_image(
+            image=image,
+            vae=components.vae,
+            generator=block_state.generator,
+            device=device,
+            dtype=dtype,
+            latent_channels=components.num_channels_latents,
+        )
+        setattr(block_state, self._image_latents_output_name, image_latents)
+
+        self.set_block_state(state, block_state)
+
+        return components, state
+
+
+class QwenImageEditPlusVaeEncoderDynamicStep(QwenImageVaeEncoderDynamicStep):
+    model_name = "qwenimage-edit-plus"
+
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        # Each reference image latent can have varied resolutions hence we return this as a list.
+        return [
+            OutputParam(
+                self._image_latents_output_name,
+                type_hint=List[torch.Tensor],
+                description="The latents representing the reference image(s).",
+            )
+        ]
+
+    @torch.no_grad()
+    def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+
+        device = components._execution_device
+        dtype = components.vae.dtype
+
+        image = getattr(block_state, self._image_input_name)
+
+        # Encode image into latents
        image_latents = []
        for img in image:
            image_latents.append(
@@ -1062,12 +1014,9 @@ class QwenImageVaeEncoderDynamicStep(ModularPipelineBlocks):
                    latent_channels=components.num_channels_latents,
                )
            )
-        if not is_image_list:
-            image_latents = image_latents[0]

        setattr(block_state, self._image_latents_output_name, image_latents)

-
        self.set_block_state(state, block_state)

        return components, state
--- a/src/diffusers/modular_pipelines/qwenimage/inputs.py
+++ b/src/diffusers/modular_pipelines/qwenimage/inputs.py
@@ -222,15 +222,36 @@ class QwenImageTextInputsStep(ModularPipelineBlocks):


 class QwenImageInputsDynamicStep(ModularPipelineBlocks):
-    """Input step for QwenImage: update height/width, expand batch, patchify."""
-
    model_name = "qwenimage"

-    def __init__(
-        self,
-        image_latent_inputs: List[str] = ["image_latents"],
-        additional_batch_inputs: List[str] = [],
-    ):
+    def __init__(self, image_latent_inputs: List[str] = ["image_latents"], additional_batch_inputs: List[str] = []):
+        """Initialize a configurable step that standardizes the inputs for the denoising step. It:\n"
+
+        This step handles multiple common tasks to prepare inputs for the denoising step:
+        1. For encoded image latents, use it update height/width if None, patchifies, and expands batch size
+        2. For additional_batch_inputs: Only expands batch dimensions to match final batch size
+
+        This is a dynamic block that allows you to configure which inputs to process.
+
+        Args:
+            image_latent_inputs (List[str], optional): Names of image latent tensors to process.
+                These will be used to determine height/width, patchified, and batch-expanded. Can be a single string or
+                list of strings. Defaults to ["image_latents"]. Examples: ["image_latents"], ["control_image_latents"]
+            additional_batch_inputs (List[str], optional):
+                Names of additional conditional input tensors to expand batch size. These tensors will only have their
+                batch dimensions adjusted to match the final batch size. Can be a single string or list of strings.
+                Defaults to []. Examples: ["processed_mask_image"]
+
+        Examples:
+            # Configure to process image_latents (default behavior) QwenImageInputsDynamicStep()
+
+            # Configure to process multiple image latent inputs
+            QwenImageInputsDynamicStep(image_latent_inputs=["image_latents", "control_image_latents"])
+
+            # Configure to process image latents and additional batch inputs QwenImageInputsDynamicStep(
+                image_latent_inputs=["image_latents"], additional_batch_inputs=["processed_mask_image"]
+            )
+        """
        if not isinstance(image_latent_inputs, list):
            image_latent_inputs = [image_latent_inputs]
        if not isinstance(additional_batch_inputs, list):
@@ -242,12 +263,14 @@ class QwenImageInputsDynamicStep(ModularPipelineBlocks):

    @property
    def description(self) -> str:
+        # Functionality section
        summary_section = (
            "Input processing step that:\n"
-            "  1. For image latent inputs: Updates height/width if None, patchifies, and expands batch size\n"
+            "  1. For image latent inputs: Updates height/width if None, patchifies latents, and expands batch size\n"
            "  2. For additional batch inputs: Expands batch dimensions to match final batch size"
        )

+        # Inputs info
        inputs_info = ""
        if self._image_latent_inputs or self._additional_batch_inputs:
            inputs_info = "\n\nConfigured inputs:"
@@ -256,16 +279,11 @@ class QwenImageInputsDynamicStep(ModularPipelineBlocks):
            if self._additional_batch_inputs:
                inputs_info += f"\n  - Additional batch inputs: {self._additional_batch_inputs}"

+        # Placement guidance
        placement_section = "\n\nThis block should be placed after the encoder steps and the text input step."

        return summary_section + inputs_info + placement_section

-    @property
-    def expected_components(self) -> List[ComponentSpec]:
-        return [
-            ComponentSpec("pachifier", QwenImagePachifier, default_creation_method="from_config"),
-        ]
-
    @property
    def inputs(self) -> List[InputParam]:
        inputs = [
@@ -275,9 +293,11 @@ class QwenImageInputsDynamicStep(ModularPipelineBlocks):
            InputParam(name="width"),
        ]

+        # Add image latent inputs
        for image_latent_input_name in self._image_latent_inputs:
            inputs.append(InputParam(name=image_latent_input_name))

+        # Add additional batch inputs
        for input_name in self._additional_batch_inputs:
            inputs.append(InputParam(name=input_name))

@@ -290,16 +310,22 @@ class QwenImageInputsDynamicStep(ModularPipelineBlocks):
            OutputParam(name="image_width", type_hint=int, description="The width of the image latents"),
        ]

+    @property
+    def expected_components(self) -> List[ComponentSpec]:
+        return [
+            ComponentSpec("pachifier", QwenImagePachifier, default_creation_method="from_config"),
+        ]
+
    def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
        block_state = self.get_block_state(state)

-        # Process image latent inputs
+        # Process image latent inputs (height/width calculation, patchify, and batch expansion)
        for image_latent_input_name in self._image_latent_inputs:
            image_latent_tensor = getattr(block_state, image_latent_input_name)
            if image_latent_tensor is None:
                continue

-            # 1. Calculate height/width from latents and update if not provided
+            # 1. Calculate height/width from latents
            height, width = calculate_dimension_from_latents(image_latent_tensor, components.vae_scale_factor)
            block_state.height = block_state.height or height
            block_state.width = block_state.width or width
@@ -309,7 +335,7 @@ class QwenImageInputsDynamicStep(ModularPipelineBlocks):
            if not hasattr(block_state, "image_width"):
                block_state.image_width = width

-            # 2. Patchify
+            # 2. Patchify the image latent tensor
            image_latent_tensor = components.pachifier.pack_latents(image_latent_tensor)

            # 3. Expand batch size
@@ -328,6 +354,7 @@ class QwenImageInputsDynamicStep(ModularPipelineBlocks):
            if input_tensor is None:
                continue

+            # Only expand batch size
            input_tensor = repeat_tensor_to_batch_size(
                input_name=input_name,
                input_tensor=input_tensor,
@@ -341,130 +368,63 @@ class QwenImageInputsDynamicStep(ModularPipelineBlocks):
        return components, state


-class QwenImageEditPlusInputsDynamicStep(ModularPipelineBlocks):
-    """Input step for QwenImage Edit Plus: handles list of latents with different sizes."""
-
+class QwenImageEditPlusInputsDynamicStep(QwenImageInputsDynamicStep):
    model_name = "qwenimage-edit-plus"

-    def __init__(
-        self,
-        image_latent_inputs: List[str] = ["image_latents"],
-        additional_batch_inputs: List[str] = [],
-    ):
-        if not isinstance(image_latent_inputs, list):
-            image_latent_inputs = [image_latent_inputs]
-        if not isinstance(additional_batch_inputs, list):
-            additional_batch_inputs = [additional_batch_inputs]
-
-        self._image_latent_inputs = image_latent_inputs
-        self._additional_batch_inputs = additional_batch_inputs
-        super().__init__()
-
-    @property
-    def description(self) -> str:
-        summary_section = (
-            "Input processing step for Edit Plus that:\n"
-            "  1. For image latent inputs (list): Collects heights/widths, patchifies each, concatenates, expands batch\n"
-            "  2. For additional batch inputs: Expands batch dimensions to match final batch size\n"
-            "  Height/width defaults to last image in the list."
-        )
-
-        inputs_info = ""
-        if self._image_latent_inputs or self._additional_batch_inputs:
-            inputs_info = "\n\nConfigured inputs:"
-            if self._image_latent_inputs:
-                inputs_info += f"\n  - Image latent inputs: {self._image_latent_inputs}"
-            if self._additional_batch_inputs:
-                inputs_info += f"\n  - Additional batch inputs: {self._additional_batch_inputs}"
-
-        placement_section = "\n\nThis block should be placed after the encoder steps and the text input step."
-
-        return summary_section + inputs_info + placement_section
-
-    @property
-    def expected_components(self) -> List[ComponentSpec]:
-        return [
-            ComponentSpec("pachifier", QwenImagePachifier, default_creation_method="from_config"),
-        ]
-
-    @property
-    def inputs(self) -> List[InputParam]:
-        inputs = [
-            InputParam(name="num_images_per_prompt", default=1),
-            InputParam(name="batch_size", required=True),
-            InputParam(name="height"),
-            InputParam(name="width"),
-        ]
-
-        for image_latent_input_name in self._image_latent_inputs:
-            inputs.append(InputParam(name=image_latent_input_name))
-
-        for input_name in self._additional_batch_inputs:
-            inputs.append(InputParam(name=input_name))
-
-        return inputs
-
    @property
    def intermediate_outputs(self) -> List[OutputParam]:
        return [
-            OutputParam(name="image_height", type_hint=List[int], description="The heights of the image latents"),
-            OutputParam(name="image_width", type_hint=List[int], description="The widths of the image latents"),
+            OutputParam(name="image_height", type_hint=List[int], description="The height of the image latents"),
+            OutputParam(name="image_width", type_hint=List[int], description="The width of the image latents"),
        ]

    def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
        block_state = self.get_block_state(state)

-        # Process image latent inputs
+        # Process image latent inputs (height/width calculation, patchify, and batch expansion)
        for image_latent_input_name in self._image_latent_inputs:
            image_latent_tensor = getattr(block_state, image_latent_input_name)
            if image_latent_tensor is None:
                continue

-            is_list = isinstance(image_latent_tensor, list)
-            if not is_list:
-                image_latent_tensor = [image_latent_tensor]
-
+            # Each image latent can have different size in QwenImage Edit Plus.
            image_heights = []
            image_widths = []
            packed_image_latent_tensors = []

-            for i, img_latent_tensor in enumerate(image_latent_tensor):
+            for img_latent_tensor in image_latent_tensor:
                # 1. Calculate height/width from latents
                height, width = calculate_dimension_from_latents(img_latent_tensor, components.vae_scale_factor)
                image_heights.append(height)
                image_widths.append(width)

-                # 2. Patchify
+                # 2. Patchify the image latent tensor
                img_latent_tensor = components.pachifier.pack_latents(img_latent_tensor)

                # 3. Expand batch size
                img_latent_tensor = repeat_tensor_to_batch_size(
-                    input_name=f"{image_latent_input_name}[{i}]",
+                    input_name=image_latent_input_name,
                    input_tensor=img_latent_tensor,
                    num_images_per_prompt=block_state.num_images_per_prompt,
                    batch_size=block_state.batch_size,
                )
                packed_image_latent_tensors.append(img_latent_tensor)

-            # Concatenate all packed latents along dim=1
            packed_image_latent_tensors = torch.cat(packed_image_latent_tensors, dim=1)
-
-            # Output lists of heights/widths
            block_state.image_height = image_heights
            block_state.image_width = image_widths
+            setattr(block_state, image_latent_input_name, packed_image_latent_tensors)

-            # Default height/width from last image
            block_state.height = block_state.height or image_heights[-1]
            block_state.width = block_state.width or image_widths[-1]

-            setattr(block_state, image_latent_input_name, packed_image_latent_tensors)
-
        # Process additional batch inputs (only batch expansion)
        for input_name in self._additional_batch_inputs:
            input_tensor = getattr(block_state, input_name)
            if input_tensor is None:
                continue

+            # Only expand batch size
            input_tensor = repeat_tensor_to_batch_size(
                input_name=input_name,
                input_tensor=input_tensor,
@@ -476,6 +436,8 @@ class QwenImageEditPlusInputsDynamicStep(ModularPipelineBlocks):

        self.set_block_state(state, block_state)
        return components, state
+
+
 class QwenImageControlNetInputsStep(ModularPipelineBlocks):
    model_name = "qwenimage"

--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks.py
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
@@ -1,465 +0,0 @@
-# Copyright 2025 Qwen-Image Team and The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from ...utils import logging
-from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks, ConditionalPipelineBlocks
-from ..modular_pipeline_utils import InsertableDict
-from .before_denoise import (
-    QwenImageControlNetBeforeDenoiserStep,
-    QwenImageCreateMaskLatentsStep,
-    QwenImagePrepareLatentsStep,
-    QwenImagePrepareLatentsWithStrengthStep,
-    QwenImageRoPEInputsStep,
-    QwenImageSetTimestepsStep,
-    QwenImageSetTimestepsWithStrengthStep,
-)
-from .decoders import (
-    QwenImageAfterDenoiseStep,
-    QwenImageDecoderStep,
-    QwenImageInpaintProcessImagesOutputStep,
-    QwenImageProcessImagesOutputStep,
-)
-from .denoise import (
-    QwenImageControlNetDenoiseStep,
-    QwenImageDenoiseStep,
-    QwenImageInpaintControlNetDenoiseStep,
-    QwenImageInpaintDenoiseStep,
-    QwenImageLoopBeforeDenoiserControlNet,
-)
-from .encoders import (
-    QwenImageControlNetVaeEncoderStep,
-    QwenImageInpaintProcessImagesInputStep,
-    QwenImageProcessImagesInputStep,
-    QwenImageTextEncoderStep,
-    QwenImageVaeEncoderDynamicStep,
-)
-from .inputs import (
-    QwenImageControlNetInputsStep,
-    QwenImageInputsDynamicStep,
-    QwenImageTextInputsStep,
-)
-
-
-logger = logging.get_logger(__name__)
-
-
-
-# 1. VAE ENCODER
-
-# inpaint vae encoder
-class QwenImageInpaintVaeEncoderStep(SequentialPipelineBlocks):
-    model_name = "qwenimage"
-    block_classes = [QwenImageInpaintProcessImagesInputStep(), QwenImageVaeEncoderDynamicStep()]
-    block_names = ["preprocess", "encode"]
-
-    @property
-    def description(self) -> str:
-        return (
-            "This step is used for processing image and mask inputs for inpainting tasks. It:\n"
-            " - Resizes the image to the target size, based on `height` and `width`.\n"
-            " - Processes and updates `image` and `mask_image`.\n"
-            " - Creates `image_latents`."
-        )
-
-
-# img2img vae encoder
-class QwenImageImg2ImgVaeEncoderStep(SequentialPipelineBlocks):
-    model_name = "qwenimage"
-
-    block_classes = [QwenImageProcessImagesInputStep(), QwenImageVaeEncoderDynamicStep()]
-    block_names = ["preprocess", "encode"]
-
-    @property
-    def description(self) -> str:
-        return "Vae encoder step that preprocess andencode the image inputs into their latent representations."
-
-
-# auto vae encoder
-class QwenImageAutoVaeEncoderStep(AutoPipelineBlocks):
-    block_classes = [QwenImageInpaintVaeEncoderStep, QwenImageImg2ImgVaeEncoderStep]
-    block_names = ["inpaint", "img2img"]
-    block_trigger_inputs = ["mask_image", "image"]
-
-    @property
-    def description(self):
-        return (
-            "Vae encoder step that encode the image inputs into their latent representations.\n"
-            + "This is an auto pipeline block.\n"
-            + " - `QwenImageInpaintVaeEncoderStep` (inpaint) is used when `mask_image` is provided.\n"
-            + " - `QwenImageImg2ImgVaeEncoderStep` (img2img) is used when `image` is provided.\n"
-            + " - if `mask_image` or `image` is not provided, step will be skipped."
-        )
-
-
-# optional controlnet vae encoder
-class QwenImageOptionalControlNetVaeEncoderStep(AutoPipelineBlocks):
-    block_classes = [QwenImageControlNetVaeEncoderStep]
-    block_names = ["controlnet"]
-    block_trigger_inputs = ["control_image"]
-
-    @property
-    def description(self):
-        return (
-            "Vae encoder step that encode the image inputs into their latent representations.\n"
-            + "This is an auto pipeline block.\n"
-            + " - `QwenImageControlNetVaeEncoderStep` (controlnet) is used when `control_image` is provided.\n"
-            + " - if `control_image` is not provided, step will be skipped."
-        )
-
-# 2. DENOISE
-# input -> prepare_latents -> set_timesteps -> prepare_rope_inputs -> denoise -> after_denoise
-
-# img2img input
-class QwenImageImg2ImgInputStep(SequentialPipelineBlocks):
-    model_name = "qwenimage"
-    block_classes = [QwenImageTextInputsStep(), QwenImageInputsDynamicStep(image_latent_inputs=["image_latents"])]
-    block_names = ["text_inputs", "additional_inputs"]
-
-    @property
-    def description(self):
-        return "Input step that prepares the inputs for the img2img denoising step. It:\n"
-        " - make sure the text embeddings have consistent batch size as well as the additional inputs (`image_latents`).\n"
-        " - update height/width based `image_latents`, patchify `image_latents`."
-
-
-# inpaint input
-class QwenImageInpaintInputStep(SequentialPipelineBlocks):
-    model_name = "qwenimage"
-    block_classes = [QwenImageTextInputsStep(), QwenImageInputsDynamicStep(image_latent_inputs=["image_latents"], additional_batch_inputs=["processed_mask_image"])]
-    block_names = ["text_inputs", "additional_inputs"]
-
-    @property
-    def description(self):
-        return "Input step that prepares the inputs for the inpainting denoising step. It:\n"
-        " - make sure the text embeddings have consistent batch size as well as the additional inputs (`image_latents` and `processed_mask_image`).\n"
-        " - update height/width based `image_latents`, patchify `image_latents`."
-
-# inpaint prepare latents
-class QwenImageInpaintPrepareLatentsStep(SequentialPipelineBlocks):
-    model_name = "qwenimage"
-    block_classes = [QwenImagePrepareLatentsWithStrengthStep(), QwenImageCreateMaskLatentsStep()]
-    block_names = ["add_noise_to_latents", "create_mask_latents"]
-
-    @property
-    def description(self) -> str:
-        return (
-            "This step prepares the latents/image_latents and mask inputs for the inpainting denoising step. It:\n"
-            " - Add noise to the image latents to create the latents input for the denoiser.\n"
-            " - Create the pachified latents `mask` based on the processedmask image.\n"
-        )
-
-# CoreDenoiseStep: 
-# (input +  prepare_latents + set_timesteps + prepare_rope_inputs + denoise + after_denoise)
-
-# 1. text2image
-class QwenImageCoreDenoiseStep(SequentialPipelineBlocks):
-    model_name = "qwenimage"
-    block_classes = [
-        QwenImageTextInputsStep(),
-        QwenImagePrepareLatentsStep(), 
-        QwenImageSetTimestepsStep(), 
-        QwenImageRoPEInputsStep(), 
-        QwenImageDenoiseStep(),
-        QwenImageAfterDenoiseStep(),
-    ]
-    block_names = [
-        "input",
-        "prepare_latents", 
-        "set_timesteps", 
-        "prepare_rope_inputs", 
-        "denoise", 
-        "after_denoise",
-    ]
-
-    @property
-    def description(self):
-        return "step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.)."
-
-
-# 2.inpaint
-class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks):
-    model_name = "qwenimage"
-    block_classes = [
-        QwenImageInpaintInputStep(),
-        QwenImagePrepareLatentsStep(), 
-        QwenImageSetTimestepsWithStrengthStep(), 
-        QwenImageInpaintPrepareLatentsStep(), 
-        QwenImageRoPEInputsStep(),
-        QwenImageInpaintDenoiseStep(),
-        QwenImageAfterDenoiseStep(),
-        ]
-    block_names = [
-        "input",
-        "prepare_latents",
-        "set_timesteps", 
-        "prepare_inpaint_latents", 
-        "prepare_rope_inputs",
-        "denoise",
-        "after_denoise",
-        ]
-
-    @property
-    def description(self):
-        return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task."
-
-
-# 3. img2img
-class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
-    model_name = "qwenimage"
-    block_classes = [
-        QwenImageImg2ImgInputStep(),
-        QwenImagePrepareLatentsStep(), 
-        QwenImageSetTimestepsWithStrengthStep(), 
-        QwenImagePrepareLatentsWithStrengthStep(), 
-        QwenImageRoPEInputsStep(),
-        QwenImageDenoiseStep(),
-        QwenImageAfterDenoiseStep(),
-        ]
-    block_names = [
-        "input",
-        "prepare_latents", 
-        "set_timesteps", 
-        "prepare_img2img_latents", 
-        "prepare_rope_inputs",
-        "denoise",
-        "after_denoise",
-        ]
-
-    @property
-    def description(self):
-        return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task."
-
-
-
-# 4. text2image + controlnet
-class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks):
-    model_name = "qwenimage"
-    block_classes = [
-        QwenImageTextInputsStep(),
-        QwenImageControlNetInputsStep(),
-        QwenImagePrepareLatentsStep(), 
-        QwenImageSetTimestepsStep(), 
-        QwenImageRoPEInputsStep(), 
-        QwenImageControlNetBeforeDenoiserStep(),
-        QwenImageControlNetDenoiseStep(),
-        QwenImageAfterDenoiseStep(),
-    ]
-    block_names = [
-        "input",
-        "controlnet_input",
-        "prepare_latents", 
-        "set_timesteps", 
-        "prepare_rope_inputs", 
-        "controlnet_before_denoise",
-        "controlnet_denoise",
-        "after_denoise",
-    ]
-
-    @property
-    def description(self):
-        return "step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.)."
-
-
-# 5. inpaint + controlnet
-class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks):
-    model_name = "qwenimage"
-    block_classes = [
-        QwenImageInpaintInputStep(),
-        QwenImageControlNetInputsStep(),
-        QwenImagePrepareLatentsStep(), 
-        QwenImageSetTimestepsWithStrengthStep(), 
-        QwenImageInpaintPrepareLatentsStep(), 
-        QwenImageRoPEInputsStep(),
-        QwenImageControlNetBeforeDenoiserStep(),
-        QwenImageInpaintControlNetDenoiseStep(),
-        QwenImageAfterDenoiseStep(),
-        ]
-    block_names = [
-        "input",
-        "controlnet_input",
-        "prepare_latents",
-        "set_timesteps", 
-        "prepare_inpaint_latents", 
-        "prepare_rope_inputs",
-        "controlnet_before_denoise",
-        "controlnet_denoise",
-        "after_denoise",
-        ]
-
-    @property
-    def description(self):
-        return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task."
-
-
-# 6. img2img + controlnet
-class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
-    model_name = "qwenimage"
-    block_classes = [
-        QwenImageImg2ImgInputStep(),
-        QwenImageControlNetInputsStep(),
-        QwenImagePrepareLatentsStep(), 
-        QwenImageSetTimestepsWithStrengthStep(), 
-        QwenImagePrepareLatentsWithStrengthStep(), 
-        QwenImageRoPEInputsStep(),
-        QwenImageControlNetBeforeDenoiserStep(),
-        QwenImageControlNetDenoiseStep(),
-        QwenImageAfterDenoiseStep(),
-        ]
-    block_names = [
-        "input",
-        "controlnet_input",
-        "prepare_latents", 
-        "set_timesteps", 
-        "prepare_img2img_latents", 
-        "prepare_rope_inputs",
-        "controlnet_before_denoise",
-        "controlnet_denoise",
-        "after_denoise",
-        ]
-
-    @property
-    def description(self):
-        return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task."
-
-
-# auto denoise
-# auto denoise step for controlnet tasks: works for all tasks with controlnet
-class QwenImageAutoCoreDenoiseStep(ConditionalPipelineBlocks):
-    block_classes = [
-        QwenImageCoreDenoiseStep,
-        QwenImageInpaintCoreDenoiseStep, 
-        QwenImageImg2ImgCoreDenoiseStep,
-        QwenImageControlNetCoreDenoiseStep,
-        QwenImageControlNetInpaintCoreDenoiseStep,
-        QwenImageControlNetImg2ImgCoreDenoiseStep,
-    ]
-    block_names = [
-        "text2image",
-        "inpaint",
-        "img2img",
-        "controlnet_text2image",
-        "controlnet_inpaint",
-        "controlnet_img2img"]
-    block_trigger_inputs = ["control_image_latents", "processed_mask_image", "image_latents"]
-    default_block_name = "text2image"
-
-    def select_block(self, control_image_latents=None, processed_mask_image=None, image_latents=None):
-
-        if control_image_latents is not None:
-            if processed_mask_image is not None:
-                return "controlnet_inpaint"
-            elif image_latents is not None:
-                return "controlnet_img2img"
-            else:
-                return "controlnet_text2image"
-        else:
-            if processed_mask_image is not None:
-                return "inpaint"
-            elif image_latents is not None:
-                return "img2img"
-            else:
-                return "text2image"
-
-    @property
-    def description(self):
-        return (
-            "Core step that performs the denoising process. \n"
-            + " - `QwenImageCoreDenoiseStep` (text2image) for text2image tasks.\n"
-            + " - `QwenImageInpaintCoreDenoiseStep` (inpaint) for inpaint tasks.\n"
-            + " - `QwenImageImg2ImgCoreDenoiseStep` (img2img) for img2img tasks.\n"
-            + " - `QwenImageControlNetCoreDenoiseStep` (controlnet_text2image) for text2image tasks with controlnet.\n"
-            + " - `QwenImageControlNetInpaintCoreDenoiseStep` (controlnet_inpaint) for inpaint tasks with controlnet.\n"
-            + " - `QwenImageControlNetImg2ImgCoreDenoiseStep` (controlnet_img2img) for img2img tasks with controlnet.\n"
-            + "This step support text-to-image, image-to-image, inpainting, and controlnet tasks for QwenImage:\n"
-            + " - for image-to-image generation, you need to provide `image_latents`\n"
-            + " - for inpainting, you need to provide `processed_mask_image` and `image_latents`\n"
-            + " - to run the controlnet workflow, you need to provide `control_image_latents`\n"
-            + " - for text-to-image generation, all you need to provide is prompt embeddings"
-        )
-
-
-# 4. DECODE
-
-## 1.1 text2image
-
-#### decode
-#### (standard decode step works for most tasks except for inpaint)
-
-class QwenImageDecodeStep(SequentialPipelineBlocks):
-    model_name = "qwenimage"
-    block_classes = [QwenImageDecoderStep(), QwenImageProcessImagesOutputStep()]
-    block_names = ["decode", "postprocess"]
-
-    @property
-    def description(self):
-        return "Decode step that decodes the latents to images and postprocess the generated image."
-
-
-
-#### inpaint decode
-
-class QwenImageInpaintDecodeStep(SequentialPipelineBlocks):
-    model_name = "qwenimage"
-    block_classes = [QwenImageDecoderStep(), QwenImageInpaintProcessImagesOutputStep()]
-    block_names = ["decode", "postprocess"]
-
-    @property
-    def description(self):
-        return "Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask overally to the original image."
-
-
-# auto decode step for inpaint and text2image tasks
-class QwenImageAutoDecodeStep(AutoPipelineBlocks):
-    block_classes = [QwenImageInpaintDecodeStep, QwenImageDecodeStep]
-    block_names = ["inpaint_decode", "decode"]
-    block_trigger_inputs = ["mask", None]
-
-    @property
-    def description(self):
-        return (
-            "Decode step that decode the latents into images. \n"
-            " This is an auto pipeline block that works for inpaint/text2image/img2img tasks, for both QwenImage and QwenImage-Edit.\n"
-            + " - `QwenImageInpaintDecodeStep` (inpaint) is used when `mask` is provided.\n"
-            + " - `QwenImageDecodeStep` (text2image/img2img) is used when `mask` is not provided.\n"
-        )
-
-
-
-## 1.10 QwenImage/auto block & presets
-AUTO_BLOCKS = InsertableDict(
-    [
-        ("text_encoder", QwenImageTextEncoderStep()),
-        ("vae_encoder", QwenImageAutoVaeEncoderStep()),
-        ("controlnet_vae_encoder", QwenImageOptionalControlNetVaeEncoderStep()),
-        ("denoise", QwenImageAutoCoreDenoiseStep()),
-        ("decode", QwenImageAutoDecodeStep()),
-    ]
-)
-
-
-class QwenImageAutoBlocks(SequentialPipelineBlocks):
-    model_name = "qwenimage"
-
-    block_classes = AUTO_BLOCKS.values()
-    block_names = AUTO_BLOCKS.keys()
-
-    @property
-    def description(self):
-        return (
-            "Auto Modular pipeline for text-to-image, image-to-image, inpainting, and controlnet tasks using QwenImage.\n"
-            + "- for image-to-image generation, you need to provide `image`\n"
-            + "- for inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop` \n"
-            + "- to run the controlnet workflow, you need to provide `control_image`\n"
-            + "- for text-to-image generation, all you need to provide is `prompt`"
-        )
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
@@ -1,329 +0,0 @@
-# Copyright 2025 Qwen-Image Team and The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Optional
-
-from ...utils import logging
-from ..modular_pipeline import AutoPipelineBlocks, ConditionalPipelineBlocks, SequentialPipelineBlocks
-from ..modular_pipeline_utils import InsertableDict
-from .before_denoise import (
-    QwenImageCreateMaskLatentsStep,
-    QwenImageEditRoPEInputsStep,
-    QwenImagePrepareLatentsStep,
-    QwenImagePrepareLatentsWithStrengthStep,
-    QwenImageSetTimestepsStep,
-    QwenImageSetTimestepsWithStrengthStep,
-)
-from .decoders import (
-    QwenImageAfterDenoiseStep,
-    QwenImageDecoderStep,
-    QwenImageInpaintProcessImagesOutputStep,
-    QwenImageProcessImagesOutputStep,
-)
-from .denoise import (
-    QwenImageEditDenoiseStep,
-    QwenImageEditInpaintDenoiseStep,
-)
-from .encoders import (
-    QwenImageEditResizeDynamicStep,
-    QwenImageEditTextEncoderStep,
-    QwenImageInpaintProcessImagesInputStep,
-    QwenImageProcessImagesInputStep,
-    QwenImageVaeEncoderDynamicStep,
-)
-from .inputs import (
-    QwenImageInputsDynamicStep,
-    QwenImageTextInputsStep,
-)
-
-
-logger = logging.get_logger(__name__)
-
-
-# ====================
-# 1. TEXT ENCODER
-# ====================
-
-class QwenImageEditVLEncoderStep(SequentialPipelineBlocks):
-    """VL encoder that takes both image and text prompts."""
-    model_name = "qwenimage-edit"
-    block_classes = [
-        QwenImageEditResizeDynamicStep(),
-        QwenImageEditTextEncoderStep(),
-    ]
-    block_names = ["resize", "encode"]
-
-    @property
-    def description(self) -> str:
-        return "QwenImage-Edit VL encoder step that encode the image and text prompts together."
-
-
-# ====================
-# 2. VAE ENCODER
-# ====================
-
-# Edit VAE encoder
-class QwenImageEditVaeEncoderStep(SequentialPipelineBlocks):
-    model_name = "qwenimage-edit"
-    block_classes = [
-        QwenImageEditResizeDynamicStep(),
-        QwenImageProcessImagesInputStep(),
-        QwenImageVaeEncoderDynamicStep(),
-    ]
-    block_names = ["resize", "preprocess", "encode"]
-
-    @property
-    def description(self) -> str:
-        return "Vae encoder step that encode the image inputs into their latent representations."
-
-
-# Edit Inpaint VAE encoder
-class QwenImageEditInpaintVaeEncoderStep(SequentialPipelineBlocks):
-    model_name = "qwenimage-edit"
-    block_classes = [
-        QwenImageEditResizeDynamicStep(),
-        QwenImageInpaintProcessImagesInputStep(),
-        QwenImageVaeEncoderDynamicStep(input_name="processed_image", output_name="image_latents"),
-    ]
-    block_names = ["resize", "preprocess", "encode"]
-
-    @property
-    def description(self) -> str:
-        return (
-            "This step is used for processing image and mask inputs for QwenImage-Edit inpaint tasks. It:\n"
-            " - resize the image for target area (1024 * 1024) while maintaining the aspect ratio.\n"
-            " - process the resized image and mask image.\n"
-            " - create image latents."
-        )
-
-
-# Auto VAE encoder
-class QwenImageEditAutoVaeEncoderStep(AutoPipelineBlocks):
-    block_classes = [QwenImageEditInpaintVaeEncoderStep, QwenImageEditVaeEncoderStep]
-    block_names = ["edit_inpaint", "edit"]
-    block_trigger_inputs = ["mask_image", "image"]
-
-    @property
-    def description(self):
-        return (
-            "Vae encoder step that encode the image inputs into their latent representations.\n"
-            "This is an auto pipeline block.\n"
-            " - `QwenImageEditInpaintVaeEncoderStep` (edit_inpaint) is used when `mask_image` is provided.\n"
-            " - `QwenImageEditVaeEncoderStep` (edit) is used when `image` is provided.\n"
-            " - if `mask_image` or `image` is not provided, step will be skipped."
-        )
-
-
-# ====================
-# 3. DENOISE - input -> prepare_latents -> set_timesteps -> prepare_rope_inputs -> denoise -> after_denoise
-# ====================
-
-# Edit input step
-class QwenImageEditInputStep(SequentialPipelineBlocks):
-    model_name = "qwenimage-edit"
-    block_classes = [
-        QwenImageTextInputsStep(),
-        QwenImageInputsDynamicStep(image_latent_inputs=["image_latents"]),
-    ]
-    block_names = ["text_inputs", "additional_inputs"]
-
-    @property
-    def description(self):
-        return (
-            "Input step that prepares the inputs for the edit denoising step. It:\n"
-            " - make sure the text embeddings have consistent batch size as well as the additional inputs.\n"
-            " - update height/width based `image_latents`, patchify `image_latents`."
-        )
-
-
-# Edit Inpaint input step
-class QwenImageEditInpaintInputStep(SequentialPipelineBlocks):
-    model_name = "qwenimage-edit"
-    block_classes = [
-        QwenImageTextInputsStep(),
-        QwenImageInputsDynamicStep(image_latent_inputs=["image_latents"], additional_batch_inputs=["processed_mask_image"]),
-    ]
-    block_names = ["text_inputs", "additional_inputs"]
-
-    @property
-    def description(self):
-        return (
-            "Input step that prepares the inputs for the edit inpaint denoising step. It:\n"
-            " - make sure the text embeddings have consistent batch size as well as the additional inputs.\n"
-            " - update height/width based `image_latents`, patchify `image_latents`."
-        )
-
-
-# Edit Inpaint prepare latents step
-class QwenImageEditInpaintPrepareLatentsStep(SequentialPipelineBlocks):
-    model_name = "qwenimage-edit"
-    block_classes = [QwenImagePrepareLatentsWithStrengthStep(), QwenImageCreateMaskLatentsStep()]
-    block_names = ["add_noise_to_latents", "create_mask_latents"]
-
-    @property
-    def description(self) -> str:
-        return (
-            "This step prepares the latents/image_latents and mask inputs for the edit inpainting denoising step. It:\n"
-            " - Add noise to the image latents to create the latents input for the denoiser.\n"
-            " - Create the patchified latents `mask` based on the processed mask image.\n"
-        )
-
-
-# 1. Edit (img2img) core denoise
-class QwenImageEditCoreDenoiseStep(SequentialPipelineBlocks):
-    model_name = "qwenimage-edit"
-    block_classes = [
-        QwenImageEditInputStep(),
-        QwenImagePrepareLatentsStep(),
-        QwenImageSetTimestepsStep(),
-        QwenImageEditRoPEInputsStep(),
-        QwenImageEditDenoiseStep(),
-        QwenImageAfterDenoiseStep(),
-    ]
-    block_names = [
-        "input",
-        "prepare_latents",
-        "set_timesteps",
-        "prepare_rope_inputs",
-        "denoise",
-        "after_denoise",
-    ]
-
-    @property
-    def description(self):
-        return "Core denoising workflow for QwenImage-Edit edit (img2img) task."
-
-
-# 2. Edit Inpaint core denoise
-class QwenImageEditInpaintCoreDenoiseStep(SequentialPipelineBlocks):
-    model_name = "qwenimage-edit"
-    block_classes = [
-        QwenImageEditInpaintInputStep(),
-        QwenImagePrepareLatentsStep(),
-        QwenImageSetTimestepsWithStrengthStep(),
-        QwenImageEditInpaintPrepareLatentsStep(),
-        QwenImageEditRoPEInputsStep(),
-        QwenImageEditInpaintDenoiseStep(),
-        QwenImageAfterDenoiseStep(),
-    ]
-    block_names = [
-        "input",
-        "prepare_latents",
-        "set_timesteps",
-        "prepare_inpaint_latents",
-        "prepare_rope_inputs",
-        "denoise",
-        "after_denoise",
-    ]
-
-    @property
-    def description(self):
-        return "Core denoising workflow for QwenImage-Edit edit inpaint task."
-
-
-# Auto core denoise step
-class QwenImageEditAutoCoreDenoiseStep(ConditionalPipelineBlocks):
-    block_classes = [
-        QwenImageEditInpaintCoreDenoiseStep,
-        QwenImageEditCoreDenoiseStep,
-    ]
-    block_names = ["edit_inpaint", "edit"]
-    block_trigger_inputs = ["processed_mask_image", "image_latents"]
-    default_block_name = "edit"
-
-    def select_block(self, processed_mask_image=None, image_latents=None) -> Optional[str]:
-        if processed_mask_image is not None:
-            return "edit_inpaint"
-        elif image_latents is not None:
-            return "edit"
-        return None
-
-    @property
-    def description(self):
-        return (
-            "Auto core denoising step that selects the appropriate workflow based on inputs.\n"
-            " - `QwenImageEditInpaintCoreDenoiseStep` when `processed_mask_image` is provided\n"
-            " - `QwenImageEditCoreDenoiseStep` when `image_latents` is provided\n"
-            "Supports edit (img2img) and edit inpainting tasks for QwenImage-Edit."
-        )
-
-
-# ====================
-# 4. DECODE
-# ====================
-
-# Decode step (standard)
-class QwenImageEditDecodeStep(SequentialPipelineBlocks):
-    model_name = "qwenimage-edit"
-    block_classes = [QwenImageDecoderStep(), QwenImageProcessImagesOutputStep()]
-    block_names = ["decode", "postprocess"]
-
-    @property
-    def description(self):
-        return "Decode step that decodes the latents to images and postprocess the generated image."
-
-
-# Inpaint decode step
-class QwenImageEditInpaintDecodeStep(SequentialPipelineBlocks):
-    model_name = "qwenimage-edit"
-    block_classes = [QwenImageDecoderStep(), QwenImageInpaintProcessImagesOutputStep()]
-    block_names = ["decode", "postprocess"]
-
-    @property
-    def description(self):
-        return "Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask overlay to the original image."
-
-
-# Auto decode step
-class QwenImageEditAutoDecodeStep(AutoPipelineBlocks):
-    block_classes = [QwenImageEditInpaintDecodeStep, QwenImageEditDecodeStep]
-    block_names = ["inpaint_decode", "decode"]
-    block_trigger_inputs = ["mask", None]
-
-    @property
-    def description(self):
-        return (
-            "Decode step that decode the latents into images.\n"
-            "This is an auto pipeline block.\n"
-            " - `QwenImageEditInpaintDecodeStep` (inpaint) is used when `mask` is provided.\n"
-            " - `QwenImageEditDecodeStep` (edit) is used when `mask` is not provided.\n"
-        )
-
-
-# ====================
-# 5. AUTO BLOCKS & PRESETS
-# ====================
-
-EDIT_AUTO_BLOCKS = InsertableDict(
-    [
-        ("text_encoder", QwenImageEditVLEncoderStep()),
-        ("vae_encoder", QwenImageEditAutoVaeEncoderStep()),
-        ("denoise", QwenImageEditAutoCoreDenoiseStep()),
-        ("decode", QwenImageEditAutoDecodeStep()),
-    ]
-)
-
-
-class QwenImageEditAutoBlocks(SequentialPipelineBlocks):
-    model_name = "qwenimage-edit"
-    block_classes = EDIT_AUTO_BLOCKS.values()
-    block_names = EDIT_AUTO_BLOCKS.keys()
-
-    @property
-    def description(self):
-        return (
-            "Auto Modular pipeline for edit (img2img) and edit inpaint tasks using QwenImage-Edit.\n"
-            "- for edit (img2img) generation, you need to provide `image`\n"
-            "- for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop`\n"
-        )
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
@@ -1,175 +0,0 @@
-# Copyright 2025 Qwen-Image Team and The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from ...utils import logging
-from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks
-from ..modular_pipeline_utils import InsertableDict
-from .before_denoise import (
-    QwenImageEditPlusRoPEInputsStep,
-    QwenImagePrepareLatentsStep,
-    QwenImageSetTimestepsStep,
-)
-from .decoders import (
-    QwenImageAfterDenoiseStep,
-    QwenImageDecoderStep,
-    QwenImageProcessImagesOutputStep,
-)
-from .denoise import (
-    QwenImageEditDenoiseStep,
-)
-from .encoders import (
-    QwenImageEditPlusResizeDynamicStep,
-    QwenImageEditPlusTextEncoderStep,
-    QwenImageEditPlusProcessImagesInputStep,
-    QwenImageVaeEncoderDynamicStep,
-)
-from .inputs import (
-    QwenImageEditPlusInputsDynamicStep,
-    QwenImageTextInputsStep,
-)
-
-
-logger = logging.get_logger(__name__)
-
-
-# ====================
-# 1. TEXT ENCODER
-# ====================
-
-class QwenImageEditPlusVLEncoderStep(SequentialPipelineBlocks):
-    """VL encoder that takes both image and text prompts. Uses 384x384 target area."""
-    model_name = "qwenimage-edit-plus"
-    block_classes = [
-        QwenImageEditPlusResizeDynamicStep(target_area=384 * 384, output_name="resized_cond_image"),
-        QwenImageEditPlusTextEncoderStep(),
-    ]
-    block_names = ["resize", "encode"]
-
-    @property
-    def description(self) -> str:
-        return "QwenImage-Edit Plus VL encoder step that encodes the image and text prompts together."
-
-
-# ====================
-# 2. VAE ENCODER
-# ====================
-
-class QwenImageEditPlusVaeEncoderStep(SequentialPipelineBlocks):
-    """VAE encoder that handles multiple images with different sizes. Uses 1024x1024 target area."""
-    model_name = "qwenimage-edit-plus"
-    block_classes = [
-        QwenImageEditPlusResizeDynamicStep(target_area=1024 * 1024, output_name="resized_image"),
-        QwenImageEditPlusProcessImagesInputStep(),
-        QwenImageVaeEncoderDynamicStep(),
-    ]
-    block_names = ["resize", "preprocess", "encode"]
-
-    @property
-    def description(self) -> str:
-        return (
-            "VAE encoder step that encodes image inputs into latent representations.\n"
-            "Each image is resized independently based on its own aspect ratio to 1024x1024 target area."
-        )
-
-
-# ====================
-# 3. DENOISE - input -> prepare_latents -> set_timesteps -> prepare_rope_inputs -> denoise -> after_denoise
-# ====================
-
-# Edit Plus input step
-class QwenImageEditPlusInputStep(SequentialPipelineBlocks):
-    model_name = "qwenimage-edit-plus"
-    block_classes = [
-        QwenImageTextInputsStep(),
-        QwenImageEditPlusInputsDynamicStep(image_latent_inputs=["image_latents"]),
-    ]
-    block_names = ["text_inputs", "additional_inputs"]
-
-    @property
-    def description(self):
-        return (
-            "Input step that prepares the inputs for the Edit Plus denoising step. It:\n"
-            " - Standardizes text embeddings batch size.\n"
-            " - Processes list of image latents: patchifies, concatenates along dim=1, expands batch.\n"
-            " - Outputs lists of image_height/image_width for RoPE calculation.\n"
-            " - Defaults height/width from last image in the list."
-        )
-
-
-# Edit Plus core denoise
-class QwenImageEditPlusCoreDenoiseStep(SequentialPipelineBlocks):
-    model_name = "qwenimage-edit-plus"
-    block_classes = [
-        QwenImageEditPlusInputStep(),
-        QwenImagePrepareLatentsStep(),
-        QwenImageSetTimestepsStep(),
-        QwenImageEditPlusRoPEInputsStep(),
-        QwenImageEditDenoiseStep(),
-        QwenImageAfterDenoiseStep(),
-    ]
-    block_names = [
-        "input",
-        "prepare_latents",
-        "set_timesteps",
-        "prepare_rope_inputs",
-        "denoise",
-        "after_denoise",
-    ]
-
-    @property
-    def description(self):
-        return "Core denoising workflow for QwenImage-Edit Plus edit (img2img) task."
-
-
-# ====================
-# 4. DECODE
-# ====================
-
-class QwenImageEditPlusDecodeStep(SequentialPipelineBlocks):
-    model_name = "qwenimage-edit-plus"
-    block_classes = [QwenImageDecoderStep(), QwenImageProcessImagesOutputStep()]
-    block_names = ["decode", "postprocess"]
-
-    @property
-    def description(self):
-        return "Decode step that decodes the latents to images and postprocesses the generated image."
-
-
-# ====================
-# 5. AUTO BLOCKS & PRESETS
-# ====================
-
-EDIT_PLUS_AUTO_BLOCKS = InsertableDict(
-    [
-        ("text_encoder", QwenImageEditPlusVLEncoderStep()),
-        ("vae_encoder", QwenImageEditPlusVaeEncoderStep()),
-        ("denoise", QwenImageEditPlusCoreDenoiseStep()),
-        ("decode", QwenImageEditPlusDecodeStep()),
-    ]
-)
-
-
-class QwenImageEditPlusAutoBlocks(SequentialPipelineBlocks):
-    model_name = "qwenimage-edit-plus"
-    block_classes = EDIT_PLUS_AUTO_BLOCKS.values()
-    block_names = EDIT_PLUS_AUTO_BLOCKS.keys()
-
-    @property
-    def description(self):
-        return (
-            "Auto Modular pipeline for edit (img2img) tasks using QwenImage-Edit Plus.\n"
-            "- `image` is required input (can be single image or list of images).\n"
-            "- Each image is resized independently based on its own aspect ratio.\n"
-            "- VL encoder uses 384x384 target area, VAE encoder uses 1024x1024 target area."
-        )
--- a/src/diffusers/modular_pipelines/qwenimage/node_utils.py
+++ b/src/diffusers/modular_pipelines/qwenimage/node_utils.py
@@ -0,0 +1,95 @@
+# Copyright 2025 Qwen-Image Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# mellon nodes
+QwenImage_NODE_TYPES_PARAMS_MAP = {
+    "controlnet": {
+        "inputs": [
+            "control_image",
+            "controlnet_conditioning_scale",
+            "control_guidance_start",
+            "control_guidance_end",
+            "height",
+            "width",
+        ],
+        "model_inputs": [
+            "controlnet",
+            "vae",
+        ],
+        "outputs": [
+            "controlnet_out",
+        ],
+        "block_names": ["controlnet_vae_encoder"],
+    },
+    "denoise": {
+        "inputs": [
+            "embeddings",
+            "width",
+            "height",
+            "seed",
+            "num_inference_steps",
+            "guidance_scale",
+            "image_latents",
+            "strength",
+            "controlnet",
+        ],
+        "model_inputs": [
+            "unet",
+            "guider",
+            "scheduler",
+        ],
+        "outputs": [
+            "latents",
+            "latents_preview",
+        ],
+        "block_names": ["denoise"],
+    },
+    "vae_encoder": {
+        "inputs": [
+            "image",
+            "width",
+            "height",
+        ],
+        "model_inputs": [
+            "vae",
+        ],
+        "outputs": [
+            "image_latents",
+        ],
+    },
+    "text_encoder": {
+        "inputs": [
+            "prompt",
+            "negative_prompt",
+        ],
+        "model_inputs": [
+            "text_encoders",
+        ],
+        "outputs": [
+            "embeddings",
+        ],
+    },
+    "decoder": {
+        "inputs": [
+            "latents",
+        ],
+        "model_inputs": [
+            "vae",
+        ],
+        "outputs": [
+            "images",
+        ],
+    },
+}
--- a/src/diffusers/modular_pipelines/stable_diffusion_xl/node_utils.py
+++ b/src/diffusers/modular_pipelines/stable_diffusion_xl/node_utils.py
@@ -0,0 +1,99 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+SDXL_NODE_TYPES_PARAMS_MAP = {
+    "controlnet": {
+        "inputs": [
+            "control_image",
+            "controlnet_conditioning_scale",
+            "control_guidance_start",
+            "control_guidance_end",
+            "height",
+            "width",
+        ],
+        "model_inputs": [
+            "controlnet",
+        ],
+        "outputs": [
+            "controlnet_out",
+        ],
+        "block_names": [None],
+    },
+    "denoise": {
+        "inputs": [
+            "embeddings",
+            "width",
+            "height",
+            "seed",
+            "num_inference_steps",
+            "guidance_scale",
+            "image_latents",
+            "strength",
+            # custom adapters coming in as inputs
+            "controlnet",
+            # ip_adapter is optional and custom; include if available
+            "ip_adapter",
+        ],
+        "model_inputs": [
+            "unet",
+            "guider",
+            "scheduler",
+        ],
+        "outputs": [
+            "latents",
+            "latents_preview",
+        ],
+        "block_names": ["denoise"],
+    },
+    "vae_encoder": {
+        "inputs": [
+            "image",
+            "width",
+            "height",
+        ],
+        "model_inputs": [
+            "vae",
+        ],
+        "outputs": [
+            "image_latents",
+        ],
+        "block_names": ["vae_encoder"],
+    },
+    "text_encoder": {
+        "inputs": [
+            "prompt",
+            "negative_prompt",
+        ],
+        "model_inputs": [
+            "text_encoders",
+        ],
+        "outputs": [
+            "embeddings",
+        ],
+        "block_names": ["text_encoder"],
+    },
+    "decoder": {
+        "inputs": [
+            "latents",
+        ],
+        "model_inputs": [
+            "vae",
+        ],
+        "outputs": [
+            "images",
+        ],
+        "block_names": ["decode"],
+    },
+}
--- a/src/diffusers/modular_pipelines/z_image/denoise.py
+++ b/src/diffusers/modular_pipelines/z_image/denoise.py
@@ -129,10 +129,6 @@ class ZImageLoopDenoiser(ModularPipelineBlocks):
                type_hint=int,
                description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.",
            ),
-            InputParam(
-                kwargs_type="denoiser_input_fields",
-                description="conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.",
-            ),
        ]
        guider_input_names = []
        uncond_guider_input_names = []
--- a/src/diffusers/modular_pipelines/z_image/modular_blocks.py
+++ b/src/diffusers/modular_pipelines/z_image/modular_blocks.py
@@ -119,7 +119,7 @@ class ZImageAutoDenoiseStep(AutoPipelineBlocks):

 class ZImageAutoVaeImageEncoderStep(AutoPipelineBlocks):
    block_classes = [ZImageVaeImageEncoderStep]
-    block_names = ["vae_encoder"]
+    block_names = ["vae_image_encoder"]
    block_trigger_inputs = ["image"]

    @property
@@ -137,7 +137,7 @@ class ZImageAutoBlocks(SequentialPipelineBlocks):
        ZImageAutoDenoiseStep,
        ZImageVaeDecoderStep,
    ]
-    block_names = ["text_encoder", "vae_encoder", "denoise", "decode"]
+    block_names = ["text_encoder", "vae_image_encoder", "denoise", "decode"]

    @property
    def description(self) -> str:
@@ -162,7 +162,7 @@ TEXT2IMAGE_BLOCKS = InsertableDict(
 IMAGE2IMAGE_BLOCKS = InsertableDict(
    [
        ("text_encoder", ZImageTextEncoderStep),
-        ("vae_encoder", ZImageVaeImageEncoderStep),
+        ("vae_image_encoder", ZImageVaeImageEncoderStep),
        ("input", ZImageTextInputStep),
        ("additional_inputs", ZImageAdditionalInputsStep(image_latent_inputs=["image_latents"])),
        ("prepare_latents", ZImagePrepareLatentsStep),
@@ -178,7 +178,7 @@ IMAGE2IMAGE_BLOCKS = InsertableDict(
 AUTO_BLOCKS = InsertableDict(
    [
        ("text_encoder", ZImageTextEncoderStep),
-        ("vae_encoder", ZImageAutoVaeImageEncoderStep),
+        ("vae_image_encoder", ZImageAutoVaeImageEncoderStep),
        ("denoise", ZImageAutoDenoiseStep),
        ("decode", ZImageVaeDecoderStep),
    ]
--- a/src/diffusers/pipelines/init.py
+++ b/src/diffusers/pipelines/init.py
@@ -165,7 +165,6 @@ else:
    _import_structure["cogview4"] = ["CogView4Pipeline", "CogView4ControlPipeline"]
    _import_structure["consisid"] = ["ConsisIDPipeline"]
    _import_structure["cosmos"] = [
-        "Cosmos2_5_PredictBasePipeline",
        "Cosmos2TextToImagePipeline",
        "CosmosTextToWorldPipeline",
        "CosmosVideoToWorldPipeline",
@@ -292,7 +291,6 @@ else:
    _import_structure["lumina"] = ["LuminaPipeline", "LuminaText2ImgPipeline"]
    _import_structure["lumina2"] = ["Lumina2Pipeline", "Lumina2Text2ImgPipeline"]
    _import_structure["lucy"] = ["LucyEditPipeline"]
-    _import_structure["longcat_image"] = ["LongCatImagePipeline", "LongCatImageEditPipeline"]
    _import_structure["marigold"].extend(
        [
            "MarigoldDepthPipeline",
@@ -406,12 +404,7 @@ else:
        "Kandinsky5T2IPipeline",
        "Kandinsky5I2IPipeline",
    ]
-    _import_structure["z_image"] = [
-        "ZImageImg2ImgPipeline",
-        "ZImagePipeline",
-        "ZImageControlNetPipeline",
-        "ZImageControlNetInpaintPipeline",
-    ]
+    _import_structure["z_image"] = ["ZImageImg2ImgPipeline", "ZImagePipeline"]
    _import_structure["skyreels_v2"] = [
        "SkyReelsV2DiffusionForcingPipeline",
        "SkyReelsV2DiffusionForcingImageToVideoPipeline",
@@ -428,7 +421,6 @@ else:
        "QwenImageEditInpaintPipeline",
        "QwenImageControlNetInpaintPipeline",
        "QwenImageControlNetPipeline",
-        "QwenImageLayeredPipeline",
    ]
    _import_structure["chronoedit"] = ["ChronoEditPipeline"]
 try:
@@ -623,7 +615,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
            StableDiffusionXLControlNetXSPipeline,
        )
        from .cosmos import (
-            Cosmos2_5_PredictBasePipeline,
            Cosmos2TextToImagePipeline,
            Cosmos2VideoToWorldPipeline,
            CosmosTextToWorldPipeline,
@@ -727,7 +718,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
            LEditsPPPipelineStableDiffusion,
            LEditsPPPipelineStableDiffusionXL,
        )
-        from .longcat_image import LongCatImageEditPipeline, LongCatImagePipeline
        from .ltx import LTXConditionPipeline, LTXImageToVideoPipeline, LTXLatentUpsamplePipeline, LTXPipeline
        from .lucy import LucyEditPipeline
        from .lumina import LuminaPipeline, LuminaText2ImgPipeline
@@ -772,7 +762,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
            QwenImageEditPlusPipeline,
            QwenImageImg2ImgPipeline,
            QwenImageInpaintPipeline,
-            QwenImageLayeredPipeline,
            QwenImagePipeline,
        )
        from .sana import (
@@ -852,12 +841,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
            WuerstchenDecoderPipeline,
            WuerstchenPriorPipeline,
        )
-        from .z_image import (
-            ZImageControlNetInpaintPipeline,
-            ZImageControlNetPipeline,
-            ZImageImg2ImgPipeline,
-            ZImagePipeline,
-        )
+        from .z_image import ZImageImg2ImgPipeline, ZImagePipeline

        try:
            if not is_onnx_available():
--- a/src/diffusers/pipelines/cosmos/init.py
+++ b/src/diffusers/pipelines/cosmos/init.py
@@ -22,9 +22,6 @@ except OptionalDependencyNotAvailable:

    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
 else:
-    _import_structure["pipeline_cosmos2_5_predict"] = [
-        "Cosmos2_5_PredictBasePipeline",
-    ]
    _import_structure["pipeline_cosmos2_text2image"] = ["Cosmos2TextToImagePipeline"]
    _import_structure["pipeline_cosmos2_video2world"] = ["Cosmos2VideoToWorldPipeline"]
    _import_structure["pipeline_cosmos_text2world"] = ["CosmosTextToWorldPipeline"]
@@ -38,9 +35,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
    except OptionalDependencyNotAvailable:
        from ...utils.dummy_torch_and_transformers_objects import *
    else:
-        from .pipeline_cosmos2_5_predict import (
-            Cosmos2_5_PredictBasePipeline,
-        )
        from .pipeline_cosmos2_text2image import Cosmos2TextToImagePipeline
        from .pipeline_cosmos2_video2world import Cosmos2VideoToWorldPipeline
        from .pipeline_cosmos_text2world import CosmosTextToWorldPipeline
--- a/src/diffusers/pipelines/cosmos/pipeline_cosmos2_5_predict.py
+++ b/src/diffusers/pipelines/cosmos/pipeline_cosmos2_5_predict.py
@@ -1,847 +0,0 @@
-# Copyright 2025 The NVIDIA Team and The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Callable, Dict, List, Optional, Union
-
-import numpy as np
-import torch
-import torchvision
-import torchvision.transforms
-import torchvision.transforms.functional
-from transformers import AutoTokenizer, Qwen2_5_VLForConditionalGeneration
-
-from ...callbacks import MultiPipelineCallbacks, PipelineCallback
-from ...image_processor import PipelineImageInput
-from ...models import AutoencoderKLWan, CosmosTransformer3DModel
-from ...schedulers import UniPCMultistepScheduler
-from ...utils import is_cosmos_guardrail_available, is_torch_xla_available, logging, replace_example_docstring
-from ...utils.torch_utils import randn_tensor
-from ...video_processor import VideoProcessor
-from ..pipeline_utils import DiffusionPipeline
-from .pipeline_output import CosmosPipelineOutput
-
-
-if is_cosmos_guardrail_available():
-    from cosmos_guardrail import CosmosSafetyChecker
-else:
-
-    class CosmosSafetyChecker:
-        def __init__(self, *args, **kwargs):
-            raise ImportError(
-                "`cosmos_guardrail` is not installed. Please install it to use the safety checker for Cosmos: `pip install cosmos_guardrail`."
-            )
-
-
-if is_torch_xla_available():
-    import torch_xla.core.xla_model as xm
-
-    XLA_AVAILABLE = True
-else:
-    XLA_AVAILABLE = False
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-
-# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
-def retrieve_latents(
-    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
-):
-    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
-        return encoder_output.latent_dist.sample(generator)
-    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
-        return encoder_output.latent_dist.mode()
-    elif hasattr(encoder_output, "latents"):
-        return encoder_output.latents
-    else:
-        raise AttributeError("Could not access latents of provided encoder_output")
-
-
-EXAMPLE_DOC_STRING = """
-    Examples:
-        ```python
-        >>> import torch
-        >>> from diffusers import Cosmos2_5_PredictBasePipeline
-        >>> from diffusers.utils import export_to_video, load_image, load_video
-
-        >>> model_id = "nvidia/Cosmos-Predict2.5-2B"
-        >>> pipe = Cosmos2_5_PredictBasePipeline.from_pretrained(
-        ...     model_id, revision="diffusers/base/pre-trianed", torch_dtype=torch.bfloat16
-        ... )
-        >>> pipe = pipe.to("cuda")
-
-        >>> # Common negative prompt reused across modes.
-        >>> negative_prompt = (
-        ...     "The video captures a series of frames showing ugly scenes, static with no motion, motion blur, "
-        ...     "over-saturation, shaky footage, low resolution, grainy texture, pixelated images, poorly lit areas, "
-        ...     "underexposed and overexposed scenes, poor color balance, washed out colors, choppy sequences, jerky "
-        ...     "movements, low frame rate, artifacting, color banding, unnatural transitions, outdated special effects, "
-        ...     "fake elements, unconvincing visuals, poorly edited content, jump cuts, visual noise, and flickering. "
-        ...     "Overall, the video is of poor quality."
-        ... )
-
-        >>> # Text2World: generate a 93-frame world video from text only.
-        >>> prompt = (
-        ...     "As the red light shifts to green, the red bus at the intersection begins to move forward, its headlights "
-        ...     "cutting through the falling snow. The snowy tire tracks deepen as the vehicle inches ahead, casting fresh "
-        ...     "lines onto the slushy road. Around it, streetlights glow warmer, illuminating the drifting flakes and wet "
-        ...     "reflections on the asphalt. Other cars behind start to edge forward, their beams joining the scene. "
-        ...     "The stillness of the urban street transitions into motion as the quiet snowfall is punctuated by the slow "
-        ...     "advance of traffic through the frosty city corridor."
-        ... )
-        >>> video = pipe(
-        ...     image=None,
-        ...     video=None,
-        ...     prompt=prompt,
-        ...     negative_prompt=negative_prompt,
-        ...     num_frames=93,
-        ...     generator=torch.Generator().manual_seed(1),
-        ... ).frames[0]
-        >>> export_to_video(video, "text2world.mp4", fps=16)
-
-        >>> # Image2World: condition on a single image and generate a 93-frame world video.
-        >>> prompt = (
-        ...     "A high-definition video captures the precision of robotic welding in an industrial setting. "
-        ...     "The first frame showcases a robotic arm, equipped with a welding torch, positioned over a large metal structure. "
-        ...     "The welding process is in full swing, with bright sparks and intense light illuminating the scene, creating a vivid "
-        ...     "display of blue and white hues. A significant amount of smoke billows around the welding area, partially obscuring "
-        ...     "the view but emphasizing the heat and activity. The background reveals parts of the workshop environment, including a "
-        ...     "ventilation system and various pieces of machinery, indicating a busy and functional industrial workspace. As the video "
-        ...     "progresses, the robotic arm maintains its steady position, continuing the welding process and moving to its left. "
-        ...     "The welding torch consistently emits sparks and light, and the smoke continues to rise, diffusing slightly as it moves upward. "
-        ...     "The metal surface beneath the torch shows ongoing signs of heating and melting. The scene retains its industrial ambiance, with "
-        ...     "the welding sparks and smoke dominating the visual field, underscoring the ongoing nature of the welding operation."
-        ... )
-        >>> image = load_image(
-        ...     "https://media.githubusercontent.com/media/nvidia-cosmos/cosmos-predict2.5/refs/heads/main/assets/base/robot_welding.jpg"
-        ... )
-        >>> video = pipe(
-        ...     image=image,
-        ...     video=None,
-        ...     prompt=prompt,
-        ...     negative_prompt=negative_prompt,
-        ...     num_frames=93,
-        ...     generator=torch.Generator().manual_seed(1),
-        ... ).frames[0]
-        >>> # export_to_video(video, "image2world.mp4", fps=16)
-
-        >>> # Video2World: condition on an input clip and predict a 93-frame world video.
-        >>> prompt = (
-        ...     "The video opens with an aerial view of a large-scale sand mining construction operation, showcasing extensive piles "
-        ...     "of brown sand meticulously arranged in parallel rows. A central water channel, fed by a water pipe, flows through the "
-        ...     "middle of these sand heaps, creating ripples and movement as it cascades down. The surrounding area features dense green "
-        ...     "vegetation on the left, contrasting with the sandy terrain, while a body of water is visible in the background on the right. "
-        ...     "As the video progresses, a piece of heavy machinery, likely a bulldozer, enters the frame from the right, moving slowly along "
-        ...     "the edge of the sand piles. This machinery's presence indicates ongoing construction work in the operation. The final frame "
-        ...     "captures the same scene, with the water continuing its flow and the bulldozer still in motion, maintaining the dynamic yet "
-        ...     "steady pace of the construction activity."
-        ... )
-        >>> input_video = load_video(
-        ...     "https://github.com/nvidia-cosmos/cosmos-predict2.5/raw/refs/heads/main/assets/base/sand_mining.mp4"
-        ... )
-        >>> video = pipe(
-        ...     image=None,
-        ...     video=input_video,
-        ...     prompt=prompt,
-        ...     negative_prompt=negative_prompt,
-        ...     num_frames=93,
-        ...     generator=torch.Generator().manual_seed(1),
-        ... ).frames[0]
-        >>> export_to_video(video, "video2world.mp4", fps=16)
-
-        >>> # To produce an image instead of a world (video) clip, set num_frames=1 and
-        >>> # save the first frame: pipe(..., num_frames=1).frames[0][0].
-        ```
-"""
-
-
-class Cosmos2_5_PredictBasePipeline(DiffusionPipeline):
-    r"""
-    Pipeline for [Cosmos Predict2.5](https://github.com/nvidia-cosmos/cosmos-predict2.5) base model.
-
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
-    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
-
-    Args:
-        text_encoder ([`Qwen2_5_VLForConditionalGeneration`]):
-            Frozen text-encoder. Cosmos Predict2.5 uses the [Qwen2.5
-            VL](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct) encoder.
-        tokenizer (`AutoTokenizer`):
-            Tokenizer associated with the Qwen2.5 VL encoder.
-        transformer ([`CosmosTransformer3DModel`]):
-            Conditional Transformer to denoise the encoded image latents.
-        scheduler ([`UniPCMultistepScheduler`]):
-            A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
-        vae ([`AutoencoderKLWan`]):
-            Variational Auto-Encoder (VAE) Model to encode and decode videos to and from latent representations.
-    """
-
-    model_cpu_offload_seq = "text_encoder->transformer->vae"
-    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
-    # We mark safety_checker as optional here to get around some test failures, but it is not really optional
-    _optional_components = ["safety_checker"]
-    _exclude_from_cpu_offload = ["safety_checker"]
-
-    def __init__(
-        self,
-        text_encoder: Qwen2_5_VLForConditionalGeneration,
-        tokenizer: AutoTokenizer,
-        transformer: CosmosTransformer3DModel,
-        vae: AutoencoderKLWan,
-        scheduler: UniPCMultistepScheduler,
-        safety_checker: CosmosSafetyChecker = None,
-    ):
-        super().__init__()
-
-        if safety_checker is None:
-            safety_checker = CosmosSafetyChecker()
-
-        self.register_modules(
-            vae=vae,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            transformer=transformer,
-            scheduler=scheduler,
-            safety_checker=safety_checker,
-        )
-
-        self.vae_scale_factor_temporal = 2 ** sum(self.vae.temperal_downsample) if getattr(self, "vae", None) else 4
-        self.vae_scale_factor_spatial = 2 ** len(self.vae.temperal_downsample) if getattr(self, "vae", None) else 8
-        self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial)
-
-        latents_mean = (
-            torch.tensor(self.vae.config.latents_mean).view(1, self.vae.config.z_dim, 1, 1, 1).float()
-            if getattr(self.vae.config, "latents_mean", None) is not None
-            else None
-        )
-        latents_std = (
-            torch.tensor(self.vae.config.latents_std).view(1, self.vae.config.z_dim, 1, 1, 1).float()
-            if getattr(self.vae.config, "latents_std", None) is not None
-            else None
-        )
-        self.latents_mean = latents_mean
-        self.latents_std = latents_std
-
-        if self.latents_mean is None or self.latents_std is None:
-            raise ValueError("VAE configuration must define both `latents_mean` and `latents_std`.")
-
-    def _get_prompt_embeds(
-        self,
-        prompt: Union[str, List[str]] = None,
-        max_sequence_length: int = 512,
-        device: Optional[torch.device] = None,
-        dtype: Optional[torch.dtype] = None,
-    ):
-        device = device or self._execution_device
-        dtype = dtype or self.text_encoder.dtype
-        prompt = [prompt] if isinstance(prompt, str) else prompt
-
-        input_ids_batch = []
-
-        for sample_idx in range(len(prompt)):
-            conversations = [
-                {
-                    "role": "system",
-                    "content": [
-                        {
-                            "type": "text",
-                            "text": "You are a helpful assistant who will provide prompts to an image generator.",
-                        }
-                    ],
-                },
-                {
-                    "role": "user",
-                    "content": [
-                        {
-                            "type": "text",
-                            "text": prompt[sample_idx],
-                        }
-                    ],
-                },
-            ]
-            input_ids = self.tokenizer.apply_chat_template(
-                conversations,
-                tokenize=True,
-                add_generation_prompt=False,
-                add_vision_id=False,
-                max_length=max_sequence_length,
-                truncation=True,
-                padding="max_length",
-            )
-            input_ids = torch.LongTensor(input_ids)
-            input_ids_batch.append(input_ids)
-
-        input_ids_batch = torch.stack(input_ids_batch, dim=0)
-
-        outputs = self.text_encoder(
-            input_ids_batch.to(device),
-            output_hidden_states=True,
-        )
-        hidden_states = outputs.hidden_states
-
-        normalized_hidden_states = []
-        for layer_idx in range(1, len(hidden_states)):
-            normalized_state = (hidden_states[layer_idx] - hidden_states[layer_idx].mean(dim=-1, keepdim=True)) / (
-                hidden_states[layer_idx].std(dim=-1, keepdim=True) + 1e-8
-            )
-            normalized_hidden_states.append(normalized_state)
-
-        prompt_embeds = torch.cat(normalized_hidden_states, dim=-1)
-        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
-
-        return prompt_embeds
-
-    # Modified from diffusers.pipelines.cosmos.pipeline_cosmos_text2world.CosmosTextToWorldPipeline.encode_prompt
-    def encode_prompt(
-        self,
-        prompt: Union[str, List[str]],
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        do_classifier_free_guidance: bool = True,
-        num_videos_per_prompt: int = 1,
-        prompt_embeds: Optional[torch.Tensor] = None,
-        negative_prompt_embeds: Optional[torch.Tensor] = None,
-        max_sequence_length: int = 512,
-        device: Optional[torch.device] = None,
-        dtype: Optional[torch.dtype] = None,
-    ):
-        r"""
-        Encodes the prompt into text encoder hidden states.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                prompt to be encoded
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
-                less than `1`).
-            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
-                Whether to use classifier free guidance or not.
-            num_videos_per_prompt (`int`, *optional*, defaults to 1):
-                Number of videos that should be generated per prompt. torch device to place the resulting embeddings on
-            prompt_embeds (`torch.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`torch.Tensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            device: (`torch.device`, *optional*):
-                torch device
-            dtype: (`torch.dtype`, *optional*):
-                torch dtype
-        """
-        device = device or self._execution_device
-
-        prompt = [prompt] if isinstance(prompt, str) else prompt
-        if prompt is not None:
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        if prompt_embeds is None:
-            prompt_embeds = self._get_prompt_embeds(
-                prompt=prompt, max_sequence_length=max_sequence_length, device=device, dtype=dtype
-            )
-
-            # duplicate text embeddings for each generation per prompt, using mps friendly method
-            _, seq_len, _ = prompt_embeds.shape
-            prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt, 1)
-            prompt_embeds = prompt_embeds.view(batch_size * num_videos_per_prompt, seq_len, -1)
-
-        if do_classifier_free_guidance and negative_prompt_embeds is None:
-            negative_prompt = negative_prompt or ""
-            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
-
-            if prompt is not None and type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}."
-                )
-            elif batch_size != len(negative_prompt):
-                raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`."
-                )
-
-            negative_prompt_embeds = self._get_prompt_embeds(
-                prompt=negative_prompt, max_sequence_length=max_sequence_length, device=device, dtype=dtype
-            )
-
-            # duplicate text embeddings for each generation per prompt, using mps friendly method
-            _, seq_len, _ = negative_prompt_embeds.shape
-            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_videos_per_prompt, 1)
-            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_videos_per_prompt, seq_len, -1)
-
-        return prompt_embeds, negative_prompt_embeds
-
-    # Modified from diffusers.pipelines.cosmos.pipeline_cosmos2_video2world.Cosmos2VideoToWorldPipeline.prepare_latents and
-    # diffusers.pipelines.cosmos.pipeline_cosmos2_video2world.Cosmos2TextToImagePipeline.prepare_latents
-    def prepare_latents(
-        self,
-        video: Optional[torch.Tensor],
-        batch_size: int,
-        num_channels_latents: int = 16,
-        height: int = 704,
-        width: int = 1280,
-        num_frames_in: int = 93,
-        num_frames_out: int = 93,
-        do_classifier_free_guidance: bool = True,
-        dtype: Optional[torch.dtype] = None,
-        device: Optional[torch.device] = None,
-        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
-        latents: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        if isinstance(generator, list) and len(generator) != batch_size:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
-            )
-
-        B = batch_size
-        C = num_channels_latents
-        T = (num_frames_out - 1) // self.vae_scale_factor_temporal + 1
-        H = height // self.vae_scale_factor_spatial
-        W = width // self.vae_scale_factor_spatial
-        shape = (B, C, T, H, W)
-
-        if num_frames_in == 0:
-            if latents is None:
-                latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
-
-            cond_mask = torch.zeros((B, 1, T, H, W), dtype=latents.dtype, device=latents.device)
-            cond_indicator = torch.zeros((B, 1, T, 1, 1), dtype=latents.dtype, device=latents.device)
-
-            cond_latents = torch.zeros_like(latents)
-
-            return (
-                latents,
-                cond_latents,
-                cond_mask,
-                cond_indicator,
-            )
-        else:
-            if video is None:
-                raise ValueError("`video` must be provided when `num_frames_in` is greater than 0.")
-            needs_preprocessing = not (isinstance(video, torch.Tensor) and video.ndim == 5 and video.shape[1] == 3)
-            if needs_preprocessing:
-                video = self.video_processor.preprocess_video(video, height, width)
-            video = video.to(device=device, dtype=self.vae.dtype)
-            if isinstance(generator, list):
-                cond_latents = [
-                    retrieve_latents(self.vae.encode(video[i].unsqueeze(0)), generator=generator[i])
-                    for i in range(batch_size)
-                ]
-            else:
-                cond_latents = [retrieve_latents(self.vae.encode(vid.unsqueeze(0)), generator) for vid in video]
-
-            cond_latents = torch.cat(cond_latents, dim=0).to(dtype)
-
-            latents_mean = self.latents_mean.to(device=device, dtype=dtype)
-            latents_std = self.latents_std.to(device=device, dtype=dtype)
-            cond_latents = (cond_latents - latents_mean) / latents_std
-
-            if latents is None:
-                latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
-            else:
-                latents = latents.to(device=device, dtype=dtype)
-
-            padding_shape = (B, 1, T, H, W)
-            ones_padding = latents.new_ones(padding_shape)
-            zeros_padding = latents.new_zeros(padding_shape)
-
-            num_cond_latent_frames = (num_frames_in - 1) // self.vae_scale_factor_temporal + 1
-            cond_indicator = latents.new_zeros(1, 1, latents.size(2), 1, 1)
-            cond_indicator[:, :, 0:num_cond_latent_frames] = 1.0
-            cond_mask = cond_indicator * ones_padding + (1 - cond_indicator) * zeros_padding
-
-            return (
-                latents,
-                cond_latents,
-                cond_mask,
-                cond_indicator,
-            )
-
-    # Copied from diffusers.pipelines.cosmos.pipeline_cosmos_text2world.CosmosTextToWorldPipeline.check_inputs
-    def check_inputs(
-        self,
-        prompt,
-        height,
-        width,
-        prompt_embeds=None,
-        callback_on_step_end_tensor_inputs=None,
-    ):
-        if height % 16 != 0 or width % 16 != 0:
-            raise ValueError(f"`height` and `width` have to be divisible by 16 but are {height} and {width}.")
-
-        if callback_on_step_end_tensor_inputs is not None and not all(
-            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
-        ):
-            raise ValueError(
-                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
-            )
-
-        if prompt is not None and prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two."
-            )
-        elif prompt is None and prompt_embeds is None:
-            raise ValueError(
-                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
-            )
-        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-    @property
-    def guidance_scale(self):
-        return self._guidance_scale
-
-    @property
-    def do_classifier_free_guidance(self):
-        return self._guidance_scale > 1.0
-
-    @property
-    def num_timesteps(self):
-        return self._num_timesteps
-
-    @property
-    def current_timestep(self):
-        return self._current_timestep
-
-    @property
-    def interrupt(self):
-        return self._interrupt
-
-    @torch.no_grad()
-    @replace_example_docstring(EXAMPLE_DOC_STRING)
-    def __call__(
-        self,
-        image: PipelineImageInput | None = None,
-        video: List[PipelineImageInput] | None = None,
-        prompt: Union[str, List[str]] | None = None,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        height: int = 704,
-        width: int = 1280,
-        num_frames: int = 93,
-        num_inference_steps: int = 36,
-        guidance_scale: float = 7.0,
-        num_videos_per_prompt: Optional[int] = 1,
-        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
-        latents: Optional[torch.Tensor] = None,
-        prompt_embeds: Optional[torch.Tensor] = None,
-        negative_prompt_embeds: Optional[torch.Tensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback_on_step_end: Optional[
-            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
-        ] = None,
-        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
-        max_sequence_length: int = 512,
-        conditional_frame_timestep: float = 0.1,
-    ):
-        r"""
-        The call function to the pipeline for generation. Supports three modes:
-
-        - **Text2World**: `image=None`, `video=None`, `prompt` provided. Generates a world clip.
-        - **Image2World**: `image` provided, `video=None`, `prompt` provided. Conditions on a single frame.
-        - **Video2World**: `video` provided, `image=None`, `prompt` provided. Conditions on an input clip.
-
-        Set `num_frames=93` (default) to produce a world video, or `num_frames=1` to produce a single image frame (the
-        above in "*2Image mode").
-
-        Outputs follow `output_type` (e.g., `"pil"` returns a list of `num_frames` PIL images per prompt).
-
-        Args:
-            image (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, *optional*):
-                Optional single image for Image2World conditioning. Must be `None` when `video` is provided.
-            video (`List[PIL.Image.Image]`, `np.ndarray`, `torch.Tensor`, *optional*):
-                Optional input video for Video2World conditioning. Must be `None` when `image` is provided.
-            prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide generation. Required unless `prompt_embeds` is supplied.
-            height (`int`, defaults to `704`):
-                The height in pixels of the generated image.
-            width (`int`, defaults to `1280`):
-                The width in pixels of the generated image.
-            num_frames (`int`, defaults to `93`):
-                Number of output frames. Use `93` for world (video) generation; set to `1` to return a single frame.
-            num_inference_steps (`int`, defaults to `35`):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            guidance_scale (`float`, defaults to `7.0`):
-                Guidance scale as defined in [Classifier-Free Diffusion
-                Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
-                of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
-                `guidance_scale > 1`.
-            num_videos_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
-                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
-                generation deterministic.
-            latents (`torch.Tensor`, *optional*):
-                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor is generated by sampling using the supplied random `generator`.
-            prompt_embeds (`torch.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated negative text embeddings. For PixArt-Sigma this negative prompt should be "". If not
-                provided, negative_prompt_embeds will be generated from `negative_prompt` input argument.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`CosmosPipelineOutput`] instead of a plain tuple.
-            callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
-                A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
-                each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
-                DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
-                list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
-            callback_on_step_end_tensor_inputs (`List`, *optional*):
-                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
-                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
-                `._callback_tensor_inputs` attribute of your pipeline class.
-            max_sequence_length (`int`, defaults to `512`):
-                The maximum number of tokens in the prompt. If the prompt exceeds this length, it will be truncated. If
-                the prompt is shorter than this length, it will be padded.
-
-        Examples:
-
-        Returns:
-            [`~CosmosPipelineOutput`] or `tuple`:
-                If `return_dict` is `True`, [`CosmosPipelineOutput`] is returned, otherwise a `tuple` is returned where
-                the first element is a list with the generated images and the second element is a list of `bool`s
-                indicating whether the corresponding generated image contains "not-safe-for-work" (nsfw) content.
-        """
-        if self.safety_checker is None:
-            raise ValueError(
-                f"You have disabled the safety checker for {self.__class__}. This is in violation of the "
-                "[NVIDIA Open Model License Agreement](https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-open-model-license). "
-                f"Please ensure that you are compliant with the license agreement."
-            )
-
-        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
-            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
-
-        # Check inputs. Raise error if not correct
-        self.check_inputs(prompt, height, width, prompt_embeds, callback_on_step_end_tensor_inputs)
-
-        self._guidance_scale = guidance_scale
-        self._current_timestep = None
-        self._interrupt = False
-
-        device = self._execution_device
-
-        if self.safety_checker is not None:
-            self.safety_checker.to(device)
-            if prompt is not None:
-                prompt_list = [prompt] if isinstance(prompt, str) else prompt
-                for p in prompt_list:
-                    if not self.safety_checker.check_text_safety(p):
-                        raise ValueError(
-                            f"Cosmos Guardrail detected unsafe text in the prompt: {p}. Please ensure that the "
-                            f"prompt abides by the NVIDIA Open Model License Agreement."
-                        )
-
-        # Define call parameters
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        # Encode input prompt
-        (
-            prompt_embeds,
-            negative_prompt_embeds,
-        ) = self.encode_prompt(
-            prompt=prompt,
-            negative_prompt=negative_prompt,
-            do_classifier_free_guidance=self.do_classifier_free_guidance,
-            num_videos_per_prompt=num_videos_per_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-            device=device,
-            max_sequence_length=max_sequence_length,
-        )
-
-        vae_dtype = self.vae.dtype
-        transformer_dtype = self.transformer.dtype
-
-        num_frames_in = None
-        if image is not None:
-            if batch_size != 1:
-                raise ValueError(f"batch_size must be 1 for image input (given {batch_size})")
-
-            image = torchvision.transforms.functional.to_tensor(image).unsqueeze(0)
-            video = torch.cat([image, torch.zeros_like(image).repeat(num_frames - 1, 1, 1, 1)], dim=0)
-            video = video.unsqueeze(0)
-            num_frames_in = 1
-        elif video is None:
-            video = torch.zeros(batch_size, num_frames, 3, height, width, dtype=torch.uint8)
-            num_frames_in = 0
-        else:
-            num_frames_in = len(video)
-
-            if batch_size != 1:
-                raise ValueError(f"batch_size must be 1 for video input (given {batch_size})")
-
-        assert video is not None
-        video = self.video_processor.preprocess_video(video, height, width)
-
-        # pad with last frame (for video2world)
-        num_frames_out = num_frames
-        if video.shape[2] < num_frames_out:
-            n_pad_frames = num_frames_out - num_frames_in
-            last_frame = video[0, :, -1:, :, :]  # [C, T==1, H, W]
-            pad_frames = last_frame.repeat(1, 1, n_pad_frames, 1, 1)  # [B, C, T, H, W]
-            video = torch.cat((video, pad_frames), dim=2)
-
-        assert num_frames_in <= num_frames_out, f"expected ({num_frames_in=}) <= ({num_frames_out=})"
-
-        video = video.to(device=device, dtype=vae_dtype)
-
-        num_channels_latents = self.transformer.config.in_channels - 1
-        latents, cond_latent, cond_mask, cond_indicator = self.prepare_latents(
-            video=video,
-            batch_size=batch_size * num_videos_per_prompt,
-            num_channels_latents=num_channels_latents,
-            height=height,
-            width=width,
-            num_frames_in=num_frames_in,
-            num_frames_out=num_frames,
-            do_classifier_free_guidance=self.do_classifier_free_guidance,
-            dtype=torch.float32,
-            device=device,
-            generator=generator,
-            latents=latents,
-        )
-        cond_timestep = torch.ones_like(cond_indicator) * conditional_frame_timestep
-        cond_mask = cond_mask.to(transformer_dtype)
-
-        padding_mask = latents.new_zeros(1, 1, height, width, dtype=transformer_dtype)
-
-        # Denoising loop
-        self.scheduler.set_timesteps(num_inference_steps, device=device)
-        timesteps = self.scheduler.timesteps
-        self._num_timesteps = len(timesteps)
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-
-        gt_velocity = (latents - cond_latent) * cond_mask
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                if self.interrupt:
-                    continue
-
-                self._current_timestep = t.cpu().item()
-
-                # NOTE: assumes sigma(t) \in [0, 1]
-                sigma_t = (
-                    torch.tensor(self.scheduler.sigmas[i].item())
-                    .unsqueeze(0)
-                    .to(device=device, dtype=transformer_dtype)
-                )
-
-                in_latents = cond_mask * cond_latent + (1 - cond_mask) * latents
-                in_latents = in_latents.to(transformer_dtype)
-                in_timestep = cond_indicator * cond_timestep + (1 - cond_indicator) * sigma_t
-                noise_pred = self.transformer(
-                    hidden_states=in_latents,
-                    condition_mask=cond_mask,
-                    timestep=in_timestep,
-                    encoder_hidden_states=prompt_embeds,
-                    padding_mask=padding_mask,
-                    return_dict=False,
-                )[0]
-                # NOTE: replace velocity (noise_pred) with gt_velocity for conditioning inputs only
-                noise_pred = gt_velocity + noise_pred * (1 - cond_mask)
-
-                if self.do_classifier_free_guidance:
-                    noise_pred_neg = self.transformer(
-                        hidden_states=in_latents,
-                        condition_mask=cond_mask,
-                        timestep=in_timestep,
-                        encoder_hidden_states=negative_prompt_embeds,
-                        padding_mask=padding_mask,
-                        return_dict=False,
-                    )[0]
-                    # NOTE: replace velocity (noise_pred_neg) with gt_velocity for conditioning inputs only
-                    noise_pred_neg = gt_velocity + noise_pred_neg * (1 - cond_mask)
-                    noise_pred = noise_pred + self.guidance_scale * (noise_pred - noise_pred_neg)
-
-                latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
-
-                if callback_on_step_end is not None:
-                    callback_kwargs = {}
-                    for k in callback_on_step_end_tensor_inputs:
-                        callback_kwargs[k] = locals()[k]
-                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
-
-                    latents = callback_outputs.pop("latents", latents)
-                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
-                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
-
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    progress_bar.update()
-
-                if XLA_AVAILABLE:
-                    xm.mark_step()
-
-        self._current_timestep = None
-
-        if not output_type == "latent":
-            latents_mean = self.latents_mean.to(latents.device, latents.dtype)
-            latents_std = self.latents_std.to(latents.device, latents.dtype)
-            latents = latents * latents_std + latents_mean
-            video = self.vae.decode(latents.to(self.vae.dtype), return_dict=False)[0]
-            video = self._match_num_frames(video, num_frames)
-
-            assert self.safety_checker is not None
-            self.safety_checker.to(device)
-            video = self.video_processor.postprocess_video(video, output_type="np")
-            video = (video * 255).astype(np.uint8)
-            video_batch = []
-            for vid in video:
-                vid = self.safety_checker.check_video_safety(vid)
-                video_batch.append(vid)
-            video = np.stack(video_batch).astype(np.float32) / 255.0 * 2 - 1
-            video = torch.from_numpy(video).permute(0, 4, 1, 2, 3)
-            video = self.video_processor.postprocess_video(video, output_type=output_type)
-        else:
-            video = latents
-
-        # Offload all models
-        self.maybe_free_model_hooks()
-
-        if not return_dict:
-            return (video,)
-
-        return CosmosPipelineOutput(frames=video)
-
-    def _match_num_frames(self, video: torch.Tensor, target_num_frames: int) -> torch.Tensor:
-        if target_num_frames <= 0 or video.shape[2] == target_num_frames:
-            return video
-
-        frames_per_latent = max(self.vae_scale_factor_temporal, 1)
-        video = torch.repeat_interleave(video, repeats=frames_per_latent, dim=2)
-
-        current_frames = video.shape[2]
-        if current_frames < target_num_frames:
-            pad = video[:, :, -1:, :, :].repeat(1, 1, target_num_frames - current_frames, 1, 1)
-            video = torch.cat([video, pad], dim=2)
-        elif current_frames > target_num_frames:
-            video = video[:, :, :target_num_frames]
-
-        return video
--- a/src/diffusers/pipelines/longcat_image/init.py
+++ b/src/diffusers/pipelines/longcat_image/init.py
@@ -1,51 +0,0 @@
-from typing import TYPE_CHECKING
-
-from ...utils import (
-    DIFFUSERS_SLOW_IMPORT,
-    OptionalDependencyNotAvailable,
-    _LazyModule,
-    get_objects_from_module,
-    is_torch_available,
-    is_transformers_available,
-)
-
-
-_dummy_objects = {}
-_import_structure = {}
-
-try:
-    if not (is_transformers_available() and is_torch_available()):
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    from ...utils import dummy_torch_and_transformers_objects  # noqa: F403
-
-    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
-else:
-    _import_structure["pipeline_longcat_image"] = ["LongCatImagePipeline"]
-    _import_structure["pipeline_longcat_image_edit"] = ["LongCatImageEditPipeline"]
-    _import_structure["pipeline_output"] = ["LongCatImagePipelineOutput"]
-
-if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
-    try:
-        if not (is_transformers_available() and is_torch_available()):
-            raise OptionalDependencyNotAvailable()
-
-    except OptionalDependencyNotAvailable:
-        from ...utils.dummy_torch_and_transformers_objects import *
-    else:
-        from .pipeline_longcat_image import LongCatImagePipeline
-        from .pipeline_longcat_image_edit import LongCatImageEditPipeline
-        from .pipeline_output import LongCatImagePipelineOutput
-
-else:
-    import sys
-
-    sys.modules[__name__] = _LazyModule(
-        __name__,
-        globals()["__file__"],
-        _import_structure,
-        module_spec=__spec__,
-    )
-
-    for name, value in _dummy_objects.items():
-        setattr(sys.modules[__name__], name, value)
--- a/src/diffusers/pipelines/longcat_image/pipeline_longcat_image.py
+++ b/src/diffusers/pipelines/longcat_image/pipeline_longcat_image.py
@@ -1,666 +0,0 @@
-# Copyright 2025 MeiTuan LongCat-Image Team and The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import inspect
-import re
-from typing import Any, Dict, List, Optional, Union
-
-import numpy as np
-import torch
-from transformers import Qwen2_5_VLForConditionalGeneration, Qwen2Tokenizer, Qwen2VLProcessor
-
-from ...image_processor import VaeImageProcessor
-from ...loaders import FromSingleFileMixin
-from ...models.autoencoders import AutoencoderKL
-from ...models.transformers import LongCatImageTransformer2DModel
-from ...pipelines.pipeline_utils import DiffusionPipeline
-from ...schedulers import FlowMatchEulerDiscreteScheduler
-from ...utils import is_torch_xla_available, logging, replace_example_docstring
-from ...utils.torch_utils import randn_tensor
-from .pipeline_output import LongCatImagePipelineOutput
-from .system_messages import SYSTEM_PROMPT_EN, SYSTEM_PROMPT_ZH
-
-
-if is_torch_xla_available():
-    import torch_xla.core.xla_model as xm
-
-    XLA_AVAILABLE = True
-else:
-    XLA_AVAILABLE = False
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-
-EXAMPLE_DOC_STRING = """
-    Examples:
-        ```py
-        >>> import torch
-        >>> from diffusers import LongCatImagePipeline
-
-        >>> pipe = LongCatImagePipeline.from_pretrained("meituan-longcat/LongCat-Image", torch_dtype=torch.bfloat16)
-        >>> pipe.to("cuda")
-
-        >>> prompt = "一个年轻的亚裔女性，身穿黄色针织衫，搭配白色项链。她的双手放在膝盖上，表情恬静。背景是一堵粗糙的砖墙，午后的阳光温暖地洒在她身上，营造出一种宁静而温馨的氛围。镜头采用中距离视角，突出她的神态和服饰的细节。光线柔和地打在她的脸上，强调她的五官和饰品的质感，增加画面的层次感与亲和力。整个画面构图简洁，砖墙的纹理与阳光的光影效果相得益彰，突显出人物的优雅与从容。"
-        >>> image = pipe(
-        ...     prompt,
-        ...     height=768,
-        ...     width=1344,
-        ...     num_inference_steps=50,
-        ...     guidance_scale=4.5,
-        ...     generator=torch.Generator("cpu").manual_seed(43),
-        ...     enable_cfg_renorm=True,
-        ... ).images[0]
-        >>> image.save("longcat_image.png")
-        ```
-"""
-
-
-def get_prompt_language(prompt):
-    pattern = re.compile(r"[\u4e00-\u9fff]")
-    if bool(pattern.search(prompt)):
-        return "zh"
-    return "en"
-
-
-def split_quotation(prompt, quote_pairs=None):
-    """
-    Implement a regex-based string splitting algorithm that identifies delimiters defined by single or double quote
-    pairs. Examples::
-        >>> prompt_en = "Please write 'Hello' on the blackboard for me." >>> print(split_quotation(prompt_en)) >>> #
-        output: [('Please write ', False), ("'Hello'", True), (' on the blackboard for me.', False)]
-    """
-    word_internal_quote_pattern = re.compile(r"[a-zA-Z]+'[a-zA-Z]+")
-    matches_word_internal_quote_pattern = word_internal_quote_pattern.findall(prompt)
-    mapping_word_internal_quote = []
-
-    for i, word_src in enumerate(set(matches_word_internal_quote_pattern)):
-        word_tgt = "longcat_$##$_longcat" * (i + 1)
-        prompt = prompt.replace(word_src, word_tgt)
-        mapping_word_internal_quote.append([word_src, word_tgt])
-
-    if quote_pairs is None:
-        quote_pairs = [("'", "'"), ('"', '"'), ("‘", "’"), ("“", "”")]
-    pattern = "|".join([re.escape(q1) + r"[^" + re.escape(q1 + q2) + r"]*?" + re.escape(q2) for q1, q2 in quote_pairs])
-    parts = re.split(f"({pattern})", prompt)
-
-    result = []
-    for part in parts:
-        for word_src, word_tgt in mapping_word_internal_quote:
-            part = part.replace(word_tgt, word_src)
-        if re.match(pattern, part):
-            if len(part):
-                result.append((part, True))
-        else:
-            if len(part):
-                result.append((part, False))
-    return result
-
-
-def prepare_pos_ids(modality_id=0, type="text", start=(0, 0), num_token=None, height=None, width=None):
-    if type == "text":
-        assert num_token
-        if height or width:
-            print('Warning: The parameters of height and width will be ignored in "text" type.')
-        pos_ids = torch.zeros(num_token, 3)
-        pos_ids[..., 0] = modality_id
-        pos_ids[..., 1] = torch.arange(num_token) + start[0]
-        pos_ids[..., 2] = torch.arange(num_token) + start[1]
-    elif type == "image":
-        assert height and width
-        if num_token:
-            print('Warning: The parameter of num_token will be ignored in "image" type.')
-        pos_ids = torch.zeros(height, width, 3)
-        pos_ids[..., 0] = modality_id
-        pos_ids[..., 1] = pos_ids[..., 1] + torch.arange(height)[:, None] + start[0]
-        pos_ids[..., 2] = pos_ids[..., 2] + torch.arange(width)[None, :] + start[1]
-        pos_ids = pos_ids.reshape(height * width, 3)
-    else:
-        raise KeyError(f'Unknow type {type}, only support "text" or "image".')
-    return pos_ids
-
-
-def calculate_shift(
-    image_seq_len,
-    base_seq_len: int = 256,
-    max_seq_len: int = 4096,
-    base_shift: float = 0.5,
-    max_shift: float = 1.15,
-):
-    m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
-    b = base_shift - m * base_seq_len
-    mu = image_seq_len * m + b
-    return mu
-
-
-# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
-def retrieve_timesteps(
-    scheduler,
-    num_inference_steps: Optional[int] = None,
-    device: Optional[Union[str, torch.device]] = None,
-    timesteps: Optional[List[int]] = None,
-    sigmas: Optional[List[float]] = None,
-    **kwargs,
-):
-    r"""
-    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
-    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
-
-    Args:
-        scheduler (`SchedulerMixin`):
-            The scheduler to get timesteps from.
-        num_inference_steps (`int`):
-            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
-            must be `None`.
-        device (`str` or `torch.device`, *optional*):
-            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
-        timesteps (`List[int]`, *optional*):
-            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
-            `num_inference_steps` and `sigmas` must be `None`.
-        sigmas (`List[float]`, *optional*):
-            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
-            `num_inference_steps` and `timesteps` must be `None`.
-
-    Returns:
-        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
-        second element is the number of inference steps.
-    """
-    if timesteps is not None and sigmas is not None:
-        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
-    if timesteps is not None:
-        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
-        if not accepts_timesteps:
-            raise ValueError(
-                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
-                f" timestep schedules. Please check whether you are using the correct scheduler."
-            )
-        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
-        timesteps = scheduler.timesteps
-        num_inference_steps = len(timesteps)
-    elif sigmas is not None:
-        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
-        if not accept_sigmas:
-            raise ValueError(
-                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
-                f" sigmas schedules. Please check whether you are using the correct scheduler."
-            )
-        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
-        timesteps = scheduler.timesteps
-        num_inference_steps = len(timesteps)
-    else:
-        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
-        timesteps = scheduler.timesteps
-    return timesteps, num_inference_steps
-
-
-class LongCatImagePipeline(DiffusionPipeline, FromSingleFileMixin):
-    r"""
-    The pipeline for text-to-image generation.
-    """
-
-    model_cpu_offload_seq = "text_encoder->transformer->vae"
-    _optional_components = []
-    _callback_tensor_inputs = ["latents", "prompt_embeds"]
-
-    def __init__(
-        self,
-        scheduler: FlowMatchEulerDiscreteScheduler,
-        vae: AutoencoderKL,
-        text_encoder: Qwen2_5_VLForConditionalGeneration,
-        tokenizer: Qwen2Tokenizer,
-        text_processor: Qwen2VLProcessor,
-        transformer: LongCatImageTransformer2DModel,
-    ):
-        super().__init__()
-
-        self.register_modules(
-            vae=vae,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            transformer=transformer,
-            scheduler=scheduler,
-            text_processor=text_processor,
-        )
-
-        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
-        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor * 2)
-
-        self.prompt_template_encode_prefix = "<|im_start|>system\nAs an image captioning expert, generate a descriptive text prompt based on an image content, suitable for input to a text-to-image model.<|im_end|>\n<|im_start|>user\n"
-        self.prompt_template_encode_suffix = "<|im_end|>\n<|im_start|>assistant\n"
-        self.default_sample_size = 128
-        self.tokenizer_max_length = 512
-
-    def rewire_prompt(self, prompt, device):
-        prompt = [prompt] if isinstance(prompt, str) else prompt
-        all_text = []
-        for each_prompt in prompt:
-            language = get_prompt_language(each_prompt)
-            if language == "zh":
-                question = SYSTEM_PROMPT_ZH + f"\n用户输入为：{each_prompt}\n改写后的prompt为："
-            else:
-                question = SYSTEM_PROMPT_EN + f"\nUser Input: {each_prompt}\nRewritten prompt:"
-            message = [
-                {
-                    "role": "user",
-                    "content": [
-                        {"type": "text", "text": question},
-                    ],
-                }
-            ]
-            # Preparation for inference
-            text = self.text_processor.apply_chat_template(message, tokenize=False, add_generation_prompt=True)
-            all_text.append(text)
-
-        inputs = self.text_processor(text=all_text, padding=True, return_tensors="pt").to(device)
-
-        self.text_encoder.to(device)
-        generated_ids = self.text_encoder.generate(**inputs, max_new_tokens=self.tokenizer_max_length)
-        generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
-        output_text = self.text_processor.batch_decode(
-            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
-        )
-        rewrite_prompt = output_text
-        return rewrite_prompt
-
-    def _encode_prompt(self, prompt: List[str]):
-        batch_all_tokens = []
-
-        for each_prompt in prompt:
-            all_tokens = []
-            for clean_prompt_sub, matched in split_quotation(each_prompt):
-                if matched:
-                    for sub_word in clean_prompt_sub:
-                        tokens = self.tokenizer(sub_word, add_special_tokens=False)["input_ids"]
-                        all_tokens.extend(tokens)
-                else:
-                    tokens = self.tokenizer(clean_prompt_sub, add_special_tokens=False)["input_ids"]
-                    all_tokens.extend(tokens)
-
-            if len(all_tokens) > self.tokenizer_max_length:
-                logger.warning(
-                    "Your input was truncated because `max_sequence_length` is set to "
-                    f" {self.tokenizer_max_length} input token nums : {len(all_tokens)}"
-                )
-                all_tokens = all_tokens[: self.tokenizer_max_length]
-            batch_all_tokens.append(all_tokens)
-
-        text_tokens_and_mask = self.tokenizer.pad(
-            {"input_ids": batch_all_tokens},
-            max_length=self.tokenizer_max_length,
-            padding="max_length",
-            return_attention_mask=True,
-            return_tensors="pt",
-        )
-
-        prefix_tokens = self.tokenizer(self.prompt_template_encode_prefix, add_special_tokens=False)["input_ids"]
-        suffix_tokens = self.tokenizer(self.prompt_template_encode_suffix, add_special_tokens=False)["input_ids"]
-        prefix_len = len(prefix_tokens)
-        suffix_len = len(suffix_tokens)
-
-        prefix_tokens_mask = torch.tensor([1] * len(prefix_tokens), dtype=text_tokens_and_mask.attention_mask[0].dtype)
-        suffix_tokens_mask = torch.tensor([1] * len(suffix_tokens), dtype=text_tokens_and_mask.attention_mask[0].dtype)
-
-        prefix_tokens = torch.tensor(prefix_tokens, dtype=text_tokens_and_mask.input_ids.dtype)
-        suffix_tokens = torch.tensor(suffix_tokens, dtype=text_tokens_and_mask.input_ids.dtype)
-
-        batch_size = text_tokens_and_mask.input_ids.size(0)
-
-        prefix_tokens_batch = prefix_tokens.unsqueeze(0).expand(batch_size, -1)
-        suffix_tokens_batch = suffix_tokens.unsqueeze(0).expand(batch_size, -1)
-        prefix_mask_batch = prefix_tokens_mask.unsqueeze(0).expand(batch_size, -1)
-        suffix_mask_batch = suffix_tokens_mask.unsqueeze(0).expand(batch_size, -1)
-
-        input_ids = torch.cat((prefix_tokens_batch, text_tokens_and_mask.input_ids, suffix_tokens_batch), dim=-1)
-        attention_mask = torch.cat((prefix_mask_batch, text_tokens_and_mask.attention_mask, suffix_mask_batch), dim=-1)
-
-        input_ids = input_ids.to(self.device)
-        attention_mask = attention_mask.to(self.device)
-
-        text_output = self.text_encoder(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True)
-        # [max_sequence_length, batch, hidden_size] -> [batch, max_sequence_length, hidden_size]
-        # clone to have a contiguous tensor
-        prompt_embeds = text_output.hidden_states[-1].detach()
-        prompt_embeds = prompt_embeds[:, prefix_len:-suffix_len, :]
-        return prompt_embeds
-
-    def encode_prompt(
-        self,
-        prompt: Union[str, List[str]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        prompt_embeds: Optional[torch.Tensor] = None,
-    ):
-        prompt = [prompt] if isinstance(prompt, str) else prompt
-        batch_size = len(prompt)
-        # If prompt_embeds is provided and prompt is None, skip encoding
-        if prompt_embeds is None:
-            prompt_embeds = self._encode_prompt(prompt)
-
-        _, seq_len, _ = prompt_embeds.shape
-        # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
-        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
-        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
-
-        text_ids = prepare_pos_ids(modality_id=0, type="text", start=(0, 0), num_token=prompt_embeds.shape[1]).to(
-            self.device
-        )
-        return prompt_embeds.to(self.device), text_ids
-
-    @staticmethod
-    def _pack_latents(latents, batch_size, num_channels_latents, height, width):
-        latents = latents.view(batch_size, num_channels_latents, height // 2, 2, width // 2, 2)
-        latents = latents.permute(0, 2, 4, 1, 3, 5)
-        latents = latents.reshape(batch_size, (height // 2) * (width // 2), num_channels_latents * 4)
-
-        return latents
-
-    @staticmethod
-    def _unpack_latents(latents, height, width, vae_scale_factor):
-        batch_size, num_patches, channels = latents.shape
-
-        # VAE applies 8x compression on images but we must also account for packing which requires
-        # latent height and width to be divisible by 2.
-        height = 2 * (int(height) // (vae_scale_factor * 2))
-        width = 2 * (int(width) // (vae_scale_factor * 2))
-
-        latents = latents.view(batch_size, height // 2, width // 2, channels // 4, 2, 2)
-        latents = latents.permute(0, 3, 1, 4, 2, 5)
-
-        latents = latents.reshape(batch_size, channels // (2 * 2), height, width)
-
-        return latents
-
-    @property
-    def do_classifier_free_guidance(self):
-        return self._guidance_scale > 1
-
-    def prepare_latents(
-        self,
-        batch_size,
-        num_channels_latents,
-        height,
-        width,
-        dtype,
-        device,
-        generator,
-        latents=None,
-    ):
-        # VAE applies 8x compression on images but we must also account for packing which requires
-        # latent height and width to be divisible by 2.
-        height = 2 * (int(height) // (self.vae_scale_factor * 2))
-        width = 2 * (int(width) // (self.vae_scale_factor * 2))
-
-        shape = (batch_size, num_channels_latents, height, width)
-        latent_image_ids = prepare_pos_ids(
-            modality_id=1,
-            type="image",
-            start=(self.tokenizer_max_length, self.tokenizer_max_length),
-            height=height // 2,
-            width=width // 2,
-        ).to(device)
-
-        if latents is not None:
-            return latents.to(device=device, dtype=dtype), latent_image_ids
-
-        if isinstance(generator, list) and len(generator) != batch_size:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
-            )
-
-        latents = randn_tensor(shape, generator=generator, device=device)
-        latents = latents.to(dtype=dtype)
-        latents = self._pack_latents(latents, batch_size, num_channels_latents, height, width)
-
-        return latents, latent_image_ids
-
-    @property
-    def guidance_scale(self):
-        return self._guidance_scale
-
-    @property
-    def joint_attention_kwargs(self):
-        return self._joint_attention_kwargs
-
-    @property
-    def num_timesteps(self):
-        return self._num_timesteps
-
-    @property
-    def current_timestep(self):
-        return self._current_timestep
-
-    @property
-    def interrupt(self):
-        return self._interrupt
-
-    def check_inputs(
-        self, prompt, height, width, negative_prompt=None, prompt_embeds=None, negative_prompt_embeds=None
-    ):
-        if height % (self.vae_scale_factor * 2) != 0 or width % (self.vae_scale_factor * 2) != 0:
-            logger.warning(
-                f"`height` and `width` have to be divisible by {self.vae_scale_factor * 2} but are {height} and {width}. Dimensions will be resized accordingly"
-            )
-
-        if prompt is not None and prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two."
-            )
-        elif prompt is None and prompt_embeds is None:
-            raise ValueError(
-                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
-            )
-        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if negative_prompt is not None and negative_prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
-                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
-            )
-
-    @replace_example_docstring(EXAMPLE_DOC_STRING)
-    @torch.no_grad()
-    def __call__(
-        self,
-        prompt: Union[str, List[str]] = None,
-        negative_prompt: Union[str, List[str]] = None,
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        num_inference_steps: int = 50,
-        sigmas: Optional[List[float]] = None,
-        guidance_scale: float = 4.5,
-        num_images_per_prompt: Optional[int] = 1,
-        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
-        latents: Optional[torch.FloatTensor] = None,
-        prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
-        enable_cfg_renorm: Optional[bool] = True,
-        cfg_renorm_min: Optional[float] = 0.0,
-        enable_prompt_rewrite: Optional[bool] = True,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            enable_cfg_renorm: Whether to enable cfg_renorm. Enabling cfg_renorm will improve image quality,
-                but it may lead to a decrease in the stability of some image outputs..
-            cfg_renorm_min: The minimum value of the cfg_renorm_scale range (0-1).
-                cfg_renorm_min = 1.0, renorm has no effect, while cfg_renorm_min=0.0, the renorm range is larger.
-            enable_prompt_rewrite: whether to enable prompt rewrite.
-        Examples:
-
-        Returns:
-            [`~pipelines.LongCatImagePipelineOutput`] or `tuple`: [`~pipelines.LongCatImagePipelineOutput`] if
-            `return_dict` is True, otherwise a `tuple`. When returning a tuple, the first element is a list with the
-            generated images.
-        """
-
-        height = height or self.default_sample_size * self.vae_scale_factor
-        width = width or self.default_sample_size * self.vae_scale_factor
-
-        # 1. Check inputs. Raise error if not correct
-        self.check_inputs(
-            prompt,
-            height,
-            width,
-            negative_prompt=negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-        )
-
-        self._guidance_scale = guidance_scale
-        self._joint_attention_kwargs = joint_attention_kwargs
-        self._current_timestep = None
-        self._interrupt = False
-
-        # 2. Define call parameters
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        device = self._execution_device
-        if enable_prompt_rewrite:
-            prompt = self.rewire_prompt(prompt, device)
-            logger.info(f"Rewrite prompt {prompt}!")
-
-        negative_prompt = "" if negative_prompt is None else negative_prompt
-        (prompt_embeds, text_ids) = self.encode_prompt(
-            prompt=prompt, prompt_embeds=prompt_embeds, num_images_per_prompt=num_images_per_prompt
-        )
-        if self.do_classifier_free_guidance:
-            (negative_prompt_embeds, negative_text_ids) = self.encode_prompt(
-                prompt=negative_prompt,
-                prompt_embeds=negative_prompt_embeds,
-                num_images_per_prompt=num_images_per_prompt,
-            )
-
-        # 4. Prepare latent variables
-        num_channels_latents = 16
-        latents, latent_image_ids = self.prepare_latents(
-            batch_size * num_images_per_prompt,
-            num_channels_latents,
-            height,
-            width,
-            prompt_embeds.dtype,
-            device,
-            generator,
-            latents,
-        )
-
-        # 5. Prepare timesteps
-        sigmas = np.linspace(1.0, 1.0 / num_inference_steps, num_inference_steps) if sigmas is None else sigmas
-        image_seq_len = latents.shape[1]
-        mu = calculate_shift(
-            image_seq_len,
-            self.scheduler.config.get("base_image_seq_len", 256),
-            self.scheduler.config.get("max_image_seq_len", 4096),
-            self.scheduler.config.get("base_shift", 0.5),
-            self.scheduler.config.get("max_shift", 1.15),
-        )
-        timesteps, num_inference_steps = retrieve_timesteps(
-            self.scheduler,
-            num_inference_steps,
-            device,
-            sigmas=sigmas,
-            mu=mu,
-        )
-        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
-        self._num_timesteps = len(timesteps)
-
-        # handle guidance
-        guidance = None
-
-        if self.joint_attention_kwargs is None:
-            self._joint_attention_kwargs = {}
-
-        # 6. Denoising loop
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                if self.interrupt:
-                    continue
-
-                self._current_timestep = t
-                timestep = t.expand(latents.shape[0]).to(latents.dtype)
-                with self.transformer.cache_context("cond"):
-                    noise_pred_text = self.transformer(
-                        hidden_states=latents,
-                        timestep=timestep / 1000,
-                        guidance=guidance,
-                        encoder_hidden_states=prompt_embeds,
-                        txt_ids=text_ids,
-                        img_ids=latent_image_ids,
-                        return_dict=False,
-                    )[0]
-                if self.do_classifier_free_guidance:
-                    with self.transformer.cache_context("uncond"):
-                        noise_pred_uncond = self.transformer(
-                            hidden_states=latents,
-                            timestep=timestep / 1000,
-                            encoder_hidden_states=negative_prompt_embeds,
-                            txt_ids=negative_text_ids,
-                            img_ids=latent_image_ids,
-                            return_dict=False,
-                        )[0]
-                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
-
-                    if enable_cfg_renorm:
-                        cond_norm = torch.norm(noise_pred_text, dim=-1, keepdim=True)
-                        noise_norm = torch.norm(noise_pred, dim=-1, keepdim=True)
-                        scale = (cond_norm / (noise_norm + 1e-8)).clamp(min=cfg_renorm_min, max=1.0)
-                        noise_pred = noise_pred * scale
-                else:
-                    noise_pred = noise_pred_text
-
-                # compute the previous noisy sample x_t -> x_t-1
-                latents_dtype = latents.dtype
-                latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
-
-                if latents.dtype != latents_dtype:
-                    if torch.backends.mps.is_available():
-                        # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
-                        latents = latents.to(latents_dtype)
-
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    progress_bar.update()
-
-                if XLA_AVAILABLE:
-                    xm.mark_step()
-
-        self._current_timestep = None
-
-        if output_type == "latent":
-            image = latents
-        else:
-            latents = self._unpack_latents(latents, height, width, self.vae_scale_factor)
-            latents = (latents / self.vae.config.scaling_factor) + self.vae.config.shift_factor
-
-            if latents.dtype != self.vae.dtype:
-                latents = latents.to(dtype=self.vae.dtype)
-
-            image = self.vae.decode(latents, return_dict=False)[0]
-            image = self.image_processor.postprocess(image, output_type=output_type)
-
-        # Offload all models
-        self.maybe_free_model_hooks()
-
-        if not return_dict:
-            return (image,)
-
-        return LongCatImagePipelineOutput(images=image)
--- a/src/diffusers/pipelines/longcat_image/pipeline_longcat_image_edit.py
+++ b/src/diffusers/pipelines/longcat_image/pipeline_longcat_image_edit.py
@@ -1,727 +0,0 @@
-# Copyright 2025 MeiTuan LongCat-Image Team and The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import inspect
-import math
-import re
-from typing import Any, Dict, List, Optional, Union
-
-import numpy as np
-import PIL
-import torch
-from transformers import Qwen2_5_VLForConditionalGeneration, Qwen2Tokenizer, Qwen2VLProcessor
-
-from ...image_processor import VaeImageProcessor
-from ...loaders import FromSingleFileMixin
-from ...models.autoencoders import AutoencoderKL
-from ...models.transformers import LongCatImageTransformer2DModel
-from ...pipelines.pipeline_utils import DiffusionPipeline
-from ...schedulers import FlowMatchEulerDiscreteScheduler
-from ...utils import is_torch_xla_available, logging, replace_example_docstring
-from ...utils.torch_utils import randn_tensor
-from .pipeline_output import LongCatImagePipelineOutput
-
-
-if is_torch_xla_available():
-    import torch_xla.core.xla_model as xm
-
-    XLA_AVAILABLE = True
-else:
-    XLA_AVAILABLE = False
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-
-EXAMPLE_DOC_STRING = """
-    Examples:
-        ```py
-        >>> from PIL import Image
-        >>> import torch
-        >>> from diffusers import LongCatImageEditPipeline
-
-        >>> pipe = LongCatImageEditPipeline.from_pretrained(
-        ...     "meituan-longcat/LongCat-Image-Edit", torch_dtype=torch.bfloat16
-        ... )
-        >>> pipe.to("cuda")
-
-        >>> prompt = "change the cat to dog."
-        >>> input_image = Image.open("test.jpg").convert("RGB")
-        >>> image = pipe(
-        ...     input_image,
-        ...     prompt,
-        ...     num_inference_steps=50,
-        ...     guidance_scale=4.5,
-        ...     generator=torch.Generator("cpu").manual_seed(43),
-        ... ).images[0]
-        >>> image.save("longcat_image_edit.png")
-        ```
-"""
-
-
-# Copied from diffusers.pipelines.longcat_image.pipeline_longcat_image.split_quotation
-def split_quotation(prompt, quote_pairs=None):
-    """
-    Implement a regex-based string splitting algorithm that identifies delimiters defined by single or double quote
-    pairs. Examples::
-        >>> prompt_en = "Please write 'Hello' on the blackboard for me." >>> print(split_quotation(prompt_en)) >>> #
-        output: [('Please write ', False), ("'Hello'", True), (' on the blackboard for me.', False)]
-    """
-    word_internal_quote_pattern = re.compile(r"[a-zA-Z]+'[a-zA-Z]+")
-    matches_word_internal_quote_pattern = word_internal_quote_pattern.findall(prompt)
-    mapping_word_internal_quote = []
-
-    for i, word_src in enumerate(set(matches_word_internal_quote_pattern)):
-        word_tgt = "longcat_$##$_longcat" * (i + 1)
-        prompt = prompt.replace(word_src, word_tgt)
-        mapping_word_internal_quote.append([word_src, word_tgt])
-
-    if quote_pairs is None:
-        quote_pairs = [("'", "'"), ('"', '"'), ("‘", "’"), ("“", "”")]
-    pattern = "|".join([re.escape(q1) + r"[^" + re.escape(q1 + q2) + r"]*?" + re.escape(q2) for q1, q2 in quote_pairs])
-    parts = re.split(f"({pattern})", prompt)
-
-    result = []
-    for part in parts:
-        for word_src, word_tgt in mapping_word_internal_quote:
-            part = part.replace(word_tgt, word_src)
-        if re.match(pattern, part):
-            if len(part):
-                result.append((part, True))
-        else:
-            if len(part):
-                result.append((part, False))
-    return result
-
-
-# Copied from diffusers.pipelines.longcat_image.pipeline_longcat_image.prepare_pos_ids
-def prepare_pos_ids(modality_id=0, type="text", start=(0, 0), num_token=None, height=None, width=None):
-    if type == "text":
-        assert num_token
-        if height or width:
-            print('Warning: The parameters of height and width will be ignored in "text" type.')
-        pos_ids = torch.zeros(num_token, 3)
-        pos_ids[..., 0] = modality_id
-        pos_ids[..., 1] = torch.arange(num_token) + start[0]
-        pos_ids[..., 2] = torch.arange(num_token) + start[1]
-    elif type == "image":
-        assert height and width
-        if num_token:
-            print('Warning: The parameter of num_token will be ignored in "image" type.')
-        pos_ids = torch.zeros(height, width, 3)
-        pos_ids[..., 0] = modality_id
-        pos_ids[..., 1] = pos_ids[..., 1] + torch.arange(height)[:, None] + start[0]
-        pos_ids[..., 2] = pos_ids[..., 2] + torch.arange(width)[None, :] + start[1]
-        pos_ids = pos_ids.reshape(height * width, 3)
-    else:
-        raise KeyError(f'Unknow type {type}, only support "text" or "image".')
-    return pos_ids
-
-
-# Copied from diffusers.pipelines.longcat_image.pipeline_longcat_image.calculate_shift
-def calculate_shift(
-    image_seq_len,
-    base_seq_len: int = 256,
-    max_seq_len: int = 4096,
-    base_shift: float = 0.5,
-    max_shift: float = 1.15,
-):
-    m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
-    b = base_shift - m * base_seq_len
-    mu = image_seq_len * m + b
-    return mu
-
-
-# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
-def retrieve_timesteps(
-    scheduler,
-    num_inference_steps: Optional[int] = None,
-    device: Optional[Union[str, torch.device]] = None,
-    timesteps: Optional[List[int]] = None,
-    sigmas: Optional[List[float]] = None,
-    **kwargs,
-):
-    r"""
-    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
-    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
-
-    Args:
-        scheduler (`SchedulerMixin`):
-            The scheduler to get timesteps from.
-        num_inference_steps (`int`):
-            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
-            must be `None`.
-        device (`str` or `torch.device`, *optional*):
-            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
-        timesteps (`List[int]`, *optional*):
-            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
-            `num_inference_steps` and `sigmas` must be `None`.
-        sigmas (`List[float]`, *optional*):
-            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
-            `num_inference_steps` and `timesteps` must be `None`.
-
-    Returns:
-        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
-        second element is the number of inference steps.
-    """
-    if timesteps is not None and sigmas is not None:
-        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
-    if timesteps is not None:
-        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
-        if not accepts_timesteps:
-            raise ValueError(
-                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
-                f" timestep schedules. Please check whether you are using the correct scheduler."
-            )
-        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
-        timesteps = scheduler.timesteps
-        num_inference_steps = len(timesteps)
-    elif sigmas is not None:
-        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
-        if not accept_sigmas:
-            raise ValueError(
-                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
-                f" sigmas schedules. Please check whether you are using the correct scheduler."
-            )
-        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
-        timesteps = scheduler.timesteps
-        num_inference_steps = len(timesteps)
-    else:
-        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
-        timesteps = scheduler.timesteps
-    return timesteps, num_inference_steps
-
-
-# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
-def retrieve_latents(
-    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
-):
-    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
-        return encoder_output.latent_dist.sample(generator)
-    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
-        return encoder_output.latent_dist.mode()
-    elif hasattr(encoder_output, "latents"):
-        return encoder_output.latents
-    else:
-        raise AttributeError("Could not access latents of provided encoder_output")
-
-
-def calculate_dimensions(target_area, ratio):
-    width = math.sqrt(target_area * ratio)
-    height = width / ratio
-
-    width = width if width % 16 == 0 else (width // 16 + 1) * 16
-    height = height if height % 16 == 0 else (height // 16 + 1) * 16
-
-    width = int(width)
-    height = int(height)
-
-    return width, height
-
-
-class LongCatImageEditPipeline(DiffusionPipeline, FromSingleFileMixin):
-    r"""
-    The LongCat-Image-Edit pipeline for image editing.
-    """
-
-    model_cpu_offload_seq = "text_encoder->image_encoder->transformer->vae"
-    _optional_components = []
-    _callback_tensor_inputs = ["latents", "prompt_embeds"]
-
-    def __init__(
-        self,
-        scheduler: FlowMatchEulerDiscreteScheduler,
-        vae: AutoencoderKL,
-        text_encoder: Qwen2_5_VLForConditionalGeneration,
-        tokenizer: Qwen2Tokenizer,
-        text_processor: Qwen2VLProcessor,
-        transformer: LongCatImageTransformer2DModel,
-    ):
-        super().__init__()
-
-        self.register_modules(
-            vae=vae,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            transformer=transformer,
-            scheduler=scheduler,
-            text_processor=text_processor,
-        )
-
-        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
-        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor * 2)
-        self.image_processor_vl = text_processor.image_processor
-
-        self.image_token = "<|image_pad|>"
-        self.prompt_template_encode_prefix = "<|im_start|>system\nAs an image editing expert, first analyze the content and attributes of the input image(s). Then, based on the user's editing instructions, clearly and precisely determine how to modify the given image(s), ensuring that only the specified parts are altered and all other aspects remain consistent with the original(s).<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>"
-        self.prompt_template_encode_suffix = "<|im_end|>\n<|im_start|>assistant\n"
-        self.default_sample_size = 128
-        self.tokenizer_max_length = 512
-
-    def _encode_prompt(self, prompt, image):
-        raw_vl_input = self.image_processor_vl(images=image, return_tensors="pt")
-        pixel_values = raw_vl_input["pixel_values"]
-        image_grid_thw = raw_vl_input["image_grid_thw"]
-        all_tokens = []
-        for clean_prompt_sub, matched in split_quotation(prompt[0]):
-            if matched:
-                for sub_word in clean_prompt_sub:
-                    tokens = self.tokenizer(sub_word, add_special_tokens=False)["input_ids"]
-                    all_tokens.extend(tokens)
-            else:
-                tokens = self.tokenizer(clean_prompt_sub, add_special_tokens=False)["input_ids"]
-                all_tokens.extend(tokens)
-
-        if len(all_tokens) > self.tokenizer_max_length:
-            logger.warning(
-                "Your input was truncated because `max_sequence_length` is set to "
-                f" {self.tokenizer_max_length} input token nums : {len(len(all_tokens))}"
-            )
-            all_tokens = all_tokens[: self.tokenizer_max_length]
-
-        text_tokens_and_mask = self.tokenizer.pad(
-            {"input_ids": [all_tokens]},
-            max_length=self.tokenizer_max_length,
-            padding="max_length",
-            return_attention_mask=True,
-            return_tensors="pt",
-        )
-
-        text = self.prompt_template_encode_prefix
-
-        merge_length = self.image_processor_vl.merge_size**2
-        while self.image_token in text:
-            num_image_tokens = image_grid_thw.prod() // merge_length
-            text = text.replace(self.image_token, "<|placeholder|>" * num_image_tokens, 1)
-        text = text.replace("<|placeholder|>", self.image_token)
-
-        prefix_tokens = self.tokenizer(text, add_special_tokens=False)["input_ids"]
-        suffix_tokens = self.tokenizer(self.prompt_template_encode_suffix, add_special_tokens=False)["input_ids"]
-
-        vision_start_token_id = self.tokenizer.convert_tokens_to_ids("<|vision_start|>")
-        prefix_len = prefix_tokens.index(vision_start_token_id)
-        suffix_len = len(suffix_tokens)
-
-        prefix_tokens_mask = torch.tensor([1] * len(prefix_tokens), dtype=text_tokens_and_mask.attention_mask[0].dtype)
-        suffix_tokens_mask = torch.tensor([1] * len(suffix_tokens), dtype=text_tokens_and_mask.attention_mask[0].dtype)
-
-        prefix_tokens = torch.tensor(prefix_tokens, dtype=text_tokens_and_mask.input_ids.dtype)
-        suffix_tokens = torch.tensor(suffix_tokens, dtype=text_tokens_and_mask.input_ids.dtype)
-
-        input_ids = torch.cat((prefix_tokens, text_tokens_and_mask.input_ids[0], suffix_tokens), dim=-1)
-        attention_mask = torch.cat(
-            (prefix_tokens_mask, text_tokens_and_mask.attention_mask[0], suffix_tokens_mask), dim=-1
-        )
-
-        input_ids = input_ids.unsqueeze(0).to(self.device)
-        attention_mask = attention_mask.unsqueeze(0).to(self.device)
-
-        pixel_values = pixel_values.to(self.device)
-        image_grid_thw = image_grid_thw.to(self.device)
-
-        text_output = self.text_encoder(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            pixel_values=pixel_values,
-            image_grid_thw=image_grid_thw,
-            output_hidden_states=True,
-        )
-        # [max_sequence_length, batch, hidden_size] -> [batch, max_sequence_length, hidden_size]
-        # clone to have a contiguous tensor
-        prompt_embeds = text_output.hidden_states[-1].detach()
-        prompt_embeds = prompt_embeds[:, prefix_len:-suffix_len, :]
-        return prompt_embeds
-
-    def encode_prompt(
-        self,
-        prompt: List[str] = None,
-        image: Optional[torch.Tensor] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        prompt_embeds: Optional[torch.Tensor] = None,
-    ):
-        prompt = [prompt] if isinstance(prompt, str) else prompt
-        batch_size = len(prompt)
-        # If prompt_embeds is provided and prompt is None, skip encoding
-        if prompt_embeds is None:
-            prompt_embeds = self._encode_prompt(prompt, image)
-
-        _, seq_len, _ = prompt_embeds.shape
-        # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
-        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
-        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
-
-        text_ids = prepare_pos_ids(modality_id=0, type="text", start=(0, 0), num_token=prompt_embeds.shape[1]).to(
-            self.device
-        )
-        return prompt_embeds, text_ids
-
-    @staticmethod
-    def _pack_latents(latents, batch_size, num_channels_latents, height, width):
-        latents = latents.view(batch_size, num_channels_latents, height // 2, 2, width // 2, 2)
-        latents = latents.permute(0, 2, 4, 1, 3, 5)
-        latents = latents.reshape(batch_size, (height // 2) * (width // 2), num_channels_latents * 4)
-
-        return latents
-
-    @staticmethod
-    def _unpack_latents(latents, height, width, vae_scale_factor):
-        batch_size, num_patches, channels = latents.shape
-
-        # VAE applies 8x compression on images but we must also account for packing which requires
-        # latent height and width to be divisible by 2.
-        height = 2 * (int(height) // (vae_scale_factor * 2))
-        width = 2 * (int(width) // (vae_scale_factor * 2))
-
-        latents = latents.view(batch_size, height // 2, width // 2, channels // 4, 2, 2)
-        latents = latents.permute(0, 3, 1, 4, 2, 5)
-
-        latents = latents.reshape(batch_size, channels // (2 * 2), height, width)
-
-        return latents
-
-    def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator):
-        if isinstance(generator, list):
-            image_latents = [
-                retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i], sample_mode="argmax")
-                for i in range(image.shape[0])
-            ]
-            image_latents = torch.cat(image_latents, dim=0)
-        else:
-            image_latents = retrieve_latents(self.vae.encode(image), generator=generator, sample_mode="argmax")
-        image_latents = (image_latents - self.vae.config.shift_factor) * self.vae.config.scaling_factor
-
-        return image_latents
-
-    @property
-    def do_classifier_free_guidance(self):
-        return self._guidance_scale > 1
-
-    def prepare_latents(
-        self,
-        image,
-        batch_size,
-        num_channels_latents,
-        height,
-        width,
-        dtype,
-        prompt_embeds_length,
-        device,
-        generator,
-        latents=None,
-    ):
-        # VAE applies 8x compression on images but we must also account for packing which requires
-        # latent height and width to be divisible by 2.
-        height = 2 * (int(height) // (self.vae_scale_factor * 2))
-        width = 2 * (int(width) // (self.vae_scale_factor * 2))
-
-        image_latents, image_latents_ids = None, None
-
-        if image is not None:
-            image = image.to(device=self.device, dtype=dtype)
-
-            if image.shape[1] != self.vae.config.latent_channels:
-                image_latents = self._encode_vae_image(image=image, generator=generator)
-            else:
-                image_latents = image
-            if batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] == 0:
-                additional_image_per_prompt = batch_size // image_latents.shape[0]
-                image_latents = torch.cat([image_latents] * additional_image_per_prompt, dim=0)
-            elif batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] != 0:
-                raise ValueError(
-                    f"Cannot duplicate `image` of batch size {image_latents.shape[0]} to {batch_size} text prompts."
-                )
-            else:
-                image_latents = torch.cat([image_latents], dim=0)
-
-            image_latents = self._pack_latents(image_latents, batch_size, num_channels_latents, height, width)
-
-            image_latents_ids = prepare_pos_ids(
-                modality_id=2,
-                type="image",
-                start=(prompt_embeds_length, prompt_embeds_length),
-                height=height // 2,
-                width=width // 2,
-            ).to(device, dtype=torch.float64)
-
-        shape = (batch_size, num_channels_latents, height, width)
-        latents_ids = prepare_pos_ids(
-            modality_id=1,
-            type="image",
-            start=(prompt_embeds_length, prompt_embeds_length),
-            height=height // 2,
-            width=width // 2,
-        ).to(device)
-
-        if isinstance(generator, list) and len(generator) != batch_size:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
-            )
-
-        if latents is None:
-            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
-            latents = self._pack_latents(latents, batch_size, num_channels_latents, height, width)
-        else:
-            latents = latents.to(device=device, dtype=dtype)
-
-        return latents, image_latents, latents_ids, image_latents_ids
-
-    @property
-    def guidance_scale(self):
-        return self._guidance_scale
-
-    @property
-    def joint_attention_kwargs(self):
-        return self._joint_attention_kwargs
-
-    @property
-    def num_timesteps(self):
-        return self._num_timesteps
-
-    @property
-    def current_timestep(self):
-        return self._current_timestep
-
-    @property
-    def interrupt(self):
-        return self._interrupt
-
-    def check_inputs(
-        self, prompt, height, width, negative_prompt=None, prompt_embeds=None, negative_prompt_embeds=None
-    ):
-        if height % (self.vae_scale_factor * 2) != 0 or width % (self.vae_scale_factor * 2) != 0:
-            logger.warning(
-                f"`height` and `width` have to be divisible by {self.vae_scale_factor * 2} but are {height} and {width}. Dimensions will be resized accordingly"
-            )
-
-        if prompt is not None and prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two."
-            )
-        elif prompt is None and prompt_embeds is None:
-            raise ValueError(
-                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
-            )
-        elif prompt is not None:
-            if isinstance(prompt, str):
-                pass
-            elif isinstance(prompt, list) and len(prompt) == 1:
-                pass
-            else:
-                raise ValueError(
-                    f"`prompt` must be a `str` or a `list` of length 1, but is {prompt} (type: {type(prompt)})"
-                )
-
-        if negative_prompt is not None and negative_prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
-                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
-            )
-
-    @replace_example_docstring(EXAMPLE_DOC_STRING)
-    @torch.no_grad()
-    def __call__(
-        self,
-        image: Optional[PIL.Image.Image] = None,
-        prompt: Union[str, List[str]] = None,
-        negative_prompt: Union[str, List[str]] = None,
-        num_inference_steps: int = 50,
-        sigmas: Optional[List[float]] = None,
-        guidance_scale: float = 4.5,
-        num_images_per_prompt: Optional[int] = 1,
-        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
-        latents: Optional[torch.FloatTensor] = None,
-        prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-
-        Examples:
-
-        Returns:
-            [`~pipelines.LongCatImagePipelineOutput`] or `tuple`: [`~pipelines.LongCatImagePipelineOutput`] if
-            `return_dict` is True, otherwise a `tuple`. When returning a tuple, the first element is a list with the
-            generated images.
-        """
-
-        image_size = image[0].size if isinstance(image, list) else image.size
-        calculated_width, calculated_height = calculate_dimensions(1024 * 1024, image_size[0] * 1.0 / image_size[1])
-
-        # 1. Check inputs. Raise error if not correct
-        self.check_inputs(
-            prompt,
-            calculated_height,
-            calculated_width,
-            negative_prompt=negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-        )
-
-        self._guidance_scale = guidance_scale
-        self._joint_attention_kwargs = joint_attention_kwargs
-        self._current_timestep = None
-        self._interrupt = False
-
-        # 2. Define call parameters
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        device = self._execution_device
-
-        # 3. Preprocess image
-        if image is not None and not (isinstance(image, torch.Tensor) and image.size(1) == self.latent_channels):
-            image = self.image_processor.resize(image, calculated_height, calculated_width)
-            prompt_image = self.image_processor.resize(image, calculated_height // 2, calculated_width // 2)
-            image = self.image_processor.preprocess(image, calculated_height, calculated_width)
-
-        negative_prompt = "" if negative_prompt is None else negative_prompt
-        (prompt_embeds, text_ids) = self.encode_prompt(
-            prompt=prompt, image=prompt_image, prompt_embeds=prompt_embeds, num_images_per_prompt=num_images_per_prompt
-        )
-        if self.do_classifier_free_guidance:
-            (negative_prompt_embeds, negative_text_ids) = self.encode_prompt(
-                prompt=negative_prompt,
-                image=prompt_image,
-                prompt_embeds=negative_prompt_embeds,
-                num_images_per_prompt=num_images_per_prompt,
-            )
-
-        # 4. Prepare latent variables
-        num_channels_latents = 16
-        latents, image_latents, latents_ids, image_latents_ids = self.prepare_latents(
-            image,
-            batch_size * num_images_per_prompt,
-            num_channels_latents,
-            calculated_height,
-            calculated_width,
-            prompt_embeds.dtype,
-            prompt_embeds.shape[1],
-            device,
-            generator,
-            latents,
-        )
-
-        # 5. Prepare timesteps
-        sigmas = np.linspace(1.0, 1.0 / num_inference_steps, num_inference_steps) if sigmas is None else sigmas
-        image_seq_len = latents.shape[1]
-        mu = calculate_shift(
-            image_seq_len,
-            self.scheduler.config.get("base_image_seq_len", 256),
-            self.scheduler.config.get("max_image_seq_len", 4096),
-            self.scheduler.config.get("base_shift", 0.5),
-            self.scheduler.config.get("max_shift", 1.15),
-        )
-        timesteps, num_inference_steps = retrieve_timesteps(
-            self.scheduler,
-            num_inference_steps,
-            device,
-            sigmas=sigmas,
-            mu=mu,
-        )
-        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
-        self._num_timesteps = len(timesteps)
-
-        # handle guidance
-        guidance = None
-
-        if self.joint_attention_kwargs is None:
-            self._joint_attention_kwargs = {}
-
-        if image is not None:
-            latent_image_ids = torch.cat([latents_ids, image_latents_ids], dim=0)
-        else:
-            latent_image_ids = latents_ids
-
-        # 6. Denoising loop
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                if self.interrupt:
-                    continue
-
-                self._current_timestep = t
-
-                latent_model_input = latents
-                if image_latents is not None:
-                    latent_model_input = torch.cat([latents, image_latents], dim=1)
-
-                timestep = t.expand(latent_model_input.shape[0]).to(latents.dtype)
-                with self.transformer.cache_context("cond"):
-                    noise_pred_text = self.transformer(
-                        hidden_states=latent_model_input,
-                        timestep=timestep / 1000,
-                        guidance=guidance,
-                        encoder_hidden_states=prompt_embeds,
-                        txt_ids=text_ids,
-                        img_ids=latent_image_ids,
-                        return_dict=False,
-                    )[0]
-                    noise_pred_text = noise_pred_text[:, :image_seq_len]
-                if self.do_classifier_free_guidance:
-                    with self.transformer.cache_context("uncond"):
-                        noise_pred_uncond = self.transformer(
-                            hidden_states=latent_model_input,
-                            timestep=timestep / 1000,
-                            encoder_hidden_states=negative_prompt_embeds,
-                            txt_ids=negative_text_ids,
-                            img_ids=latent_image_ids,
-                            return_dict=False,
-                        )[0]
-                        noise_pred_uncond = noise_pred_uncond[:, :image_seq_len]
-                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
-                else:
-                    noise_pred = noise_pred_text
-                # compute the previous noisy sample x_t -> x_t-1
-                latents_dtype = latents.dtype
-                latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
-
-                if latents.dtype != latents_dtype:
-                    if torch.backends.mps.is_available():
-                        # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
-                        latents = latents.to(latents_dtype)
-
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    progress_bar.update()
-
-                if XLA_AVAILABLE:
-                    xm.mark_step()
-
-        self._current_timestep = None
-
-        if output_type == "latent":
-            image = latents
-        else:
-            latents = self._unpack_latents(latents, calculated_height, calculated_width, self.vae_scale_factor)
-            latents = (latents / self.vae.config.scaling_factor) + self.vae.config.shift_factor
-
-            if latents.dtype != self.vae.dtype:
-                latents = latents.to(dtype=self.vae.dtype)
-
-            image = self.vae.decode(latents, return_dict=False)[0]
-            image = self.image_processor.postprocess(image, output_type=output_type)
-
-        # Offload all models
-        self.maybe_free_model_hooks()
-
-        if not return_dict:
-            return (image,)
-
-        return LongCatImagePipelineOutput(images=image)
--- a/src/diffusers/pipelines/longcat_image/pipeline_output.py
+++ b/src/diffusers/pipelines/longcat_image/pipeline_output.py
@@ -1,21 +0,0 @@
-from dataclasses import dataclass
-from typing import List, Union
-
-import numpy as np
-import PIL.Image
-
-from diffusers.utils import BaseOutput
-
-
-@dataclass
-class LongCatImagePipelineOutput(BaseOutput):
-    """
-    Output class for Stable Diffusion pipelines.
-
-    Args:
-        images (`List[PIL.Image.Image]` or `np.ndarray`)
-            List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width,
-            num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline.
-    """
-
-    images: Union[List[PIL.Image.Image], np.ndarray]
--- a/src/diffusers/pipelines/longcat_image/system_messages.py
+++ b/src/diffusers/pipelines/longcat_image/system_messages.py
@@ -1,142 +0,0 @@
-SYSTEM_PROMPT_EN = """
-You are a prompt engineering expert for text-to-image models. Since text-to-image models have limited capabilities in
-understanding user prompts, you need to identify the core theme and intent of the user's input and improve the model's
-understanding accuracy and generation quality through optimization and rewriting. The rewrite must strictly retain all
-information from the user's original prompt without deleting or distorting any details. Specific requirements are as
-follows:
-1. The rewrite must not affect any information expressed in the user's original prompt; the rewritten prompt should use
-   coherent natural language, avoid low-information redundant descriptions, and keep the rewritten prompt length as
-   concise as possible.
-2. Ensure consistency between input and output languages: Chinese input yields Chinese output, and English input yields
-   English output. The rewritten token count should not exceed 512.
-3. The rewritten description should further refine subject characteristics and aesthetic techniques appearing in the
-   original prompt, such as lighting and textures.
-4. If the original prompt does not specify an image style, ensure the rewritten prompt uses a **realistic photography
-   style**. If the user specifies a style, retain the user's style.
-5. When the original prompt requires reasoning to clarify user intent, use logical reasoning based on world knowledge
-   to convert vague abstract descriptions into specific tangible objects (e.g., convert "the tallest animal" to "a
-   giraffe").
-6. When the original prompt requires text generation, please use double quotes to enclose the text part (e.g., `"50%
-   OFF"`).
-7. When the original prompt requires generating text-heavy scenes like webpages, logos, UIs, or posters, and no
-   specific text content is specified, you need to infer appropriate text content and enclose it in double quotes. For
-   example, if the user inputs: "A tourism flyer with a grassland theme," it should be rewritten as: "A tourism flyer
-   with the image title 'Grassland'."
-8. When negative words exist in the original prompt, ensure the rewritten prompt does not contain negative words. For
-   example, "a lakeside without boats" should be rewritten such that the word "boat" does not appear at all.
-9. Except for text content explicitly requested by the user, **adding any extra text content is prohibited**.
-Here are examples of rewrites for different types of prompts: # Examples (Few-Shot Learning)
-  1. User Input: An animal with nine lives.
-    Rewrite Output: A cat bathed in soft sunlight, its fur soft and glossy. The background is a comfortable home
-    environment with light from the window filtering through curtains, creating a warm light and shadow effect. The
-    shot uses a medium distance perspective to highlight the cat's leisurely and stretched posture. Light cleverly hits
-    the cat's face, emphasizing its spirited eyes and delicate whiskers, adding depth and affinity to the image.
-  2. User Input: Create an anime-style tourism flyer with a grassland theme.
-    Rewrite Output: In the lower right of the center, a short-haired girl sits sideways on a gray, irregularly shaped
-    rock. She wears a white short-sleeved dress and brown flat shoes, holding a bunch of small white flowers in her
-    left hand, smiling with her legs hanging naturally. The girl has dark brown shoulder-length hair with bangs
-    covering her forehead, brown eyes, and a slightly open mouth. The rock surface has textures of varying depths. To
-    the girl's left and front is lush grass, with long, yellow-green blades, some glowing golden in the sunlight. The
-    grass extends into the distance, forming rolling green hills that fade in color as they recede. The sky occupies
-    the upper half of the picture, pale blue dotted with a few fluffy white clouds. In the upper left corner, there is
-    a line of text in italic, dark green font reading "Explore Nature's Peace". Colors are dominated by green, blue,
-    and yellow, fluid lines, and distinct light and shadow contrast, creating a quiet and comfortable atmosphere.
-  3. User Input: A Christmas sale poster with a red background, promoting a Buy 1 Get 1 Free milk tea offer.
-    Rewrite Output: The poster features an overall red tone, embellished with white snowflake patterns on the top and
-    left side. The upper right features a bunch of holly leaves with red berries and a pine cone. In the upper center,
-    golden 3D text reads "Christmas Heartwarming Feedback" centered, along with red bold text "Buy 1 Get 1". Below, two
-    transparent cups filled with bubble tea are placed side by side; the tea is light brown with dark brown pearls
-    scattered at the bottom and middle. Below the cups, white snow piles up, decorated with pine branches, red berries,
-    and pine cones. A blurry Christmas tree is faintly visible in the lower right corner. The image has high clarity,
-    accurate text content, a unified design style, a prominent Christmas theme, and a reasonable layout, providing
-    strong visual appeal.
-  4. User Input: A woman indoors shot in natural light, smiling with arms crossed, showing a relaxed and confident
-     posture.
-    Rewrite Output: The image features a young Asian woman with long dark brown hair naturally falling over her
-    shoulders, with some strands illuminated by light, showing a soft sheen. Her features are delicate, with long
-    eyebrows, bright and spirited dark brown eyes looking directly at the camera, revealing peace and confidence. She
-    has a high nose bridge, full lips with nude lipstick, and corners of the mouth slightly raised in a faint smile.
-    Her skin is fair, with cheeks and collarbones illuminated by warm light, showing a healthy ruddiness. She wears a
-    black spaghetti strap tank top revealing graceful collarbone lines, and a thin gold necklace with small beads and
-    metal bars glinting in the light. Her outer layer is a beige knitted cardigan, soft in texture with visible
-    knitting patterns on the sleeves. Her arms are crossed over her chest, hands covered by the cardigan sleeves, in a
-    relaxed posture. The background is a pure dark brown without extra decoration, making the figure the absolute
-    focus. The figure is located in the center of the frame. Light enters from the upper right, creating bright spots
-    on her left cheek, neck, and collarbone, while the right side is slightly shadowed, creating a three-dimensional
-    and soft tone. Image details are clear, showcasing skin texture, hair, and clothing materials well. Colors are
-    dominated by warm tones, with the combination of beige and dark brown creating a warm and comfortable atmosphere.
-    The overall style is natural, elegant, and artistic.
-  5. User Input: Create a series of images showing the growth process of an apple from seed to fruit. The series should
-     include four stages: 1. Sowing, 2. Seedling growth, 3. Plant maturity, 4. Fruit harvesting.
-    Rewrite Output: A 4-panel exquisite illustration depicting the growth process of an apple, capturing each stage
-    precisely and clearly. 1. "Sowing": A close-up shot of a hand gently placing a small apple seed into fertile dark
-    soil, with visible soil texture and the seed's smooth surface. The background is a soft-focus garden dotted with
-    green leaves and sunlight filtering through. 2. "Seedling Growth": A young apple sapling breaks through the soil,
-    stretching tender green leaves toward the sky. The scene is set in a vibrant garden illuminated by warm golden
-    light, highlighting the seedling's delicate structure. 3. "Plant Maturity": A mature apple tree, lush with branches
-    and leaves, covered in tender green foliage and developing small apples. The background is a vibrant orchard under
-    a clear blue sky, with dappled sunlight creating a peaceful atmosphere. 4. "Fruit Harvesting": A hand reaches into
-    the tree to pick a ripe red apple, its smooth skin glistening in the sun. The scene shows the abundance of the
-    orchard, with baskets of apples in the background, giving a sense of fulfillment. Each illustration uses a
-    realistic style, focusing on details and harmonious colors to showcase the natural beauty and development of the
-    apple's life cycle.
-  6. User Input: If 1 represents red, 2 represents green, 3 represents purple, and 4 represents yellow, please generate
-     a four-color rainbow based on this rule. The color order from top to bottom is 3142.
-    Rewrite Output: The image consists of four horizontally arranged colored stripes, ordered from top to bottom as
-    purple, red, yellow, and green. A white number is centered on each stripe. The top purple stripe features the
-    number "3", the red stripe below it has the number "1", the yellow stripe further down has the number "4", and the
-    bottom green stripe has the number "2". All numbers use a sans-serif font in pure white, forming a sharp contrast
-    with the background colors to ensure good readability. The stripes have high color saturation and a slight texture.
-    The overall layout is simple and clear, with distinct visual effects and no extra decorative elements, emphasizing
-    the numerical information. The image is high definition, with accurate colors and a consistent style, offering
-    strong visual appeal.
-  7. User Input: A stone tablet carved with "Guan Guan Ju Jiu, On the River Isle", natural light, background is a
-     Chinese garden.
-    Rewrite Output: An ancient stone tablet carved with "Guan Guan Ju Jiu, On the River Isle", the surface covered with
-    traces of time, the writing clear and deep. Natural light falls from above, softly illuminating every detail of the
-    stone tablet and enhancing its sense of history. The background is an elegant Chinese garden featuring lush bamboo
-    forests, winding paths, and quiet pools, creating a serene and distant atmosphere. The overall picture uses a
-    realistic style with rich details and natural light and shadow effects, highlighting the cultural heritage of the
-    stone tablet and the classical beauty of the garden.
-# Output Format Please directly output the rewritten and optimized Prompt content. Do not include any explanatory
-language or JSON formatting, and do not add opening or closing quotes yourself."""
-
-
-SYSTEM_PROMPT_ZH = """
-你是一名文生图模型的prompt
-engineering专家。由于文生图模型对用户prompt的理解能力有限，你需要识别用户输入的核心主题和意图，并通过优化改写提升模型的理解准确性和生成质量。改写必须严格保留用户原始prompt的所有信息，不得删减或曲解任何细节。
-具体要求如下：
-1. 改写不能影响用户原始prompt里表达的任何信息，改写后的prompt应该使用连贯的自然语言表达,不要出现低信息量的冗余描述，尽可能保持改写后prompt长度精简。
-2. 请确保输入和输出的语言类型一致，中文输入中文输出，英文输入英文输出，改写后的token数量不要超过512个;
-3. 改写后的描述应当进一步完善原始prompt中出现的主体特征、美学技巧，如打光、纹理等；
-4. 如果原始prompt没有指定图片风格时，确保改写后的prompt使用真实摄影风格，如果用户指定了图片风格，则保留用户风格；
-5. 当原始prompt需要推理才能明确用户意图时，根据世界知识进行适当逻辑推理，将模糊抽象描述转化为具体指向事物（例：将"最高的动物"转化为"一头长颈鹿"）。
-6. 当原始prompt需要生成文字时，请使用双引号圈定文字部分，例：`"限时5折"`）。
-7. 当原始prompt需要生成网页、logo、ui、海报等文字场景时，且没有指定具体的文字内容时，需要推断出合适的文字内容，并使用双引号圈定，如用户输入：一个旅游宣传单，以草原为主题。应该改写成：一个旅游宣传单，图片标题为“草原”。
-8. 当原始prompt中存在否定词时，需要确保改写后的prompt不存在否定词，如没有船的湖边，改写后的prompt不能出现船这个词汇。
-9. 除非用户指定生成品牌logo，否则不要增加额外的品牌logo.
-10. 除了用户明确要求书写的文字内容外，**禁止增加任何额外的文字内容**。
-以下是针对不同类型prompt改写的示例：
-
-# Examples (Few-Shot Learning)
-  1. 用户输入: 九条命的动物。
-    改写输出:
-    一只猫，被柔和的阳光笼罩着，毛发柔软而富有光泽。背景是一个舒适的家居环境，窗外的光线透过窗帘，形成温馨的光影效果。镜头采用中距离视角，突出猫悠闲舒展的姿态。光线巧妙地打在猫的脸部，强调它灵动的眼睛和精致的胡须，增加画面的层次感与亲和力。
-  2. 用户输入: 制作一个动画风格的旅游宣传单，以草原为主题。
-    改写输出:
-    画面中央偏右下角，一个短发女孩侧身坐在灰色的不规则形状岩石上，她穿着白色短袖连衣裙和棕色平底鞋，左手拿着一束白色小花，面带微笑，双腿自然垂下。女孩的头发为深棕色，齐肩短发，刘海覆盖额头，眼睛呈棕色，嘴巴微张。岩石表面有深浅不一的纹理。女孩的左侧和前方是茂盛的草地，草叶细长，呈黄绿色，部分草叶在阳光下泛着金色的光芒，仿佛被阳光照亮。草地向远处延伸，形成连绵起伏的绿色山丘，山丘的颜色由近及远逐渐变浅。天空占据了画面的上半部分，呈淡蓝色，点缀着几朵白色蓬松的云彩。画面的左上角有一行文字，文字内容是斜体、深绿色的“Explore
-    Nature's Peace”。色彩以绿色、蓝色和黄色为主，线条流畅，光影明暗对比明显，营造出一种宁静、舒适的氛围。
-  3. 用户输入: 一张以红色为背景的圣诞节促销海报，主要宣传奶茶买一送一的优惠活动。
-    改写输出: 海报整体呈现红色调，上方和左侧点缀着白色雪花图案，右上方有一束冬青叶和红色浆果，以及一个松果。海报中央偏上位置，金色立体字样“圣诞节
-    暖心回馈”居中排列，和红色粗体字“买1送1”。海报下方，两个装满珍珠奶茶的透明杯子并排摆放，杯中奶茶呈浅棕色，底部和中间散布着深棕色珍珠。杯子下方，堆积着白色雪花，雪花上装饰着松枝、红色浆果和松果。右下角隐约可见一棵模糊的圣诞树。图片清晰度高，文字内容准确，整体设计风格统一，圣诞主题突出，排版布局合理，具有较强的视觉吸引力。
-  4. 用户输入: 一位女性在室内以自然光线拍摄，她面带微笑，双臂交叉，展现出轻松自信的姿态。
-    改写输出:
-    画面中是一位年轻的亚洲女性，她拥有深棕色的长发，发丝自然地垂落在双肩，部分发丝被光线照亮，呈现出柔和的光泽。她的五官精致，眉毛修长，眼睛明亮有神，瞳孔呈深棕色，眼神直视镜头，流露出平和与自信。鼻梁挺拔，嘴唇丰满，涂有裸色系唇膏，嘴角微微上扬，展现出浅浅的微笑。她的肤色白皙，脸颊和锁骨处被暖色调的光线照亮，呈现出健康的红润感。她穿着一件黑色的细吊带背心，肩带纤细，露出优美的锁骨线条。脖颈上佩戴着一条金色的细项链，项链由小珠子和几个细长的金属条组成，在光线下闪烁着光泽。她的外搭是一件米黄色的针织开衫，材质柔软，袖子部分有明显的针织纹理。她双臂交叉在胸前，双手被开衫的袖子覆盖，姿态放松。背景是纯粹的深棕色，没有多余的装饰，使得人物成为画面的绝对焦点。人物位于画面中央。光线从画面的右上方射入，在人物的左侧脸颊、脖颈和锁骨处形成明亮的光斑，右侧则略显阴影，营造出立体感和柔和的影调。图像细节清晰，人物的皮肤纹理、发丝以及衣物材质都得到了很好的展现。色彩以暖色调为主，米黄色和深棕色的搭配营造出温馨舒适的氛围。整体呈现出一种自然、优雅且富有亲和力的艺术风格。
-  5. 用户输入：创作一系列图片，展现苹果从种子到结果的生长过程。该系列图片应包含以下四个阶段：1. 播种，2. 幼苗生长，3. 植物成熟，4. 果实采摘。
-    改写输出：一个4宫格的精美插图，描绘苹果的生长过程，精确清晰地捕捉每个阶段。1.“播种”：特写镜头，一只手轻轻地将一颗小小的苹果种子放入肥沃的深色土壤中，土壤的纹理和种子光滑的表面清晰可见。背景是花园的柔焦画面，点缀着绿色的树叶和透过树叶洒下的阳光。2.“幼苗生长”：一棵幼小的苹果树苗破土而出，嫩绿的叶子向天空舒展。场景设定在一个生机勃勃的花园中，温暖的金光照亮了它。幼苗的纤细结构。3.“植物的成熟”：一棵成熟的苹果树，枝繁叶茂，挂满了嫩绿的叶子和正在萌发的小苹果。背景是一片生机勃勃的果园，湛蓝的天空下，斑驳的阳光营造出宁静祥和的氛围。4.“采摘果实”：一只手伸向树上，摘下一个成熟的红苹果，苹果光滑的果皮在阳光下闪闪发光。画面展现了果园的丰收景象，背景中摆放着一篮篮的苹果，给人一种圆满满足的感觉。每幅插图都采用写实风格，注重细节，色彩和谐，展现了苹果生命周期的自然之美和发展过程。
-  6. 用户输入： 如果1代表红色，2代表绿色，3代表紫色，4代表黄色，请按照此规则生成四色彩虹。它的颜色顺序从上到下是3142
-    改写输出：图片由四个水平排列的彩色条纹组成，从上到下依次为紫色、红色、黄色和绿色。每个条纹上都居中放置一个白色数字。最上方的紫色条纹上是数字“3”，其下方红色条纹上是数字“1”，再下方黄色条纹上是数字“4”，最下方的绿色条纹上是数字“2”。所有数字均采用无衬线字体，颜色为纯白色，与背景色形成鲜明对比，确保了良好的可读性。条纹的颜色饱和度高，且带有轻微的纹理感，整体排版简洁明了，视觉效果清晰，没有多余的装饰元素，强调了数字信息本身。图片整体清晰度高，色彩准确，风格一致，具有较强的视觉吸引力。
-  7. 用户输入：石碑上刻着“关关雎鸠，在河之洲”，自然光照，背景是中式园林
-    改写输出：一块古老的石碑上刻着“关关雎鸠，在河之洲”，石碑表面布满岁月的痕迹，字迹清晰而深刻。自然光线从上方洒下，柔和地照亮石碑的每一个细节，增强了其历史感。背景是一座典雅的中式园林，园林中有翠绿的竹林、蜿蜒的小径和静谧的水池，营造出一种宁静而悠远的氛围。整体画面采用写实风格，细节丰富，光影效果自然，突出了石碑的文化底蕴和园林的古典美。
-# 输出格式 请直接输出改写优化后的 Prompt 内容，不要包含任何解释性语言或 JSON 格式，不要自行添加开头或结尾的引号。
-"""
--- a/src/diffusers/pipelines/qwenimage/init.py
+++ b/src/diffusers/pipelines/qwenimage/init.py
@@ -31,7 +31,6 @@ else:
    _import_structure["pipeline_qwenimage_edit_plus"] = ["QwenImageEditPlusPipeline"]
    _import_structure["pipeline_qwenimage_img2img"] = ["QwenImageImg2ImgPipeline"]
    _import_structure["pipeline_qwenimage_inpaint"] = ["QwenImageInpaintPipeline"]
-    _import_structure["pipeline_qwenimage_layered"] = ["QwenImageLayeredPipeline"]

 if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
    try:
@@ -48,7 +47,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
        from .pipeline_qwenimage_edit_plus import QwenImageEditPlusPipeline
        from .pipeline_qwenimage_img2img import QwenImageImg2ImgPipeline
        from .pipeline_qwenimage_inpaint import QwenImageInpaintPipeline
-        from .pipeline_qwenimage_layered import QwenImageLayeredPipeline
 else:
    import sys

--- a/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_layered.py
+++ b/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_layered.py
@@ -1,905 +0,0 @@
-# Copyright 2025 Qwen-Image Team and The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-import math
-from typing import Any, Callable, Dict, List, Optional, Union
-
-import numpy as np
-import torch
-from transformers import Qwen2_5_VLForConditionalGeneration, Qwen2Tokenizer, Qwen2VLProcessor
-
-from ...image_processor import PipelineImageInput, VaeImageProcessor
-from ...loaders import QwenImageLoraLoaderMixin
-from ...models import AutoencoderKLQwenImage, QwenImageTransformer2DModel
-from ...schedulers import FlowMatchEulerDiscreteScheduler
-from ...utils import is_torch_xla_available, logging, replace_example_docstring
-from ...utils.torch_utils import randn_tensor
-from ..pipeline_utils import DiffusionPipeline
-from .pipeline_output import QwenImagePipelineOutput
-
-
-if is_torch_xla_available():
-    import torch_xla.core.xla_model as xm
-
-    XLA_AVAILABLE = True
-else:
-    XLA_AVAILABLE = False
-
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-EXAMPLE_DOC_STRING = """
-    Examples:
-        ```py
-        >>> import torch
-        >>> from PIL import Image
-        >>> from diffusers import QwenImageLayeredPipeline
-        >>> from diffusers.utils import load_image
-
-        >>> pipe = QwenImageLayeredPipeline.from_pretrained("Qwen/Qwen-Image-Layered", torch_dtype=torch.bfloat16)
-        >>> pipe.to("cuda")
-        >>> image = load_image(
-        ...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/yarn-art-pikachu.png"
-        ... ).convert("RGBA")
-        >>> prompt = ""
-        >>> # Depending on the variant being used, the pipeline call will slightly vary.
-        >>> # Refer to the pipeline documentation for more details.
-        >>> images = pipe(
-        ...     image,
-        ...     prompt,
-        ...     num_inference_steps=50,
-        ...     true_cfg_scale=4.0,
-        ...     layers=4,
-        ...     resolution=640,
-        ...     cfg_normalize=False,
-        ...     use_en_prompt=True,
-        ... ).images[0]
-        >>> for i, image in enumerate(images):
-        ...     image.save(f"{i}.out.png")
-        ```
-"""
-
-
-# Copied from diffusers.pipelines.qwenimage.pipeline_qwenimage.calculate_shift
-def calculate_shift(
-    image_seq_len,
-    base_seq_len: int = 256,
-    max_seq_len: int = 4096,
-    base_shift: float = 0.5,
-    max_shift: float = 1.15,
-):
-    m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
-    b = base_shift - m * base_seq_len
-    mu = image_seq_len * m + b
-    return mu
-
-
-# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
-def retrieve_timesteps(
-    scheduler,
-    num_inference_steps: Optional[int] = None,
-    device: Optional[Union[str, torch.device]] = None,
-    timesteps: Optional[List[int]] = None,
-    sigmas: Optional[List[float]] = None,
-    **kwargs,
-):
-    r"""
-    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
-    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
-
-    Args:
-        scheduler (`SchedulerMixin`):
-            The scheduler to get timesteps from.
-        num_inference_steps (`int`):
-            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
-            must be `None`.
-        device (`str` or `torch.device`, *optional*):
-            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
-        timesteps (`List[int]`, *optional*):
-            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
-            `num_inference_steps` and `sigmas` must be `None`.
-        sigmas (`List[float]`, *optional*):
-            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
-            `num_inference_steps` and `timesteps` must be `None`.
-
-    Returns:
-        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
-        second element is the number of inference steps.
-    """
-    if timesteps is not None and sigmas is not None:
-        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
-    if timesteps is not None:
-        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
-        if not accepts_timesteps:
-            raise ValueError(
-                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
-                f" timestep schedules. Please check whether you are using the correct scheduler."
-            )
-        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
-        timesteps = scheduler.timesteps
-        num_inference_steps = len(timesteps)
-    elif sigmas is not None:
-        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
-        if not accept_sigmas:
-            raise ValueError(
-                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
-                f" sigmas schedules. Please check whether you are using the correct scheduler."
-            )
-        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
-        timesteps = scheduler.timesteps
-        num_inference_steps = len(timesteps)
-    else:
-        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
-        timesteps = scheduler.timesteps
-    return timesteps, num_inference_steps
-
-
-# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
-def retrieve_latents(
-    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
-):
-    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
-        return encoder_output.latent_dist.sample(generator)
-    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
-        return encoder_output.latent_dist.mode()
-    elif hasattr(encoder_output, "latents"):
-        return encoder_output.latents
-    else:
-        raise AttributeError("Could not access latents of provided encoder_output")
-
-
-# Copied from diffusers.pipelines.qwenimage.pipeline_qwenimage_edit_plus.calculate_dimensions
-def calculate_dimensions(target_area, ratio):
-    width = math.sqrt(target_area * ratio)
-    height = width / ratio
-
-    width = round(width / 32) * 32
-    height = round(height / 32) * 32
-
-    return width, height
-
-
-class QwenImageLayeredPipeline(DiffusionPipeline, QwenImageLoraLoaderMixin):
-    r"""
-    The Qwen-Image-Layered pipeline for image decomposing.
-
-    Args:
-        transformer ([`QwenImageTransformer2DModel`]):
-            Conditional Transformer (MMDiT) architecture to denoise the encoded image latents.
-        scheduler ([`FlowMatchEulerDiscreteScheduler`]):
-            A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
-        vae ([`AutoencoderKL`]):
-            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
-        text_encoder ([`Qwen2.5-VL-7B-Instruct`]):
-            [Qwen2.5-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct), specifically the
-            [Qwen2.5-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct) variant.
-        tokenizer (`QwenTokenizer`):
-            Tokenizer of class
-            [CLIPTokenizer](https://huggingface.co/docs/transformers/en/model_doc/clip#transformers.CLIPTokenizer).
-    """
-
-    model_cpu_offload_seq = "text_encoder->transformer->vae"
-    _callback_tensor_inputs = ["latents", "prompt_embeds"]
-
-    def __init__(
-        self,
-        scheduler: FlowMatchEulerDiscreteScheduler,
-        vae: AutoencoderKLQwenImage,
-        text_encoder: Qwen2_5_VLForConditionalGeneration,
-        tokenizer: Qwen2Tokenizer,
-        processor: Qwen2VLProcessor,
-        transformer: QwenImageTransformer2DModel,
-    ):
-        super().__init__()
-
-        self.register_modules(
-            vae=vae,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            processor=processor,
-            transformer=transformer,
-            scheduler=scheduler,
-        )
-        self.vae_scale_factor = 2 ** len(self.vae.temperal_downsample) if getattr(self, "vae", None) else 8
-        self.latent_channels = self.vae.config.z_dim if getattr(self, "vae", None) else 16
-        # QwenImage latents are turned into 2x2 patches and packed. This means the latent width and height has to be divisible
-        # by the patch size. So the vae scale factor is multiplied by the patch size to account for this
-        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor * 2)
-        self.vl_processor = processor
-        self.tokenizer_max_length = 1024
-
-        self.prompt_template_encode = "<|im_start|>system\nDescribe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n"
-        self.prompt_template_encode_start_idx = 34
-        self.image_caption_prompt_cn = """<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n# 图像标注器\n你是一个专业的图像标注器。请基于输入图像，撰写图注:\n1.
-使用自然、描述性的语言撰写图注，不要使用结构化形式或富文本形式。\n2. 通过加入以下内容，丰富图注细节：\n - 对象的属性：如数量、颜色、形状、大小、位置、材质、状态、动作等\n -
-对象间的视觉关系：如空间关系、功能关系、动作关系、从属关系、比较关系、因果关系等\n - 环境细节：例如天气、光照、颜色、纹理、气氛等\n - 文字内容：识别图像中清晰可见的文字，不做翻译和解释，用引号在图注中强调\n3.
-保持真实性与准确性：\n - 不要使用笼统的描述\n -
-描述图像中所有可见的信息，但不要加入没有在图像中出现的内容\n<|vision_start|><|image_pad|><|vision_end|><|im_end|>\n<|im_start|>assistant\n"""
-        self.image_caption_prompt_en = """<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n# Image Annotator\nYou are a professional
-image annotator. Please write an image caption based on the input image:\n1. Write the caption using natural,
-descriptive language without structured formats or rich text.\n2. Enrich caption details by including: \n - Object
-attributes, such as quantity, color, shape, size, material, state, position, actions, and so on\n - Vision Relations
-between objects, such as spatial relations, functional relations, possessive relations, attachment relations, action
-relations, comparative relations, causal relations, and so on\n - Environmental details, such as weather, lighting,
-colors, textures, atmosphere, and so on\n - Identify the text clearly visible in the image, without translation or
-explanation, and highlight it in the caption with quotation marks\n3. Maintain authenticity and accuracy:\n - Avoid
-generalizations\n - Describe all visible information in the image, while do not add information not explicitly shown in
-the image\n<|vision_start|><|image_pad|><|vision_end|><|im_end|>\n<|im_start|>assistant\n"""
-        self.default_sample_size = 128
-
-    # Copied from diffusers.pipelines.qwenimage.pipeline_qwenimage.QwenImagePipeline._extract_masked_hidden
-    def _extract_masked_hidden(self, hidden_states: torch.Tensor, mask: torch.Tensor):
-        bool_mask = mask.bool()
-        valid_lengths = bool_mask.sum(dim=1)
-        selected = hidden_states[bool_mask]
-        split_result = torch.split(selected, valid_lengths.tolist(), dim=0)
-
-        return split_result
-
-    def _get_qwen_prompt_embeds(
-        self,
-        prompt: Union[str, List[str]] = None,
-        device: Optional[torch.device] = None,
-        dtype: Optional[torch.dtype] = None,
-    ):
-        device = device or self._execution_device
-        dtype = dtype or self.text_encoder.dtype
-
-        prompt = [prompt] if isinstance(prompt, str) else prompt
-
-        template = self.prompt_template_encode
-        drop_idx = self.prompt_template_encode_start_idx
-        txt = [template.format(e) for e in prompt]
-        txt_tokens = self.tokenizer(
-            txt,
-            padding=True,
-            return_tensors="pt",
-        ).to(device)
-        encoder_hidden_states = self.text_encoder(
-            input_ids=txt_tokens.input_ids,
-            attention_mask=txt_tokens.attention_mask,
-            output_hidden_states=True,
-        )
-        hidden_states = encoder_hidden_states.hidden_states[-1]
-        split_hidden_states = self._extract_masked_hidden(hidden_states, txt_tokens.attention_mask)
-        split_hidden_states = [e[drop_idx:] for e in split_hidden_states]
-        attn_mask_list = [torch.ones(e.size(0), dtype=torch.long, device=e.device) for e in split_hidden_states]
-        max_seq_len = max([e.size(0) for e in split_hidden_states])
-        prompt_embeds = torch.stack(
-            [torch.cat([u, u.new_zeros(max_seq_len - u.size(0), u.size(1))]) for u in split_hidden_states]
-        )
-        encoder_attention_mask = torch.stack(
-            [torch.cat([u, u.new_zeros(max_seq_len - u.size(0))]) for u in attn_mask_list]
-        )
-
-        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
-
-        return prompt_embeds, encoder_attention_mask
-
-    # Copied from diffusers.pipelines.qwenimage.pipeline_qwenimage.QwenImagePipeline.encode_prompt
-    def encode_prompt(
-        self,
-        prompt: Union[str, List[str]],
-        device: Optional[torch.device] = None,
-        num_images_per_prompt: int = 1,
-        prompt_embeds: Optional[torch.Tensor] = None,
-        prompt_embeds_mask: Optional[torch.Tensor] = None,
-        max_sequence_length: int = 1024,
-    ):
-        r"""
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                prompt to be encoded
-            device: (`torch.device`):
-                torch device
-            num_images_per_prompt (`int`):
-                number of images that should be generated per prompt
-            prompt_embeds (`torch.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-        """
-        device = device or self._execution_device
-
-        prompt = [prompt] if isinstance(prompt, str) else prompt
-        batch_size = len(prompt) if prompt_embeds is None else prompt_embeds.shape[0]
-
-        if prompt_embeds is None:
-            prompt_embeds, prompt_embeds_mask = self._get_qwen_prompt_embeds(prompt, device)
-
-        prompt_embeds = prompt_embeds[:, :max_sequence_length]
-        prompt_embeds_mask = prompt_embeds_mask[:, :max_sequence_length]
-
-        _, seq_len, _ = prompt_embeds.shape
-        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
-        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
-        prompt_embeds_mask = prompt_embeds_mask.repeat(1, num_images_per_prompt, 1)
-        prompt_embeds_mask = prompt_embeds_mask.view(batch_size * num_images_per_prompt, seq_len)
-
-        return prompt_embeds, prompt_embeds_mask
-
-    def get_image_caption(self, prompt_image, use_en_prompt=True, device=None):
-        if use_en_prompt:
-            prompt = self.image_caption_prompt_en
-        else:
-            prompt = self.image_caption_prompt_cn
-        model_inputs = self.vl_processor(
-            text=prompt,
-            images=prompt_image,
-            padding=True,
-            return_tensors="pt",
-        ).to(device)
-        generated_ids = self.text_encoder.generate(**model_inputs, max_new_tokens=512)
-        generated_ids_trimmed = [
-            out_ids[len(in_ids) :] for in_ids, out_ids in zip(model_inputs.input_ids, generated_ids)
-        ]
-        output_text = self.vl_processor.batch_decode(
-            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
-        )[0]
-        return output_text.strip()
-
-    def check_inputs(
-        self,
-        height,
-        width,
-        negative_prompt=None,
-        prompt_embeds=None,
-        negative_prompt_embeds=None,
-        prompt_embeds_mask=None,
-        negative_prompt_embeds_mask=None,
-        callback_on_step_end_tensor_inputs=None,
-        max_sequence_length=None,
-    ):
-        if height % (self.vae_scale_factor * 2) != 0 or width % (self.vae_scale_factor * 2) != 0:
-            logger.warning(
-                f"`height` and `width` have to be divisible by {self.vae_scale_factor * 2} but are {height} and {width}. Dimensions will be resized accordingly"
-            )
-
-        if callback_on_step_end_tensor_inputs is not None and not all(
-            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
-        ):
-            raise ValueError(
-                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
-            )
-
-        if negative_prompt is not None and negative_prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
-                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
-            )
-
-        if prompt_embeds is not None and prompt_embeds_mask is None:
-            raise ValueError(
-                "If `prompt_embeds` are provided, `prompt_embeds_mask` also have to be passed. Make sure to generate `prompt_embeds_mask` from the same text encoder that was used to generate `prompt_embeds`."
-            )
-        if negative_prompt_embeds is not None and negative_prompt_embeds_mask is None:
-            raise ValueError(
-                "If `negative_prompt_embeds` are provided, `negative_prompt_embeds_mask` also have to be passed. Make sure to generate `negative_prompt_embeds_mask` from the same text encoder that was used to generate `negative_prompt_embeds`."
-            )
-
-        if max_sequence_length is not None and max_sequence_length > 1024:
-            raise ValueError(f"`max_sequence_length` cannot be greater than 1024 but is {max_sequence_length}")
-
-    @staticmethod
-    def _pack_latents(latents, batch_size, num_channels_latents, height, width, layers):
-        latents = latents.view(batch_size, layers, num_channels_latents, height // 2, 2, width // 2, 2)
-        latents = latents.permute(0, 1, 3, 5, 2, 4, 6)
-        latents = latents.reshape(batch_size, layers * (height // 2) * (width // 2), num_channels_latents * 4)
-
-        return latents
-
-    @staticmethod
-    def _unpack_latents(latents, height, width, layers, vae_scale_factor):
-        batch_size, num_patches, channels = latents.shape
-
-        # VAE applies 8x compression on images but we must also account for packing which requires
-        # latent height and width to be divisible by 2.
-        height = 2 * (int(height) // (vae_scale_factor * 2))
-        width = 2 * (int(width) // (vae_scale_factor * 2))
-
-        latents = latents.view(batch_size, layers + 1, height // 2, width // 2, channels // 4, 2, 2)
-        latents = latents.permute(0, 1, 4, 2, 5, 3, 6)
-
-        latents = latents.reshape(batch_size, layers + 1, channels // (2 * 2), height, width)
-        latents = latents.permute(0, 2, 1, 3, 4)  # (b, c, f, h, w)
-
-        return latents
-
-    # Copied from diffusers.pipelines.qwenimage.pipeline_qwenimage_edit.QwenImageEditPipeline._encode_vae_image
-    def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator):
-        if isinstance(generator, list):
-            image_latents = [
-                retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i], sample_mode="argmax")
-                for i in range(image.shape[0])
-            ]
-            image_latents = torch.cat(image_latents, dim=0)
-        else:
-            image_latents = retrieve_latents(self.vae.encode(image), generator=generator, sample_mode="argmax")
-        latents_mean = (
-            torch.tensor(self.vae.config.latents_mean)
-            .view(1, self.latent_channels, 1, 1, 1)
-            .to(image_latents.device, image_latents.dtype)
-        )
-        latents_std = (
-            torch.tensor(self.vae.config.latents_std)
-            .view(1, self.latent_channels, 1, 1, 1)
-            .to(image_latents.device, image_latents.dtype)
-        )
-        image_latents = (image_latents - latents_mean) / latents_std
-
-        return image_latents
-
-    def prepare_latents(
-        self,
-        image,
-        batch_size,
-        num_channels_latents,
-        height,
-        width,
-        layers,
-        dtype,
-        device,
-        generator,
-        latents=None,
-    ):
-        # VAE applies 8x compression on images but we must also account for packing which requires
-        # latent height and width to be divisible by 2.
-        height = 2 * (int(height) // (self.vae_scale_factor * 2))
-        width = 2 * (int(width) // (self.vae_scale_factor * 2))
-
-        shape = (
-            batch_size,
-            layers + 1,
-            num_channels_latents,
-            height,
-            width,
-        )  ### the generated first image is combined image
-
-        image_latents = None
-        if image is not None:
-            image = image.to(device=device, dtype=dtype)
-            if image.shape[1] != self.latent_channels:
-                image_latents = self._encode_vae_image(image=image, generator=generator)
-            else:
-                image_latents = image
-            if batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] == 0:
-                # expand init_latents for batch_size
-                additional_image_per_prompt = batch_size // image_latents.shape[0]
-                image_latents = torch.cat([image_latents] * additional_image_per_prompt, dim=0)
-            elif batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] != 0:
-                raise ValueError(
-                    f"Cannot duplicate `image` of batch size {image_latents.shape[0]} to {batch_size} text prompts."
-                )
-            else:
-                image_latents = torch.cat([image_latents], dim=0)
-
-            image_latent_height, image_latent_width = image_latents.shape[3:]
-            image_latents = image_latents.permute(0, 2, 1, 3, 4)  # (b, c, f, h, w) -> (b, f, c, h, w)
-            image_latents = self._pack_latents(
-                image_latents, batch_size, num_channels_latents, image_latent_height, image_latent_width, 1
-            )
-
-        if isinstance(generator, list) and len(generator) != batch_size:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
-            )
-        if latents is None:
-            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
-            latents = self._pack_latents(latents, batch_size, num_channels_latents, height, width, layers + 1)
-        else:
-            latents = latents.to(device=device, dtype=dtype)
-
-        return latents, image_latents
-
-    @property
-    def guidance_scale(self):
-        return self._guidance_scale
-
-    @property
-    def attention_kwargs(self):
-        return self._attention_kwargs
-
-    @property
-    def num_timesteps(self):
-        return self._num_timesteps
-
-    @property
-    def current_timestep(self):
-        return self._current_timestep
-
-    @property
-    def interrupt(self):
-        return self._interrupt
-
-    @torch.no_grad()
-    @replace_example_docstring(EXAMPLE_DOC_STRING)
-    def __call__(
-        self,
-        image: Optional[PipelineImageInput] = None,
-        prompt: Union[str, List[str]] = None,
-        negative_prompt: Union[str, List[str]] = None,
-        true_cfg_scale: float = 4.0,
-        layers: Optional[int] = 4,
-        num_inference_steps: int = 50,
-        sigmas: Optional[List[float]] = None,
-        guidance_scale: Optional[float] = None,
-        num_images_per_prompt: int = 1,
-        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
-        latents: Optional[torch.Tensor] = None,
-        prompt_embeds: Optional[torch.Tensor] = None,
-        prompt_embeds_mask: Optional[torch.Tensor] = None,
-        negative_prompt_embeds: Optional[torch.Tensor] = None,
-        negative_prompt_embeds_mask: Optional[torch.Tensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        attention_kwargs: Optional[Dict[str, Any]] = None,
-        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
-        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
-        max_sequence_length: int = 512,
-        resolution: int = 640,
-        cfg_normalize: bool = False,
-        use_en_prompt: bool = False,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
-                `Image`, numpy array or tensor representing an image batch to be used as the starting point. For both
-                numpy array and pytorch tensor, the expected value range is between `[0, 1]` If it's a tensor or a list
-                or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy array or a
-                list of arrays, the expected shape should be `(B, H, W, C)` or `(H, W, C)` It can also accept image
-                latents as `image`, but if passing latents directly it is not encoded again.
-            prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
-                instead.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `true_cfg_scale` is
-                not greater than `1`).
-            true_cfg_scale (`float`, *optional*, defaults to 1.0):
-                true_cfg_scale (`float`, *optional*, defaults to 1.0): Guidance scale as defined in [Classifier-Free
-                Diffusion Guidance](https://huggingface.co/papers/2207.12598). `true_cfg_scale` is defined as `w` of
-                equation 2. of [Imagen Paper](https://huggingface.co/papers/2205.11487). Classifier-free guidance is
-                enabled by setting `true_cfg_scale > 1` and a provided `negative_prompt`. Higher guidance scale
-                encourages to generate images that are closely linked to the text `prompt`, usually at the expense of
-                lower image quality.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            sigmas (`List[float]`, *optional*):
-                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
-                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
-                will be used.
-            guidance_scale (`float`, *optional*, defaults to None):
-                A guidance scale value for guidance distilled models. Unlike the traditional classifier-free guidance
-                where the guidance scale is applied during inference through noise prediction rescaling, guidance
-                distilled models take the guidance scale directly as an input parameter during forward pass. Guidance
-                scale is enabled by setting `guidance_scale > 1`. Higher guidance scale encourages to generate images
-                that are closely linked to the text `prompt`, usually at the expense of lower image quality. This
-                parameter in the pipeline is there to support future guidance-distilled models when they come up. It is
-                ignored when not using guidance distilled models. To enable traditional classifier-free guidance,
-                please pass `true_cfg_scale > 1.0` and `negative_prompt` (even an empty negative prompt like " " should
-                enable classifier-free guidance computations).
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
-                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
-                to make generation deterministic.
-            latents (`torch.Tensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will be generated by sampling using the supplied random `generator`.
-            prompt_embeds (`torch.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`torch.Tensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.qwenimage.QwenImagePipelineOutput`] instead of a plain tuple.
-            attention_kwargs (`dict`, *optional*):
-                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
-                `self.processor` in
-                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
-            callback_on_step_end (`Callable`, *optional*):
-                A function that calls at the end of each denoising steps during the inference. The function is called
-                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
-                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
-                `callback_on_step_end_tensor_inputs`.
-            callback_on_step_end_tensor_inputs (`List`, *optional*):
-                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
-                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
-                `._callback_tensor_inputs` attribute of your pipeline class.
-            max_sequence_length (`int` defaults to 512): Maximum sequence length to use with the `prompt`.
-            resolution (`int`, *optional*, defaults to 640):
-                using different bucket in (640, 1024) to determin the condition and output resolution
-            cfg_normalize (`bool`, *optional*, defaults to `False`)
-                whether enable cfg normalization.
-            use_en_prompt (`bool`, *optional*, defaults to `False`)
-                automatic caption language if user does not provide caption
-
-        Examples:
-
-        Returns:
-            [`~pipelines.qwenimage.QwenImagePipelineOutput`] or `tuple`:
-            [`~pipelines.qwenimage.QwenImagePipelineOutput`] if `return_dict` is True, otherwise a `tuple`. When
-            returning a tuple, the first element is a list with the generated images.
-        """
-        image_size = image[0].size if isinstance(image, list) else image.size
-        assert resolution in [640, 1024], f"resolution must be either 640 or 1024, but got {resolution}"
-        calculated_width, calculated_height = calculate_dimensions(
-            resolution * resolution, image_size[0] / image_size[1]
-        )
-        height = calculated_height
-        width = calculated_width
-
-        multiple_of = self.vae_scale_factor * 2
-        width = width // multiple_of * multiple_of
-        height = height // multiple_of * multiple_of
-
-        # 1. Check inputs. Raise error if not correct
-        self.check_inputs(
-            height,
-            width,
-            negative_prompt=negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-            prompt_embeds_mask=prompt_embeds_mask,
-            negative_prompt_embeds_mask=negative_prompt_embeds_mask,
-            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
-            max_sequence_length=max_sequence_length,
-        )
-
-        self._guidance_scale = guidance_scale
-        self._attention_kwargs = attention_kwargs
-        self._current_timestep = None
-        self._interrupt = False
-
-        device = self._execution_device
-        # 2. Preprocess image
-        if image is not None and not (isinstance(image, torch.Tensor) and image.size(1) == self.latent_channels):
-            image = self.image_processor.resize(image, calculated_height, calculated_width)
-            prompt_image = image
-            image = self.image_processor.preprocess(image, calculated_height, calculated_width)
-            image = image.unsqueeze(2)
-            image = image.to(dtype=self.text_encoder.dtype)
-
-        if prompt is None or prompt == "" or prompt == " ":
-            prompt = self.get_image_caption(prompt_image, use_en_prompt=use_en_prompt, device=device)
-
-        # 3. Define call parameters
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        has_neg_prompt = negative_prompt is not None or (
-            negative_prompt_embeds is not None and negative_prompt_embeds_mask is not None
-        )
-
-        if true_cfg_scale > 1 and not has_neg_prompt:
-            logger.warning(
-                f"true_cfg_scale is passed as {true_cfg_scale}, but classifier-free guidance is not enabled since no negative_prompt is provided."
-            )
-        elif true_cfg_scale <= 1 and has_neg_prompt:
-            logger.warning(
-                " negative_prompt is passed but classifier-free guidance is not enabled since true_cfg_scale <= 1"
-            )
-
-        do_true_cfg = true_cfg_scale > 1 and has_neg_prompt
-        prompt_embeds, prompt_embeds_mask = self.encode_prompt(
-            prompt=prompt,
-            prompt_embeds=prompt_embeds,
-            prompt_embeds_mask=prompt_embeds_mask,
-            device=device,
-            num_images_per_prompt=num_images_per_prompt,
-            max_sequence_length=max_sequence_length,
-        )
-        if do_true_cfg:
-            negative_prompt_embeds, negative_prompt_embeds_mask = self.encode_prompt(
-                prompt=negative_prompt,
-                prompt_embeds=negative_prompt_embeds,
-                prompt_embeds_mask=negative_prompt_embeds_mask,
-                device=device,
-                num_images_per_prompt=num_images_per_prompt,
-                max_sequence_length=max_sequence_length,
-            )
-
-        # 4. Prepare latent variables
-        num_channels_latents = self.transformer.config.in_channels // 4
-        latents, image_latents = self.prepare_latents(
-            image,
-            batch_size * num_images_per_prompt,
-            num_channels_latents,
-            height,
-            width,
-            layers,
-            prompt_embeds.dtype,
-            device,
-            generator,
-            latents,
-        )
-        img_shapes = [
-            [
-                *[
-                    (1, height // self.vae_scale_factor // 2, width // self.vae_scale_factor // 2)
-                    for _ in range(layers + 1)
-                ],
-                (1, calculated_height // self.vae_scale_factor // 2, calculated_width // self.vae_scale_factor // 2),
-            ]
-        ] * batch_size
-
-        # 5. Prepare timesteps
-        sigmas = np.linspace(1.0, 0, num_inference_steps + 1)[:-1] if sigmas is None else sigmas
-        image_seq_len = latents.shape[1]
-        base_seqlen = 256 * 256 / 16 / 16
-        mu = (image_latents.shape[1] / base_seqlen) ** 0.5
-        timesteps, num_inference_steps = retrieve_timesteps(
-            self.scheduler,
-            num_inference_steps,
-            device,
-            sigmas=sigmas,
-            mu=mu,
-        )
-        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
-        self._num_timesteps = len(timesteps)
-
-        # handle guidance
-        if self.transformer.config.guidance_embeds and guidance_scale is None:
-            raise ValueError("guidance_scale is required for guidance-distilled model.")
-        elif self.transformer.config.guidance_embeds:
-            guidance = torch.full([1], guidance_scale, device=device, dtype=torch.float32)
-            guidance = guidance.expand(latents.shape[0])
-        elif not self.transformer.config.guidance_embeds and guidance_scale is not None:
-            logger.warning(
-                f"guidance_scale is passed as {guidance_scale}, but ignored since the model is not guidance-distilled."
-            )
-            guidance = None
-        elif not self.transformer.config.guidance_embeds and guidance_scale is None:
-            guidance = None
-
-        if self.attention_kwargs is None:
-            self._attention_kwargs = {}
-
-        txt_seq_lens = prompt_embeds_mask.sum(dim=1).tolist() if prompt_embeds_mask is not None else None
-        negative_txt_seq_lens = (
-            negative_prompt_embeds_mask.sum(dim=1).tolist() if negative_prompt_embeds_mask is not None else None
-        )
-        is_rgb = torch.tensor([0] * batch_size).to(device=device, dtype=torch.long)
-        # 6. Denoising loop
-        self.scheduler.set_begin_index(0)
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                if self.interrupt:
-                    continue
-
-                self._current_timestep = t
-
-                latent_model_input = latents
-                if image_latents is not None:
-                    latent_model_input = torch.cat([latents, image_latents], dim=1)
-
-                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
-                timestep = t.expand(latents.shape[0]).to(latents.dtype)
-                with self.transformer.cache_context("cond"):
-                    noise_pred = self.transformer(
-                        hidden_states=latent_model_input,
-                        timestep=timestep / 1000,
-                        guidance=guidance,
-                        encoder_hidden_states_mask=prompt_embeds_mask,
-                        encoder_hidden_states=prompt_embeds,
-                        img_shapes=img_shapes,
-                        txt_seq_lens=txt_seq_lens,
-                        attention_kwargs=self.attention_kwargs,
-                        additional_t_cond=is_rgb,
-                        return_dict=False,
-                    )[0]
-                    noise_pred = noise_pred[:, : latents.size(1)]
-
-                if do_true_cfg:
-                    with self.transformer.cache_context("uncond"):
-                        neg_noise_pred = self.transformer(
-                            hidden_states=latent_model_input,
-                            timestep=timestep / 1000,
-                            guidance=guidance,
-                            encoder_hidden_states_mask=negative_prompt_embeds_mask,
-                            encoder_hidden_states=negative_prompt_embeds,
-                            img_shapes=img_shapes,
-                            txt_seq_lens=negative_txt_seq_lens,
-                            attention_kwargs=self.attention_kwargs,
-                            additional_t_cond=is_rgb,
-                            return_dict=False,
-                        )[0]
-                    neg_noise_pred = neg_noise_pred[:, : latents.size(1)]
-                    comb_pred = neg_noise_pred + true_cfg_scale * (noise_pred - neg_noise_pred)
-
-                    if cfg_normalize:
-                        cond_norm = torch.norm(noise_pred, dim=-1, keepdim=True)
-                        noise_norm = torch.norm(comb_pred, dim=-1, keepdim=True)
-                        noise_pred = comb_pred * (cond_norm / noise_norm)
-                    else:
-                        noise_pred = comb_pred
-
-                # compute the previous noisy sample x_t -> x_t-1
-                latents_dtype = latents.dtype
-                latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
-
-                if latents.dtype != latents_dtype:
-                    if torch.backends.mps.is_available():
-                        # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
-                        latents = latents.to(latents_dtype)
-
-                if callback_on_step_end is not None:
-                    callback_kwargs = {}
-                    for k in callback_on_step_end_tensor_inputs:
-                        callback_kwargs[k] = locals()[k]
-                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
-
-                    latents = callback_outputs.pop("latents", latents)
-                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
-
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    progress_bar.update()
-
-                if XLA_AVAILABLE:
-                    xm.mark_step()
-
-        self._current_timestep = None
-        if output_type == "latent":
-            image = latents
-        else:
-            latents = self._unpack_latents(latents, height, width, layers, self.vae_scale_factor)
-            latents = latents.to(self.vae.dtype)
-            latents_mean = (
-                torch.tensor(self.vae.config.latents_mean)
-                .view(1, self.vae.config.z_dim, 1, 1, 1)
-                .to(latents.device, latents.dtype)
-            )
-            latents_std = 1.0 / torch.tensor(self.vae.config.latents_std).view(1, self.vae.config.z_dim, 1, 1, 1).to(
-                latents.device, latents.dtype
-            )
-            latents = latents / latents_std + latents_mean
-
-            b, c, f, h, w = latents.shape
-
-            latents = latents[:, :, 1:]  # remove the first frame as it is the orgin input
-
-            latents = latents.permute(0, 2, 1, 3, 4).view(-1, c, 1, h, w)
-
-            image = self.vae.decode(latents, return_dict=False)[0]  # (b f) c 1 h w
-
-            image = image.squeeze(2)
-
-            image = self.image_processor.postprocess(image, output_type=output_type)
-            images = []
-            for bidx in range(b):
-                images.append(image[bidx * f : (bidx + 1) * f])
-
-        # Offload all models
-        self.maybe_free_model_hooks()
-
-        if not return_dict:
-            return (images,)
-
-        return QwenImagePipelineOutput(images=images)
--- a/src/diffusers/pipelines/z_image/init.py
+++ b/src/diffusers/pipelines/z_image/init.py
@@ -23,8 +23,6 @@ except OptionalDependencyNotAvailable:
 else:
    _import_structure["pipeline_output"] = ["ZImagePipelineOutput"]
    _import_structure["pipeline_z_image"] = ["ZImagePipeline"]
-    _import_structure["pipeline_z_image_controlnet"] = ["ZImageControlNetPipeline"]
-    _import_structure["pipeline_z_image_controlnet_inpaint"] = ["ZImageControlNetInpaintPipeline"]
    _import_structure["pipeline_z_image_img2img"] = ["ZImageImg2ImgPipeline"]


@@ -38,8 +36,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
    else:
        from .pipeline_output import ZImagePipelineOutput
        from .pipeline_z_image import ZImagePipeline
-        from .pipeline_z_image_controlnet import ZImageControlNetPipeline
-        from .pipeline_z_image_controlnet_inpaint import ZImageControlNetInpaintPipeline
        from .pipeline_z_image_img2img import ZImageImg2ImgPipeline

 else:
--- a/src/diffusers/pipelines/z_image/pipeline_z_image_controlnet.py
+++ b/src/diffusers/pipelines/z_image/pipeline_z_image_controlnet.py
@@ -1,725 +0,0 @@
-# Copyright 2025 Alibaba Z-Image Team and The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-from typing import Any, Callable, Dict, List, Optional, Union
-
-import torch
-from transformers import AutoTokenizer, PreTrainedModel
-
-from ...image_processor import PipelineImageInput, VaeImageProcessor
-from ...loaders import FromSingleFileMixin
-from ...models.autoencoders import AutoencoderKL
-from ...models.controlnets import ZImageControlNetModel
-from ...models.transformers import ZImageTransformer2DModel
-from ...pipelines.pipeline_utils import DiffusionPipeline
-from ...schedulers import FlowMatchEulerDiscreteScheduler
-from ...utils import logging, replace_example_docstring
-from ...utils.torch_utils import randn_tensor
-from .pipeline_output import ZImagePipelineOutput
-
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-EXAMPLE_DOC_STRING = """
-    Examples:
-        ```py
-        >>> import torch
-        >>> from diffusers import ZImageControlNetPipeline
-        >>> from diffusers import ZImageControlNetModel
-        >>> from diffusers.utils import load_image
-        >>> from huggingface_hub import hf_hub_download
-
-        >>> controlnet = ZImageControlNetModel.from_single_file(
-        ...     hf_hub_download(
-        ...         "alibaba-pai/Z-Image-Turbo-Fun-Controlnet-Union",
-        ...         filename="Z-Image-Turbo-Fun-Controlnet-Union.safetensors",
-        ...     ),
-        ...     torch_dtype=torch.bfloat16,
-        ... )
-
-        >>> # 2.1
-        >>> # controlnet = ZImageControlNetModel.from_single_file(
-        >>> #     hf_hub_download(
-        >>> #         "alibaba-pai/Z-Image-Turbo-Fun-Controlnet-Union-2.0",
-        >>> #         filename="Z-Image-Turbo-Fun-Controlnet-Union-2.1.safetensors",
-        >>> #     ),
-        >>> #     torch_dtype=torch.bfloat16,
-        >>> # )
-
-        >>> # 2.0 - `config` is required
-        >>> # controlnet = ZImageControlNetModel.from_single_file(
-        >>> #     hf_hub_download(
-        >>> #         "alibaba-pai/Z-Image-Turbo-Fun-Controlnet-Union-2.0",
-        >>> #         filename="Z-Image-Turbo-Fun-Controlnet-Union-2.0.safetensors",
-        >>> #     ),
-        >>> #     torch_dtype=torch.bfloat16,
-        >>> #     config="hlky/Z-Image-Turbo-Fun-Controlnet-Union-2.0",
-        >>> # )
-
-        >>> pipe = ZImageControlNetPipeline.from_pretrained(
-        ...     "Tongyi-MAI/Z-Image-Turbo", controlnet=controlnet, torch_dtype=torch.bfloat16
-        ... )
-        >>> pipe.to("cuda")
-
-        >>> # Optionally, set the attention backend to flash-attn 2 or 3, default is SDPA in PyTorch.
-        >>> # (1) Use flash attention 2
-        >>> # pipe.transformer.set_attention_backend("flash")
-        >>> # (2) Use flash attention 3
-        >>> # pipe.transformer.set_attention_backend("_flash_3")
-
-        >>> control_image = load_image(
-        ...     "https://huggingface.co/alibaba-pai/Z-Image-Turbo-Fun-Controlnet-Union/resolve/main/asset/pose.jpg?download=true"
-        ... )
-        >>> prompt = "一位年轻女子站在阳光明媚的海岸线上，白裙在轻拂的海风中微微飘动。她拥有一头鲜艳的紫色长发，在风中轻盈舞动，发间系着一个精致的黑色蝴蝶结，与身后柔和的蔚蓝天空形成鲜明对比。她面容清秀，眉目精致，透着一股甜美的青春气息；神情柔和，略带羞涩，目光静静地凝望着远方的地平线，双手自然交叠于身前，仿佛沉浸在思绪之中。在她身后，是辽阔无垠、波光粼粼的大海，阳光洒在海面上，映出温暖的金色光晕。"
-        >>> image = pipe(
-        ...     prompt,
-        ...     control_image=control_image,
-        ...     controlnet_conditioning_scale=0.75,
-        ...     height=1728,
-        ...     width=992,
-        ...     num_inference_steps=9,
-        ...     guidance_scale=0.0,
-        ...     generator=torch.Generator("cuda").manual_seed(43),
-        ... ).images[0]
-        >>> image.save("zimage.png")
-        ```
-"""
-
-
-# Copied from diffusers.pipelines.flux.pipeline_flux.calculate_shift
-def calculate_shift(
-    image_seq_len,
-    base_seq_len: int = 256,
-    max_seq_len: int = 4096,
-    base_shift: float = 0.5,
-    max_shift: float = 1.15,
-):
-    m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
-    b = base_shift - m * base_seq_len
-    mu = image_seq_len * m + b
-    return mu
-
-
-# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
-def retrieve_latents(
-    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
-):
-    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
-        return encoder_output.latent_dist.sample(generator)
-    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
-        return encoder_output.latent_dist.mode()
-    elif hasattr(encoder_output, "latents"):
-        return encoder_output.latents
-    else:
-        raise AttributeError("Could not access latents of provided encoder_output")
-
-
-# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
-def retrieve_timesteps(
-    scheduler,
-    num_inference_steps: Optional[int] = None,
-    device: Optional[Union[str, torch.device]] = None,
-    timesteps: Optional[List[int]] = None,
-    sigmas: Optional[List[float]] = None,
-    **kwargs,
-):
-    r"""
-    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
-    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
-
-    Args:
-        scheduler (`SchedulerMixin`):
-            The scheduler to get timesteps from.
-        num_inference_steps (`int`):
-            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
-            must be `None`.
-        device (`str` or `torch.device`, *optional*):
-            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
-        timesteps (`List[int]`, *optional*):
-            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
-            `num_inference_steps` and `sigmas` must be `None`.
-        sigmas (`List[float]`, *optional*):
-            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
-            `num_inference_steps` and `timesteps` must be `None`.
-
-    Returns:
-        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
-        second element is the number of inference steps.
-    """
-    if timesteps is not None and sigmas is not None:
-        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
-    if timesteps is not None:
-        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
-        if not accepts_timesteps:
-            raise ValueError(
-                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
-                f" timestep schedules. Please check whether you are using the correct scheduler."
-            )
-        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
-        timesteps = scheduler.timesteps
-        num_inference_steps = len(timesteps)
-    elif sigmas is not None:
-        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
-        if not accept_sigmas:
-            raise ValueError(
-                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
-                f" sigmas schedules. Please check whether you are using the correct scheduler."
-            )
-        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
-        timesteps = scheduler.timesteps
-        num_inference_steps = len(timesteps)
-    else:
-        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
-        timesteps = scheduler.timesteps
-    return timesteps, num_inference_steps
-
-
-class ZImageControlNetPipeline(DiffusionPipeline, FromSingleFileMixin):
-    model_cpu_offload_seq = "text_encoder->transformer->vae"
-    _optional_components = []
-    _callback_tensor_inputs = ["latents", "prompt_embeds"]
-
-    def __init__(
-        self,
-        scheduler: FlowMatchEulerDiscreteScheduler,
-        vae: AutoencoderKL,
-        text_encoder: PreTrainedModel,
-        tokenizer: AutoTokenizer,
-        transformer: ZImageTransformer2DModel,
-        controlnet: ZImageControlNetModel,
-    ):
-        super().__init__()
-        controlnet = ZImageControlNetModel.from_transformer(controlnet, transformer)
-
-        self.register_modules(
-            vae=vae,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            scheduler=scheduler,
-            transformer=transformer,
-            controlnet=controlnet,
-        )
-        self.vae_scale_factor = (
-            2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
-        )
-        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor * 2)
-
-    def encode_prompt(
-        self,
-        prompt: Union[str, List[str]],
-        device: Optional[torch.device] = None,
-        do_classifier_free_guidance: bool = True,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        prompt_embeds: Optional[List[torch.FloatTensor]] = None,
-        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
-        max_sequence_length: int = 512,
-    ):
-        prompt = [prompt] if isinstance(prompt, str) else prompt
-        prompt_embeds = self._encode_prompt(
-            prompt=prompt,
-            device=device,
-            prompt_embeds=prompt_embeds,
-            max_sequence_length=max_sequence_length,
-        )
-
-        if do_classifier_free_guidance:
-            if negative_prompt is None:
-                negative_prompt = ["" for _ in prompt]
-            else:
-                negative_prompt = [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
-            assert len(prompt) == len(negative_prompt)
-            negative_prompt_embeds = self._encode_prompt(
-                prompt=negative_prompt,
-                device=device,
-                prompt_embeds=negative_prompt_embeds,
-                max_sequence_length=max_sequence_length,
-            )
-        else:
-            negative_prompt_embeds = []
-        return prompt_embeds, negative_prompt_embeds
-
-    def _encode_prompt(
-        self,
-        prompt: Union[str, List[str]],
-        device: Optional[torch.device] = None,
-        prompt_embeds: Optional[List[torch.FloatTensor]] = None,
-        max_sequence_length: int = 512,
-    ) -> List[torch.FloatTensor]:
-        device = device or self._execution_device
-
-        if prompt_embeds is not None:
-            return prompt_embeds
-
-        if isinstance(prompt, str):
-            prompt = [prompt]
-
-        for i, prompt_item in enumerate(prompt):
-            messages = [
-                {"role": "user", "content": prompt_item},
-            ]
-            prompt_item = self.tokenizer.apply_chat_template(
-                messages,
-                tokenize=False,
-                add_generation_prompt=True,
-                enable_thinking=True,
-            )
-            prompt[i] = prompt_item
-
-        text_inputs = self.tokenizer(
-            prompt,
-            padding="max_length",
-            max_length=max_sequence_length,
-            truncation=True,
-            return_tensors="pt",
-        )
-
-        text_input_ids = text_inputs.input_ids.to(device)
-        prompt_masks = text_inputs.attention_mask.to(device).bool()
-
-        prompt_embeds = self.text_encoder(
-            input_ids=text_input_ids,
-            attention_mask=prompt_masks,
-            output_hidden_states=True,
-        ).hidden_states[-2]
-
-        embeddings_list = []
-
-        for i in range(len(prompt_embeds)):
-            embeddings_list.append(prompt_embeds[i][prompt_masks[i]])
-
-        return embeddings_list
-
-    def prepare_latents(
-        self,
-        batch_size,
-        num_channels_latents,
-        height,
-        width,
-        dtype,
-        device,
-        generator,
-        latents=None,
-    ):
-        height = 2 * (int(height) // (self.vae_scale_factor * 2))
-        width = 2 * (int(width) // (self.vae_scale_factor * 2))
-
-        shape = (batch_size, num_channels_latents, height, width)
-
-        if latents is None:
-            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
-        else:
-            if latents.shape != shape:
-                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
-            latents = latents.to(device)
-        return latents
-
-    # Copied from diffusers.pipelines.controlnet_sd3.pipeline_stable_diffusion_3_controlnet.StableDiffusion3ControlNetPipeline.prepare_image
-    def prepare_image(
-        self,
-        image,
-        width,
-        height,
-        batch_size,
-        num_images_per_prompt,
-        device,
-        dtype,
-        do_classifier_free_guidance=False,
-        guess_mode=False,
-    ):
-        if isinstance(image, torch.Tensor):
-            pass
-        else:
-            image = self.image_processor.preprocess(image, height=height, width=width)
-
-        image_batch_size = image.shape[0]
-
-        if image_batch_size == 1:
-            repeat_by = batch_size
-        else:
-            # image batch size is the same as prompt batch size
-            repeat_by = num_images_per_prompt
-
-        image = image.repeat_interleave(repeat_by, dim=0)
-
-        image = image.to(device=device, dtype=dtype)
-
-        if do_classifier_free_guidance and not guess_mode:
-            image = torch.cat([image] * 2)
-
-        return image
-
-    @property
-    def guidance_scale(self):
-        return self._guidance_scale
-
-    @property
-    def do_classifier_free_guidance(self):
-        return self._guidance_scale > 1
-
-    @property
-    def joint_attention_kwargs(self):
-        return self._joint_attention_kwargs
-
-    @property
-    def num_timesteps(self):
-        return self._num_timesteps
-
-    @property
-    def interrupt(self):
-        return self._interrupt
-
-    @torch.no_grad()
-    @replace_example_docstring(EXAMPLE_DOC_STRING)
-    def __call__(
-        self,
-        prompt: Union[str, List[str]] = None,
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        num_inference_steps: int = 50,
-        sigmas: Optional[List[float]] = None,
-        guidance_scale: float = 5.0,
-        control_image: PipelineImageInput = None,
-        controlnet_conditioning_scale: Union[float, List[float]] = 0.75,
-        cfg_normalization: bool = False,
-        cfg_truncation: float = 1.0,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
-        latents: Optional[torch.FloatTensor] = None,
-        prompt_embeds: Optional[List[torch.FloatTensor]] = None,
-        negative_prompt_embeds: Optional[List[torch.FloatTensor]] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
-        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
-        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
-        max_sequence_length: int = 512,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
-                instead.
-            height (`int`, *optional*, defaults to 1024):
-                The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to 1024):
-                The width in pixels of the generated image.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            sigmas (`List[float]`, *optional*):
-                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
-                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
-                will be used.
-            guidance_scale (`float`, *optional*, defaults to 5.0):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            cfg_normalization (`bool`, *optional*, defaults to False):
-                Whether to apply configuration normalization.
-            cfg_truncation (`float`, *optional*, defaults to 1.0):
-                The truncation value for configuration.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
-                less than `1`).
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
-                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
-                to make generation deterministic.
-            latents (`torch.FloatTensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will be generated by sampling using the supplied random `generator`.
-            prompt_embeds (`List[torch.FloatTensor]`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`List[torch.FloatTensor]`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.ZImagePipelineOutput`] instead of a plain
-                tuple.
-            joint_attention_kwargs (`dict`, *optional*):
-                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
-                `self.processor` in
-                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
-            callback_on_step_end (`Callable`, *optional*):
-                A function that calls at the end of each denoising steps during the inference. The function is called
-                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
-                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
-                `callback_on_step_end_tensor_inputs`.
-            callback_on_step_end_tensor_inputs (`List`, *optional*):
-                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
-                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
-                `._callback_tensor_inputs` attribute of your pipeline class.
-            max_sequence_length (`int`, *optional*, defaults to 512):
-                Maximum sequence length to use with the `prompt`.
-
-        Examples:
-
-        Returns:
-            [`~pipelines.z_image.ZImagePipelineOutput`] or `tuple`: [`~pipelines.z_image.ZImagePipelineOutput`] if
-            `return_dict` is True, otherwise a `tuple`. When returning a tuple, the first element is a list with the
-            generated images.
-        """
-        height = height or 1024
-        width = width or 1024
-
-        vae_scale = self.vae_scale_factor * 2
-        if height % vae_scale != 0:
-            raise ValueError(
-                f"Height must be divisible by {vae_scale} (got {height}). "
-                f"Please adjust the height to a multiple of {vae_scale}."
-            )
-        if width % vae_scale != 0:
-            raise ValueError(
-                f"Width must be divisible by {vae_scale} (got {width}). "
-                f"Please adjust the width to a multiple of {vae_scale}."
-            )
-
-        device = self._execution_device
-
-        self._guidance_scale = guidance_scale
-        self._joint_attention_kwargs = joint_attention_kwargs
-        self._interrupt = False
-        self._cfg_normalization = cfg_normalization
-        self._cfg_truncation = cfg_truncation
-        # 2. Define call parameters
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = len(prompt_embeds)
-
-        # If prompt_embeds is provided and prompt is None, skip encoding
-        if prompt_embeds is not None and prompt is None:
-            if self.do_classifier_free_guidance and negative_prompt_embeds is None:
-                raise ValueError(
-                    "When `prompt_embeds` is provided without `prompt`, "
-                    "`negative_prompt_embeds` must also be provided for classifier-free guidance."
-                )
-        else:
-            (
-                prompt_embeds,
-                negative_prompt_embeds,
-            ) = self.encode_prompt(
-                prompt=prompt,
-                negative_prompt=negative_prompt,
-                do_classifier_free_guidance=self.do_classifier_free_guidance,
-                prompt_embeds=prompt_embeds,
-                negative_prompt_embeds=negative_prompt_embeds,
-                device=device,
-                max_sequence_length=max_sequence_length,
-            )
-
-        # 4. Prepare latent variables
-        num_channels_latents = self.transformer.in_channels
-
-        control_image = self.prepare_image(
-            image=control_image,
-            width=width,
-            height=height,
-            batch_size=batch_size * num_images_per_prompt,
-            num_images_per_prompt=num_images_per_prompt,
-            device=device,
-            dtype=self.vae.dtype,
-        )
-        height, width = control_image.shape[-2:]
-        control_image = retrieve_latents(self.vae.encode(control_image), generator=generator, sample_mode="argmax")
-        control_image = (control_image - self.vae.config.shift_factor) * self.vae.config.scaling_factor
-        control_image = control_image.unsqueeze(2)
-
-        if num_channels_latents != self.controlnet.config.control_in_dim:
-            # For model version 2.0
-            control_image = torch.cat(
-                [
-                    control_image,
-                    torch.zeros(
-                        control_image.shape[0],
-                        self.controlnet.config.control_in_dim - num_channels_latents,
-                        *control_image.shape[2:],
-                    ).to(device=control_image.device, dtype=control_image.dtype),
-                ],
-                dim=1,
-            )
-
-        latents = self.prepare_latents(
-            batch_size * num_images_per_prompt,
-            num_channels_latents,
-            height,
-            width,
-            torch.float32,
-            device,
-            generator,
-            latents,
-        )
-
-        # Repeat prompt_embeds for num_images_per_prompt
-        if num_images_per_prompt > 1:
-            prompt_embeds = [pe for pe in prompt_embeds for _ in range(num_images_per_prompt)]
-            if self.do_classifier_free_guidance and negative_prompt_embeds:
-                negative_prompt_embeds = [npe for npe in negative_prompt_embeds for _ in range(num_images_per_prompt)]
-
-        actual_batch_size = batch_size * num_images_per_prompt
-        image_seq_len = (latents.shape[2] // 2) * (latents.shape[3] // 2)
-
-        # 5. Prepare timesteps
-        mu = calculate_shift(
-            image_seq_len,
-            self.scheduler.config.get("base_image_seq_len", 256),
-            self.scheduler.config.get("max_image_seq_len", 4096),
-            self.scheduler.config.get("base_shift", 0.5),
-            self.scheduler.config.get("max_shift", 1.15),
-        )
-        self.scheduler.sigma_min = 0.0
-        scheduler_kwargs = {"mu": mu}
-        timesteps, num_inference_steps = retrieve_timesteps(
-            self.scheduler,
-            num_inference_steps,
-            device,
-            sigmas=sigmas,
-            **scheduler_kwargs,
-        )
-        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
-        self._num_timesteps = len(timesteps)
-
-        # 6. Denoising loop
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                if self.interrupt:
-                    continue
-
-                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
-                timestep = t.expand(latents.shape[0])
-                timestep = (1000 - timestep) / 1000
-                # Normalized time for time-aware config (0 at start, 1 at end)
-                t_norm = timestep[0].item()
-
-                # Handle cfg truncation
-                current_guidance_scale = self.guidance_scale
-                if (
-                    self.do_classifier_free_guidance
-                    and self._cfg_truncation is not None
-                    and float(self._cfg_truncation) <= 1
-                ):
-                    if t_norm > self._cfg_truncation:
-                        current_guidance_scale = 0.0
-
-                # Run CFG only if configured AND scale is non-zero
-                apply_cfg = self.do_classifier_free_guidance and current_guidance_scale > 0
-
-                if apply_cfg:
-                    latents_typed = latents.to(self.transformer.dtype)
-                    latent_model_input = latents_typed.repeat(2, 1, 1, 1)
-                    prompt_embeds_model_input = prompt_embeds + negative_prompt_embeds
-                    timestep_model_input = timestep.repeat(2)
-                else:
-                    latent_model_input = latents.to(self.transformer.dtype)
-                    prompt_embeds_model_input = prompt_embeds
-                    timestep_model_input = timestep
-
-                latent_model_input = latent_model_input.unsqueeze(2)
-                latent_model_input_list = list(latent_model_input.unbind(dim=0))
-
-                controlnet_block_samples = self.controlnet(
-                    latent_model_input_list,
-                    timestep_model_input,
-                    prompt_embeds_model_input,
-                    control_image,
-                    conditioning_scale=controlnet_conditioning_scale,
-                )
-
-                model_out_list = self.transformer(
-                    latent_model_input_list,
-                    timestep_model_input,
-                    prompt_embeds_model_input,
-                    controlnet_block_samples=controlnet_block_samples,
-                )[0]
-
-                if apply_cfg:
-                    # Perform CFG
-                    pos_out = model_out_list[:actual_batch_size]
-                    neg_out = model_out_list[actual_batch_size:]
-
-                    noise_pred = []
-                    for j in range(actual_batch_size):
-                        pos = pos_out[j].float()
-                        neg = neg_out[j].float()
-
-                        pred = pos + current_guidance_scale * (pos - neg)
-
-                        # Renormalization
-                        if self._cfg_normalization and float(self._cfg_normalization) > 0.0:
-                            ori_pos_norm = torch.linalg.vector_norm(pos)
-                            new_pos_norm = torch.linalg.vector_norm(pred)
-                            max_new_norm = ori_pos_norm * float(self._cfg_normalization)
-                            if new_pos_norm > max_new_norm:
-                                pred = pred * (max_new_norm / new_pos_norm)
-
-                        noise_pred.append(pred)
-
-                    noise_pred = torch.stack(noise_pred, dim=0)
-                else:
-                    noise_pred = torch.stack([t.float() for t in model_out_list], dim=0)
-
-                noise_pred = noise_pred.squeeze(2)
-                noise_pred = -noise_pred
-
-                # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred.to(torch.float32), t, latents, return_dict=False)[0]
-                assert latents.dtype == torch.float32
-
-                if callback_on_step_end is not None:
-                    callback_kwargs = {}
-                    for k in callback_on_step_end_tensor_inputs:
-                        callback_kwargs[k] = locals()[k]
-                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
-
-                    latents = callback_outputs.pop("latents", latents)
-                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
-                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
-
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    progress_bar.update()
-
-        if output_type == "latent":
-            image = latents
-
-        else:
-            latents = latents.to(self.vae.dtype)
-            latents = (latents / self.vae.config.scaling_factor) + self.vae.config.shift_factor
-
-            image = self.vae.decode(latents, return_dict=False)[0]
-            image = self.image_processor.postprocess(image, output_type=output_type)
-
-        # Offload all models
-        self.maybe_free_model_hooks()
-
-        if not return_dict:
-            return (image,)
-
-        return ZImagePipelineOutput(images=image)
--- a/src/diffusers/pipelines/z_image/pipeline_z_image_controlnet_inpaint.py
+++ b/src/diffusers/pipelines/z_image/pipeline_z_image_controlnet_inpaint.py
@@ -1,747 +0,0 @@
-# Copyright 2025 Alibaba Z-Image Team and The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-from typing import Any, Callable, Dict, List, Optional, Union
-
-import torch
-import torch.nn.functional as F
-from transformers import AutoTokenizer, PreTrainedModel
-
-from ...image_processor import PipelineImageInput, VaeImageProcessor
-from ...loaders import FromSingleFileMixin
-from ...models.autoencoders import AutoencoderKL
-from ...models.controlnets import ZImageControlNetModel
-from ...models.transformers import ZImageTransformer2DModel
-from ...pipelines.pipeline_utils import DiffusionPipeline
-from ...schedulers import FlowMatchEulerDiscreteScheduler
-from ...utils import logging, replace_example_docstring
-from ...utils.torch_utils import randn_tensor
-from .pipeline_output import ZImagePipelineOutput
-
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-EXAMPLE_DOC_STRING = """
-    Examples:
-        ```py
-        >>> import torch
-        >>> from diffusers import ZImageControlNetInpaintPipeline
-        >>> from diffusers import ZImageControlNetModel
-        >>> from diffusers.utils import load_image
-        >>> from huggingface_hub import hf_hub_download
-
-        >>> controlnet = ZImageControlNetModel.from_single_file(
-        ...     hf_hub_download(
-        ...         "alibaba-pai/Z-Image-Turbo-Fun-Controlnet-Union-2.0",
-        ...         filename="Z-Image-Turbo-Fun-Controlnet-Union-2.1.safetensors",
-        ...     ),
-        ...     torch_dtype=torch.bfloat16,
-        ... )
-
-        >>> # 2.0 - `config` is required
-        >>> # controlnet = ZImageControlNetModel.from_single_file(
-        >>> #     hf_hub_download(
-        >>> #         "alibaba-pai/Z-Image-Turbo-Fun-Controlnet-Union-2.0",
-        >>> #         filename="Z-Image-Turbo-Fun-Controlnet-Union-2.0.safetensors",
-        >>> #     ),
-        >>> #     torch_dtype=torch.bfloat16,
-        >>> #     config="hlky/Z-Image-Turbo-Fun-Controlnet-Union-2.0",
-        >>> # )
-
-        >>> pipe = ZImageControlNetInpaintPipeline.from_pretrained(
-        ...     "Tongyi-MAI/Z-Image-Turbo", controlnet=controlnet, torch_dtype=torch.bfloat16
-        ... )
-        >>> pipe.to("cuda")
-
-        >>> # Optionally, set the attention backend to flash-attn 2 or 3, default is SDPA in PyTorch.
-        >>> # (1) Use flash attention 2
-        >>> # pipe.transformer.set_attention_backend("flash")
-        >>> # (2) Use flash attention 3
-        >>> # pipe.transformer.set_attention_backend("_flash_3")
-
-        >>> image = load_image(
-        ...     "https://huggingface.co/alibaba-pai/Z-Image-Turbo-Fun-Controlnet-Union-2.0/resolve/main/asset/inpaint.jpg?download=true"
-        ... )
-        >>> mask_image = load_image(
-        ...     "https://huggingface.co/alibaba-pai/Z-Image-Turbo-Fun-Controlnet-Union-2.0/resolve/main/asset/mask.jpg?download=true"
-        ... )
-        >>> control_image = load_image(
-        ...     "https://huggingface.co/alibaba-pai/Z-Image-Turbo-Fun-Controlnet-Union-2.0/resolve/main/asset/pose.jpg?download=true"
-        ... )
-        >>> prompt = "一位年轻女子站在阳光明媚的海岸线上，画面为全身竖构图，身体微微侧向右侧，左手自然下垂，右臂弯曲扶在腰间，她的手指清晰可见，站姿放松而略带羞涩。她身穿轻盈的白色连衣裙，裙摆在海风中轻轻飘动，布料半透、质感柔软。女子拥有一头鲜艳的及腰紫色长发，被海风吹起，在身侧轻盈飞舞，发间系着一个精致的黑色蝴蝶结，与发色形成对比。她面容清秀，眉目精致，肤色白皙细腻，表情温柔略显羞涩，微微低头，眼神静静望向远处的海平线，流露出甜美的青春气息与若有所思的神情。背景是辽阔无垠的海洋与蔚蓝天空，阳光从侧前方洒下，海面波光粼粼，泛着温暖的金色光晕，天空清澈明亮，云朵稀薄，整体色调清新唯美。"
-        >>> image = pipe(
-        ...     prompt,
-        ...     image=image,
-        ...     mask_image=mask_image,
-        ...     control_image=control_image,
-        ...     controlnet_conditioning_scale=0.75,
-        ...     height=1728,
-        ...     width=992,
-        ...     num_inference_steps=25,
-        ...     guidance_scale=0.0,
-        ...     generator=torch.Generator("cuda").manual_seed(43),
-        ... ).images[0]
-        >>> image.save("zimage-inpaint.png")
-        ```
-"""
-
-
-# Copied from diffusers.pipelines.flux.pipeline_flux.calculate_shift
-def calculate_shift(
-    image_seq_len,
-    base_seq_len: int = 256,
-    max_seq_len: int = 4096,
-    base_shift: float = 0.5,
-    max_shift: float = 1.15,
-):
-    m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
-    b = base_shift - m * base_seq_len
-    mu = image_seq_len * m + b
-    return mu
-
-
-# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
-def retrieve_latents(
-    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
-):
-    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
-        return encoder_output.latent_dist.sample(generator)
-    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
-        return encoder_output.latent_dist.mode()
-    elif hasattr(encoder_output, "latents"):
-        return encoder_output.latents
-    else:
-        raise AttributeError("Could not access latents of provided encoder_output")
-
-
-# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
-def retrieve_timesteps(
-    scheduler,
-    num_inference_steps: Optional[int] = None,
-    device: Optional[Union[str, torch.device]] = None,
-    timesteps: Optional[List[int]] = None,
-    sigmas: Optional[List[float]] = None,
-    **kwargs,
-):
-    r"""
-    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
-    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
-
-    Args:
-        scheduler (`SchedulerMixin`):
-            The scheduler to get timesteps from.
-        num_inference_steps (`int`):
-            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
-            must be `None`.
-        device (`str` or `torch.device`, *optional*):
-            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
-        timesteps (`List[int]`, *optional*):
-            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
-            `num_inference_steps` and `sigmas` must be `None`.
-        sigmas (`List[float]`, *optional*):
-            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
-            `num_inference_steps` and `timesteps` must be `None`.
-
-    Returns:
-        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
-        second element is the number of inference steps.
-    """
-    if timesteps is not None and sigmas is not None:
-        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
-    if timesteps is not None:
-        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
-        if not accepts_timesteps:
-            raise ValueError(
-                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
-                f" timestep schedules. Please check whether you are using the correct scheduler."
-            )
-        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
-        timesteps = scheduler.timesteps
-        num_inference_steps = len(timesteps)
-    elif sigmas is not None:
-        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
-        if not accept_sigmas:
-            raise ValueError(
-                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
-                f" sigmas schedules. Please check whether you are using the correct scheduler."
-            )
-        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
-        timesteps = scheduler.timesteps
-        num_inference_steps = len(timesteps)
-    else:
-        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
-        timesteps = scheduler.timesteps
-    return timesteps, num_inference_steps
-
-
-class ZImageControlNetInpaintPipeline(DiffusionPipeline, FromSingleFileMixin):
-    model_cpu_offload_seq = "text_encoder->transformer->vae"
-    _optional_components = []
-    _callback_tensor_inputs = ["latents", "prompt_embeds"]
-
-    def __init__(
-        self,
-        scheduler: FlowMatchEulerDiscreteScheduler,
-        vae: AutoencoderKL,
-        text_encoder: PreTrainedModel,
-        tokenizer: AutoTokenizer,
-        transformer: ZImageTransformer2DModel,
-        controlnet: ZImageControlNetModel,
-    ):
-        super().__init__()
-        if transformer.in_channels == controlnet.config.control_in_dim:
-            raise ValueError(
-                "ZImageControlNetInpaintPipeline is not compatible with `alibaba-pai/Z-Image-Turbo-Fun-Controlnet-Union`, use `alibaba-pai/Z-Image-Turbo-Fun-Controlnet-Union-2.0`."
-            )
-        controlnet = ZImageControlNetModel.from_transformer(controlnet, transformer)
-
-        self.register_modules(
-            vae=vae,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            scheduler=scheduler,
-            transformer=transformer,
-            controlnet=controlnet,
-        )
-        self.vae_scale_factor = (
-            2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
-        )
-        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor * 2)
-        self.mask_processor = VaeImageProcessor(
-            vae_scale_factor=self.vae_scale_factor, do_normalize=False, do_binarize=True, do_convert_grayscale=True
-        )
-
-    def encode_prompt(
-        self,
-        prompt: Union[str, List[str]],
-        device: Optional[torch.device] = None,
-        do_classifier_free_guidance: bool = True,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        prompt_embeds: Optional[List[torch.FloatTensor]] = None,
-        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
-        max_sequence_length: int = 512,
-    ):
-        prompt = [prompt] if isinstance(prompt, str) else prompt
-        prompt_embeds = self._encode_prompt(
-            prompt=prompt,
-            device=device,
-            prompt_embeds=prompt_embeds,
-            max_sequence_length=max_sequence_length,
-        )
-
-        if do_classifier_free_guidance:
-            if negative_prompt is None:
-                negative_prompt = ["" for _ in prompt]
-            else:
-                negative_prompt = [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
-            assert len(prompt) == len(negative_prompt)
-            negative_prompt_embeds = self._encode_prompt(
-                prompt=negative_prompt,
-                device=device,
-                prompt_embeds=negative_prompt_embeds,
-                max_sequence_length=max_sequence_length,
-            )
-        else:
-            negative_prompt_embeds = []
-        return prompt_embeds, negative_prompt_embeds
-
-    def _encode_prompt(
-        self,
-        prompt: Union[str, List[str]],
-        device: Optional[torch.device] = None,
-        prompt_embeds: Optional[List[torch.FloatTensor]] = None,
-        max_sequence_length: int = 512,
-    ) -> List[torch.FloatTensor]:
-        device = device or self._execution_device
-
-        if prompt_embeds is not None:
-            return prompt_embeds
-
-        if isinstance(prompt, str):
-            prompt = [prompt]
-
-        for i, prompt_item in enumerate(prompt):
-            messages = [
-                {"role": "user", "content": prompt_item},
-            ]
-            prompt_item = self.tokenizer.apply_chat_template(
-                messages,
-                tokenize=False,
-                add_generation_prompt=True,
-                enable_thinking=True,
-            )
-            prompt[i] = prompt_item
-
-        text_inputs = self.tokenizer(
-            prompt,
-            padding="max_length",
-            max_length=max_sequence_length,
-            truncation=True,
-            return_tensors="pt",
-        )
-
-        text_input_ids = text_inputs.input_ids.to(device)
-        prompt_masks = text_inputs.attention_mask.to(device).bool()
-
-        prompt_embeds = self.text_encoder(
-            input_ids=text_input_ids,
-            attention_mask=prompt_masks,
-            output_hidden_states=True,
-        ).hidden_states[-2]
-
-        embeddings_list = []
-
-        for i in range(len(prompt_embeds)):
-            embeddings_list.append(prompt_embeds[i][prompt_masks[i]])
-
-        return embeddings_list
-
-    def prepare_latents(
-        self,
-        batch_size,
-        num_channels_latents,
-        height,
-        width,
-        dtype,
-        device,
-        generator,
-        latents=None,
-    ):
-        height = 2 * (int(height) // (self.vae_scale_factor * 2))
-        width = 2 * (int(width) // (self.vae_scale_factor * 2))
-
-        shape = (batch_size, num_channels_latents, height, width)
-
-        if latents is None:
-            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
-        else:
-            if latents.shape != shape:
-                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
-            latents = latents.to(device)
-        return latents
-
-    # Copied from diffusers.pipelines.controlnet_sd3.pipeline_stable_diffusion_3_controlnet.StableDiffusion3ControlNetPipeline.prepare_image
-    def prepare_image(
-        self,
-        image,
-        width,
-        height,
-        batch_size,
-        num_images_per_prompt,
-        device,
-        dtype,
-        do_classifier_free_guidance=False,
-        guess_mode=False,
-    ):
-        if isinstance(image, torch.Tensor):
-            pass
-        else:
-            image = self.image_processor.preprocess(image, height=height, width=width)
-
-        image_batch_size = image.shape[0]
-
-        if image_batch_size == 1:
-            repeat_by = batch_size
-        else:
-            # image batch size is the same as prompt batch size
-            repeat_by = num_images_per_prompt
-
-        image = image.repeat_interleave(repeat_by, dim=0)
-
-        image = image.to(device=device, dtype=dtype)
-
-        if do_classifier_free_guidance and not guess_mode:
-            image = torch.cat([image] * 2)
-
-        return image
-
-    @property
-    def guidance_scale(self):
-        return self._guidance_scale
-
-    @property
-    def do_classifier_free_guidance(self):
-        return self._guidance_scale > 1
-
-    @property
-    def joint_attention_kwargs(self):
-        return self._joint_attention_kwargs
-
-    @property
-    def num_timesteps(self):
-        return self._num_timesteps
-
-    @property
-    def interrupt(self):
-        return self._interrupt
-
-    @torch.no_grad()
-    @replace_example_docstring(EXAMPLE_DOC_STRING)
-    def __call__(
-        self,
-        prompt: Union[str, List[str]] = None,
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        num_inference_steps: int = 50,
-        sigmas: Optional[List[float]] = None,
-        guidance_scale: float = 5.0,
-        image: PipelineImageInput = None,
-        mask_image: PipelineImageInput = None,
-        control_image: PipelineImageInput = None,
-        controlnet_conditioning_scale: Union[float, List[float]] = 0.75,
-        cfg_normalization: bool = False,
-        cfg_truncation: float = 1.0,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
-        latents: Optional[torch.FloatTensor] = None,
-        prompt_embeds: Optional[List[torch.FloatTensor]] = None,
-        negative_prompt_embeds: Optional[List[torch.FloatTensor]] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
-        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
-        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
-        max_sequence_length: int = 512,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
-                instead.
-            height (`int`, *optional*, defaults to 1024):
-                The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to 1024):
-                The width in pixels of the generated image.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            sigmas (`List[float]`, *optional*):
-                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
-                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
-                will be used.
-            guidance_scale (`float`, *optional*, defaults to 5.0):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            cfg_normalization (`bool`, *optional*, defaults to False):
-                Whether to apply configuration normalization.
-            cfg_truncation (`float`, *optional*, defaults to 1.0):
-                The truncation value for configuration.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
-                less than `1`).
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
-                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
-                to make generation deterministic.
-            latents (`torch.FloatTensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will be generated by sampling using the supplied random `generator`.
-            prompt_embeds (`List[torch.FloatTensor]`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`List[torch.FloatTensor]`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.ZImagePipelineOutput`] instead of a plain
-                tuple.
-            joint_attention_kwargs (`dict`, *optional*):
-                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
-                `self.processor` in
-                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
-            callback_on_step_end (`Callable`, *optional*):
-                A function that calls at the end of each denoising steps during the inference. The function is called
-                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
-                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
-                `callback_on_step_end_tensor_inputs`.
-            callback_on_step_end_tensor_inputs (`List`, *optional*):
-                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
-                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
-                `._callback_tensor_inputs` attribute of your pipeline class.
-            max_sequence_length (`int`, *optional*, defaults to 512):
-                Maximum sequence length to use with the `prompt`.
-
-        Examples:
-
-        Returns:
-            [`~pipelines.z_image.ZImagePipelineOutput`] or `tuple`: [`~pipelines.z_image.ZImagePipelineOutput`] if
-            `return_dict` is True, otherwise a `tuple`. When returning a tuple, the first element is a list with the
-            generated images.
-        """
-        height = height or 1024
-        width = width or 1024
-
-        vae_scale = self.vae_scale_factor * 2
-        if height % vae_scale != 0:
-            raise ValueError(
-                f"Height must be divisible by {vae_scale} (got {height}). "
-                f"Please adjust the height to a multiple of {vae_scale}."
-            )
-        if width % vae_scale != 0:
-            raise ValueError(
-                f"Width must be divisible by {vae_scale} (got {width}). "
-                f"Please adjust the width to a multiple of {vae_scale}."
-            )
-
-        device = self._execution_device
-
-        self._guidance_scale = guidance_scale
-        self._joint_attention_kwargs = joint_attention_kwargs
-        self._interrupt = False
-        self._cfg_normalization = cfg_normalization
-        self._cfg_truncation = cfg_truncation
-        # 2. Define call parameters
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = len(prompt_embeds)
-
-        # If prompt_embeds is provided and prompt is None, skip encoding
-        if prompt_embeds is not None and prompt is None:
-            if self.do_classifier_free_guidance and negative_prompt_embeds is None:
-                raise ValueError(
-                    "When `prompt_embeds` is provided without `prompt`, "
-                    "`negative_prompt_embeds` must also be provided for classifier-free guidance."
-                )
-        else:
-            (
-                prompt_embeds,
-                negative_prompt_embeds,
-            ) = self.encode_prompt(
-                prompt=prompt,
-                negative_prompt=negative_prompt,
-                do_classifier_free_guidance=self.do_classifier_free_guidance,
-                prompt_embeds=prompt_embeds,
-                negative_prompt_embeds=negative_prompt_embeds,
-                device=device,
-                max_sequence_length=max_sequence_length,
-            )
-
-        # 4. Prepare latent variables
-        num_channels_latents = self.transformer.in_channels
-
-        control_image = self.prepare_image(
-            image=control_image,
-            width=width,
-            height=height,
-            batch_size=batch_size * num_images_per_prompt,
-            num_images_per_prompt=num_images_per_prompt,
-            device=device,
-            dtype=self.vae.dtype,
-        )
-        height, width = control_image.shape[-2:]
-        control_image = retrieve_latents(self.vae.encode(control_image), generator=generator, sample_mode="argmax")
-        control_image = (control_image - self.vae.config.shift_factor) * self.vae.config.scaling_factor
-        control_image = control_image.unsqueeze(2)
-
-        mask_condition = self.mask_processor.preprocess(mask_image, height=height, width=width)
-        mask_condition = torch.tile(mask_condition, [1, 3, 1, 1]).to(
-            device=control_image.device, dtype=control_image.dtype
-        )
-
-        init_image = self.prepare_image(
-            image=image,
-            width=width,
-            height=height,
-            batch_size=batch_size * num_images_per_prompt,
-            num_images_per_prompt=num_images_per_prompt,
-            device=device,
-            dtype=self.vae.dtype,
-        )
-        height, width = init_image.shape[-2:]
-        init_image = init_image * (mask_condition < 0.5)
-        init_image = retrieve_latents(self.vae.encode(init_image), generator=generator, sample_mode="argmax")
-        init_image = (init_image - self.vae.config.shift_factor) * self.vae.config.scaling_factor
-        init_image = init_image.unsqueeze(2)
-
-        mask_condition = F.interpolate(1 - mask_condition[:, :1], size=init_image.size()[-2:], mode="nearest").to(
-            device=control_image.device, dtype=control_image.dtype
-        )
-        mask_condition = mask_condition.unsqueeze(2)
-
-        control_image = torch.cat([control_image, mask_condition, init_image], dim=1)
-
-        latents = self.prepare_latents(
-            batch_size * num_images_per_prompt,
-            num_channels_latents,
-            height,
-            width,
-            torch.float32,
-            device,
-            generator,
-            latents,
-        )
-
-        # Repeat prompt_embeds for num_images_per_prompt
-        if num_images_per_prompt > 1:
-            prompt_embeds = [pe for pe in prompt_embeds for _ in range(num_images_per_prompt)]
-            if self.do_classifier_free_guidance and negative_prompt_embeds:
-                negative_prompt_embeds = [npe for npe in negative_prompt_embeds for _ in range(num_images_per_prompt)]
-
-        actual_batch_size = batch_size * num_images_per_prompt
-        image_seq_len = (latents.shape[2] // 2) * (latents.shape[3] // 2)
-
-        # 5. Prepare timesteps
-        mu = calculate_shift(
-            image_seq_len,
-            self.scheduler.config.get("base_image_seq_len", 256),
-            self.scheduler.config.get("max_image_seq_len", 4096),
-            self.scheduler.config.get("base_shift", 0.5),
-            self.scheduler.config.get("max_shift", 1.15),
-        )
-        self.scheduler.sigma_min = 0.0
-        scheduler_kwargs = {"mu": mu}
-        timesteps, num_inference_steps = retrieve_timesteps(
-            self.scheduler,
-            num_inference_steps,
-            device,
-            sigmas=sigmas,
-            **scheduler_kwargs,
-        )
-        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
-        self._num_timesteps = len(timesteps)
-
-        # 6. Denoising loop
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                if self.interrupt:
-                    continue
-
-                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
-                timestep = t.expand(latents.shape[0])
-                timestep = (1000 - timestep) / 1000
-                # Normalized time for time-aware config (0 at start, 1 at end)
-                t_norm = timestep[0].item()
-
-                # Handle cfg truncation
-                current_guidance_scale = self.guidance_scale
-                if (
-                    self.do_classifier_free_guidance
-                    and self._cfg_truncation is not None
-                    and float(self._cfg_truncation) <= 1
-                ):
-                    if t_norm > self._cfg_truncation:
-                        current_guidance_scale = 0.0
-
-                # Run CFG only if configured AND scale is non-zero
-                apply_cfg = self.do_classifier_free_guidance and current_guidance_scale > 0
-
-                if apply_cfg:
-                    latents_typed = latents.to(self.transformer.dtype)
-                    latent_model_input = latents_typed.repeat(2, 1, 1, 1)
-                    prompt_embeds_model_input = prompt_embeds + negative_prompt_embeds
-                    timestep_model_input = timestep.repeat(2)
-                else:
-                    latent_model_input = latents.to(self.transformer.dtype)
-                    prompt_embeds_model_input = prompt_embeds
-                    timestep_model_input = timestep
-
-                latent_model_input = latent_model_input.unsqueeze(2)
-                latent_model_input_list = list(latent_model_input.unbind(dim=0))
-
-                controlnet_block_samples = self.controlnet(
-                    latent_model_input_list,
-                    timestep_model_input,
-                    prompt_embeds_model_input,
-                    control_image,
-                    conditioning_scale=controlnet_conditioning_scale,
-                )
-
-                model_out_list = self.transformer(
-                    latent_model_input_list,
-                    timestep_model_input,
-                    prompt_embeds_model_input,
-                    controlnet_block_samples=controlnet_block_samples,
-                )[0]
-
-                if apply_cfg:
-                    # Perform CFG
-                    pos_out = model_out_list[:actual_batch_size]
-                    neg_out = model_out_list[actual_batch_size:]
-
-                    noise_pred = []
-                    for j in range(actual_batch_size):
-                        pos = pos_out[j].float()
-                        neg = neg_out[j].float()
-
-                        pred = pos + current_guidance_scale * (pos - neg)
-
-                        # Renormalization
-                        if self._cfg_normalization and float(self._cfg_normalization) > 0.0:
-                            ori_pos_norm = torch.linalg.vector_norm(pos)
-                            new_pos_norm = torch.linalg.vector_norm(pred)
-                            max_new_norm = ori_pos_norm * float(self._cfg_normalization)
-                            if new_pos_norm > max_new_norm:
-                                pred = pred * (max_new_norm / new_pos_norm)
-
-                        noise_pred.append(pred)
-
-                    noise_pred = torch.stack(noise_pred, dim=0)
-                else:
-                    noise_pred = torch.stack([t.float() for t in model_out_list], dim=0)
-
-                noise_pred = noise_pred.squeeze(2)
-                noise_pred = -noise_pred
-
-                # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred.to(torch.float32), t, latents, return_dict=False)[0]
-                assert latents.dtype == torch.float32
-
-                if callback_on_step_end is not None:
-                    callback_kwargs = {}
-                    for k in callback_on_step_end_tensor_inputs:
-                        callback_kwargs[k] = locals()[k]
-                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
-
-                    latents = callback_outputs.pop("latents", latents)
-                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
-                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
-
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    progress_bar.update()
-
-        if output_type == "latent":
-            image = latents
-
-        else:
-            latents = latents.to(self.vae.dtype)
-            latents = (latents / self.vae.config.scaling_factor) + self.vae.config.shift_factor
-
-            image = self.vae.decode(latents, return_dict=False)[0]
-            image = self.image_processor.postprocess(image, output_type=output_type)
-
-        # Offload all models
-        self.maybe_free_model_hooks()
-
-        if not return_dict:
-            return (image,)
-
-        return ZImagePipelineOutput(images=image)
--- a/src/diffusers/schedulers/scheduling_unipc_multistep.py
+++ b/src/diffusers/schedulers/scheduling_unipc_multistep.py
@@ -217,8 +217,6 @@ class UniPCMultistepScheduler(SchedulerMixin, ConfigMixin):
        rescale_betas_zero_snr: bool = False,
        use_dynamic_shifting: bool = False,
        time_shift_type: Literal["exponential"] = "exponential",
-        sigma_min: Optional[float] = None,
-        sigma_max: Optional[float] = None,
    ) -> None:
        if self.config.use_beta_sigmas and not is_scipy_available():
            raise ImportError("Make sure to install scipy if you want to use beta sigmas.")
@@ -352,12 +350,7 @@ class UniPCMultistepScheduler(SchedulerMixin, ConfigMixin):
            log_sigmas = np.log(sigmas)
            sigmas = np.flip(sigmas).copy()
            sigmas = self._convert_to_karras(in_sigmas=sigmas, num_inference_steps=num_inference_steps)
-            if self.config.use_flow_sigmas:
-                sigmas = sigmas / (sigmas + 1)
-                timesteps = (sigmas * self.config.num_train_timesteps).copy()
-            else:
-                timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas]).round()
-
+            timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas]).round()
            if self.config.final_sigmas_type == "sigma_min":
                sigma_last = sigmas[-1]
            elif self.config.final_sigmas_type == "zero":
--- a/src/diffusers/utils/dummy_pt_objects.py
+++ b/src/diffusers/utils/dummy_pt_objects.py
@@ -1132,21 +1132,6 @@ class LatteTransformer3DModel(metaclass=DummyObject):
        requires_backends(cls, ["torch"])


-class LongCatImageTransformer2DModel(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["torch"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["torch"])
-
-
 class LTXVideoTransformer3DModel(metaclass=DummyObject):
    _backends = ["torch"]

@@ -1777,21 +1762,6 @@ class WanVACETransformer3DModel(metaclass=DummyObject):
        requires_backends(cls, ["torch"])


-class ZImageControlNetModel(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["torch"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["torch"])
-
-
 class ZImageTransformer2DModel(metaclass=DummyObject):
    _backends = ["torch"]

--- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py
+++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
@@ -767,21 +767,6 @@ class ConsisIDPipeline(metaclass=DummyObject):
        requires_backends(cls, ["torch", "transformers"])


-class Cosmos2_5_PredictBasePipeline(metaclass=DummyObject):
-    _backends = ["torch", "transformers"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch", "transformers"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "transformers"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "transformers"])
-
-
 class Cosmos2TextToImagePipeline(metaclass=DummyObject):
    _backends = ["torch", "transformers"]

@@ -1847,36 +1832,6 @@ class LEditsPPPipelineStableDiffusionXL(metaclass=DummyObject):
        requires_backends(cls, ["torch", "transformers"])


-class LongCatImageEditPipeline(metaclass=DummyObject):
-    _backends = ["torch", "transformers"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch", "transformers"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "transformers"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "transformers"])
-
-
-class LongCatImagePipeline(metaclass=DummyObject):
-    _backends = ["torch", "transformers"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch", "transformers"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "transformers"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "transformers"])
-
-
 class LTXConditionPipeline(metaclass=DummyObject):
    _backends = ["torch", "transformers"]

@@ -2312,21 +2267,6 @@ class QwenImageInpaintPipeline(metaclass=DummyObject):
        requires_backends(cls, ["torch", "transformers"])


-class QwenImageLayeredPipeline(metaclass=DummyObject):
-    _backends = ["torch", "transformers"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch", "transformers"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "transformers"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "transformers"])
-
-
 class QwenImagePipeline(metaclass=DummyObject):
    _backends = ["torch", "transformers"]

@@ -3872,36 +3812,6 @@ class WuerstchenPriorPipeline(metaclass=DummyObject):
        requires_backends(cls, ["torch", "transformers"])


-class ZImageControlNetInpaintPipeline(metaclass=DummyObject):
-    _backends = ["torch", "transformers"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch", "transformers"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "transformers"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "transformers"])
-
-
-class ZImageControlNetPipeline(metaclass=DummyObject):
-    _backends = ["torch", "transformers"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch", "transformers"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "transformers"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "transformers"])
-
-
 class ZImageImg2ImgPipeline(metaclass=DummyObject):
    _backends = ["torch", "transformers"]

--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -32,6 +32,8 @@ warnings.simplefilter(action="ignore", category=FutureWarning)

 def pytest_configure(config):
    config.addinivalue_line("markers", "big_accelerator: marks tests as requiring big accelerator resources")
+    config.addinivalue_line("markers", "slow: mark test as slow")
+    config.addinivalue_line("markers", "nightly: mark test as nightly")


 def pytest_addoption(parser):
--- a/tests/modular_pipelines/test_modular_pipelines_custom_blocks.py
+++ b/tests/modular_pipelines/test_modular_pipelines_custom_blocks.py
@@ -0,0 +1,272 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+import tempfile
+from collections import deque
+from typing import List
+
+import numpy as np
+import torch
+
+from diffusers import FluxTransformer2DModel
+from diffusers.modular_pipelines import (
+    ComponentSpec,
+    InputParam,
+    ModularPipelineBlocks,
+    OutputParam,
+    PipelineState,
+    WanModularPipeline,
+)
+
+from ..testing_utils import nightly, require_torch, slow
+
+
+class DummyCustomBlockSimple(ModularPipelineBlocks):
+    def __init__(self, use_dummy_model_component=False):
+        self.use_dummy_model_component = use_dummy_model_component
+        super().__init__()
+
+    @property
+    def expected_components(self):
+        if self.use_dummy_model_component:
+            return [ComponentSpec("transformer", FluxTransformer2DModel)]
+        else:
+            return []
+
+    @property
+    def inputs(self) -> List[InputParam]:
+        return [InputParam("prompt", type_hint=str, required=True, description="Prompt to use")]
+
+    @property
+    def intermediate_inputs(self) -> List[InputParam]:
+        return []
+
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [
+            OutputParam(
+                "output_prompt",
+                type_hint=str,
+                description="Modified prompt",
+            )
+        ]
+
+    def __call__(self, components, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+
+        old_prompt = block_state.prompt
+        block_state.output_prompt = "Modular diffusers + " + old_prompt
+        self.set_block_state(state, block_state)
+
+        return components, state
+
+
+CODE_STR = """
+from diffusers.modular_pipelines import (
+    ComponentSpec,
+    InputParam,
+    ModularPipelineBlocks,
+    OutputParam,
+    PipelineState,
+    WanModularPipeline,
+)
+from typing import List
+
+class DummyCustomBlockSimple(ModularPipelineBlocks):
+    def __init__(self, use_dummy_model_component=False):
+        self.use_dummy_model_component = use_dummy_model_component
+        super().__init__()
+
+    @property
+    def expected_components(self):
+        if self.use_dummy_model_component:
+            return [ComponentSpec("transformer", FluxTransformer2DModel)]
+        else:
+            return []
+
+    @property
+    def inputs(self) -> List[InputParam]:
+        return [InputParam("prompt", type_hint=str, required=True, description="Prompt to use")]
+
+    @property
+    def intermediate_inputs(self) -> List[InputParam]:
+        return []
+
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [
+            OutputParam(
+                "output_prompt",
+                type_hint=str,
+                description="Modified prompt",
+            )
+        ]
+
+    def __call__(self, components, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+
+        old_prompt = block_state.prompt
+        block_state.output_prompt = "Modular diffusers + " + old_prompt
+        self.set_block_state(state, block_state)
+
+        return components, state
+"""
+
+
+class TestModularCustomBlocks:
+    def _test_block_properties(self, block):
+        assert not block.expected_components
+        assert not block.intermediate_inputs
+
+        actual_inputs = [inp.name for inp in block.inputs]
+        actual_intermediate_outputs = [out.name for out in block.intermediate_outputs]
+        assert actual_inputs == ["prompt"]
+        assert actual_intermediate_outputs == ["output_prompt"]
+
+    def test_custom_block_properties(self):
+        custom_block = DummyCustomBlockSimple()
+        self._test_block_properties(custom_block)
+
+    def test_custom_block_output(self):
+        custom_block = DummyCustomBlockSimple()
+        pipe = custom_block.init_pipeline()
+        prompt = "Diffusers is nice"
+        output = pipe(prompt=prompt)
+
+        actual_inputs = [inp.name for inp in custom_block.inputs]
+        actual_intermediate_outputs = [out.name for out in custom_block.intermediate_outputs]
+        assert sorted(output.values) == sorted(actual_inputs + actual_intermediate_outputs)
+
+        output_prompt = output.values["output_prompt"]
+        assert output_prompt.startswith("Modular diffusers + ")
+
+    def test_custom_block_saving_loading(self):
+        custom_block = DummyCustomBlockSimple()
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            custom_block.save_pretrained(tmpdir)
+            assert any("modular_config.json" in k for k in os.listdir(tmpdir))
+
+            with open(os.path.join(tmpdir, "modular_config.json"), "r") as f:
+                config = json.load(f)
+            auto_map = config["auto_map"]
+            assert auto_map == {"ModularPipelineBlocks": "test_modular_pipelines_custom_blocks.DummyCustomBlockSimple"}
+
+            # For now, the Python script that implements the custom block has to be manually pushed to the Hub.
+            # This is why, we have to separately save the Python script here.
+            code_path = os.path.join(tmpdir, "test_modular_pipelines_custom_blocks.py")
+            with open(code_path, "w") as f:
+                f.write(CODE_STR)
+
+            loaded_custom_block = ModularPipelineBlocks.from_pretrained(tmpdir, trust_remote_code=True)
+
+        pipe = loaded_custom_block.init_pipeline()
+        prompt = "Diffusers is nice"
+        output = pipe(prompt=prompt)
+
+        actual_inputs = [inp.name for inp in loaded_custom_block.inputs]
+        actual_intermediate_outputs = [out.name for out in loaded_custom_block.intermediate_outputs]
+        assert sorted(output.values) == sorted(actual_inputs + actual_intermediate_outputs)
+
+        output_prompt = output.values["output_prompt"]
+        assert output_prompt.startswith("Modular diffusers + ")
+
+    def test_custom_block_supported_components(self):
+        custom_block = DummyCustomBlockSimple(use_dummy_model_component=True)
+        pipe = custom_block.init_pipeline("hf-internal-testing/tiny-flux-kontext-pipe")
+        pipe.load_components()
+
+        assert len(pipe.components) == 1
+        assert pipe.component_names[0] == "transformer"
+
+    def test_custom_block_loads_from_hub(self):
+        repo_id = "hf-internal-testing/tiny-modular-diffusers-block"
+        block = ModularPipelineBlocks.from_pretrained(repo_id, trust_remote_code=True)
+        self._test_block_properties(block)
+
+        pipe = block.init_pipeline()
+
+        prompt = "Diffusers is nice"
+        output = pipe(prompt=prompt)
+        output_prompt = output.values["output_prompt"]
+        assert output_prompt.startswith("Modular diffusers + ")
+
+
+@slow
+@nightly
+@require_torch
+class TestKreaCustomBlocksIntegration:
+    repo_id = "krea/krea-realtime-video"
+
+    def test_loading_from_hub(self):
+        blocks = ModularPipelineBlocks.from_pretrained(self.repo_id, trust_remote_code=True)
+        block_names = sorted(blocks.sub_blocks)
+
+        assert block_names == sorted(["text_encoder", "before_denoise", "denoise", "decode"])
+
+        pipe = WanModularPipeline(blocks, self.repo_id)
+        pipe.load_components(
+            trust_remote_code=True,
+            device_map="cuda",
+            torch_dtype={"default": torch.bfloat16, "vae": torch.float16},
+        )
+        assert len(pipe.components) == 7
+        assert sorted(pipe.components) == sorted(
+            ["text_encoder", "tokenizer", "guider", "scheduler", "vae", "transformer", "video_processor"]
+        )
+
+    def test_forward(self):
+        blocks = ModularPipelineBlocks.from_pretrained(self.repo_id, trust_remote_code=True)
+        pipe = WanModularPipeline(blocks, self.repo_id)
+        pipe.load_components(
+            trust_remote_code=True,
+            device_map="cuda",
+            torch_dtype={"default": torch.bfloat16, "vae": torch.float16},
+        )
+
+        num_frames_per_block = 2
+        num_blocks = 2
+
+        state = PipelineState()
+        state.set("frame_cache_context", deque(maxlen=pipe.config.frame_cache_len))
+
+        prompt = ["a cat sitting on a boat"]
+
+        for block in pipe.transformer.blocks:
+            block.self_attn.fuse_projections()
+
+        for block_idx in range(num_blocks):
+            state = pipe(
+                state,
+                prompt=prompt,
+                num_inference_steps=2,
+                num_blocks=num_blocks,
+                num_frames_per_block=num_frames_per_block,
+                block_idx=block_idx,
+                generator=torch.manual_seed(42),
+            )
+            current_frames = np.array(state.values["videos"][0])
+            current_frames_flat = current_frames.flatten()
+            actual_slices = np.concatenate([current_frames_flat[:4], current_frames_flat[-4:]]).tolist()
+
+            if block_idx == 0:
+                assert current_frames.shape == (5, 480, 832, 3)
+                expected_slices = np.array([211, 229, 238, 208, 195, 180, 188, 193])
+            else:
+                assert current_frames.shape == (8, 480, 832, 3)
+                expected_slices = np.array([179, 203, 214, 176, 194, 181, 187, 191])
+
+            assert np.allclose(actual_slices, expected_slices)
--- a/tests/pipelines/cosmos/cosmos_guardrail.py
+++ b/tests/pipelines/cosmos/cosmos_guardrail.py
@@ -27,7 +27,7 @@ class DummyCosmosSafetyChecker(ModelMixin, ConfigMixin):
    def __init__(self) -> None:
        super().__init__()

-        self.register_buffer("_device_tracker", torch.zeros(1, dtype=torch.float32), persistent=False)
+        self._dtype = torch.float32

    def check_text_safety(self, prompt: str) -> bool:
        return True
@@ -35,14 +35,13 @@ class DummyCosmosSafetyChecker(ModelMixin, ConfigMixin):
    def check_video_safety(self, frames: np.ndarray) -> np.ndarray:
        return frames

-    def to(self, device: Union[str, torch.device] = None, dtype: torch.dtype = None):
-        module = super().to(device=device, dtype=dtype)
-        return module
+    def to(self, device: Union[str, torch.device] = None, dtype: torch.dtype = None) -> None:
+        self._dtype = dtype

    @property
    def device(self) -> torch.device:
-        return self._device_tracker.device
+        return None

    @property
    def dtype(self) -> torch.dtype:
-        return self._device_tracker.dtype
+        return self._dtype
--- a/tests/pipelines/cosmos/test_cosmos2_5_predict.py
+++ b/tests/pipelines/cosmos/test_cosmos2_5_predict.py
@@ -1,337 +0,0 @@
-# Copyright 2025 The HuggingFace Team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-import json
-import os
-import tempfile
-import unittest
-
-import numpy as np
-import torch
-from transformers import Qwen2_5_VLConfig, Qwen2_5_VLForConditionalGeneration, Qwen2Tokenizer
-
-from diffusers import (
-    AutoencoderKLWan,
-    Cosmos2_5_PredictBasePipeline,
-    CosmosTransformer3DModel,
-    UniPCMultistepScheduler,
-)
-
-from ...testing_utils import enable_full_determinism, torch_device
-from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
-from ..test_pipelines_common import PipelineTesterMixin, to_np
-from .cosmos_guardrail import DummyCosmosSafetyChecker
-
-
-enable_full_determinism()
-
-
-class Cosmos2_5_PredictBaseWrapper(Cosmos2_5_PredictBasePipeline):
-    @staticmethod
-    def from_pretrained(*args, **kwargs):
-        if "safety_checker" not in kwargs or kwargs["safety_checker"] is None:
-            safety_checker = DummyCosmosSafetyChecker()
-            device_map = kwargs.get("device_map", "cpu")
-            torch_dtype = kwargs.get("torch_dtype")
-            if device_map is not None or torch_dtype is not None:
-                safety_checker = safety_checker.to(device_map, dtype=torch_dtype)
-            kwargs["safety_checker"] = safety_checker
-        return Cosmos2_5_PredictBasePipeline.from_pretrained(*args, **kwargs)
-
-
-class Cosmos2_5_PredictPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = Cosmos2_5_PredictBaseWrapper
-    params = TEXT_TO_IMAGE_PARAMS - {"cross_attention_kwargs"}
-    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
-    image_params = TEXT_TO_IMAGE_IMAGE_PARAMS
-    image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
-    required_optional_params = frozenset(
-        [
-            "num_inference_steps",
-            "generator",
-            "latents",
-            "return_dict",
-            "callback_on_step_end",
-            "callback_on_step_end_tensor_inputs",
-        ]
-    )
-    supports_dduf = False
-    test_xformers_attention = False
-    test_layerwise_casting = True
-    test_group_offloading = True
-
-    def get_dummy_components(self):
-        torch.manual_seed(0)
-        transformer = CosmosTransformer3DModel(
-            in_channels=16 + 1,
-            out_channels=16,
-            num_attention_heads=2,
-            attention_head_dim=16,
-            num_layers=2,
-            mlp_ratio=2,
-            text_embed_dim=32,
-            adaln_lora_dim=4,
-            max_size=(4, 32, 32),
-            patch_size=(1, 2, 2),
-            rope_scale=(2.0, 1.0, 1.0),
-            concat_padding_mask=True,
-            extra_pos_embed_type="learnable",
-        )
-
-        torch.manual_seed(0)
-        vae = AutoencoderKLWan(
-            base_dim=3,
-            z_dim=16,
-            dim_mult=[1, 1, 1, 1],
-            num_res_blocks=1,
-            temperal_downsample=[False, True, True],
-        )
-
-        torch.manual_seed(0)
-        scheduler = UniPCMultistepScheduler()
-
-        torch.manual_seed(0)
-        config = Qwen2_5_VLConfig(
-            text_config={
-                "hidden_size": 16,
-                "intermediate_size": 16,
-                "num_hidden_layers": 2,
-                "num_attention_heads": 2,
-                "num_key_value_heads": 2,
-                "rope_scaling": {
-                    "mrope_section": [1, 1, 2],
-                    "rope_type": "default",
-                    "type": "default",
-                },
-                "rope_theta": 1000000.0,
-            },
-            vision_config={
-                "depth": 2,
-                "hidden_size": 16,
-                "intermediate_size": 16,
-                "num_heads": 2,
-                "out_hidden_size": 16,
-            },
-            hidden_size=16,
-            vocab_size=152064,
-            vision_end_token_id=151653,
-            vision_start_token_id=151652,
-            vision_token_id=151654,
-        )
-        text_encoder = Qwen2_5_VLForConditionalGeneration(config)
-        tokenizer = Qwen2Tokenizer.from_pretrained("hf-internal-testing/tiny-random-Qwen2VLForConditionalGeneration")
-
-        components = {
-            "transformer": transformer,
-            "vae": vae,
-            "scheduler": scheduler,
-            "text_encoder": text_encoder,
-            "tokenizer": tokenizer,
-            "safety_checker": DummyCosmosSafetyChecker(),
-        }
-        return components
-
-    def get_dummy_inputs(self, device, seed=0):
-        if str(device).startswith("mps"):
-            generator = torch.manual_seed(seed)
-        else:
-            generator = torch.Generator(device=device).manual_seed(seed)
-
-        inputs = {
-            "prompt": "dance monkey",
-            "negative_prompt": "bad quality",
-            "generator": generator,
-            "num_inference_steps": 2,
-            "guidance_scale": 3.0,
-            "height": 32,
-            "width": 32,
-            "num_frames": 3,
-            "max_sequence_length": 16,
-            "output_type": "pt",
-        }
-
-        return inputs
-
-    def test_components_function(self):
-        init_components = self.get_dummy_components()
-        init_components = {k: v for k, v in init_components.items() if not isinstance(v, (str, int, float))}
-        pipe = self.pipeline_class(**init_components)
-        self.assertTrue(hasattr(pipe, "components"))
-        self.assertTrue(set(pipe.components.keys()) == set(init_components.keys()))
-
-    def test_inference(self):
-        device = "cpu"
-
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        pipe.to(device)
-        pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        video = pipe(**inputs).frames
-        generated_video = video[0]
-        self.assertEqual(generated_video.shape, (3, 3, 32, 32))
-        self.assertTrue(torch.isfinite(generated_video).all())
-
-    def test_callback_inputs(self):
-        sig = inspect.signature(self.pipeline_class.__call__)
-        has_callback_tensor_inputs = "callback_on_step_end_tensor_inputs" in sig.parameters
-        has_callback_step_end = "callback_on_step_end" in sig.parameters
-
-        if not (has_callback_tensor_inputs and has_callback_step_end):
-            return
-
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        pipe = pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        self.assertTrue(
-            hasattr(pipe, "_callback_tensor_inputs"),
-            f" {self.pipeline_class} should have `_callback_tensor_inputs` that defines a list of tensor variables its callback function can use as inputs",
-        )
-
-        def callback_inputs_subset(pipe, i, t, callback_kwargs):
-            for tensor_name in callback_kwargs.keys():
-                assert tensor_name in pipe._callback_tensor_inputs
-            return callback_kwargs
-
-        def callback_inputs_all(pipe, i, t, callback_kwargs):
-            for tensor_name in pipe._callback_tensor_inputs:
-                assert tensor_name in callback_kwargs
-            for tensor_name in callback_kwargs.keys():
-                assert tensor_name in pipe._callback_tensor_inputs
-            return callback_kwargs
-
-        inputs = self.get_dummy_inputs(torch_device)
-
-        inputs["callback_on_step_end"] = callback_inputs_subset
-        inputs["callback_on_step_end_tensor_inputs"] = ["latents"]
-        _ = pipe(**inputs)[0]
-
-        inputs["callback_on_step_end"] = callback_inputs_all
-        inputs["callback_on_step_end_tensor_inputs"] = pipe._callback_tensor_inputs
-        _ = pipe(**inputs)[0]
-
-        def callback_inputs_change_tensor(pipe, i, t, callback_kwargs):
-            is_last = i == (pipe.num_timesteps - 1)
-            if is_last:
-                callback_kwargs["latents"] = torch.zeros_like(callback_kwargs["latents"])
-            return callback_kwargs
-
-        inputs["callback_on_step_end"] = callback_inputs_change_tensor
-        inputs["callback_on_step_end_tensor_inputs"] = pipe._callback_tensor_inputs
-        output = pipe(**inputs)[0]
-        assert output.abs().sum() < 1e10
-
-    def test_inference_batch_single_identical(self):
-        self._test_inference_batch_single_identical(batch_size=2, expected_max_diff=1e-2)
-
-    def test_attention_slicing_forward_pass(
-        self, test_max_difference=True, test_mean_pixel_difference=True, expected_max_diff=1e-3
-    ):
-        if not getattr(self, "test_attention_slicing", True):
-            return
-
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        for component in pipe.components.values():
-            if hasattr(component, "set_default_attn_processor"):
-                component.set_default_attn_processor()
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-
-        generator_device = "cpu"
-        inputs = self.get_dummy_inputs(generator_device)
-        output_without_slicing = pipe(**inputs)[0]
-
-        pipe.enable_attention_slicing(slice_size=1)
-        inputs = self.get_dummy_inputs(generator_device)
-        output_with_slicing1 = pipe(**inputs)[0]
-
-        pipe.enable_attention_slicing(slice_size=2)
-        inputs = self.get_dummy_inputs(generator_device)
-        output_with_slicing2 = pipe(**inputs)[0]
-
-        if test_max_difference:
-            max_diff1 = np.abs(to_np(output_with_slicing1) - to_np(output_without_slicing)).max()
-            max_diff2 = np.abs(to_np(output_with_slicing2) - to_np(output_without_slicing)).max()
-            self.assertLess(
-                max(max_diff1, max_diff2),
-                expected_max_diff,
-                "Attention slicing should not affect the inference results",
-            )
-
-    def test_save_load_optional_components(self, expected_max_difference=1e-4):
-        self.pipeline_class._optional_components.remove("safety_checker")
-        super().test_save_load_optional_components(expected_max_difference=expected_max_difference)
-        self.pipeline_class._optional_components.append("safety_checker")
-
-    def test_serialization_with_variants(self):
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        model_components = [
-            component_name
-            for component_name, component in pipe.components.items()
-            if isinstance(component, torch.nn.Module)
-        ]
-        model_components.remove("safety_checker")
-        variant = "fp16"
-
-        with tempfile.TemporaryDirectory() as tmpdir:
-            pipe.save_pretrained(tmpdir, variant=variant, safe_serialization=False)
-
-            with open(f"{tmpdir}/model_index.json", "r") as f:
-                config = json.load(f)
-
-            for subfolder in os.listdir(tmpdir):
-                if not os.path.isfile(subfolder) and subfolder in model_components:
-                    folder_path = os.path.join(tmpdir, subfolder)
-                    is_folder = os.path.isdir(folder_path) and subfolder in config
-                    assert is_folder and any(p.split(".")[1].startswith(variant) for p in os.listdir(folder_path))
-
-    def test_torch_dtype_dict(self):
-        components = self.get_dummy_components()
-        if not components:
-            self.skipTest("No dummy components defined.")
-
-        pipe = self.pipeline_class(**components)
-
-        specified_key = next(iter(components.keys()))
-
-        with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as tmpdirname:
-            pipe.save_pretrained(tmpdirname, safe_serialization=False)
-            torch_dtype_dict = {specified_key: torch.bfloat16, "default": torch.float16}
-            loaded_pipe = self.pipeline_class.from_pretrained(
-                tmpdirname, safety_checker=DummyCosmosSafetyChecker(), torch_dtype=torch_dtype_dict
-            )
-
-        for name, component in loaded_pipe.components.items():
-            if name == "safety_checker":
-                continue
-            if isinstance(component, torch.nn.Module) and hasattr(component, "dtype"):
-                expected_dtype = torch_dtype_dict.get(name, torch_dtype_dict.get("default", torch.float32))
-                self.assertEqual(
-                    component.dtype,
-                    expected_dtype,
-                    f"Component '{name}' has dtype {component.dtype} but expected {expected_dtype}",
-                )
-
-    @unittest.skip(
-        "The pipeline should not be runnable without a safety checker. The test creates a pipeline without passing in "
-        "a safety checker, which makes the pipeline default to the actual Cosmos Guardrail. The Cosmos Guardrail is "
-        "too large and slow to run on CI."
-    )
-    def test_encode_prompt_works_in_isolation(self):
-        pass
--- a/tests/pipelines/longcat_image/init.py
+++ b/tests/pipelines/longcat_image/init.py
--- a/tests/quantization/torchao/test_torchao.py
+++ b/tests/quantization/torchao/test_torchao.py
@@ -35,7 +35,6 @@ from diffusers.models.attention_processor import Attention
 from diffusers.quantizers import PipelineQuantizationConfig

 from ...testing_utils import (
-    Expectations,
    backend_empty_cache,
    backend_synchronize,
    enable_full_determinism,
@@ -498,23 +497,8 @@ class TorchAoTest(unittest.TestCase):

    def test_model_memory_usage(self):
        model_id = "hf-internal-testing/tiny-flux-pipe"
-        expected_memory_saving_ratios = Expectations(
-            {
-                # XPU: For this tiny model, per-tensor overheads (alignment, fragmentation, metadata) become visible.
-                # While XPU doesn't have the large fixed cuBLAS workspace of A100, these small overheads prevent reaching the ideal 2.0 ratio.
-                # Observed ~1.27x (158k vs 124k) for model size.
-                # The runtime memory overhead is ~88k for both bf16 and int8wo. Adding this to model size: (158k+88k)/(124k+88k) ≈ 1.15.
-                ("xpu", None): 1.15,
-                # On Ampere, the cuBLAS kernels used for matrix multiplication often allocate a fixed-size workspace.
-                # Since the tiny-flux model weights are likely smaller than or comparable to this workspace, the total memory is dominated by the workspace.
-                ("cuda", 8): 1.02,
-                # On Hopper, TorchAO utilizes newer, highly optimized kernels (via Triton or CUTLASS 3.x) that are designed to be workspace-free or use negligible extra memory.
-                # Additionally, Triton kernels often handle unaligned memory better, avoiding the padding overhead seen on other backends for tiny tensors.
-                # This allows it to achieve the near-ideal 2.0x compression ratio.
-                ("cuda", 9): 2.0,
-            }
-        )
-        expected_memory_saving_ratio = expected_memory_saving_ratios.get_expectation()
+        expected_memory_saving_ratio = 2.0
+
        inputs = self.get_dummy_tensor_inputs(device=torch_device)

        transformer_bf16 = self.get_dummy_components(None, model_id=model_id)["transformer"]
--- a/tests/schedulers/test_scheduler_unipc.py
+++ b/tests/schedulers/test_scheduler_unipc.py
@@ -399,32 +399,3 @@ class UniPCMultistepScheduler1DTest(UniPCMultistepSchedulerTest):

    def test_exponential_sigmas(self):
        self.check_over_configs(use_exponential_sigmas=True)
-
-    def test_flow_and_karras_sigmas(self):
-        self.check_over_configs(use_flow_sigmas=True, use_karras_sigmas=True)
-
-    def test_flow_and_karras_sigmas_values(self):
-        num_train_timesteps = 1000
-        num_inference_steps = 5
-        scheduler = UniPCMultistepScheduler(
-            sigma_min=0.01,
-            sigma_max=200.0,
-            use_flow_sigmas=True,
-            use_karras_sigmas=True,
-            num_train_timesteps=num_train_timesteps,
-        )
-        scheduler.set_timesteps(num_inference_steps=num_inference_steps)
-
-        expected_sigmas = [
-            0.9950248599052429,
-            0.9787454605102539,
-            0.8774884343147278,
-            0.3604971766471863,
-            0.009900986216962337,
-            0.0,  # 0 appended as default
-        ]
-        expected_sigmas = torch.tensor(expected_sigmas)
-        expected_timesteps = (expected_sigmas * num_train_timesteps).to(torch.int64)
-        expected_timesteps = expected_timesteps[0:-1]
-        self.assertTrue(torch.allclose(scheduler.sigmas, expected_sigmas))
-        self.assertTrue(torch.all(expected_timesteps == scheduler.timesteps))
Author	SHA1	Message	Date
Sayak Paul	6899fe3b48	Merge branch 'main' into custom-modular-tests	2025-12-15 20:27:45 +08:00
Sayak Paul	67536f9d9b	Merge branch 'main' into custom-modular-tests	2025-12-08 16:29:06 +08:00
sayakpaul	3eb1f0efe9	resolve conflicts	2025-12-02 21:29:28 +08:00
sayakpaul	1c91475008	up	2025-11-11 17:54:01 +05:30
sayakpaul	6375c02130	resolve conflicts.,	2025-11-11 17:52:53 +05:30
Sayak Paul	e0b1383868	Merge branch 'main' into custom-modular-tests	2025-11-11 09:39:22 +05:30
Sayak Paul	54ddce87fd	Merge branch 'main' into custom-modular-tests	2025-11-10 09:56:58 +05:30
Sayak Paul	c0ce538afc	Apply suggestions from code review	2025-11-03 08:31:06 +05:30
Sayak Paul	fd88f3d3fc	Merge branch 'main' into custom-modular-tests	2025-11-03 08:28:52 +05:30
Sayak Paul	ea4f29f0e8	Merge branch 'main' into custom-modular-tests	2025-10-31 15:53:03 +05:30
sayakpaul	b8809f76d5	up	2025-10-31 15:52:19 +05:30
Sayak Paul	728655ca01	Merge branch 'main' into custom-modular-tests	2025-10-30 08:47:18 +05:30
sayakpaul	9f113f8138	up	2025-10-29 21:25:21 +05:30
sayakpaul	b5f13d9b59	up	2025-10-29 18:28:06 +05:30
sayakpaul	ddb5ba734d	up	2025-10-29 18:27:31 +05:30
sayakpaul	5f1afc11ac	up	2025-10-29 18:19:07 +05:30
sayakpaul	ecdd843044	up	2025-10-29 17:10:10 +05:30
sayakpaul	316b71ff2b	style.	2025-10-29 17:03:34 +05:30
sayakpaul	1be88f036f	up	2025-10-29 17:03:02 +05:30
sayakpaul	77e50155e6	simplify modular workflow ci.	2025-10-29 16:43:39 +05:30
sayakpaul	760a9149a7	start custom block testing.	2025-10-29 16:40:53 +05:30