Compare commits

..

2 Commits

Author SHA1 Message Date
sayakpaul
f70010ca5d up 2025-12-18 11:37:01 +05:30
sayakpaul
2f0b35fd84 start qwenimage layer testsing. 2025-12-18 09:58:17 +05:30
30 changed files with 2598 additions and 3470 deletions

View File

@@ -70,12 +70,6 @@ output.save("output.png")
- all
- __call__
## Cosmos2_5_PredictBasePipeline
[[autodoc]] Cosmos2_5_PredictBasePipeline
- all
- __call__
## CosmosPipelineOutput
[[autodoc]] pipelines.cosmos.pipeline_output.CosmosPipelineOutput

View File

@@ -1,55 +1,11 @@
"""
# Cosmos 2 Predict
Download checkpoint
```bash
hf download nvidia/Cosmos-Predict2-2B-Text2Image
```
convert checkpoint
```bash
transformer_ckpt_path=~/.cache/huggingface/hub/models--nvidia--Cosmos-Predict2-2B-Text2Image/snapshots/acdb5fde992a73ef0355f287977d002cbfd127e0/model.pt
python scripts/convert_cosmos_to_diffusers.py \
--transformer_ckpt_path $transformer_ckpt_path \
--transformer_type Cosmos-2.0-Diffusion-2B-Text2Image \
--text_encoder_path google-t5/t5-11b \
--tokenizer_path google-t5/t5-11b \
--vae_type wan2.1 \
--output_path converted/cosmos-p2-t2i-2b \
--save_pipeline
```
# Cosmos 2.5 Predict
Download checkpoint
```bash
hf download nvidia/Cosmos-Predict2.5-2B
```
Convert checkpoint
```bash
transformer_ckpt_path=~/.cache/huggingface/hub/models--nvidia--Cosmos-Predict2.5-2B/snapshots/865baf084d4c9e850eac59a021277d5a9b9e8b63/base/pre-trained/d20b7120-df3e-4911-919d-db6e08bad31c_ema_bf16.pt
python scripts/convert_cosmos_to_diffusers.py \
--transformer_type Cosmos-2.5-Predict-Base-2B \
--transformer_ckpt_path $transformer_ckpt_path \
--vae_type wan2.1 \
--output_path converted/cosmos-p2.5-base-2b \
--save_pipeline
```
"""
import argparse
import pathlib
import sys
from typing import Any, Dict
import torch
from accelerate import init_empty_weights
from huggingface_hub import snapshot_download
from transformers import AutoTokenizer, Qwen2_5_VLForConditionalGeneration, T5EncoderModel, T5TokenizerFast
from transformers import T5EncoderModel, T5TokenizerFast
from diffusers import (
AutoencoderKLCosmos,
@@ -61,9 +17,7 @@ from diffusers import (
CosmosVideoToWorldPipeline,
EDMEulerScheduler,
FlowMatchEulerDiscreteScheduler,
UniPCMultistepScheduler,
)
from diffusers.pipelines.cosmos.pipeline_cosmos2_5_predict import Cosmos2_5_PredictBasePipeline
def remove_keys_(key: str, state_dict: Dict[str, Any]):
@@ -279,25 +233,6 @@ TRANSFORMER_CONFIGS = {
"concat_padding_mask": True,
"extra_pos_embed_type": None,
},
"Cosmos-2.5-Predict-Base-2B": {
"in_channels": 16 + 1,
"out_channels": 16,
"num_attention_heads": 16,
"attention_head_dim": 128,
"num_layers": 28,
"mlp_ratio": 4.0,
"text_embed_dim": 1024,
"adaln_lora_dim": 256,
"max_size": (128, 240, 240),
"patch_size": (1, 2, 2),
"rope_scale": (1.0, 3.0, 3.0),
"concat_padding_mask": True,
# NOTE: source config has pos_emb_learnable: 'True' - but params are missing
"extra_pos_embed_type": None,
"use_crossattn_projection": True,
"crossattn_proj_in_channels": 100352,
"encoder_hidden_states_channels": 1024,
},
}
VAE_KEYS_RENAME_DICT = {
@@ -399,9 +334,6 @@ def convert_transformer(transformer_type: str, ckpt_path: str, weights_only: boo
elif "Cosmos-2.0" in transformer_type:
TRANSFORMER_KEYS_RENAME_DICT = TRANSFORMER_KEYS_RENAME_DICT_COSMOS_2_0
TRANSFORMER_SPECIAL_KEYS_REMAP = TRANSFORMER_SPECIAL_KEYS_REMAP_COSMOS_2_0
elif "Cosmos-2.5" in transformer_type:
TRANSFORMER_KEYS_RENAME_DICT = TRANSFORMER_KEYS_RENAME_DICT_COSMOS_2_0
TRANSFORMER_SPECIAL_KEYS_REMAP = TRANSFORMER_SPECIAL_KEYS_REMAP_COSMOS_2_0
else:
assert False
@@ -415,7 +347,6 @@ def convert_transformer(transformer_type: str, ckpt_path: str, weights_only: boo
new_key = new_key.removeprefix(PREFIX_KEY)
for replace_key, rename_key in TRANSFORMER_KEYS_RENAME_DICT.items():
new_key = new_key.replace(replace_key, rename_key)
print(key, "->", new_key, flush=True)
update_state_dict_(original_state_dict, key, new_key)
for key in list(original_state_dict.keys()):
@@ -424,21 +355,6 @@ def convert_transformer(transformer_type: str, ckpt_path: str, weights_only: boo
continue
handler_fn_inplace(key, original_state_dict)
expected_keys = set(transformer.state_dict().keys())
mapped_keys = set(original_state_dict.keys())
missing_keys = expected_keys - mapped_keys
unexpected_keys = mapped_keys - expected_keys
if missing_keys:
print(f"ERROR: missing keys ({len(missing_keys)} from state_dict:", flush=True, file=sys.stderr)
for k in missing_keys:
print(k)
sys.exit(1)
if unexpected_keys:
print(f"ERROR: unexpected keys ({len(unexpected_keys)}) from state_dict:", flush=True, file=sys.stderr)
for k in unexpected_keys:
print(k)
sys.exit(2)
transformer.load_state_dict(original_state_dict, strict=True, assign=True)
return transformer
@@ -528,34 +444,6 @@ def save_pipeline_cosmos_2_0(args, transformer, vae):
pipe.save_pretrained(args.output_path, safe_serialization=True, max_shard_size="5GB")
def save_pipeline_cosmos2_5(args, transformer, vae):
text_encoder_path = args.text_encoder_path or "nvidia/Cosmos-Reason1-7B"
tokenizer_path = args.tokenizer_path or "Qwen/Qwen2.5-VL-7B-Instruct"
text_encoder = Qwen2_5_VLForConditionalGeneration.from_pretrained(
text_encoder_path, torch_dtype="auto", device_map="cpu"
)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
scheduler = UniPCMultistepScheduler(
use_karras_sigmas=True,
use_flow_sigmas=True,
prediction_type="flow_prediction",
sigma_max=200.0,
sigma_min=0.01,
)
pipe = Cosmos2_5_PredictBasePipeline(
text_encoder=text_encoder,
tokenizer=tokenizer,
transformer=transformer,
vae=vae,
scheduler=scheduler,
safety_checker=lambda *args, **kwargs: None,
)
pipe.save_pretrained(args.output_path, safe_serialization=True, max_shard_size="5GB")
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument("--transformer_type", type=str, default=None, choices=list(TRANSFORMER_CONFIGS.keys()))
@@ -563,10 +451,10 @@ def get_args():
"--transformer_ckpt_path", type=str, default=None, help="Path to original transformer checkpoint"
)
parser.add_argument(
"--vae_type", type=str, default="wan2.1", choices=["wan2.1", *list(VAE_CONFIGS.keys())], help="Type of VAE"
"--vae_type", type=str, default=None, choices=["none", *list(VAE_CONFIGS.keys())], help="Type of VAE"
)
parser.add_argument("--text_encoder_path", type=str, default=None)
parser.add_argument("--tokenizer_path", type=str, default=None)
parser.add_argument("--text_encoder_path", type=str, default="google-t5/t5-11b")
parser.add_argument("--tokenizer_path", type=str, default="google-t5/t5-11b")
parser.add_argument("--save_pipeline", action="store_true")
parser.add_argument("--output_path", type=str, required=True, help="Path where converted model should be saved")
parser.add_argument("--dtype", default="bf16", help="Torch dtype to save the transformer in.")
@@ -589,6 +477,8 @@ if __name__ == "__main__":
if args.save_pipeline:
assert args.transformer_ckpt_path is not None
assert args.vae_type is not None
assert args.text_encoder_path is not None
assert args.tokenizer_path is not None
if args.transformer_ckpt_path is not None:
weights_only = "Cosmos-1.0" in args.transformer_type
@@ -600,26 +490,17 @@ if __name__ == "__main__":
if args.vae_type is not None:
if "Cosmos-1.0" in args.transformer_type:
vae = convert_vae(args.vae_type)
elif "Cosmos-2.0" in args.transformer_type or "Cosmos-2.5" in args.transformer_type:
else:
vae = AutoencoderKLWan.from_pretrained(
"Wan-AI/Wan2.1-T2V-1.3B-Diffusers", subfolder="vae", torch_dtype=torch.float32
)
else:
raise AssertionError(f"{args.transformer_type} not supported")
if not args.save_pipeline:
vae.save_pretrained(args.output_path, safe_serialization=True, max_shard_size="5GB")
if args.save_pipeline:
if "Cosmos-1.0" in args.transformer_type:
assert args.text_encoder_path is not None
assert args.tokenizer_path is not None
save_pipeline_cosmos_1_0(args, transformer, vae)
elif "Cosmos-2.0" in args.transformer_type:
assert args.text_encoder_path is not None
assert args.tokenizer_path is not None
save_pipeline_cosmos_2_0(args, transformer, vae)
elif "Cosmos-2.5" in args.transformer_type:
save_pipeline_cosmos2_5(args, transformer, vae)
else:
raise AssertionError(f"{args.transformer_type} not supported")
assert False

View File

@@ -463,7 +463,6 @@ else:
"CogView4ControlPipeline",
"CogView4Pipeline",
"ConsisIDPipeline",
"Cosmos2_5_PredictBasePipeline",
"Cosmos2TextToImagePipeline",
"Cosmos2VideoToWorldPipeline",
"CosmosTextToWorldPipeline",
@@ -1176,7 +1175,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
CogView4ControlPipeline,
CogView4Pipeline,
ConsisIDPipeline,
Cosmos2_5_PredictBasePipeline,
Cosmos2TextToImagePipeline,
Cosmos2VideoToWorldPipeline,
CosmosTextToWorldPipeline,

View File

@@ -439,9 +439,6 @@ class CosmosTransformer3DModel(ModelMixin, ConfigMixin, FromOriginalModelMixin):
rope_scale: Tuple[float, float, float] = (2.0, 1.0, 1.0),
concat_padding_mask: bool = True,
extra_pos_embed_type: Optional[str] = "learnable",
use_crossattn_projection: bool = False,
crossattn_proj_in_channels: int = 1024,
encoder_hidden_states_channels: int = 1024,
) -> None:
super().__init__()
hidden_size = num_attention_heads * attention_head_dim
@@ -488,12 +485,6 @@ class CosmosTransformer3DModel(ModelMixin, ConfigMixin, FromOriginalModelMixin):
hidden_size, patch_size[0] * patch_size[1] * patch_size[2] * out_channels, bias=False
)
if self.config.use_crossattn_projection:
self.crossattn_proj = nn.Sequential(
nn.Linear(crossattn_proj_in_channels, encoder_hidden_states_channels, bias=True),
nn.GELU(),
)
self.gradient_checkpointing = False
def forward(
@@ -533,7 +524,6 @@ class CosmosTransformer3DModel(ModelMixin, ConfigMixin, FromOriginalModelMixin):
post_patch_num_frames = num_frames // p_t
post_patch_height = height // p_h
post_patch_width = width // p_w
hidden_states = self.patch_embed(hidden_states)
hidden_states = hidden_states.flatten(1, 3) # [B, T, H, W, C] -> [B, THW, C]
@@ -556,9 +546,6 @@ class CosmosTransformer3DModel(ModelMixin, ConfigMixin, FromOriginalModelMixin):
else:
assert False
if self.config.use_crossattn_projection:
encoder_hidden_states = self.crossattn_proj(encoder_hidden_states)
# 5. Transformer blocks
for block in self.transformer_blocks:
if torch.is_grad_enabled() and self.gradient_checkpointing:

View File

@@ -360,7 +360,7 @@ class FluxKontextCoreDenoiseStep(SequentialPipelineBlocks):
AUTO_BLOCKS = InsertableDict(
[
("text_encoder", FluxTextEncoderStep()),
("vae_encoder", FluxAutoVaeEncoderStep()),
("image_encoder", FluxAutoVaeEncoderStep()),
("denoise", FluxCoreDenoiseStep()),
("decode", FluxDecodeStep()),
]
@@ -369,7 +369,7 @@ AUTO_BLOCKS = InsertableDict(
AUTO_BLOCKS_KONTEXT = InsertableDict(
[
("text_encoder", FluxTextEncoderStep()),
("vae_encoder", FluxKontextAutoVaeEncoderStep()),
("image_encoder", FluxKontextAutoVaeEncoderStep()),
("denoise", FluxKontextCoreDenoiseStep()),
("decode", FluxDecodeStep()),
]

File diff suppressed because it is too large Load Diff

View File

@@ -231,7 +231,7 @@ class BlockState:
class ModularPipelineBlocks(ConfigMixin, PushToHubMixin):
"""
Base class for all Pipeline Blocks: ConditionalPipelineBlocks, AutoPipelineBlocks, SequentialPipelineBlocks,
Base class for all Pipeline Blocks: PipelineBlock, AutoPipelineBlocks, SequentialPipelineBlocks,
LoopSequentialPipelineBlocks
[`ModularPipelineBlocks`] provides method to load and save the definition of pipeline blocks.
@@ -501,19 +501,15 @@ class ModularPipelineBlocks(ConfigMixin, PushToHubMixin):
@property
def input_names(self) -> List[str]:
return [input_param.name for input_param in self.inputs if input_param.name is not None]
return [input_param.name for input_param in self.inputs]
@property
def intermediate_output_names(self) -> List[str]:
return [output_param.name for output_param in self.intermediate_outputs if output_param.name is not None]
return [output_param.name for output_param in self.intermediate_outputs]
@property
def output_names(self) -> List[str]:
return [output_param.name for output_param in self.outputs if output_param.name is not None]
@property
def component_names(self) -> List[str]:
return [component.name for component in self.expected_components]
return [output_param.name for output_param in self.outputs]
@property
def doc(self):
@@ -527,10 +523,9 @@ class ModularPipelineBlocks(ConfigMixin, PushToHubMixin):
)
class ConditionalPipelineBlocks(ModularPipelineBlocks):
class AutoPipelineBlocks(ModularPipelineBlocks):
"""
A Pipeline Blocks that conditionally selects a block to run based on the inputs.
Subclasses must implement the `select_block` method to define the logic for selecting the block.
A Pipeline Blocks that automatically selects a block to run based on the inputs.
This class inherits from [`ModularPipelineBlocks`]. Check the superclass documentation for the generic methods the
library implements for all the pipeline blocks (such as loading or saving etc.)
@@ -540,13 +535,12 @@ class ConditionalPipelineBlocks(ModularPipelineBlocks):
Attributes:
block_classes: List of block classes to be used
block_names: List of prefixes for each block
block_trigger_inputs: List of input names that select_block() uses to determine which block to run
block_trigger_inputs: List of input names that trigger specific blocks, with None for default
"""
block_classes = []
block_names = []
block_trigger_inputs = []
default_block_name = None # name of the default block if no trigger inputs are provided, if None, this block can be skipped if no trigger inputs are provided
def __init__(self):
sub_blocks = InsertableDict()
@@ -556,15 +550,26 @@ class ConditionalPipelineBlocks(ModularPipelineBlocks):
else:
sub_blocks[block_name] = block
self.sub_blocks = sub_blocks
if not (len(self.block_classes) == len(self.block_names)):
if not (len(self.block_classes) == len(self.block_names) == len(self.block_trigger_inputs)):
raise ValueError(
f"In {self.__class__.__name__}, the number of block_classes and block_names must be the same."
f"In {self.__class__.__name__}, the number of block_classes, block_names, and block_trigger_inputs must be the same."
)
if self.default_block_name is not None and self.default_block_name not in self.block_names:
default_blocks = [t for t in self.block_trigger_inputs if t is None]
# can only have 1 or 0 default block, and has to put in the last
# the order of blocks matters here because the first block with matching trigger will be dispatched
# e.g. blocks = [inpaint, img2img] and block_trigger_inputs = ["mask", "image"]
# as long as mask is provided, it is inpaint; if only image is provided, it is img2img
if len(default_blocks) > 1 or (len(default_blocks) == 1 and self.block_trigger_inputs[-1] is not None):
raise ValueError(
f"In {self.__class__.__name__}, default_block_name '{self.default_block_name}' must be one of block_names: {self.block_names}"
f"In {self.__class__.__name__}, exactly one None must be specified as the last element "
"in block_trigger_inputs."
)
# Map trigger inputs to block objects
self.trigger_to_block_map = dict(zip(self.block_trigger_inputs, self.sub_blocks.values()))
self.trigger_to_block_name_map = dict(zip(self.block_trigger_inputs, self.sub_blocks.keys()))
self.block_to_trigger_map = dict(zip(self.sub_blocks.keys(), self.block_trigger_inputs))
@property
def model_name(self):
return next(iter(self.sub_blocks.values())).model_name
@@ -593,11 +598,8 @@ class ConditionalPipelineBlocks(ModularPipelineBlocks):
@property
def required_inputs(self) -> List[str]:
# no default block means this conditional block can be skipped entirely
if self.default_block_name is None:
if None not in self.block_trigger_inputs:
return []
first_block = next(iter(self.sub_blocks.values()))
required_by_all = set(getattr(first_block, "required_inputs", set()))
@@ -608,7 +610,7 @@ class ConditionalPipelineBlocks(ModularPipelineBlocks):
return list(required_by_all)
# YiYi TODO: add test for this
@property
def inputs(self) -> List[Tuple[str, Any]]:
named_inputs = [(name, block.inputs) for name, block in self.sub_blocks.items()]
@@ -633,69 +635,22 @@ class ConditionalPipelineBlocks(ModularPipelineBlocks):
combined_outputs = self.combine_outputs(*named_outputs)
return combined_outputs
def _get_trigger_inputs(self) -> set:
"""
Returns a set of all unique trigger input values found in this block and nested blocks.
"""
def fn_recursive_get_trigger(blocks):
trigger_values = set()
if blocks is not None:
for name, block in blocks.items():
# Check if current block has block_trigger_inputs
if hasattr(block, "block_trigger_inputs") and block.block_trigger_inputs is not None:
trigger_values.update(t for t in block.block_trigger_inputs if t is not None)
# If block has sub_blocks, recursively check them
if block.sub_blocks:
nested_triggers = fn_recursive_get_trigger(block.sub_blocks)
trigger_values.update(nested_triggers)
return trigger_values
# Start with this block's block_trigger_inputs
all_triggers = set(t for t in self.block_trigger_inputs if t is not None)
# Add nested triggers
all_triggers.update(fn_recursive_get_trigger(self.sub_blocks))
return all_triggers
@property
def trigger_inputs(self):
"""All trigger inputs including from nested blocks."""
return self._get_trigger_inputs()
def select_block(self, **kwargs) -> Optional[str]:
"""
Select the block to run based on the trigger inputs.
Subclasses must implement this method to define the logic for selecting the block.
Args:
**kwargs: Trigger input names and their values from the state.
Returns:
Optional[str]: The name of the block to run, or None to use default/skip.
"""
raise NotImplementedError(f"Subclass {self.__class__.__name__} must implement the `select_block` method.")
@torch.no_grad()
def __call__(self, pipeline, state: PipelineState) -> PipelineState:
trigger_kwargs = {name: state.get(name) for name in self.block_trigger_inputs if name is not None}
block_name = self.select_block(**trigger_kwargs)
# Find default block first (if any)
if block_name is None:
block_name = self.default_block_name
block = self.trigger_to_block_map.get(None)
for input_name in self.block_trigger_inputs:
if input_name is not None and state.get(input_name) is not None:
block = self.trigger_to_block_map[input_name]
break
if block_name is None:
logger.info(f"skipping conditional block: {self.__class__.__name__}")
if block is None:
logger.info(f"skipping auto block: {self.__class__.__name__}")
return pipeline, state
block = self.sub_blocks[block_name]
try:
logger.info(f"Running block: {block.__class__.__name__}")
logger.info(f"Running block: {block.__class__.__name__}, trigger: {input_name}")
return block(pipeline, state)
except Exception as e:
error_msg = (
@@ -706,6 +661,38 @@ class ConditionalPipelineBlocks(ModularPipelineBlocks):
logger.error(error_msg)
raise
def _get_trigger_inputs(self):
"""
Returns a set of all unique trigger input values found in the blocks. Returns: Set[str] containing all unique
block_trigger_inputs values
"""
def fn_recursive_get_trigger(blocks):
trigger_values = set()
if blocks is not None:
for name, block in blocks.items():
# Check if current block has trigger inputs(i.e. auto block)
if hasattr(block, "block_trigger_inputs") and block.block_trigger_inputs is not None:
# Add all non-None values from the trigger inputs list
trigger_values.update(t for t in block.block_trigger_inputs if t is not None)
# If block has sub_blocks, recursively check them
if block.sub_blocks:
nested_triggers = fn_recursive_get_trigger(block.sub_blocks)
trigger_values.update(nested_triggers)
return trigger_values
trigger_inputs = set(self.block_trigger_inputs)
trigger_inputs.update(fn_recursive_get_trigger(self.sub_blocks))
return trigger_inputs
@property
def trigger_inputs(self):
return self._get_trigger_inputs()
def __repr__(self):
class_name = self.__class__.__name__
base_class = self.__class__.__bases__[0].__name__
@@ -717,7 +704,7 @@ class ConditionalPipelineBlocks(ModularPipelineBlocks):
header += "\n"
header += " " + "=" * 100 + "\n"
header += " This pipeline contains blocks that are selected at runtime based on inputs.\n"
header += f" Trigger Inputs: {sorted(self.trigger_inputs)}\n"
header += f" Trigger Inputs: {[inp for inp in self.trigger_inputs if inp is not None]}\n"
header += " " + "=" * 100 + "\n\n"
# Format description with proper indentation
@@ -738,20 +725,31 @@ class ConditionalPipelineBlocks(ModularPipelineBlocks):
expected_configs = getattr(self, "expected_configs", [])
configs_str = format_configs(expected_configs, indent_level=2, add_empty_lines=False)
# Blocks section
# Blocks section - moved to the end with simplified format
blocks_str = " Sub-Blocks:\n"
for i, (name, block) in enumerate(self.sub_blocks.items()):
if name == self.default_block_name:
addtional_str = " [default]"
# Get trigger input for this block
trigger = None
if hasattr(self, "block_to_trigger_map"):
trigger = self.block_to_trigger_map.get(name)
# Format the trigger info
if trigger is None:
trigger_str = "[default]"
elif isinstance(trigger, (list, tuple)):
trigger_str = f"[trigger: {', '.join(str(t) for t in trigger)}]"
else:
trigger_str = f"[trigger: {trigger}]"
# For AutoPipelineBlocks, add bullet points
blocks_str += f"{name} {trigger_str} ({block.__class__.__name__})\n"
else:
addtional_str = ""
blocks_str += f" {name}{addtional_str} ({block.__class__.__name__})\n"
# For SequentialPipelineBlocks, show execution order
blocks_str += f" [{i}] {name} ({block.__class__.__name__})\n"
# Add block description
block_desc_lines = block.description.split("\n")
indented_desc = block_desc_lines[0]
if len(block_desc_lines) > 1:
indented_desc += "\n" + "\n".join(" " + line for line in block_desc_lines[1:])
desc_lines = block.description.split("\n")
indented_desc = desc_lines[0]
if len(desc_lines) > 1:
indented_desc += "\n" + "\n".join(" " + line for line in desc_lines[1:])
blocks_str += f" Description: {indented_desc}\n\n"
# Build the representation with conditional sections
@@ -782,35 +780,6 @@ class ConditionalPipelineBlocks(ModularPipelineBlocks):
)
class AutoPipelineBlocks(ConditionalPipelineBlocks):
"""
A Pipeline Blocks that automatically selects a block to run based on the presence of trigger inputs.
"""
def __init__(self):
super().__init__()
if not (len(self.block_classes) == len(self.block_names) == len(self.block_trigger_inputs)):
raise ValueError(
f"In {self.__class__.__name__}, the number of block_classes, block_names, and block_trigger_inputs must be the same."
)
@property
def default_block_name(self) -> Optional[str]:
"""Derive default_block_name from block_trigger_inputs (None entry)."""
if None in self.block_trigger_inputs:
idx = self.block_trigger_inputs.index(None)
return self.block_names[idx]
return None
def select_block(self, **kwargs) -> Optional[str]:
"""Select block based on which trigger input is present (not None)."""
for trigger_input, block_name in zip(self.block_trigger_inputs, self.block_names):
if trigger_input is not None and kwargs.get(trigger_input) is not None:
return block_name
return None
class SequentialPipelineBlocks(ModularPipelineBlocks):
"""
A Pipeline Blocks that combines multiple pipeline block classes into one. When called, it will call each block in
@@ -912,8 +881,7 @@ class SequentialPipelineBlocks(ModularPipelineBlocks):
# Only add outputs if the block cannot be skipped
should_add_outputs = True
if isinstance(block, ConditionalPipelineBlocks) and block.default_block_name is None:
# ConditionalPipelineBlocks without default can be skipped
if hasattr(block, "block_trigger_inputs") and None not in block.block_trigger_inputs:
should_add_outputs = False
if should_add_outputs:
@@ -976,7 +944,8 @@ class SequentialPipelineBlocks(ModularPipelineBlocks):
def _get_trigger_inputs(self):
"""
Returns a set of all unique trigger input values found in the blocks.
Returns a set of all unique trigger input values found in the blocks. Returns: Set[str] containing all unique
block_trigger_inputs values
"""
def fn_recursive_get_trigger(blocks):
@@ -984,8 +953,9 @@ class SequentialPipelineBlocks(ModularPipelineBlocks):
if blocks is not None:
for name, block in blocks.items():
# Check if current block has block_trigger_inputs (ConditionalPipelineBlocks)
# Check if current block has trigger inputs(i.e. auto block)
if hasattr(block, "block_trigger_inputs") and block.block_trigger_inputs is not None:
# Add all non-None values from the trigger inputs list
trigger_values.update(t for t in block.block_trigger_inputs if t is not None)
# If block has sub_blocks, recursively check them
@@ -1001,85 +971,82 @@ class SequentialPipelineBlocks(ModularPipelineBlocks):
def trigger_inputs(self):
return self._get_trigger_inputs()
def _traverse_trigger_blocks(self, active_inputs):
"""
Traverse blocks and select which ones would run given the active inputs.
def _traverse_trigger_blocks(self, trigger_inputs):
# Convert trigger_inputs to a set for easier manipulation
active_triggers = set(trigger_inputs)
Args:
active_inputs: Dict of input names to values that are "present"
Returns:
OrderedDict of block_name -> block that would execute
"""
def fn_recursive_traverse(block, block_name, active_inputs):
def fn_recursive_traverse(block, block_name, active_triggers):
result_blocks = OrderedDict()
# ConditionalPipelineBlocks (includes AutoPipelineBlocks)
if isinstance(block, ConditionalPipelineBlocks):
trigger_kwargs = {name: active_inputs.get(name) for name in block.block_trigger_inputs}
selected_block_name = block.select_block(**trigger_kwargs)
if selected_block_name is None:
selected_block_name = block.default_block_name
if selected_block_name is None:
return result_blocks
selected_block = block.sub_blocks[selected_block_name]
if selected_block.sub_blocks:
result_blocks.update(fn_recursive_traverse(selected_block, block_name, active_inputs))
# sequential(include loopsequential) or PipelineBlock
if not hasattr(block, "block_trigger_inputs"):
if block.sub_blocks:
# sequential or LoopSequentialPipelineBlocks (keep traversing)
for sub_block_name, sub_block in block.sub_blocks.items():
blocks_to_update = fn_recursive_traverse(sub_block, sub_block_name, active_triggers)
blocks_to_update = fn_recursive_traverse(sub_block, sub_block_name, active_triggers)
blocks_to_update = {f"{block_name}.{k}": v for k, v in blocks_to_update.items()}
result_blocks.update(blocks_to_update)
else:
result_blocks[block_name] = selected_block
if hasattr(selected_block, "outputs"):
for out in selected_block.outputs:
active_inputs[out.name] = True
# PipelineBlock
result_blocks[block_name] = block
# Add this block's output names to active triggers if defined
if hasattr(block, "outputs"):
active_triggers.update(out.name for out in block.outputs)
return result_blocks
# SequentialPipelineBlocks or LoopSequentialPipelineBlocks
if block.sub_blocks:
for sub_block_name, sub_block in block.sub_blocks.items():
blocks_to_update = fn_recursive_traverse(sub_block, sub_block_name, active_inputs)
blocks_to_update = {f"{block_name}.{k}": v for k, v in blocks_to_update.items()}
result_blocks.update(blocks_to_update)
# auto
else:
result_blocks[block_name] = block
if hasattr(block, "outputs"):
for out in block.outputs:
active_inputs[out.name] = True
# Find first block_trigger_input that matches any value in our active_triggers
this_block = None
for trigger_input in block.block_trigger_inputs:
if trigger_input is not None and trigger_input in active_triggers:
this_block = block.trigger_to_block_map[trigger_input]
break
# If no matches found, try to get the default (None) block
if this_block is None and None in block.block_trigger_inputs:
this_block = block.trigger_to_block_map[None]
if this_block is not None:
# sequential/auto (keep traversing)
if this_block.sub_blocks:
result_blocks.update(fn_recursive_traverse(this_block, block_name, active_triggers))
else:
# PipelineBlock
result_blocks[block_name] = this_block
# Add this block's output names to active triggers if defined
# YiYi TODO: do we need outputs here? can it just be intermediate_outputs? can we get rid of outputs attribute?
if hasattr(this_block, "outputs"):
active_triggers.update(out.name for out in this_block.outputs)
return result_blocks
all_blocks = OrderedDict()
for block_name, block in self.sub_blocks.items():
blocks_to_update = fn_recursive_traverse(block, block_name, active_inputs)
blocks_to_update = fn_recursive_traverse(block, block_name, active_triggers)
all_blocks.update(blocks_to_update)
return all_blocks
def get_execution_blocks(self, **kwargs):
"""
Get the blocks that would execute given the specified inputs.
def get_execution_blocks(self, *trigger_inputs):
trigger_inputs_all = self.trigger_inputs
Args:
**kwargs: Input names and values. Only trigger inputs affect block selection.
Pass any inputs that would be non-None at runtime.
if trigger_inputs is not None:
if not isinstance(trigger_inputs, (list, tuple, set)):
trigger_inputs = [trigger_inputs]
invalid_inputs = [x for x in trigger_inputs if x not in trigger_inputs_all]
if invalid_inputs:
logger.warning(
f"The following trigger inputs will be ignored as they are not supported: {invalid_inputs}"
)
trigger_inputs = [x for x in trigger_inputs if x in trigger_inputs_all]
Returns:
SequentialPipelineBlocks containing only the blocks that would execute
Example:
# Get blocks for inpainting workflow
blocks = pipeline.get_execution_blocks(prompt="a cat", mask=mask, image=image)
# Get blocks for text2image workflow
blocks = pipeline.get_execution_blocks(prompt="a cat")
"""
# Filter out None values
active_inputs = {k: v for k, v in kwargs.items() if v is not None}
blocks_triggered = self._traverse_trigger_blocks(active_inputs)
if trigger_inputs is None:
if None in trigger_inputs_all:
trigger_inputs = [None]
else:
trigger_inputs = [trigger_inputs_all[0]]
blocks_triggered = self._traverse_trigger_blocks(trigger_inputs)
return SequentialPipelineBlocks.from_blocks_dict(blocks_triggered)
def __repr__(self):
@@ -1096,7 +1063,7 @@ class SequentialPipelineBlocks(ModularPipelineBlocks):
header += f" Trigger Inputs: {[inp for inp in self.trigger_inputs if inp is not None]}\n"
# Get first trigger input as example
example_input = next(t for t in self.trigger_inputs if t is not None)
header += f" Use `get_execution_blocks()` to see selected blocks (e.g. `get_execution_blocks({example_input}=...)`).\n"
header += f" Use `get_execution_blocks()` with input names to see selected blocks (e.g. `get_execution_blocks('{example_input}')`).\n"
header += " " + "=" * 100 + "\n\n"
# Format description with proper indentation
@@ -1120,9 +1087,22 @@ class SequentialPipelineBlocks(ModularPipelineBlocks):
# Blocks section - moved to the end with simplified format
blocks_str = " Sub-Blocks:\n"
for i, (name, block) in enumerate(self.sub_blocks.items()):
# show execution order
blocks_str += f" [{i}] {name} ({block.__class__.__name__})\n"
# Get trigger input for this block
trigger = None
if hasattr(self, "block_to_trigger_map"):
trigger = self.block_to_trigger_map.get(name)
# Format the trigger info
if trigger is None:
trigger_str = "[default]"
elif isinstance(trigger, (list, tuple)):
trigger_str = f"[trigger: {', '.join(str(t) for t in trigger)}]"
else:
trigger_str = f"[trigger: {trigger}]"
# For AutoPipelineBlocks, add bullet points
blocks_str += f"{name} {trigger_str} ({block.__class__.__name__})\n"
else:
# For SequentialPipelineBlocks, show execution order
blocks_str += f" [{i}] {name} ({block.__class__.__name__})\n"
# Add block description
desc_lines = block.description.split("\n")
@@ -1246,9 +1226,15 @@ class LoopSequentialPipelineBlocks(ModularPipelineBlocks):
if inp.name not in outputs and inp not in inputs:
inputs.append(inp)
# Add this block's outputs
block_intermediate_outputs = [out.name for out in block.intermediate_outputs]
outputs.update(block_intermediate_outputs)
# Only add outputs if the block cannot be skipped
should_add_outputs = True
if hasattr(block, "block_trigger_inputs") and None not in block.block_trigger_inputs:
should_add_outputs = False
if should_add_outputs:
# Add this block's outputs
block_intermediate_outputs = [out.name for out in block.intermediate_outputs]
outputs.update(block_intermediate_outputs)
for input_param in inputs:
if input_param.name in self.required_inputs:
@@ -1305,14 +1291,6 @@ class LoopSequentialPipelineBlocks(ModularPipelineBlocks):
sub_blocks[block_name] = block
self.sub_blocks = sub_blocks
# Validate that sub_blocks are only leaf blocks
for block_name, block in self.sub_blocks.items():
if block.sub_blocks:
raise ValueError(
f"In {self.__class__.__name__}, sub_blocks must be leaf blocks (no sub_blocks). "
f"Block '{block_name}' ({block.__class__.__name__}) has sub_blocks."
)
@classmethod
def from_blocks_dict(cls, blocks_dict: Dict[str, Any]) -> "LoopSequentialPipelineBlocks":
"""
@@ -1547,8 +1525,10 @@ class ModularPipeline(ConfigMixin, PushToHubMixin):
if blocks is None:
if modular_config_dict is not None:
blocks_class_name = modular_config_dict.get("_blocks_class_name")
else:
elif config_dict is not None:
blocks_class_name = self.get_default_blocks_name(config_dict)
else:
blocks_class_name = None
if blocks_class_name is not None:
diffusers_module = importlib.import_module("diffusers")
blocks_class = getattr(diffusers_module, blocks_class_name)
@@ -1645,10 +1625,7 @@ class ModularPipeline(ConfigMixin, PushToHubMixin):
return None, config_dict
except EnvironmentError as e:
raise EnvironmentError(
f"Failed to load config from '{pretrained_model_name_or_path}'. "
f"Could not find or load 'modular_model_index.json' or 'model_index.json'."
) from e
logger.debug(f" model_index.json not found in the repo: {e}")
return None, None
@@ -2573,11 +2550,7 @@ class ModularPipeline(ConfigMixin, PushToHubMixin):
kwargs_type = expected_input_param.kwargs_type
if name in passed_kwargs:
state.set(name, passed_kwargs.pop(name), kwargs_type)
elif kwargs_type is not None and kwargs_type in passed_kwargs:
kwargs_dict = passed_kwargs.pop(kwargs_type)
for k, v in kwargs_dict.items():
state.set(k, v, kwargs_type)
elif name is not None and name not in state.values:
elif name not in state.values:
state.set(name, default, kwargs_type)
# Warn about unexpected inputs

View File

@@ -21,16 +21,21 @@ except OptionalDependencyNotAvailable:
_dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
else:
_import_structure["modular_blocks_qwenimage"] = [
_import_structure["encoders"] = ["QwenImageTextEncoderStep"]
_import_structure["modular_blocks"] = [
"ALL_BLOCKS",
"AUTO_BLOCKS",
"QwenImageAutoBlocks",
]
_import_structure["modular_blocks_qwenimage_edit"] = [
"CONTROLNET_BLOCKS",
"EDIT_AUTO_BLOCKS",
"QwenImageEditAutoBlocks",
]
_import_structure["modular_blocks_qwenimage_edit_plus"] = [
"EDIT_BLOCKS",
"EDIT_INPAINT_BLOCKS",
"EDIT_PLUS_AUTO_BLOCKS",
"EDIT_PLUS_BLOCKS",
"IMAGE2IMAGE_BLOCKS",
"INPAINT_BLOCKS",
"TEXT2IMAGE_BLOCKS",
"QwenImageAutoBlocks",
"QwenImageEditAutoBlocks",
"QwenImageEditPlusAutoBlocks",
]
_import_structure["modular_pipeline"] = [
@@ -46,16 +51,23 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
except OptionalDependencyNotAvailable:
from ...utils.dummy_torch_and_transformers_objects import * # noqa F403
else:
from .modular_blocks_qwenimage import (
from .encoders import (
QwenImageTextEncoderStep,
)
from .modular_blocks import (
ALL_BLOCKS,
AUTO_BLOCKS,
QwenImageAutoBlocks,
)
from .modular_blocks_qwenimage_edit import (
CONTROLNET_BLOCKS,
EDIT_AUTO_BLOCKS,
QwenImageEditAutoBlocks,
)
from .modular_blocks_qwenimage_edit_plus import (
EDIT_BLOCKS,
EDIT_INPAINT_BLOCKS,
EDIT_PLUS_AUTO_BLOCKS,
EDIT_PLUS_BLOCKS,
IMAGE2IMAGE_BLOCKS,
INPAINT_BLOCKS,
TEXT2IMAGE_BLOCKS,
QwenImageAutoBlocks,
QwenImageEditAutoBlocks,
QwenImageEditPlusAutoBlocks,
)
from .modular_pipeline import (
@@ -74,4 +86,4 @@ else:
)
for name, value in _dummy_objects.items():
setattr(sys.modules[__name__], name, value)
setattr(sys.modules[__name__], name, value)

View File

@@ -639,65 +639,19 @@ class QwenImageEditRoPEInputsStep(ModularPipelineBlocks):
return components, state
class QwenImageEditPlusRoPEInputsStep(ModularPipelineBlocks):
"""RoPE inputs step for Edit Plus that handles lists of image heights/widths."""
class QwenImageEditPlusRoPEInputsStep(QwenImageEditRoPEInputsStep):
model_name = "qwenimage-edit-plus"
@property
def description(self) -> str:
return (
"Step that prepares the RoPE inputs for denoising process. This is used in QwenImage Edit Plus.\n"
"Unlike Edit, Edit Plus handles lists of image_height/image_width for multiple reference images.\n"
"Should be placed after prepare_latents step."
)
@property
def inputs(self) -> List[InputParam]:
return [
InputParam(name="batch_size", required=True),
InputParam(name="image_height", required=True, type_hint=List[int]),
InputParam(name="image_width", required=True, type_hint=List[int]),
InputParam(name="height", required=True),
InputParam(name="width", required=True),
InputParam(name="prompt_embeds_mask"),
InputParam(name="negative_prompt_embeds_mask"),
]
@property
def intermediate_outputs(self) -> List[OutputParam]:
return [
OutputParam(
name="img_shapes",
type_hint=List[List[Tuple[int, int, int]]],
description="The shapes of the image latents, used for RoPE calculation",
),
OutputParam(
name="txt_seq_lens",
kwargs_type="denoiser_input_fields",
type_hint=List[int],
description="The sequence lengths of the prompt embeds, used for RoPE calculation",
),
OutputParam(
name="negative_txt_seq_lens",
kwargs_type="denoiser_input_fields",
type_hint=List[int],
description="The sequence lengths of the negative prompt embeds, used for RoPE calculation",
),
]
def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
block_state = self.get_block_state(state)
vae_scale_factor = components.vae_scale_factor
# Edit Plus: image_height and image_width are lists
block_state.img_shapes = [
[
(1, block_state.height // vae_scale_factor // 2, block_state.width // vae_scale_factor // 2),
*[
(1, img_height // vae_scale_factor // 2, img_width // vae_scale_factor // 2)
for img_height, img_width in zip(block_state.image_height, block_state.image_width)
(1, vae_height // vae_scale_factor // 2, vae_width // vae_scale_factor // 2)
for vae_height, vae_width in zip(block_state.image_height, block_state.image_width)
],
]
] * block_state.batch_size

View File

@@ -30,47 +30,6 @@ from .modular_pipeline import QwenImageModularPipeline, QwenImagePachifier
logger = logging.get_logger(__name__)
class QwenImageAfterDenoiseStep(ModularPipelineBlocks):
model_name = "qwenimage"
@property
def description(self) -> str:
return "Step that unpack the latents from 3D tensor (batch_size, sequence_length, channels) into 5D tensor (batch_size, channels, 1, height, width)"
@property
def expected_components(self) -> List[ComponentSpec]:
components = [
ComponentSpec("pachifier", QwenImagePachifier, default_creation_method="from_config"),
]
return components
@property
def inputs(self) -> List[InputParam]:
return [
InputParam(name="height", required=True),
InputParam(name="width", required=True),
InputParam(
name="latents",
required=True,
type_hint=torch.Tensor,
description="The latents to decode, can be generated in the denoise step",
),
]
@torch.no_grad()
def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
block_state = self.get_block_state(state)
vae_scale_factor = components.vae_scale_factor
block_state.latents = components.pachifier.unpack_latents(
block_state.latents, block_state.height, block_state.width, vae_scale_factor=vae_scale_factor
)
self.set_block_state(state, block_state)
return components, state
class QwenImageDecoderStep(ModularPipelineBlocks):
model_name = "qwenimage"
@@ -82,6 +41,7 @@ class QwenImageDecoderStep(ModularPipelineBlocks):
def expected_components(self) -> List[ComponentSpec]:
components = [
ComponentSpec("vae", AutoencoderKLQwenImage),
ComponentSpec("pachifier", QwenImagePachifier, default_creation_method="from_config"),
]
return components
@@ -89,6 +49,8 @@ class QwenImageDecoderStep(ModularPipelineBlocks):
@property
def inputs(self) -> List[InputParam]:
return [
InputParam(name="height", required=True),
InputParam(name="width", required=True),
InputParam(
name="latents",
required=True,
@@ -112,12 +74,10 @@ class QwenImageDecoderStep(ModularPipelineBlocks):
block_state = self.get_block_state(state)
# YiYi Notes: remove support for output_type = "latents', we can just skip decode/encode step in modular
if block_state.latents.ndim == 4:
block_state.latents = block_state.latents.unsqueeze(dim=1)
elif block_state.latents.ndim != 5:
raise ValueError(
f"expect latents to be a 4D or 5D tensor but got: {block_state.latents.shape}. Please make sure the latents are unpacked before decode step."
)
vae_scale_factor = components.vae_scale_factor
block_state.latents = components.pachifier.unpack_latents(
block_state.latents, block_state.height, block_state.width, vae_scale_factor=vae_scale_factor
)
block_state.latents = block_state.latents.to(components.vae.dtype)
latents_mean = (

View File

@@ -244,19 +244,18 @@ def encode_vae_image(
class QwenImageEditResizeDynamicStep(ModularPipelineBlocks):
model_name = "qwenimage"
def __init__(
self,
input_name: str = "image",
output_name: str = "resized_image",
target_area: int = 1024 * 1024,
):
"""Create a configurable step for resizing images to the target area while maintaining the aspect ratio.
def __init__(self, input_name: str = "image", output_name: str = "resized_image"):
"""Create a configurable step for resizing images to the target area (1024 * 1024) while maintaining the aspect ratio.
This block resizes an input image tensor and exposes the resized result under configurable input and output
names. Use this when you need to wire the resize step to different image fields (e.g., "image",
"control_image")
Args:
input_name (str, optional): Name of the image field to read from the
pipeline state. Defaults to "image".
output_name (str, optional): Name of the resized image field to write
back to the pipeline state. Defaults to "resized_image".
target_area (int, optional): Target area in pixels. Defaults to 1024*1024.
"""
if not isinstance(input_name, str) or not isinstance(output_name, str):
raise ValueError(
@@ -264,12 +263,11 @@ class QwenImageEditResizeDynamicStep(ModularPipelineBlocks):
)
self._image_input_name = input_name
self._resized_image_output_name = output_name
self._target_area = target_area
super().__init__()
@property
def description(self) -> str:
return f"Image Resize step that resize the {self._image_input_name} to the target area {self._target_area} while maintaining the aspect ratio."
return f"Image Resize step that resize the {self._image_input_name} to the target area (1024 * 1024) while maintaining the aspect ratio."
@property
def expected_components(self) -> List[ComponentSpec]:
@@ -322,67 +320,48 @@ class QwenImageEditResizeDynamicStep(ModularPipelineBlocks):
self.set_block_state(state, block_state)
return components, state
class QwenImageEditPlusResizeDynamicStep(ModularPipelineBlocks):
"""Resize each image independently based on its own aspect ratio. For QwenImage Edit Plus."""
model_name = "qwenimage-edit-plus"
class QwenImageEditPlusResizeDynamicStep(QwenImageEditResizeDynamicStep):
model_name = "qwenimage"
def __init__(
self,
input_name: str = "image",
self,
input_name: str = "image",
output_name: str = "resized_image",
target_area: int = 1024 * 1024,
vae_image_output_name: str = "vae_image",
):
"""Create a step for resizing images to a target area.
"""Create a configurable step for resizing images to the target area (384 * 384) while maintaining the aspect ratio.
Each image is resized independently based on its own aspect ratio.
This is suitable for Edit Plus where multiple reference images can have different dimensions.
This block resizes an input image or a list input images and exposes the resized result under configurable
input and output names. Use this when you need to wire the resize step to different image fields (e.g.,
"image", "control_image")
Args:
input_name (str, optional): Name of the image field to read. Defaults to "image".
output_name (str, optional): Name of the resized image field to write. Defaults to "resized_image".
target_area (int, optional): Target area in pixels. Defaults to 1024*1024.
input_name (str, optional): Name of the image field to read from the
pipeline state. Defaults to "image".
output_name (str, optional): Name of the resized image field to write
back to the pipeline state. Defaults to "resized_image".
vae_image_output_name (str, optional): Name of the image field
to write back to the pipeline state. This is used by the VAE encoder step later on. QwenImage Edit Plus
processes the input image(s) differently for the VL and the VAE.
"""
if not isinstance(input_name, str) or not isinstance(output_name, str):
raise ValueError(
f"input_name and output_name must be strings but are {type(input_name)} and {type(output_name)}"
)
self.condition_image_size = 384 * 384
self._image_input_name = input_name
self._resized_image_output_name = output_name
self._target_area = target_area
self._vae_image_output_name = vae_image_output_name
super().__init__()
@property
def description(self) -> str:
return (
f"Image Resize step that resizes {self._image_input_name} to target area {self._target_area}.\n"
"Each image is resized independently based on its own aspect ratio."
)
@property
def expected_components(self) -> List[ComponentSpec]:
return [
ComponentSpec(
"image_resize_processor",
VaeImageProcessor,
config=FrozenDict({"vae_scale_factor": 16}),
default_creation_method="from_config",
),
]
@property
def inputs(self) -> List[InputParam]:
return [
InputParam(
name=self._image_input_name, required=True, type_hint=torch.Tensor, description="The image(s) to resize"
),
]
@property
def intermediate_outputs(self) -> List[OutputParam]:
return [
return super().intermediate_outputs + [
OutputParam(
name=self._resized_image_output_name, type_hint=List[PIL.Image.Image], description="The resized images"
name=self._vae_image_output_name,
type_hint=List[PIL.Image.Image],
description="The images to be processed which will be further used by the VAE encoder.",
),
]
@@ -395,21 +374,26 @@ class QwenImageEditPlusResizeDynamicStep(ModularPipelineBlocks):
if not is_valid_image_imagelist(images):
raise ValueError(f"Images must be image or list of images but are {type(images)}")
if is_valid_image(images):
if (
not isinstance(images, torch.Tensor)
and isinstance(images, PIL.Image.Image)
and not isinstance(images, list)
):
images = [images]
# Resize each image independently based on its own aspect ratio
resized_images = []
for image in images:
image_width, image_height = image.size
calculated_width, calculated_height, _ = calculate_dimensions(
self._target_area, image_width / image_height
)
resized_images.append(
components.image_resize_processor.resize(image, height=calculated_height, width=calculated_width)
# TODO (sayakpaul): revisit this when the inputs are `torch.Tensor`s
condition_images = []
vae_images = []
for img in images:
image_width, image_height = img.size
condition_width, condition_height, _ = calculate_dimensions(
self.condition_image_size, image_width / image_height
)
condition_images.append(components.image_resize_processor.resize(img, condition_height, condition_width))
vae_images.append(img)
setattr(block_state, self._resized_image_output_name, resized_images)
setattr(block_state, self._resized_image_output_name, condition_images)
setattr(block_state, self._vae_image_output_name, vae_images)
self.set_block_state(state, block_state)
return components, state
@@ -663,30 +647,8 @@ class QwenImageEditTextEncoderStep(ModularPipelineBlocks):
return components, state
class QwenImageEditPlusTextEncoderStep(ModularPipelineBlocks):
"""Text encoder for QwenImage Edit Plus that handles multiple reference images."""
model_name = "qwenimage-edit-plus"
@property
def description(self) -> str:
return (
"Text Encoder step for QwenImage Edit Plus that processes prompt and multiple images together "
"to generate text embeddings for guiding image generation."
)
@property
def expected_components(self) -> List[ComponentSpec]:
return [
ComponentSpec("text_encoder", Qwen2_5_VLForConditionalGeneration),
ComponentSpec("processor", Qwen2VLProcessor),
ComponentSpec(
"guider",
ClassifierFreeGuidance,
config=FrozenDict({"guidance_scale": 4.0}),
default_creation_method="from_config",
),
]
class QwenImageEditPlusTextEncoderStep(QwenImageEditTextEncoderStep):
model_name = "qwenimage"
@property
def expected_configs(self) -> List[ConfigSpec]:
@@ -702,60 +664,6 @@ class QwenImageEditPlusTextEncoderStep(ModularPipelineBlocks):
ConfigSpec(name="prompt_template_encode_start_idx", default=64),
]
@property
def inputs(self) -> List[InputParam]:
return [
InputParam(name="prompt", required=True, type_hint=str, description="The prompt to encode"),
InputParam(name="negative_prompt", type_hint=str, description="The negative prompt to encode"),
InputParam(
name="resized_cond_image",
required=True,
type_hint=torch.Tensor,
description="The image(s) to encode, can be a single image or list of images, should be resized to 384x384 using resize step",
),
]
@property
def intermediate_outputs(self) -> List[OutputParam]:
return [
OutputParam(
name="prompt_embeds",
kwargs_type="denoiser_input_fields",
type_hint=torch.Tensor,
description="The prompt embeddings",
),
OutputParam(
name="prompt_embeds_mask",
kwargs_type="denoiser_input_fields",
type_hint=torch.Tensor,
description="The encoder attention mask",
),
OutputParam(
name="negative_prompt_embeds",
kwargs_type="denoiser_input_fields",
type_hint=torch.Tensor,
description="The negative prompt embeddings",
),
OutputParam(
name="negative_prompt_embeds_mask",
kwargs_type="denoiser_input_fields",
type_hint=torch.Tensor,
description="The negative prompt embeddings mask",
),
]
@staticmethod
def check_inputs(prompt, negative_prompt):
if not isinstance(prompt, str) and not isinstance(prompt, list):
raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
if (
negative_prompt is not None
and not isinstance(negative_prompt, str)
and not isinstance(negative_prompt, list)
):
raise ValueError(f"`negative_prompt` has to be of type `str` or `list` but is {type(negative_prompt)}")
@torch.no_grad()
def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
block_state = self.get_block_state(state)
@@ -768,7 +676,7 @@ class QwenImageEditPlusTextEncoderStep(ModularPipelineBlocks):
components.text_encoder,
components.processor,
prompt=block_state.prompt,
image=block_state.resized_cond_image,
image=block_state.resized_image,
prompt_template_encode=components.config.prompt_template_encode,
img_template_encode=components.config.img_template_encode,
prompt_template_encode_start_idx=components.config.prompt_template_encode_start_idx,
@@ -784,7 +692,7 @@ class QwenImageEditPlusTextEncoderStep(ModularPipelineBlocks):
components.text_encoder,
components.processor,
prompt=negative_prompt,
image=block_state.resized_cond_image,
image=block_state.resized_image,
prompt_template_encode=components.config.prompt_template_encode,
img_template_encode=components.config.img_template_encode,
prompt_template_encode_start_idx=components.config.prompt_template_encode_start_idx,
@@ -938,60 +846,60 @@ class QwenImageProcessImagesInputStep(ModularPipelineBlocks):
self.set_block_state(state, block_state)
return components, state
class QwenImageEditPlusProcessImagesInputStep(ModularPipelineBlocks):
class QwenImageEditPlusProcessImagesInputStep(QwenImageProcessImagesInputStep):
model_name = "qwenimage-edit-plus"
def __init__(self):
self.vae_image_size = 1024 * 1024
super().__init__()
@property
def description(self) -> str:
return "Image Preprocess step. Images can be resized first using QwenImageEditResizeDynamicStep."
@property
def expected_components(self) -> List[ComponentSpec]:
return [
ComponentSpec(
"image_processor",
VaeImageProcessor,
config=FrozenDict({"vae_scale_factor": 16}),
default_creation_method="from_config",
),
]
return "Image Preprocess step for QwenImage Edit Plus. Unlike QwenImage Edit, QwenImage Edit Plus doesn't use the same resized image for further preprocessing."
@property
def inputs(self) -> List[InputParam]:
return [InputParam("resized_image")]
@property
def intermediate_outputs(self) -> List[OutputParam]:
return [OutputParam(name="processed_image")]
return [InputParam("vae_image"), InputParam("image"), InputParam("height"), InputParam("width")]
@torch.no_grad()
def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
block_state = self.get_block_state(state)
if block_state.vae_image is None and block_state.image is None:
raise ValueError("`vae_image` and `image` cannot be None at the same time")
image = block_state.resized_image
is_image_list = isinstance(image, list)
if not is_image_list:
image = [image]
processed_images = []
for img in image:
img_width, img_height = img.size
processed_images.append(components.image_processor.preprocess(image=img, height=img_height, width=img_width))
block_state.processed_image = processed_images
if is_image_list:
block_state.processed_image = processed_images
vae_image_sizes = None
if block_state.vae_image is None:
image = block_state.image
self.check_inputs(
height=block_state.height, width=block_state.width, vae_scale_factor=components.vae_scale_factor
)
height = block_state.height or components.default_height
width = block_state.width or components.default_width
block_state.processed_image = components.image_processor.preprocess(
image=image, height=height, width=width
)
else:
block_state.processed_image = processed_images[0]
# QwenImage Edit Plus can allow multiple input images with varied resolutions
processed_images = []
vae_image_sizes = []
for img in block_state.vae_image:
width, height = img.size
vae_width, vae_height, _ = calculate_dimensions(self.vae_image_size, width / height)
vae_image_sizes.append((vae_width, vae_height))
processed_images.append(
components.image_processor.preprocess(image=img, height=vae_height, width=vae_width)
)
block_state.processed_image = processed_images
block_state.vae_image_sizes = vae_image_sizes
self.set_block_state(state, block_state)
return components, state
class QwenImageVaeEncoderDynamicStep(ModularPipelineBlocks):
"""VAE encoder that handles both single images and lists of images with varied resolutions."""
class QwenImageVaeEncoderDynamicStep(ModularPipelineBlocks):
model_name = "qwenimage"
def __init__(
@@ -1001,12 +909,21 @@ class QwenImageVaeEncoderDynamicStep(ModularPipelineBlocks):
):
"""Initialize a VAE encoder step for converting images to latent representations.
Handles both single images and lists of images. When input is a list, outputs a list of latents.
When input is a single tensor, outputs a single latent tensor.
Both the input and output names are configurable so this block can be configured to process to different image
inputs (e.g., "processed_image" -> "image_latents", "processed_control_image" -> "control_image_latents").
Args:
input_name (str, optional): Name of the input image tensor or list. Defaults to "processed_image".
output_name (str, optional): Name of the output latent tensor or list. Defaults to "image_latents".
input_name (str, optional): Name of the input image tensor. Defaults to "processed_image".
Examples: "processed_image" or "processed_control_image"
output_name (str, optional): Name of the output latent tensor. Defaults to "image_latents".
Examples: "image_latents" or "control_image_latents"
Examples:
# Basic usage with default settings (includes image processor) QwenImageVaeEncoderDynamicStep()
# Custom input/output names for control image QwenImageVaeEncoderDynamicStep(
input_name="processed_control_image", output_name="control_image_latents"
)
"""
self._image_input_name = input_name
self._image_latents_output_name = output_name
@@ -1014,18 +931,17 @@ class QwenImageVaeEncoderDynamicStep(ModularPipelineBlocks):
@property
def description(self) -> str:
return (
f"VAE Encoder step that converts {self._image_input_name} into latent representations {self._image_latents_output_name}.\n"
"Handles both single images and lists of images with varied resolutions."
)
return f"Dynamic VAE Encoder step that converts {self._image_input_name} into latent representations {self._image_latents_output_name}.\n"
@property
def expected_components(self) -> List[ComponentSpec]:
return [ComponentSpec("vae", AutoencoderKLQwenImage)]
components = [ComponentSpec("vae", AutoencoderKLQwenImage)]
return components
@property
def inputs(self) -> List[InputParam]:
return [InputParam(self._image_input_name, required=True), InputParam("generator")]
inputs = [InputParam(self._image_input_name, required=True), InputParam("generator")]
return inputs
@property
def intermediate_outputs(self) -> List[OutputParam]:
@@ -1033,7 +949,7 @@ class QwenImageVaeEncoderDynamicStep(ModularPipelineBlocks):
OutputParam(
self._image_latents_output_name,
type_hint=torch.Tensor,
description="The latents representing the reference image(s). Single tensor or list depending on input.",
description="The latents representing the reference image",
)
]
@@ -1045,11 +961,47 @@ class QwenImageVaeEncoderDynamicStep(ModularPipelineBlocks):
dtype = components.vae.dtype
image = getattr(block_state, self._image_input_name)
is_image_list = isinstance(image, list)
if not is_image_list:
image = [image]
# Handle both single image and list of images
# Encode image into latents
image_latents = encode_vae_image(
image=image,
vae=components.vae,
generator=block_state.generator,
device=device,
dtype=dtype,
latent_channels=components.num_channels_latents,
)
setattr(block_state, self._image_latents_output_name, image_latents)
self.set_block_state(state, block_state)
return components, state
class QwenImageEditPlusVaeEncoderDynamicStep(QwenImageVaeEncoderDynamicStep):
model_name = "qwenimage-edit-plus"
@property
def intermediate_outputs(self) -> List[OutputParam]:
# Each reference image latent can have varied resolutions hence we return this as a list.
return [
OutputParam(
self._image_latents_output_name,
type_hint=List[torch.Tensor],
description="The latents representing the reference image(s).",
)
]
@torch.no_grad()
def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
block_state = self.get_block_state(state)
device = components._execution_device
dtype = components.vae.dtype
image = getattr(block_state, self._image_input_name)
# Encode image into latents
image_latents = []
for img in image:
image_latents.append(
@@ -1062,12 +1014,9 @@ class QwenImageVaeEncoderDynamicStep(ModularPipelineBlocks):
latent_channels=components.num_channels_latents,
)
)
if not is_image_list:
image_latents = image_latents[0]
setattr(block_state, self._image_latents_output_name, image_latents)
self.set_block_state(state, block_state)
return components, state

View File

@@ -222,15 +222,36 @@ class QwenImageTextInputsStep(ModularPipelineBlocks):
class QwenImageInputsDynamicStep(ModularPipelineBlocks):
"""Input step for QwenImage: update height/width, expand batch, patchify."""
model_name = "qwenimage"
def __init__(
self,
image_latent_inputs: List[str] = ["image_latents"],
additional_batch_inputs: List[str] = [],
):
def __init__(self, image_latent_inputs: List[str] = ["image_latents"], additional_batch_inputs: List[str] = []):
"""Initialize a configurable step that standardizes the inputs for the denoising step. It:\n"
This step handles multiple common tasks to prepare inputs for the denoising step:
1. For encoded image latents, use it update height/width if None, patchifies, and expands batch size
2. For additional_batch_inputs: Only expands batch dimensions to match final batch size
This is a dynamic block that allows you to configure which inputs to process.
Args:
image_latent_inputs (List[str], optional): Names of image latent tensors to process.
These will be used to determine height/width, patchified, and batch-expanded. Can be a single string or
list of strings. Defaults to ["image_latents"]. Examples: ["image_latents"], ["control_image_latents"]
additional_batch_inputs (List[str], optional):
Names of additional conditional input tensors to expand batch size. These tensors will only have their
batch dimensions adjusted to match the final batch size. Can be a single string or list of strings.
Defaults to []. Examples: ["processed_mask_image"]
Examples:
# Configure to process image_latents (default behavior) QwenImageInputsDynamicStep()
# Configure to process multiple image latent inputs
QwenImageInputsDynamicStep(image_latent_inputs=["image_latents", "control_image_latents"])
# Configure to process image latents and additional batch inputs QwenImageInputsDynamicStep(
image_latent_inputs=["image_latents"], additional_batch_inputs=["processed_mask_image"]
)
"""
if not isinstance(image_latent_inputs, list):
image_latent_inputs = [image_latent_inputs]
if not isinstance(additional_batch_inputs, list):
@@ -242,12 +263,14 @@ class QwenImageInputsDynamicStep(ModularPipelineBlocks):
@property
def description(self) -> str:
# Functionality section
summary_section = (
"Input processing step that:\n"
" 1. For image latent inputs: Updates height/width if None, patchifies, and expands batch size\n"
" 1. For image latent inputs: Updates height/width if None, patchifies latents, and expands batch size\n"
" 2. For additional batch inputs: Expands batch dimensions to match final batch size"
)
# Inputs info
inputs_info = ""
if self._image_latent_inputs or self._additional_batch_inputs:
inputs_info = "\n\nConfigured inputs:"
@@ -256,16 +279,11 @@ class QwenImageInputsDynamicStep(ModularPipelineBlocks):
if self._additional_batch_inputs:
inputs_info += f"\n - Additional batch inputs: {self._additional_batch_inputs}"
# Placement guidance
placement_section = "\n\nThis block should be placed after the encoder steps and the text input step."
return summary_section + inputs_info + placement_section
@property
def expected_components(self) -> List[ComponentSpec]:
return [
ComponentSpec("pachifier", QwenImagePachifier, default_creation_method="from_config"),
]
@property
def inputs(self) -> List[InputParam]:
inputs = [
@@ -275,9 +293,11 @@ class QwenImageInputsDynamicStep(ModularPipelineBlocks):
InputParam(name="width"),
]
# Add image latent inputs
for image_latent_input_name in self._image_latent_inputs:
inputs.append(InputParam(name=image_latent_input_name))
# Add additional batch inputs
for input_name in self._additional_batch_inputs:
inputs.append(InputParam(name=input_name))
@@ -290,16 +310,22 @@ class QwenImageInputsDynamicStep(ModularPipelineBlocks):
OutputParam(name="image_width", type_hint=int, description="The width of the image latents"),
]
@property
def expected_components(self) -> List[ComponentSpec]:
return [
ComponentSpec("pachifier", QwenImagePachifier, default_creation_method="from_config"),
]
def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
block_state = self.get_block_state(state)
# Process image latent inputs
# Process image latent inputs (height/width calculation, patchify, and batch expansion)
for image_latent_input_name in self._image_latent_inputs:
image_latent_tensor = getattr(block_state, image_latent_input_name)
if image_latent_tensor is None:
continue
# 1. Calculate height/width from latents and update if not provided
# 1. Calculate height/width from latents
height, width = calculate_dimension_from_latents(image_latent_tensor, components.vae_scale_factor)
block_state.height = block_state.height or height
block_state.width = block_state.width or width
@@ -309,7 +335,7 @@ class QwenImageInputsDynamicStep(ModularPipelineBlocks):
if not hasattr(block_state, "image_width"):
block_state.image_width = width
# 2. Patchify
# 2. Patchify the image latent tensor
image_latent_tensor = components.pachifier.pack_latents(image_latent_tensor)
# 3. Expand batch size
@@ -328,6 +354,7 @@ class QwenImageInputsDynamicStep(ModularPipelineBlocks):
if input_tensor is None:
continue
# Only expand batch size
input_tensor = repeat_tensor_to_batch_size(
input_name=input_name,
input_tensor=input_tensor,
@@ -341,130 +368,63 @@ class QwenImageInputsDynamicStep(ModularPipelineBlocks):
return components, state
class QwenImageEditPlusInputsDynamicStep(ModularPipelineBlocks):
"""Input step for QwenImage Edit Plus: handles list of latents with different sizes."""
class QwenImageEditPlusInputsDynamicStep(QwenImageInputsDynamicStep):
model_name = "qwenimage-edit-plus"
def __init__(
self,
image_latent_inputs: List[str] = ["image_latents"],
additional_batch_inputs: List[str] = [],
):
if not isinstance(image_latent_inputs, list):
image_latent_inputs = [image_latent_inputs]
if not isinstance(additional_batch_inputs, list):
additional_batch_inputs = [additional_batch_inputs]
self._image_latent_inputs = image_latent_inputs
self._additional_batch_inputs = additional_batch_inputs
super().__init__()
@property
def description(self) -> str:
summary_section = (
"Input processing step for Edit Plus that:\n"
" 1. For image latent inputs (list): Collects heights/widths, patchifies each, concatenates, expands batch\n"
" 2. For additional batch inputs: Expands batch dimensions to match final batch size\n"
" Height/width defaults to last image in the list."
)
inputs_info = ""
if self._image_latent_inputs or self._additional_batch_inputs:
inputs_info = "\n\nConfigured inputs:"
if self._image_latent_inputs:
inputs_info += f"\n - Image latent inputs: {self._image_latent_inputs}"
if self._additional_batch_inputs:
inputs_info += f"\n - Additional batch inputs: {self._additional_batch_inputs}"
placement_section = "\n\nThis block should be placed after the encoder steps and the text input step."
return summary_section + inputs_info + placement_section
@property
def expected_components(self) -> List[ComponentSpec]:
return [
ComponentSpec("pachifier", QwenImagePachifier, default_creation_method="from_config"),
]
@property
def inputs(self) -> List[InputParam]:
inputs = [
InputParam(name="num_images_per_prompt", default=1),
InputParam(name="batch_size", required=True),
InputParam(name="height"),
InputParam(name="width"),
]
for image_latent_input_name in self._image_latent_inputs:
inputs.append(InputParam(name=image_latent_input_name))
for input_name in self._additional_batch_inputs:
inputs.append(InputParam(name=input_name))
return inputs
@property
def intermediate_outputs(self) -> List[OutputParam]:
return [
OutputParam(name="image_height", type_hint=List[int], description="The heights of the image latents"),
OutputParam(name="image_width", type_hint=List[int], description="The widths of the image latents"),
OutputParam(name="image_height", type_hint=List[int], description="The height of the image latents"),
OutputParam(name="image_width", type_hint=List[int], description="The width of the image latents"),
]
def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
block_state = self.get_block_state(state)
# Process image latent inputs
# Process image latent inputs (height/width calculation, patchify, and batch expansion)
for image_latent_input_name in self._image_latent_inputs:
image_latent_tensor = getattr(block_state, image_latent_input_name)
if image_latent_tensor is None:
continue
is_list = isinstance(image_latent_tensor, list)
if not is_list:
image_latent_tensor = [image_latent_tensor]
# Each image latent can have different size in QwenImage Edit Plus.
image_heights = []
image_widths = []
packed_image_latent_tensors = []
for i, img_latent_tensor in enumerate(image_latent_tensor):
for img_latent_tensor in image_latent_tensor:
# 1. Calculate height/width from latents
height, width = calculate_dimension_from_latents(img_latent_tensor, components.vae_scale_factor)
image_heights.append(height)
image_widths.append(width)
# 2. Patchify
# 2. Patchify the image latent tensor
img_latent_tensor = components.pachifier.pack_latents(img_latent_tensor)
# 3. Expand batch size
img_latent_tensor = repeat_tensor_to_batch_size(
input_name=f"{image_latent_input_name}[{i}]",
input_name=image_latent_input_name,
input_tensor=img_latent_tensor,
num_images_per_prompt=block_state.num_images_per_prompt,
batch_size=block_state.batch_size,
)
packed_image_latent_tensors.append(img_latent_tensor)
# Concatenate all packed latents along dim=1
packed_image_latent_tensors = torch.cat(packed_image_latent_tensors, dim=1)
# Output lists of heights/widths
block_state.image_height = image_heights
block_state.image_width = image_widths
setattr(block_state, image_latent_input_name, packed_image_latent_tensors)
# Default height/width from last image
block_state.height = block_state.height or image_heights[-1]
block_state.width = block_state.width or image_widths[-1]
setattr(block_state, image_latent_input_name, packed_image_latent_tensors)
# Process additional batch inputs (only batch expansion)
for input_name in self._additional_batch_inputs:
input_tensor = getattr(block_state, input_name)
if input_tensor is None:
continue
# Only expand batch size
input_tensor = repeat_tensor_to_batch_size(
input_name=input_name,
input_tensor=input_tensor,
@@ -476,6 +436,8 @@ class QwenImageEditPlusInputsDynamicStep(ModularPipelineBlocks):
self.set_block_state(state, block_state)
return components, state
class QwenImageControlNetInputsStep(ModularPipelineBlocks):
model_name = "qwenimage"

File diff suppressed because it is too large Load Diff

View File

@@ -1,465 +0,0 @@
# Copyright 2025 Qwen-Image Team and The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from ...utils import logging
from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks, ConditionalPipelineBlocks
from ..modular_pipeline_utils import InsertableDict
from .before_denoise import (
QwenImageControlNetBeforeDenoiserStep,
QwenImageCreateMaskLatentsStep,
QwenImagePrepareLatentsStep,
QwenImagePrepareLatentsWithStrengthStep,
QwenImageRoPEInputsStep,
QwenImageSetTimestepsStep,
QwenImageSetTimestepsWithStrengthStep,
)
from .decoders import (
QwenImageAfterDenoiseStep,
QwenImageDecoderStep,
QwenImageInpaintProcessImagesOutputStep,
QwenImageProcessImagesOutputStep,
)
from .denoise import (
QwenImageControlNetDenoiseStep,
QwenImageDenoiseStep,
QwenImageInpaintControlNetDenoiseStep,
QwenImageInpaintDenoiseStep,
QwenImageLoopBeforeDenoiserControlNet,
)
from .encoders import (
QwenImageControlNetVaeEncoderStep,
QwenImageInpaintProcessImagesInputStep,
QwenImageProcessImagesInputStep,
QwenImageTextEncoderStep,
QwenImageVaeEncoderDynamicStep,
)
from .inputs import (
QwenImageControlNetInputsStep,
QwenImageInputsDynamicStep,
QwenImageTextInputsStep,
)
logger = logging.get_logger(__name__)
# 1. VAE ENCODER
# inpaint vae encoder
class QwenImageInpaintVaeEncoderStep(SequentialPipelineBlocks):
model_name = "qwenimage"
block_classes = [QwenImageInpaintProcessImagesInputStep(), QwenImageVaeEncoderDynamicStep()]
block_names = ["preprocess", "encode"]
@property
def description(self) -> str:
return (
"This step is used for processing image and mask inputs for inpainting tasks. It:\n"
" - Resizes the image to the target size, based on `height` and `width`.\n"
" - Processes and updates `image` and `mask_image`.\n"
" - Creates `image_latents`."
)
# img2img vae encoder
class QwenImageImg2ImgVaeEncoderStep(SequentialPipelineBlocks):
model_name = "qwenimage"
block_classes = [QwenImageProcessImagesInputStep(), QwenImageVaeEncoderDynamicStep()]
block_names = ["preprocess", "encode"]
@property
def description(self) -> str:
return "Vae encoder step that preprocess andencode the image inputs into their latent representations."
# auto vae encoder
class QwenImageAutoVaeEncoderStep(AutoPipelineBlocks):
block_classes = [QwenImageInpaintVaeEncoderStep, QwenImageImg2ImgVaeEncoderStep]
block_names = ["inpaint", "img2img"]
block_trigger_inputs = ["mask_image", "image"]
@property
def description(self):
return (
"Vae encoder step that encode the image inputs into their latent representations.\n"
+ "This is an auto pipeline block.\n"
+ " - `QwenImageInpaintVaeEncoderStep` (inpaint) is used when `mask_image` is provided.\n"
+ " - `QwenImageImg2ImgVaeEncoderStep` (img2img) is used when `image` is provided.\n"
+ " - if `mask_image` or `image` is not provided, step will be skipped."
)
# optional controlnet vae encoder
class QwenImageOptionalControlNetVaeEncoderStep(AutoPipelineBlocks):
block_classes = [QwenImageControlNetVaeEncoderStep]
block_names = ["controlnet"]
block_trigger_inputs = ["control_image"]
@property
def description(self):
return (
"Vae encoder step that encode the image inputs into their latent representations.\n"
+ "This is an auto pipeline block.\n"
+ " - `QwenImageControlNetVaeEncoderStep` (controlnet) is used when `control_image` is provided.\n"
+ " - if `control_image` is not provided, step will be skipped."
)
# 2. DENOISE
# input -> prepare_latents -> set_timesteps -> prepare_rope_inputs -> denoise -> after_denoise
# img2img input
class QwenImageImg2ImgInputStep(SequentialPipelineBlocks):
model_name = "qwenimage"
block_classes = [QwenImageTextInputsStep(), QwenImageInputsDynamicStep(image_latent_inputs=["image_latents"])]
block_names = ["text_inputs", "additional_inputs"]
@property
def description(self):
return "Input step that prepares the inputs for the img2img denoising step. It:\n"
" - make sure the text embeddings have consistent batch size as well as the additional inputs (`image_latents`).\n"
" - update height/width based `image_latents`, patchify `image_latents`."
# inpaint input
class QwenImageInpaintInputStep(SequentialPipelineBlocks):
model_name = "qwenimage"
block_classes = [QwenImageTextInputsStep(), QwenImageInputsDynamicStep(image_latent_inputs=["image_latents"], additional_batch_inputs=["processed_mask_image"])]
block_names = ["text_inputs", "additional_inputs"]
@property
def description(self):
return "Input step that prepares the inputs for the inpainting denoising step. It:\n"
" - make sure the text embeddings have consistent batch size as well as the additional inputs (`image_latents` and `processed_mask_image`).\n"
" - update height/width based `image_latents`, patchify `image_latents`."
# inpaint prepare latents
class QwenImageInpaintPrepareLatentsStep(SequentialPipelineBlocks):
model_name = "qwenimage"
block_classes = [QwenImagePrepareLatentsWithStrengthStep(), QwenImageCreateMaskLatentsStep()]
block_names = ["add_noise_to_latents", "create_mask_latents"]
@property
def description(self) -> str:
return (
"This step prepares the latents/image_latents and mask inputs for the inpainting denoising step. It:\n"
" - Add noise to the image latents to create the latents input for the denoiser.\n"
" - Create the pachified latents `mask` based on the processedmask image.\n"
)
# CoreDenoiseStep:
# (input + prepare_latents + set_timesteps + prepare_rope_inputs + denoise + after_denoise)
# 1. text2image
class QwenImageCoreDenoiseStep(SequentialPipelineBlocks):
model_name = "qwenimage"
block_classes = [
QwenImageTextInputsStep(),
QwenImagePrepareLatentsStep(),
QwenImageSetTimestepsStep(),
QwenImageRoPEInputsStep(),
QwenImageDenoiseStep(),
QwenImageAfterDenoiseStep(),
]
block_names = [
"input",
"prepare_latents",
"set_timesteps",
"prepare_rope_inputs",
"denoise",
"after_denoise",
]
@property
def description(self):
return "step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.)."
# 2.inpaint
class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks):
model_name = "qwenimage"
block_classes = [
QwenImageInpaintInputStep(),
QwenImagePrepareLatentsStep(),
QwenImageSetTimestepsWithStrengthStep(),
QwenImageInpaintPrepareLatentsStep(),
QwenImageRoPEInputsStep(),
QwenImageInpaintDenoiseStep(),
QwenImageAfterDenoiseStep(),
]
block_names = [
"input",
"prepare_latents",
"set_timesteps",
"prepare_inpaint_latents",
"prepare_rope_inputs",
"denoise",
"after_denoise",
]
@property
def description(self):
return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task."
# 3. img2img
class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
model_name = "qwenimage"
block_classes = [
QwenImageImg2ImgInputStep(),
QwenImagePrepareLatentsStep(),
QwenImageSetTimestepsWithStrengthStep(),
QwenImagePrepareLatentsWithStrengthStep(),
QwenImageRoPEInputsStep(),
QwenImageDenoiseStep(),
QwenImageAfterDenoiseStep(),
]
block_names = [
"input",
"prepare_latents",
"set_timesteps",
"prepare_img2img_latents",
"prepare_rope_inputs",
"denoise",
"after_denoise",
]
@property
def description(self):
return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task."
# 4. text2image + controlnet
class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks):
model_name = "qwenimage"
block_classes = [
QwenImageTextInputsStep(),
QwenImageControlNetInputsStep(),
QwenImagePrepareLatentsStep(),
QwenImageSetTimestepsStep(),
QwenImageRoPEInputsStep(),
QwenImageControlNetBeforeDenoiserStep(),
QwenImageControlNetDenoiseStep(),
QwenImageAfterDenoiseStep(),
]
block_names = [
"input",
"controlnet_input",
"prepare_latents",
"set_timesteps",
"prepare_rope_inputs",
"controlnet_before_denoise",
"controlnet_denoise",
"after_denoise",
]
@property
def description(self):
return "step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.)."
# 5. inpaint + controlnet
class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks):
model_name = "qwenimage"
block_classes = [
QwenImageInpaintInputStep(),
QwenImageControlNetInputsStep(),
QwenImagePrepareLatentsStep(),
QwenImageSetTimestepsWithStrengthStep(),
QwenImageInpaintPrepareLatentsStep(),
QwenImageRoPEInputsStep(),
QwenImageControlNetBeforeDenoiserStep(),
QwenImageInpaintControlNetDenoiseStep(),
QwenImageAfterDenoiseStep(),
]
block_names = [
"input",
"controlnet_input",
"prepare_latents",
"set_timesteps",
"prepare_inpaint_latents",
"prepare_rope_inputs",
"controlnet_before_denoise",
"controlnet_denoise",
"after_denoise",
]
@property
def description(self):
return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task."
# 6. img2img + controlnet
class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
model_name = "qwenimage"
block_classes = [
QwenImageImg2ImgInputStep(),
QwenImageControlNetInputsStep(),
QwenImagePrepareLatentsStep(),
QwenImageSetTimestepsWithStrengthStep(),
QwenImagePrepareLatentsWithStrengthStep(),
QwenImageRoPEInputsStep(),
QwenImageControlNetBeforeDenoiserStep(),
QwenImageControlNetDenoiseStep(),
QwenImageAfterDenoiseStep(),
]
block_names = [
"input",
"controlnet_input",
"prepare_latents",
"set_timesteps",
"prepare_img2img_latents",
"prepare_rope_inputs",
"controlnet_before_denoise",
"controlnet_denoise",
"after_denoise",
]
@property
def description(self):
return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task."
# auto denoise
# auto denoise step for controlnet tasks: works for all tasks with controlnet
class QwenImageAutoCoreDenoiseStep(ConditionalPipelineBlocks):
block_classes = [
QwenImageCoreDenoiseStep,
QwenImageInpaintCoreDenoiseStep,
QwenImageImg2ImgCoreDenoiseStep,
QwenImageControlNetCoreDenoiseStep,
QwenImageControlNetInpaintCoreDenoiseStep,
QwenImageControlNetImg2ImgCoreDenoiseStep,
]
block_names = [
"text2image",
"inpaint",
"img2img",
"controlnet_text2image",
"controlnet_inpaint",
"controlnet_img2img"]
block_trigger_inputs = ["control_image_latents", "processed_mask_image", "image_latents"]
default_block_name = "text2image"
def select_block(self, control_image_latents=None, processed_mask_image=None, image_latents=None):
if control_image_latents is not None:
if processed_mask_image is not None:
return "controlnet_inpaint"
elif image_latents is not None:
return "controlnet_img2img"
else:
return "controlnet_text2image"
else:
if processed_mask_image is not None:
return "inpaint"
elif image_latents is not None:
return "img2img"
else:
return "text2image"
@property
def description(self):
return (
"Core step that performs the denoising process. \n"
+ " - `QwenImageCoreDenoiseStep` (text2image) for text2image tasks.\n"
+ " - `QwenImageInpaintCoreDenoiseStep` (inpaint) for inpaint tasks.\n"
+ " - `QwenImageImg2ImgCoreDenoiseStep` (img2img) for img2img tasks.\n"
+ " - `QwenImageControlNetCoreDenoiseStep` (controlnet_text2image) for text2image tasks with controlnet.\n"
+ " - `QwenImageControlNetInpaintCoreDenoiseStep` (controlnet_inpaint) for inpaint tasks with controlnet.\n"
+ " - `QwenImageControlNetImg2ImgCoreDenoiseStep` (controlnet_img2img) for img2img tasks with controlnet.\n"
+ "This step support text-to-image, image-to-image, inpainting, and controlnet tasks for QwenImage:\n"
+ " - for image-to-image generation, you need to provide `image_latents`\n"
+ " - for inpainting, you need to provide `processed_mask_image` and `image_latents`\n"
+ " - to run the controlnet workflow, you need to provide `control_image_latents`\n"
+ " - for text-to-image generation, all you need to provide is prompt embeddings"
)
# 4. DECODE
## 1.1 text2image
#### decode
#### (standard decode step works for most tasks except for inpaint)
class QwenImageDecodeStep(SequentialPipelineBlocks):
model_name = "qwenimage"
block_classes = [QwenImageDecoderStep(), QwenImageProcessImagesOutputStep()]
block_names = ["decode", "postprocess"]
@property
def description(self):
return "Decode step that decodes the latents to images and postprocess the generated image."
#### inpaint decode
class QwenImageInpaintDecodeStep(SequentialPipelineBlocks):
model_name = "qwenimage"
block_classes = [QwenImageDecoderStep(), QwenImageInpaintProcessImagesOutputStep()]
block_names = ["decode", "postprocess"]
@property
def description(self):
return "Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask overally to the original image."
# auto decode step for inpaint and text2image tasks
class QwenImageAutoDecodeStep(AutoPipelineBlocks):
block_classes = [QwenImageInpaintDecodeStep, QwenImageDecodeStep]
block_names = ["inpaint_decode", "decode"]
block_trigger_inputs = ["mask", None]
@property
def description(self):
return (
"Decode step that decode the latents into images. \n"
" This is an auto pipeline block that works for inpaint/text2image/img2img tasks, for both QwenImage and QwenImage-Edit.\n"
+ " - `QwenImageInpaintDecodeStep` (inpaint) is used when `mask` is provided.\n"
+ " - `QwenImageDecodeStep` (text2image/img2img) is used when `mask` is not provided.\n"
)
## 1.10 QwenImage/auto block & presets
AUTO_BLOCKS = InsertableDict(
[
("text_encoder", QwenImageTextEncoderStep()),
("vae_encoder", QwenImageAutoVaeEncoderStep()),
("controlnet_vae_encoder", QwenImageOptionalControlNetVaeEncoderStep()),
("denoise", QwenImageAutoCoreDenoiseStep()),
("decode", QwenImageAutoDecodeStep()),
]
)
class QwenImageAutoBlocks(SequentialPipelineBlocks):
model_name = "qwenimage"
block_classes = AUTO_BLOCKS.values()
block_names = AUTO_BLOCKS.keys()
@property
def description(self):
return (
"Auto Modular pipeline for text-to-image, image-to-image, inpainting, and controlnet tasks using QwenImage.\n"
+ "- for image-to-image generation, you need to provide `image`\n"
+ "- for inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop` \n"
+ "- to run the controlnet workflow, you need to provide `control_image`\n"
+ "- for text-to-image generation, all you need to provide is `prompt`"
)

View File

@@ -1,329 +0,0 @@
# Copyright 2025 Qwen-Image Team and The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Optional
from ...utils import logging
from ..modular_pipeline import AutoPipelineBlocks, ConditionalPipelineBlocks, SequentialPipelineBlocks
from ..modular_pipeline_utils import InsertableDict
from .before_denoise import (
QwenImageCreateMaskLatentsStep,
QwenImageEditRoPEInputsStep,
QwenImagePrepareLatentsStep,
QwenImagePrepareLatentsWithStrengthStep,
QwenImageSetTimestepsStep,
QwenImageSetTimestepsWithStrengthStep,
)
from .decoders import (
QwenImageAfterDenoiseStep,
QwenImageDecoderStep,
QwenImageInpaintProcessImagesOutputStep,
QwenImageProcessImagesOutputStep,
)
from .denoise import (
QwenImageEditDenoiseStep,
QwenImageEditInpaintDenoiseStep,
)
from .encoders import (
QwenImageEditResizeDynamicStep,
QwenImageEditTextEncoderStep,
QwenImageInpaintProcessImagesInputStep,
QwenImageProcessImagesInputStep,
QwenImageVaeEncoderDynamicStep,
)
from .inputs import (
QwenImageInputsDynamicStep,
QwenImageTextInputsStep,
)
logger = logging.get_logger(__name__)
# ====================
# 1. TEXT ENCODER
# ====================
class QwenImageEditVLEncoderStep(SequentialPipelineBlocks):
"""VL encoder that takes both image and text prompts."""
model_name = "qwenimage-edit"
block_classes = [
QwenImageEditResizeDynamicStep(),
QwenImageEditTextEncoderStep(),
]
block_names = ["resize", "encode"]
@property
def description(self) -> str:
return "QwenImage-Edit VL encoder step that encode the image and text prompts together."
# ====================
# 2. VAE ENCODER
# ====================
# Edit VAE encoder
class QwenImageEditVaeEncoderStep(SequentialPipelineBlocks):
model_name = "qwenimage-edit"
block_classes = [
QwenImageEditResizeDynamicStep(),
QwenImageProcessImagesInputStep(),
QwenImageVaeEncoderDynamicStep(),
]
block_names = ["resize", "preprocess", "encode"]
@property
def description(self) -> str:
return "Vae encoder step that encode the image inputs into their latent representations."
# Edit Inpaint VAE encoder
class QwenImageEditInpaintVaeEncoderStep(SequentialPipelineBlocks):
model_name = "qwenimage-edit"
block_classes = [
QwenImageEditResizeDynamicStep(),
QwenImageInpaintProcessImagesInputStep(),
QwenImageVaeEncoderDynamicStep(input_name="processed_image", output_name="image_latents"),
]
block_names = ["resize", "preprocess", "encode"]
@property
def description(self) -> str:
return (
"This step is used for processing image and mask inputs for QwenImage-Edit inpaint tasks. It:\n"
" - resize the image for target area (1024 * 1024) while maintaining the aspect ratio.\n"
" - process the resized image and mask image.\n"
" - create image latents."
)
# Auto VAE encoder
class QwenImageEditAutoVaeEncoderStep(AutoPipelineBlocks):
block_classes = [QwenImageEditInpaintVaeEncoderStep, QwenImageEditVaeEncoderStep]
block_names = ["edit_inpaint", "edit"]
block_trigger_inputs = ["mask_image", "image"]
@property
def description(self):
return (
"Vae encoder step that encode the image inputs into their latent representations.\n"
"This is an auto pipeline block.\n"
" - `QwenImageEditInpaintVaeEncoderStep` (edit_inpaint) is used when `mask_image` is provided.\n"
" - `QwenImageEditVaeEncoderStep` (edit) is used when `image` is provided.\n"
" - if `mask_image` or `image` is not provided, step will be skipped."
)
# ====================
# 3. DENOISE - input -> prepare_latents -> set_timesteps -> prepare_rope_inputs -> denoise -> after_denoise
# ====================
# Edit input step
class QwenImageEditInputStep(SequentialPipelineBlocks):
model_name = "qwenimage-edit"
block_classes = [
QwenImageTextInputsStep(),
QwenImageInputsDynamicStep(image_latent_inputs=["image_latents"]),
]
block_names = ["text_inputs", "additional_inputs"]
@property
def description(self):
return (
"Input step that prepares the inputs for the edit denoising step. It:\n"
" - make sure the text embeddings have consistent batch size as well as the additional inputs.\n"
" - update height/width based `image_latents`, patchify `image_latents`."
)
# Edit Inpaint input step
class QwenImageEditInpaintInputStep(SequentialPipelineBlocks):
model_name = "qwenimage-edit"
block_classes = [
QwenImageTextInputsStep(),
QwenImageInputsDynamicStep(image_latent_inputs=["image_latents"], additional_batch_inputs=["processed_mask_image"]),
]
block_names = ["text_inputs", "additional_inputs"]
@property
def description(self):
return (
"Input step that prepares the inputs for the edit inpaint denoising step. It:\n"
" - make sure the text embeddings have consistent batch size as well as the additional inputs.\n"
" - update height/width based `image_latents`, patchify `image_latents`."
)
# Edit Inpaint prepare latents step
class QwenImageEditInpaintPrepareLatentsStep(SequentialPipelineBlocks):
model_name = "qwenimage-edit"
block_classes = [QwenImagePrepareLatentsWithStrengthStep(), QwenImageCreateMaskLatentsStep()]
block_names = ["add_noise_to_latents", "create_mask_latents"]
@property
def description(self) -> str:
return (
"This step prepares the latents/image_latents and mask inputs for the edit inpainting denoising step. It:\n"
" - Add noise to the image latents to create the latents input for the denoiser.\n"
" - Create the patchified latents `mask` based on the processed mask image.\n"
)
# 1. Edit (img2img) core denoise
class QwenImageEditCoreDenoiseStep(SequentialPipelineBlocks):
model_name = "qwenimage-edit"
block_classes = [
QwenImageEditInputStep(),
QwenImagePrepareLatentsStep(),
QwenImageSetTimestepsStep(),
QwenImageEditRoPEInputsStep(),
QwenImageEditDenoiseStep(),
QwenImageAfterDenoiseStep(),
]
block_names = [
"input",
"prepare_latents",
"set_timesteps",
"prepare_rope_inputs",
"denoise",
"after_denoise",
]
@property
def description(self):
return "Core denoising workflow for QwenImage-Edit edit (img2img) task."
# 2. Edit Inpaint core denoise
class QwenImageEditInpaintCoreDenoiseStep(SequentialPipelineBlocks):
model_name = "qwenimage-edit"
block_classes = [
QwenImageEditInpaintInputStep(),
QwenImagePrepareLatentsStep(),
QwenImageSetTimestepsWithStrengthStep(),
QwenImageEditInpaintPrepareLatentsStep(),
QwenImageEditRoPEInputsStep(),
QwenImageEditInpaintDenoiseStep(),
QwenImageAfterDenoiseStep(),
]
block_names = [
"input",
"prepare_latents",
"set_timesteps",
"prepare_inpaint_latents",
"prepare_rope_inputs",
"denoise",
"after_denoise",
]
@property
def description(self):
return "Core denoising workflow for QwenImage-Edit edit inpaint task."
# Auto core denoise step
class QwenImageEditAutoCoreDenoiseStep(ConditionalPipelineBlocks):
block_classes = [
QwenImageEditInpaintCoreDenoiseStep,
QwenImageEditCoreDenoiseStep,
]
block_names = ["edit_inpaint", "edit"]
block_trigger_inputs = ["processed_mask_image", "image_latents"]
default_block_name = "edit"
def select_block(self, processed_mask_image=None, image_latents=None) -> Optional[str]:
if processed_mask_image is not None:
return "edit_inpaint"
elif image_latents is not None:
return "edit"
return None
@property
def description(self):
return (
"Auto core denoising step that selects the appropriate workflow based on inputs.\n"
" - `QwenImageEditInpaintCoreDenoiseStep` when `processed_mask_image` is provided\n"
" - `QwenImageEditCoreDenoiseStep` when `image_latents` is provided\n"
"Supports edit (img2img) and edit inpainting tasks for QwenImage-Edit."
)
# ====================
# 4. DECODE
# ====================
# Decode step (standard)
class QwenImageEditDecodeStep(SequentialPipelineBlocks):
model_name = "qwenimage-edit"
block_classes = [QwenImageDecoderStep(), QwenImageProcessImagesOutputStep()]
block_names = ["decode", "postprocess"]
@property
def description(self):
return "Decode step that decodes the latents to images and postprocess the generated image."
# Inpaint decode step
class QwenImageEditInpaintDecodeStep(SequentialPipelineBlocks):
model_name = "qwenimage-edit"
block_classes = [QwenImageDecoderStep(), QwenImageInpaintProcessImagesOutputStep()]
block_names = ["decode", "postprocess"]
@property
def description(self):
return "Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask overlay to the original image."
# Auto decode step
class QwenImageEditAutoDecodeStep(AutoPipelineBlocks):
block_classes = [QwenImageEditInpaintDecodeStep, QwenImageEditDecodeStep]
block_names = ["inpaint_decode", "decode"]
block_trigger_inputs = ["mask", None]
@property
def description(self):
return (
"Decode step that decode the latents into images.\n"
"This is an auto pipeline block.\n"
" - `QwenImageEditInpaintDecodeStep` (inpaint) is used when `mask` is provided.\n"
" - `QwenImageEditDecodeStep` (edit) is used when `mask` is not provided.\n"
)
# ====================
# 5. AUTO BLOCKS & PRESETS
# ====================
EDIT_AUTO_BLOCKS = InsertableDict(
[
("text_encoder", QwenImageEditVLEncoderStep()),
("vae_encoder", QwenImageEditAutoVaeEncoderStep()),
("denoise", QwenImageEditAutoCoreDenoiseStep()),
("decode", QwenImageEditAutoDecodeStep()),
]
)
class QwenImageEditAutoBlocks(SequentialPipelineBlocks):
model_name = "qwenimage-edit"
block_classes = EDIT_AUTO_BLOCKS.values()
block_names = EDIT_AUTO_BLOCKS.keys()
@property
def description(self):
return (
"Auto Modular pipeline for edit (img2img) and edit inpaint tasks using QwenImage-Edit.\n"
"- for edit (img2img) generation, you need to provide `image`\n"
"- for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop`\n"
)

View File

@@ -1,175 +0,0 @@
# Copyright 2025 Qwen-Image Team and The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from ...utils import logging
from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks
from ..modular_pipeline_utils import InsertableDict
from .before_denoise import (
QwenImageEditPlusRoPEInputsStep,
QwenImagePrepareLatentsStep,
QwenImageSetTimestepsStep,
)
from .decoders import (
QwenImageAfterDenoiseStep,
QwenImageDecoderStep,
QwenImageProcessImagesOutputStep,
)
from .denoise import (
QwenImageEditDenoiseStep,
)
from .encoders import (
QwenImageEditPlusResizeDynamicStep,
QwenImageEditPlusTextEncoderStep,
QwenImageEditPlusProcessImagesInputStep,
QwenImageVaeEncoderDynamicStep,
)
from .inputs import (
QwenImageEditPlusInputsDynamicStep,
QwenImageTextInputsStep,
)
logger = logging.get_logger(__name__)
# ====================
# 1. TEXT ENCODER
# ====================
class QwenImageEditPlusVLEncoderStep(SequentialPipelineBlocks):
"""VL encoder that takes both image and text prompts. Uses 384x384 target area."""
model_name = "qwenimage-edit-plus"
block_classes = [
QwenImageEditPlusResizeDynamicStep(target_area=384 * 384, output_name="resized_cond_image"),
QwenImageEditPlusTextEncoderStep(),
]
block_names = ["resize", "encode"]
@property
def description(self) -> str:
return "QwenImage-Edit Plus VL encoder step that encodes the image and text prompts together."
# ====================
# 2. VAE ENCODER
# ====================
class QwenImageEditPlusVaeEncoderStep(SequentialPipelineBlocks):
"""VAE encoder that handles multiple images with different sizes. Uses 1024x1024 target area."""
model_name = "qwenimage-edit-plus"
block_classes = [
QwenImageEditPlusResizeDynamicStep(target_area=1024 * 1024, output_name="resized_image"),
QwenImageEditPlusProcessImagesInputStep(),
QwenImageVaeEncoderDynamicStep(),
]
block_names = ["resize", "preprocess", "encode"]
@property
def description(self) -> str:
return (
"VAE encoder step that encodes image inputs into latent representations.\n"
"Each image is resized independently based on its own aspect ratio to 1024x1024 target area."
)
# ====================
# 3. DENOISE - input -> prepare_latents -> set_timesteps -> prepare_rope_inputs -> denoise -> after_denoise
# ====================
# Edit Plus input step
class QwenImageEditPlusInputStep(SequentialPipelineBlocks):
model_name = "qwenimage-edit-plus"
block_classes = [
QwenImageTextInputsStep(),
QwenImageEditPlusInputsDynamicStep(image_latent_inputs=["image_latents"]),
]
block_names = ["text_inputs", "additional_inputs"]
@property
def description(self):
return (
"Input step that prepares the inputs for the Edit Plus denoising step. It:\n"
" - Standardizes text embeddings batch size.\n"
" - Processes list of image latents: patchifies, concatenates along dim=1, expands batch.\n"
" - Outputs lists of image_height/image_width for RoPE calculation.\n"
" - Defaults height/width from last image in the list."
)
# Edit Plus core denoise
class QwenImageEditPlusCoreDenoiseStep(SequentialPipelineBlocks):
model_name = "qwenimage-edit-plus"
block_classes = [
QwenImageEditPlusInputStep(),
QwenImagePrepareLatentsStep(),
QwenImageSetTimestepsStep(),
QwenImageEditPlusRoPEInputsStep(),
QwenImageEditDenoiseStep(),
QwenImageAfterDenoiseStep(),
]
block_names = [
"input",
"prepare_latents",
"set_timesteps",
"prepare_rope_inputs",
"denoise",
"after_denoise",
]
@property
def description(self):
return "Core denoising workflow for QwenImage-Edit Plus edit (img2img) task."
# ====================
# 4. DECODE
# ====================
class QwenImageEditPlusDecodeStep(SequentialPipelineBlocks):
model_name = "qwenimage-edit-plus"
block_classes = [QwenImageDecoderStep(), QwenImageProcessImagesOutputStep()]
block_names = ["decode", "postprocess"]
@property
def description(self):
return "Decode step that decodes the latents to images and postprocesses the generated image."
# ====================
# 5. AUTO BLOCKS & PRESETS
# ====================
EDIT_PLUS_AUTO_BLOCKS = InsertableDict(
[
("text_encoder", QwenImageEditPlusVLEncoderStep()),
("vae_encoder", QwenImageEditPlusVaeEncoderStep()),
("denoise", QwenImageEditPlusCoreDenoiseStep()),
("decode", QwenImageEditPlusDecodeStep()),
]
)
class QwenImageEditPlusAutoBlocks(SequentialPipelineBlocks):
model_name = "qwenimage-edit-plus"
block_classes = EDIT_PLUS_AUTO_BLOCKS.values()
block_names = EDIT_PLUS_AUTO_BLOCKS.keys()
@property
def description(self):
return (
"Auto Modular pipeline for edit (img2img) tasks using QwenImage-Edit Plus.\n"
"- `image` is required input (can be single image or list of images).\n"
"- Each image is resized independently based on its own aspect ratio.\n"
"- VL encoder uses 384x384 target area, VAE encoder uses 1024x1024 target area."
)

View File

@@ -0,0 +1,95 @@
# Copyright 2025 Qwen-Image Team and The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# mellon nodes
QwenImage_NODE_TYPES_PARAMS_MAP = {
"controlnet": {
"inputs": [
"control_image",
"controlnet_conditioning_scale",
"control_guidance_start",
"control_guidance_end",
"height",
"width",
],
"model_inputs": [
"controlnet",
"vae",
],
"outputs": [
"controlnet_out",
],
"block_names": ["controlnet_vae_encoder"],
},
"denoise": {
"inputs": [
"embeddings",
"width",
"height",
"seed",
"num_inference_steps",
"guidance_scale",
"image_latents",
"strength",
"controlnet",
],
"model_inputs": [
"unet",
"guider",
"scheduler",
],
"outputs": [
"latents",
"latents_preview",
],
"block_names": ["denoise"],
},
"vae_encoder": {
"inputs": [
"image",
"width",
"height",
],
"model_inputs": [
"vae",
],
"outputs": [
"image_latents",
],
},
"text_encoder": {
"inputs": [
"prompt",
"negative_prompt",
],
"model_inputs": [
"text_encoders",
],
"outputs": [
"embeddings",
],
},
"decoder": {
"inputs": [
"latents",
],
"model_inputs": [
"vae",
],
"outputs": [
"images",
],
},
}

View File

@@ -0,0 +1,99 @@
# Copyright 2025 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
SDXL_NODE_TYPES_PARAMS_MAP = {
"controlnet": {
"inputs": [
"control_image",
"controlnet_conditioning_scale",
"control_guidance_start",
"control_guidance_end",
"height",
"width",
],
"model_inputs": [
"controlnet",
],
"outputs": [
"controlnet_out",
],
"block_names": [None],
},
"denoise": {
"inputs": [
"embeddings",
"width",
"height",
"seed",
"num_inference_steps",
"guidance_scale",
"image_latents",
"strength",
# custom adapters coming in as inputs
"controlnet",
# ip_adapter is optional and custom; include if available
"ip_adapter",
],
"model_inputs": [
"unet",
"guider",
"scheduler",
],
"outputs": [
"latents",
"latents_preview",
],
"block_names": ["denoise"],
},
"vae_encoder": {
"inputs": [
"image",
"width",
"height",
],
"model_inputs": [
"vae",
],
"outputs": [
"image_latents",
],
"block_names": ["vae_encoder"],
},
"text_encoder": {
"inputs": [
"prompt",
"negative_prompt",
],
"model_inputs": [
"text_encoders",
],
"outputs": [
"embeddings",
],
"block_names": ["text_encoder"],
},
"decoder": {
"inputs": [
"latents",
],
"model_inputs": [
"vae",
],
"outputs": [
"images",
],
"block_names": ["decode"],
},
}

View File

@@ -129,10 +129,6 @@ class ZImageLoopDenoiser(ModularPipelineBlocks):
type_hint=int,
description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.",
),
InputParam(
kwargs_type="denoiser_input_fields",
description="conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.",
),
]
guider_input_names = []
uncond_guider_input_names = []

View File

@@ -119,7 +119,7 @@ class ZImageAutoDenoiseStep(AutoPipelineBlocks):
class ZImageAutoVaeImageEncoderStep(AutoPipelineBlocks):
block_classes = [ZImageVaeImageEncoderStep]
block_names = ["vae_encoder"]
block_names = ["vae_image_encoder"]
block_trigger_inputs = ["image"]
@property
@@ -137,7 +137,7 @@ class ZImageAutoBlocks(SequentialPipelineBlocks):
ZImageAutoDenoiseStep,
ZImageVaeDecoderStep,
]
block_names = ["text_encoder", "vae_encoder", "denoise", "decode"]
block_names = ["text_encoder", "vae_image_encoder", "denoise", "decode"]
@property
def description(self) -> str:
@@ -162,7 +162,7 @@ TEXT2IMAGE_BLOCKS = InsertableDict(
IMAGE2IMAGE_BLOCKS = InsertableDict(
[
("text_encoder", ZImageTextEncoderStep),
("vae_encoder", ZImageVaeImageEncoderStep),
("vae_image_encoder", ZImageVaeImageEncoderStep),
("input", ZImageTextInputStep),
("additional_inputs", ZImageAdditionalInputsStep(image_latent_inputs=["image_latents"])),
("prepare_latents", ZImagePrepareLatentsStep),
@@ -178,7 +178,7 @@ IMAGE2IMAGE_BLOCKS = InsertableDict(
AUTO_BLOCKS = InsertableDict(
[
("text_encoder", ZImageTextEncoderStep),
("vae_encoder", ZImageAutoVaeImageEncoderStep),
("vae_image_encoder", ZImageAutoVaeImageEncoderStep),
("denoise", ZImageAutoDenoiseStep),
("decode", ZImageVaeDecoderStep),
]

View File

@@ -165,7 +165,6 @@ else:
_import_structure["cogview4"] = ["CogView4Pipeline", "CogView4ControlPipeline"]
_import_structure["consisid"] = ["ConsisIDPipeline"]
_import_structure["cosmos"] = [
"Cosmos2_5_PredictBasePipeline",
"Cosmos2TextToImagePipeline",
"CosmosTextToWorldPipeline",
"CosmosVideoToWorldPipeline",
@@ -623,7 +622,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
StableDiffusionXLControlNetXSPipeline,
)
from .cosmos import (
Cosmos2_5_PredictBasePipeline,
Cosmos2TextToImagePipeline,
Cosmos2VideoToWorldPipeline,
CosmosTextToWorldPipeline,

View File

@@ -22,9 +22,6 @@ except OptionalDependencyNotAvailable:
_dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
else:
_import_structure["pipeline_cosmos2_5_predict"] = [
"Cosmos2_5_PredictBasePipeline",
]
_import_structure["pipeline_cosmos2_text2image"] = ["Cosmos2TextToImagePipeline"]
_import_structure["pipeline_cosmos2_video2world"] = ["Cosmos2VideoToWorldPipeline"]
_import_structure["pipeline_cosmos_text2world"] = ["CosmosTextToWorldPipeline"]
@@ -38,9 +35,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
except OptionalDependencyNotAvailable:
from ...utils.dummy_torch_and_transformers_objects import *
else:
from .pipeline_cosmos2_5_predict import (
Cosmos2_5_PredictBasePipeline,
)
from .pipeline_cosmos2_text2image import Cosmos2TextToImagePipeline
from .pipeline_cosmos2_video2world import Cosmos2VideoToWorldPipeline
from .pipeline_cosmos_text2world import CosmosTextToWorldPipeline

View File

@@ -1,847 +0,0 @@
# Copyright 2025 The NVIDIA Team and The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Callable, Dict, List, Optional, Union
import numpy as np
import torch
import torchvision
import torchvision.transforms
import torchvision.transforms.functional
from transformers import AutoTokenizer, Qwen2_5_VLForConditionalGeneration
from ...callbacks import MultiPipelineCallbacks, PipelineCallback
from ...image_processor import PipelineImageInput
from ...models import AutoencoderKLWan, CosmosTransformer3DModel
from ...schedulers import UniPCMultistepScheduler
from ...utils import is_cosmos_guardrail_available, is_torch_xla_available, logging, replace_example_docstring
from ...utils.torch_utils import randn_tensor
from ...video_processor import VideoProcessor
from ..pipeline_utils import DiffusionPipeline
from .pipeline_output import CosmosPipelineOutput
if is_cosmos_guardrail_available():
from cosmos_guardrail import CosmosSafetyChecker
else:
class CosmosSafetyChecker:
def __init__(self, *args, **kwargs):
raise ImportError(
"`cosmos_guardrail` is not installed. Please install it to use the safety checker for Cosmos: `pip install cosmos_guardrail`."
)
if is_torch_xla_available():
import torch_xla.core.xla_model as xm
XLA_AVAILABLE = True
else:
XLA_AVAILABLE = False
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
def retrieve_latents(
encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
):
if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
return encoder_output.latent_dist.sample(generator)
elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
return encoder_output.latent_dist.mode()
elif hasattr(encoder_output, "latents"):
return encoder_output.latents
else:
raise AttributeError("Could not access latents of provided encoder_output")
EXAMPLE_DOC_STRING = """
Examples:
```python
>>> import torch
>>> from diffusers import Cosmos2_5_PredictBasePipeline
>>> from diffusers.utils import export_to_video, load_image, load_video
>>> model_id = "nvidia/Cosmos-Predict2.5-2B"
>>> pipe = Cosmos2_5_PredictBasePipeline.from_pretrained(
... model_id, revision="diffusers/base/pre-trianed", torch_dtype=torch.bfloat16
... )
>>> pipe = pipe.to("cuda")
>>> # Common negative prompt reused across modes.
>>> negative_prompt = (
... "The video captures a series of frames showing ugly scenes, static with no motion, motion blur, "
... "over-saturation, shaky footage, low resolution, grainy texture, pixelated images, poorly lit areas, "
... "underexposed and overexposed scenes, poor color balance, washed out colors, choppy sequences, jerky "
... "movements, low frame rate, artifacting, color banding, unnatural transitions, outdated special effects, "
... "fake elements, unconvincing visuals, poorly edited content, jump cuts, visual noise, and flickering. "
... "Overall, the video is of poor quality."
... )
>>> # Text2World: generate a 93-frame world video from text only.
>>> prompt = (
... "As the red light shifts to green, the red bus at the intersection begins to move forward, its headlights "
... "cutting through the falling snow. The snowy tire tracks deepen as the vehicle inches ahead, casting fresh "
... "lines onto the slushy road. Around it, streetlights glow warmer, illuminating the drifting flakes and wet "
... "reflections on the asphalt. Other cars behind start to edge forward, their beams joining the scene. "
... "The stillness of the urban street transitions into motion as the quiet snowfall is punctuated by the slow "
... "advance of traffic through the frosty city corridor."
... )
>>> video = pipe(
... image=None,
... video=None,
... prompt=prompt,
... negative_prompt=negative_prompt,
... num_frames=93,
... generator=torch.Generator().manual_seed(1),
... ).frames[0]
>>> export_to_video(video, "text2world.mp4", fps=16)
>>> # Image2World: condition on a single image and generate a 93-frame world video.
>>> prompt = (
... "A high-definition video captures the precision of robotic welding in an industrial setting. "
... "The first frame showcases a robotic arm, equipped with a welding torch, positioned over a large metal structure. "
... "The welding process is in full swing, with bright sparks and intense light illuminating the scene, creating a vivid "
... "display of blue and white hues. A significant amount of smoke billows around the welding area, partially obscuring "
... "the view but emphasizing the heat and activity. The background reveals parts of the workshop environment, including a "
... "ventilation system and various pieces of machinery, indicating a busy and functional industrial workspace. As the video "
... "progresses, the robotic arm maintains its steady position, continuing the welding process and moving to its left. "
... "The welding torch consistently emits sparks and light, and the smoke continues to rise, diffusing slightly as it moves upward. "
... "The metal surface beneath the torch shows ongoing signs of heating and melting. The scene retains its industrial ambiance, with "
... "the welding sparks and smoke dominating the visual field, underscoring the ongoing nature of the welding operation."
... )
>>> image = load_image(
... "https://media.githubusercontent.com/media/nvidia-cosmos/cosmos-predict2.5/refs/heads/main/assets/base/robot_welding.jpg"
... )
>>> video = pipe(
... image=image,
... video=None,
... prompt=prompt,
... negative_prompt=negative_prompt,
... num_frames=93,
... generator=torch.Generator().manual_seed(1),
... ).frames[0]
>>> # export_to_video(video, "image2world.mp4", fps=16)
>>> # Video2World: condition on an input clip and predict a 93-frame world video.
>>> prompt = (
... "The video opens with an aerial view of a large-scale sand mining construction operation, showcasing extensive piles "
... "of brown sand meticulously arranged in parallel rows. A central water channel, fed by a water pipe, flows through the "
... "middle of these sand heaps, creating ripples and movement as it cascades down. The surrounding area features dense green "
... "vegetation on the left, contrasting with the sandy terrain, while a body of water is visible in the background on the right. "
... "As the video progresses, a piece of heavy machinery, likely a bulldozer, enters the frame from the right, moving slowly along "
... "the edge of the sand piles. This machinery's presence indicates ongoing construction work in the operation. The final frame "
... "captures the same scene, with the water continuing its flow and the bulldozer still in motion, maintaining the dynamic yet "
... "steady pace of the construction activity."
... )
>>> input_video = load_video(
... "https://github.com/nvidia-cosmos/cosmos-predict2.5/raw/refs/heads/main/assets/base/sand_mining.mp4"
... )
>>> video = pipe(
... image=None,
... video=input_video,
... prompt=prompt,
... negative_prompt=negative_prompt,
... num_frames=93,
... generator=torch.Generator().manual_seed(1),
... ).frames[0]
>>> export_to_video(video, "video2world.mp4", fps=16)
>>> # To produce an image instead of a world (video) clip, set num_frames=1 and
>>> # save the first frame: pipe(..., num_frames=1).frames[0][0].
```
"""
class Cosmos2_5_PredictBasePipeline(DiffusionPipeline):
r"""
Pipeline for [Cosmos Predict2.5](https://github.com/nvidia-cosmos/cosmos-predict2.5) base model.
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
implemented for all pipelines (downloading, saving, running on a particular device, etc.).
Args:
text_encoder ([`Qwen2_5_VLForConditionalGeneration`]):
Frozen text-encoder. Cosmos Predict2.5 uses the [Qwen2.5
VL](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct) encoder.
tokenizer (`AutoTokenizer`):
Tokenizer associated with the Qwen2.5 VL encoder.
transformer ([`CosmosTransformer3DModel`]):
Conditional Transformer to denoise the encoded image latents.
scheduler ([`UniPCMultistepScheduler`]):
A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
vae ([`AutoencoderKLWan`]):
Variational Auto-Encoder (VAE) Model to encode and decode videos to and from latent representations.
"""
model_cpu_offload_seq = "text_encoder->transformer->vae"
_callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
# We mark safety_checker as optional here to get around some test failures, but it is not really optional
_optional_components = ["safety_checker"]
_exclude_from_cpu_offload = ["safety_checker"]
def __init__(
self,
text_encoder: Qwen2_5_VLForConditionalGeneration,
tokenizer: AutoTokenizer,
transformer: CosmosTransformer3DModel,
vae: AutoencoderKLWan,
scheduler: UniPCMultistepScheduler,
safety_checker: CosmosSafetyChecker = None,
):
super().__init__()
if safety_checker is None:
safety_checker = CosmosSafetyChecker()
self.register_modules(
vae=vae,
text_encoder=text_encoder,
tokenizer=tokenizer,
transformer=transformer,
scheduler=scheduler,
safety_checker=safety_checker,
)
self.vae_scale_factor_temporal = 2 ** sum(self.vae.temperal_downsample) if getattr(self, "vae", None) else 4
self.vae_scale_factor_spatial = 2 ** len(self.vae.temperal_downsample) if getattr(self, "vae", None) else 8
self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial)
latents_mean = (
torch.tensor(self.vae.config.latents_mean).view(1, self.vae.config.z_dim, 1, 1, 1).float()
if getattr(self.vae.config, "latents_mean", None) is not None
else None
)
latents_std = (
torch.tensor(self.vae.config.latents_std).view(1, self.vae.config.z_dim, 1, 1, 1).float()
if getattr(self.vae.config, "latents_std", None) is not None
else None
)
self.latents_mean = latents_mean
self.latents_std = latents_std
if self.latents_mean is None or self.latents_std is None:
raise ValueError("VAE configuration must define both `latents_mean` and `latents_std`.")
def _get_prompt_embeds(
self,
prompt: Union[str, List[str]] = None,
max_sequence_length: int = 512,
device: Optional[torch.device] = None,
dtype: Optional[torch.dtype] = None,
):
device = device or self._execution_device
dtype = dtype or self.text_encoder.dtype
prompt = [prompt] if isinstance(prompt, str) else prompt
input_ids_batch = []
for sample_idx in range(len(prompt)):
conversations = [
{
"role": "system",
"content": [
{
"type": "text",
"text": "You are a helpful assistant who will provide prompts to an image generator.",
}
],
},
{
"role": "user",
"content": [
{
"type": "text",
"text": prompt[sample_idx],
}
],
},
]
input_ids = self.tokenizer.apply_chat_template(
conversations,
tokenize=True,
add_generation_prompt=False,
add_vision_id=False,
max_length=max_sequence_length,
truncation=True,
padding="max_length",
)
input_ids = torch.LongTensor(input_ids)
input_ids_batch.append(input_ids)
input_ids_batch = torch.stack(input_ids_batch, dim=0)
outputs = self.text_encoder(
input_ids_batch.to(device),
output_hidden_states=True,
)
hidden_states = outputs.hidden_states
normalized_hidden_states = []
for layer_idx in range(1, len(hidden_states)):
normalized_state = (hidden_states[layer_idx] - hidden_states[layer_idx].mean(dim=-1, keepdim=True)) / (
hidden_states[layer_idx].std(dim=-1, keepdim=True) + 1e-8
)
normalized_hidden_states.append(normalized_state)
prompt_embeds = torch.cat(normalized_hidden_states, dim=-1)
prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
return prompt_embeds
# Modified from diffusers.pipelines.cosmos.pipeline_cosmos_text2world.CosmosTextToWorldPipeline.encode_prompt
def encode_prompt(
self,
prompt: Union[str, List[str]],
negative_prompt: Optional[Union[str, List[str]]] = None,
do_classifier_free_guidance: bool = True,
num_videos_per_prompt: int = 1,
prompt_embeds: Optional[torch.Tensor] = None,
negative_prompt_embeds: Optional[torch.Tensor] = None,
max_sequence_length: int = 512,
device: Optional[torch.device] = None,
dtype: Optional[torch.dtype] = None,
):
r"""
Encodes the prompt into text encoder hidden states.
Args:
prompt (`str` or `List[str]`, *optional*):
prompt to be encoded
negative_prompt (`str` or `List[str]`, *optional*):
The prompt or prompts not to guide the image generation. If not defined, one has to pass
`negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
less than `1`).
do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
Whether to use classifier free guidance or not.
num_videos_per_prompt (`int`, *optional*, defaults to 1):
Number of videos that should be generated per prompt. torch device to place the resulting embeddings on
prompt_embeds (`torch.Tensor`, *optional*):
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
provided, text embeddings will be generated from `prompt` input argument.
negative_prompt_embeds (`torch.Tensor`, *optional*):
Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
argument.
device: (`torch.device`, *optional*):
torch device
dtype: (`torch.dtype`, *optional*):
torch dtype
"""
device = device or self._execution_device
prompt = [prompt] if isinstance(prompt, str) else prompt
if prompt is not None:
batch_size = len(prompt)
else:
batch_size = prompt_embeds.shape[0]
if prompt_embeds is None:
prompt_embeds = self._get_prompt_embeds(
prompt=prompt, max_sequence_length=max_sequence_length, device=device, dtype=dtype
)
# duplicate text embeddings for each generation per prompt, using mps friendly method
_, seq_len, _ = prompt_embeds.shape
prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt, 1)
prompt_embeds = prompt_embeds.view(batch_size * num_videos_per_prompt, seq_len, -1)
if do_classifier_free_guidance and negative_prompt_embeds is None:
negative_prompt = negative_prompt or ""
negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
if prompt is not None and type(prompt) is not type(negative_prompt):
raise TypeError(
f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
f" {type(prompt)}."
)
elif batch_size != len(negative_prompt):
raise ValueError(
f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
" the batch size of `prompt`."
)
negative_prompt_embeds = self._get_prompt_embeds(
prompt=negative_prompt, max_sequence_length=max_sequence_length, device=device, dtype=dtype
)
# duplicate text embeddings for each generation per prompt, using mps friendly method
_, seq_len, _ = negative_prompt_embeds.shape
negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_videos_per_prompt, 1)
negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_videos_per_prompt, seq_len, -1)
return prompt_embeds, negative_prompt_embeds
# Modified from diffusers.pipelines.cosmos.pipeline_cosmos2_video2world.Cosmos2VideoToWorldPipeline.prepare_latents and
# diffusers.pipelines.cosmos.pipeline_cosmos2_video2world.Cosmos2TextToImagePipeline.prepare_latents
def prepare_latents(
self,
video: Optional[torch.Tensor],
batch_size: int,
num_channels_latents: int = 16,
height: int = 704,
width: int = 1280,
num_frames_in: int = 93,
num_frames_out: int = 93,
do_classifier_free_guidance: bool = True,
dtype: Optional[torch.dtype] = None,
device: Optional[torch.device] = None,
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
latents: Optional[torch.Tensor] = None,
) -> torch.Tensor:
if isinstance(generator, list) and len(generator) != batch_size:
raise ValueError(
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
f" size of {batch_size}. Make sure the batch size matches the length of the generators."
)
B = batch_size
C = num_channels_latents
T = (num_frames_out - 1) // self.vae_scale_factor_temporal + 1
H = height // self.vae_scale_factor_spatial
W = width // self.vae_scale_factor_spatial
shape = (B, C, T, H, W)
if num_frames_in == 0:
if latents is None:
latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
cond_mask = torch.zeros((B, 1, T, H, W), dtype=latents.dtype, device=latents.device)
cond_indicator = torch.zeros((B, 1, T, 1, 1), dtype=latents.dtype, device=latents.device)
cond_latents = torch.zeros_like(latents)
return (
latents,
cond_latents,
cond_mask,
cond_indicator,
)
else:
if video is None:
raise ValueError("`video` must be provided when `num_frames_in` is greater than 0.")
needs_preprocessing = not (isinstance(video, torch.Tensor) and video.ndim == 5 and video.shape[1] == 3)
if needs_preprocessing:
video = self.video_processor.preprocess_video(video, height, width)
video = video.to(device=device, dtype=self.vae.dtype)
if isinstance(generator, list):
cond_latents = [
retrieve_latents(self.vae.encode(video[i].unsqueeze(0)), generator=generator[i])
for i in range(batch_size)
]
else:
cond_latents = [retrieve_latents(self.vae.encode(vid.unsqueeze(0)), generator) for vid in video]
cond_latents = torch.cat(cond_latents, dim=0).to(dtype)
latents_mean = self.latents_mean.to(device=device, dtype=dtype)
latents_std = self.latents_std.to(device=device, dtype=dtype)
cond_latents = (cond_latents - latents_mean) / latents_std
if latents is None:
latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
else:
latents = latents.to(device=device, dtype=dtype)
padding_shape = (B, 1, T, H, W)
ones_padding = latents.new_ones(padding_shape)
zeros_padding = latents.new_zeros(padding_shape)
num_cond_latent_frames = (num_frames_in - 1) // self.vae_scale_factor_temporal + 1
cond_indicator = latents.new_zeros(1, 1, latents.size(2), 1, 1)
cond_indicator[:, :, 0:num_cond_latent_frames] = 1.0
cond_mask = cond_indicator * ones_padding + (1 - cond_indicator) * zeros_padding
return (
latents,
cond_latents,
cond_mask,
cond_indicator,
)
# Copied from diffusers.pipelines.cosmos.pipeline_cosmos_text2world.CosmosTextToWorldPipeline.check_inputs
def check_inputs(
self,
prompt,
height,
width,
prompt_embeds=None,
callback_on_step_end_tensor_inputs=None,
):
if height % 16 != 0 or width % 16 != 0:
raise ValueError(f"`height` and `width` have to be divisible by 16 but are {height} and {width}.")
if callback_on_step_end_tensor_inputs is not None and not all(
k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
):
raise ValueError(
f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
)
if prompt is not None and prompt_embeds is not None:
raise ValueError(
f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
" only forward one of the two."
)
elif prompt is None and prompt_embeds is None:
raise ValueError(
"Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
)
elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
@property
def guidance_scale(self):
return self._guidance_scale
@property
def do_classifier_free_guidance(self):
return self._guidance_scale > 1.0
@property
def num_timesteps(self):
return self._num_timesteps
@property
def current_timestep(self):
return self._current_timestep
@property
def interrupt(self):
return self._interrupt
@torch.no_grad()
@replace_example_docstring(EXAMPLE_DOC_STRING)
def __call__(
self,
image: PipelineImageInput | None = None,
video: List[PipelineImageInput] | None = None,
prompt: Union[str, List[str]] | None = None,
negative_prompt: Optional[Union[str, List[str]]] = None,
height: int = 704,
width: int = 1280,
num_frames: int = 93,
num_inference_steps: int = 36,
guidance_scale: float = 7.0,
num_videos_per_prompt: Optional[int] = 1,
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
latents: Optional[torch.Tensor] = None,
prompt_embeds: Optional[torch.Tensor] = None,
negative_prompt_embeds: Optional[torch.Tensor] = None,
output_type: Optional[str] = "pil",
return_dict: bool = True,
callback_on_step_end: Optional[
Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
] = None,
callback_on_step_end_tensor_inputs: List[str] = ["latents"],
max_sequence_length: int = 512,
conditional_frame_timestep: float = 0.1,
):
r"""
The call function to the pipeline for generation. Supports three modes:
- **Text2World**: `image=None`, `video=None`, `prompt` provided. Generates a world clip.
- **Image2World**: `image` provided, `video=None`, `prompt` provided. Conditions on a single frame.
- **Video2World**: `video` provided, `image=None`, `prompt` provided. Conditions on an input clip.
Set `num_frames=93` (default) to produce a world video, or `num_frames=1` to produce a single image frame (the
above in "*2Image mode").
Outputs follow `output_type` (e.g., `"pil"` returns a list of `num_frames` PIL images per prompt).
Args:
image (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, *optional*):
Optional single image for Image2World conditioning. Must be `None` when `video` is provided.
video (`List[PIL.Image.Image]`, `np.ndarray`, `torch.Tensor`, *optional*):
Optional input video for Video2World conditioning. Must be `None` when `image` is provided.
prompt (`str` or `List[str]`, *optional*):
The prompt or prompts to guide generation. Required unless `prompt_embeds` is supplied.
height (`int`, defaults to `704`):
The height in pixels of the generated image.
width (`int`, defaults to `1280`):
The width in pixels of the generated image.
num_frames (`int`, defaults to `93`):
Number of output frames. Use `93` for world (video) generation; set to `1` to return a single frame.
num_inference_steps (`int`, defaults to `35`):
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
expense of slower inference.
guidance_scale (`float`, defaults to `7.0`):
Guidance scale as defined in [Classifier-Free Diffusion
Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
`guidance_scale > 1`.
num_videos_per_prompt (`int`, *optional*, defaults to 1):
The number of images to generate per prompt.
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
generation deterministic.
latents (`torch.Tensor`, *optional*):
Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
tensor is generated by sampling using the supplied random `generator`.
prompt_embeds (`torch.Tensor`, *optional*):
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
provided, text embeddings will be generated from `prompt` input argument.
negative_prompt_embeds (`torch.FloatTensor`, *optional*):
Pre-generated negative text embeddings. For PixArt-Sigma this negative prompt should be "". If not
provided, negative_prompt_embeds will be generated from `negative_prompt` input argument.
output_type (`str`, *optional*, defaults to `"pil"`):
The output format of the generated image. Choose between `PIL.Image` or `np.array`.
return_dict (`bool`, *optional*, defaults to `True`):
Whether or not to return a [`CosmosPipelineOutput`] instead of a plain tuple.
callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
callback_on_step_end_tensor_inputs (`List`, *optional*):
The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
`._callback_tensor_inputs` attribute of your pipeline class.
max_sequence_length (`int`, defaults to `512`):
The maximum number of tokens in the prompt. If the prompt exceeds this length, it will be truncated. If
the prompt is shorter than this length, it will be padded.
Examples:
Returns:
[`~CosmosPipelineOutput`] or `tuple`:
If `return_dict` is `True`, [`CosmosPipelineOutput`] is returned, otherwise a `tuple` is returned where
the first element is a list with the generated images and the second element is a list of `bool`s
indicating whether the corresponding generated image contains "not-safe-for-work" (nsfw) content.
"""
if self.safety_checker is None:
raise ValueError(
f"You have disabled the safety checker for {self.__class__}. This is in violation of the "
"[NVIDIA Open Model License Agreement](https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-open-model-license). "
f"Please ensure that you are compliant with the license agreement."
)
if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
# Check inputs. Raise error if not correct
self.check_inputs(prompt, height, width, prompt_embeds, callback_on_step_end_tensor_inputs)
self._guidance_scale = guidance_scale
self._current_timestep = None
self._interrupt = False
device = self._execution_device
if self.safety_checker is not None:
self.safety_checker.to(device)
if prompt is not None:
prompt_list = [prompt] if isinstance(prompt, str) else prompt
for p in prompt_list:
if not self.safety_checker.check_text_safety(p):
raise ValueError(
f"Cosmos Guardrail detected unsafe text in the prompt: {p}. Please ensure that the "
f"prompt abides by the NVIDIA Open Model License Agreement."
)
# Define call parameters
if prompt is not None and isinstance(prompt, str):
batch_size = 1
elif prompt is not None and isinstance(prompt, list):
batch_size = len(prompt)
else:
batch_size = prompt_embeds.shape[0]
# Encode input prompt
(
prompt_embeds,
negative_prompt_embeds,
) = self.encode_prompt(
prompt=prompt,
negative_prompt=negative_prompt,
do_classifier_free_guidance=self.do_classifier_free_guidance,
num_videos_per_prompt=num_videos_per_prompt,
prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_prompt_embeds,
device=device,
max_sequence_length=max_sequence_length,
)
vae_dtype = self.vae.dtype
transformer_dtype = self.transformer.dtype
num_frames_in = None
if image is not None:
if batch_size != 1:
raise ValueError(f"batch_size must be 1 for image input (given {batch_size})")
image = torchvision.transforms.functional.to_tensor(image).unsqueeze(0)
video = torch.cat([image, torch.zeros_like(image).repeat(num_frames - 1, 1, 1, 1)], dim=0)
video = video.unsqueeze(0)
num_frames_in = 1
elif video is None:
video = torch.zeros(batch_size, num_frames, 3, height, width, dtype=torch.uint8)
num_frames_in = 0
else:
num_frames_in = len(video)
if batch_size != 1:
raise ValueError(f"batch_size must be 1 for video input (given {batch_size})")
assert video is not None
video = self.video_processor.preprocess_video(video, height, width)
# pad with last frame (for video2world)
num_frames_out = num_frames
if video.shape[2] < num_frames_out:
n_pad_frames = num_frames_out - num_frames_in
last_frame = video[0, :, -1:, :, :] # [C, T==1, H, W]
pad_frames = last_frame.repeat(1, 1, n_pad_frames, 1, 1) # [B, C, T, H, W]
video = torch.cat((video, pad_frames), dim=2)
assert num_frames_in <= num_frames_out, f"expected ({num_frames_in=}) <= ({num_frames_out=})"
video = video.to(device=device, dtype=vae_dtype)
num_channels_latents = self.transformer.config.in_channels - 1
latents, cond_latent, cond_mask, cond_indicator = self.prepare_latents(
video=video,
batch_size=batch_size * num_videos_per_prompt,
num_channels_latents=num_channels_latents,
height=height,
width=width,
num_frames_in=num_frames_in,
num_frames_out=num_frames,
do_classifier_free_guidance=self.do_classifier_free_guidance,
dtype=torch.float32,
device=device,
generator=generator,
latents=latents,
)
cond_timestep = torch.ones_like(cond_indicator) * conditional_frame_timestep
cond_mask = cond_mask.to(transformer_dtype)
padding_mask = latents.new_zeros(1, 1, height, width, dtype=transformer_dtype)
# Denoising loop
self.scheduler.set_timesteps(num_inference_steps, device=device)
timesteps = self.scheduler.timesteps
self._num_timesteps = len(timesteps)
num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
gt_velocity = (latents - cond_latent) * cond_mask
with self.progress_bar(total=num_inference_steps) as progress_bar:
for i, t in enumerate(timesteps):
if self.interrupt:
continue
self._current_timestep = t.cpu().item()
# NOTE: assumes sigma(t) \in [0, 1]
sigma_t = (
torch.tensor(self.scheduler.sigmas[i].item())
.unsqueeze(0)
.to(device=device, dtype=transformer_dtype)
)
in_latents = cond_mask * cond_latent + (1 - cond_mask) * latents
in_latents = in_latents.to(transformer_dtype)
in_timestep = cond_indicator * cond_timestep + (1 - cond_indicator) * sigma_t
noise_pred = self.transformer(
hidden_states=in_latents,
condition_mask=cond_mask,
timestep=in_timestep,
encoder_hidden_states=prompt_embeds,
padding_mask=padding_mask,
return_dict=False,
)[0]
# NOTE: replace velocity (noise_pred) with gt_velocity for conditioning inputs only
noise_pred = gt_velocity + noise_pred * (1 - cond_mask)
if self.do_classifier_free_guidance:
noise_pred_neg = self.transformer(
hidden_states=in_latents,
condition_mask=cond_mask,
timestep=in_timestep,
encoder_hidden_states=negative_prompt_embeds,
padding_mask=padding_mask,
return_dict=False,
)[0]
# NOTE: replace velocity (noise_pred_neg) with gt_velocity for conditioning inputs only
noise_pred_neg = gt_velocity + noise_pred_neg * (1 - cond_mask)
noise_pred = noise_pred + self.guidance_scale * (noise_pred - noise_pred_neg)
latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
if callback_on_step_end is not None:
callback_kwargs = {}
for k in callback_on_step_end_tensor_inputs:
callback_kwargs[k] = locals()[k]
callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
latents = callback_outputs.pop("latents", latents)
prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
# call the callback, if provided
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
progress_bar.update()
if XLA_AVAILABLE:
xm.mark_step()
self._current_timestep = None
if not output_type == "latent":
latents_mean = self.latents_mean.to(latents.device, latents.dtype)
latents_std = self.latents_std.to(latents.device, latents.dtype)
latents = latents * latents_std + latents_mean
video = self.vae.decode(latents.to(self.vae.dtype), return_dict=False)[0]
video = self._match_num_frames(video, num_frames)
assert self.safety_checker is not None
self.safety_checker.to(device)
video = self.video_processor.postprocess_video(video, output_type="np")
video = (video * 255).astype(np.uint8)
video_batch = []
for vid in video:
vid = self.safety_checker.check_video_safety(vid)
video_batch.append(vid)
video = np.stack(video_batch).astype(np.float32) / 255.0 * 2 - 1
video = torch.from_numpy(video).permute(0, 4, 1, 2, 3)
video = self.video_processor.postprocess_video(video, output_type=output_type)
else:
video = latents
# Offload all models
self.maybe_free_model_hooks()
if not return_dict:
return (video,)
return CosmosPipelineOutput(frames=video)
def _match_num_frames(self, video: torch.Tensor, target_num_frames: int) -> torch.Tensor:
if target_num_frames <= 0 or video.shape[2] == target_num_frames:
return video
frames_per_latent = max(self.vae_scale_factor_temporal, 1)
video = torch.repeat_interleave(video, repeats=frames_per_latent, dim=2)
current_frames = video.shape[2]
if current_frames < target_num_frames:
pad = video[:, :, -1:, :, :].repeat(1, 1, target_num_frames - current_frames, 1, 1)
video = torch.cat([video, pad], dim=2)
elif current_frames > target_num_frames:
video = video[:, :, :target_num_frames]
return video

View File

@@ -882,24 +882,21 @@ the image\n<|vision_start|><|image_pad|><|vision_end|><|im_end|>\n<|im_start|>as
latents = latents / latents_std + latents_mean
b, c, f, h, w = latents.shape
latents = latents[:, :, 1:] # remove the first frame as it is the orgin input
latents = latents.permute(0, 2, 1, 3, 4).view(-1, c, 1, h, w)
image = self.vae.decode(latents, return_dict=False)[0] # (b f) c 1 h w
img = self.vae.decode(latents, return_dict=False)[0] # (b f) c 1 h w
img = img.squeeze(2)
image = image.squeeze(2)
image = self.image_processor.postprocess(image, output_type=output_type)
images = []
img = self.image_processor.postprocess(img, output_type=output_type)
image = []
for bidx in range(b):
images.append(image[bidx * f : (bidx + 1) * f])
image.append(img[bidx * f : (bidx + 1) * f])
# Offload all models
self.maybe_free_model_hooks()
if not return_dict:
return (images,)
return (image,)
return QwenImagePipelineOutput(images=images)
return QwenImagePipelineOutput(images=image)

View File

@@ -217,8 +217,6 @@ class UniPCMultistepScheduler(SchedulerMixin, ConfigMixin):
rescale_betas_zero_snr: bool = False,
use_dynamic_shifting: bool = False,
time_shift_type: Literal["exponential"] = "exponential",
sigma_min: Optional[float] = None,
sigma_max: Optional[float] = None,
) -> None:
if self.config.use_beta_sigmas and not is_scipy_available():
raise ImportError("Make sure to install scipy if you want to use beta sigmas.")
@@ -352,12 +350,7 @@ class UniPCMultistepScheduler(SchedulerMixin, ConfigMixin):
log_sigmas = np.log(sigmas)
sigmas = np.flip(sigmas).copy()
sigmas = self._convert_to_karras(in_sigmas=sigmas, num_inference_steps=num_inference_steps)
if self.config.use_flow_sigmas:
sigmas = sigmas / (sigmas + 1)
timesteps = (sigmas * self.config.num_train_timesteps).copy()
else:
timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas]).round()
timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas]).round()
if self.config.final_sigmas_type == "sigma_min":
sigma_last = sigmas[-1]
elif self.config.final_sigmas_type == "zero":

View File

@@ -767,21 +767,6 @@ class ConsisIDPipeline(metaclass=DummyObject):
requires_backends(cls, ["torch", "transformers"])
class Cosmos2_5_PredictBasePipeline(metaclass=DummyObject):
_backends = ["torch", "transformers"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch", "transformers"])
@classmethod
def from_config(cls, *args, **kwargs):
requires_backends(cls, ["torch", "transformers"])
@classmethod
def from_pretrained(cls, *args, **kwargs):
requires_backends(cls, ["torch", "transformers"])
class Cosmos2TextToImagePipeline(metaclass=DummyObject):
_backends = ["torch", "transformers"]

View File

@@ -27,7 +27,7 @@ class DummyCosmosSafetyChecker(ModelMixin, ConfigMixin):
def __init__(self) -> None:
super().__init__()
self.register_buffer("_device_tracker", torch.zeros(1, dtype=torch.float32), persistent=False)
self._dtype = torch.float32
def check_text_safety(self, prompt: str) -> bool:
return True
@@ -35,14 +35,13 @@ class DummyCosmosSafetyChecker(ModelMixin, ConfigMixin):
def check_video_safety(self, frames: np.ndarray) -> np.ndarray:
return frames
def to(self, device: Union[str, torch.device] = None, dtype: torch.dtype = None):
module = super().to(device=device, dtype=dtype)
return module
def to(self, device: Union[str, torch.device] = None, dtype: torch.dtype = None) -> None:
self._dtype = dtype
@property
def device(self) -> torch.device:
return self._device_tracker.device
return None
@property
def dtype(self) -> torch.dtype:
return self._device_tracker.dtype
return self._dtype

View File

@@ -1,337 +0,0 @@
# Copyright 2025 The HuggingFace Team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import inspect
import json
import os
import tempfile
import unittest
import numpy as np
import torch
from transformers import Qwen2_5_VLConfig, Qwen2_5_VLForConditionalGeneration, Qwen2Tokenizer
from diffusers import (
AutoencoderKLWan,
Cosmos2_5_PredictBasePipeline,
CosmosTransformer3DModel,
UniPCMultistepScheduler,
)
from ...testing_utils import enable_full_determinism, torch_device
from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
from ..test_pipelines_common import PipelineTesterMixin, to_np
from .cosmos_guardrail import DummyCosmosSafetyChecker
enable_full_determinism()
class Cosmos2_5_PredictBaseWrapper(Cosmos2_5_PredictBasePipeline):
@staticmethod
def from_pretrained(*args, **kwargs):
if "safety_checker" not in kwargs or kwargs["safety_checker"] is None:
safety_checker = DummyCosmosSafetyChecker()
device_map = kwargs.get("device_map", "cpu")
torch_dtype = kwargs.get("torch_dtype")
if device_map is not None or torch_dtype is not None:
safety_checker = safety_checker.to(device_map, dtype=torch_dtype)
kwargs["safety_checker"] = safety_checker
return Cosmos2_5_PredictBasePipeline.from_pretrained(*args, **kwargs)
class Cosmos2_5_PredictPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
pipeline_class = Cosmos2_5_PredictBaseWrapper
params = TEXT_TO_IMAGE_PARAMS - {"cross_attention_kwargs"}
batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
image_params = TEXT_TO_IMAGE_IMAGE_PARAMS
image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
required_optional_params = frozenset(
[
"num_inference_steps",
"generator",
"latents",
"return_dict",
"callback_on_step_end",
"callback_on_step_end_tensor_inputs",
]
)
supports_dduf = False
test_xformers_attention = False
test_layerwise_casting = True
test_group_offloading = True
def get_dummy_components(self):
torch.manual_seed(0)
transformer = CosmosTransformer3DModel(
in_channels=16 + 1,
out_channels=16,
num_attention_heads=2,
attention_head_dim=16,
num_layers=2,
mlp_ratio=2,
text_embed_dim=32,
adaln_lora_dim=4,
max_size=(4, 32, 32),
patch_size=(1, 2, 2),
rope_scale=(2.0, 1.0, 1.0),
concat_padding_mask=True,
extra_pos_embed_type="learnable",
)
torch.manual_seed(0)
vae = AutoencoderKLWan(
base_dim=3,
z_dim=16,
dim_mult=[1, 1, 1, 1],
num_res_blocks=1,
temperal_downsample=[False, True, True],
)
torch.manual_seed(0)
scheduler = UniPCMultistepScheduler()
torch.manual_seed(0)
config = Qwen2_5_VLConfig(
text_config={
"hidden_size": 16,
"intermediate_size": 16,
"num_hidden_layers": 2,
"num_attention_heads": 2,
"num_key_value_heads": 2,
"rope_scaling": {
"mrope_section": [1, 1, 2],
"rope_type": "default",
"type": "default",
},
"rope_theta": 1000000.0,
},
vision_config={
"depth": 2,
"hidden_size": 16,
"intermediate_size": 16,
"num_heads": 2,
"out_hidden_size": 16,
},
hidden_size=16,
vocab_size=152064,
vision_end_token_id=151653,
vision_start_token_id=151652,
vision_token_id=151654,
)
text_encoder = Qwen2_5_VLForConditionalGeneration(config)
tokenizer = Qwen2Tokenizer.from_pretrained("hf-internal-testing/tiny-random-Qwen2VLForConditionalGeneration")
components = {
"transformer": transformer,
"vae": vae,
"scheduler": scheduler,
"text_encoder": text_encoder,
"tokenizer": tokenizer,
"safety_checker": DummyCosmosSafetyChecker(),
}
return components
def get_dummy_inputs(self, device, seed=0):
if str(device).startswith("mps"):
generator = torch.manual_seed(seed)
else:
generator = torch.Generator(device=device).manual_seed(seed)
inputs = {
"prompt": "dance monkey",
"negative_prompt": "bad quality",
"generator": generator,
"num_inference_steps": 2,
"guidance_scale": 3.0,
"height": 32,
"width": 32,
"num_frames": 3,
"max_sequence_length": 16,
"output_type": "pt",
}
return inputs
def test_components_function(self):
init_components = self.get_dummy_components()
init_components = {k: v for k, v in init_components.items() if not isinstance(v, (str, int, float))}
pipe = self.pipeline_class(**init_components)
self.assertTrue(hasattr(pipe, "components"))
self.assertTrue(set(pipe.components.keys()) == set(init_components.keys()))
def test_inference(self):
device = "cpu"
components = self.get_dummy_components()
pipe = self.pipeline_class(**components)
pipe.to(device)
pipe.set_progress_bar_config(disable=None)
inputs = self.get_dummy_inputs(device)
video = pipe(**inputs).frames
generated_video = video[0]
self.assertEqual(generated_video.shape, (3, 3, 32, 32))
self.assertTrue(torch.isfinite(generated_video).all())
def test_callback_inputs(self):
sig = inspect.signature(self.pipeline_class.__call__)
has_callback_tensor_inputs = "callback_on_step_end_tensor_inputs" in sig.parameters
has_callback_step_end = "callback_on_step_end" in sig.parameters
if not (has_callback_tensor_inputs and has_callback_step_end):
return
components = self.get_dummy_components()
pipe = self.pipeline_class(**components)
pipe = pipe.to(torch_device)
pipe.set_progress_bar_config(disable=None)
self.assertTrue(
hasattr(pipe, "_callback_tensor_inputs"),
f" {self.pipeline_class} should have `_callback_tensor_inputs` that defines a list of tensor variables its callback function can use as inputs",
)
def callback_inputs_subset(pipe, i, t, callback_kwargs):
for tensor_name in callback_kwargs.keys():
assert tensor_name in pipe._callback_tensor_inputs
return callback_kwargs
def callback_inputs_all(pipe, i, t, callback_kwargs):
for tensor_name in pipe._callback_tensor_inputs:
assert tensor_name in callback_kwargs
for tensor_name in callback_kwargs.keys():
assert tensor_name in pipe._callback_tensor_inputs
return callback_kwargs
inputs = self.get_dummy_inputs(torch_device)
inputs["callback_on_step_end"] = callback_inputs_subset
inputs["callback_on_step_end_tensor_inputs"] = ["latents"]
_ = pipe(**inputs)[0]
inputs["callback_on_step_end"] = callback_inputs_all
inputs["callback_on_step_end_tensor_inputs"] = pipe._callback_tensor_inputs
_ = pipe(**inputs)[0]
def callback_inputs_change_tensor(pipe, i, t, callback_kwargs):
is_last = i == (pipe.num_timesteps - 1)
if is_last:
callback_kwargs["latents"] = torch.zeros_like(callback_kwargs["latents"])
return callback_kwargs
inputs["callback_on_step_end"] = callback_inputs_change_tensor
inputs["callback_on_step_end_tensor_inputs"] = pipe._callback_tensor_inputs
output = pipe(**inputs)[0]
assert output.abs().sum() < 1e10
def test_inference_batch_single_identical(self):
self._test_inference_batch_single_identical(batch_size=2, expected_max_diff=1e-2)
def test_attention_slicing_forward_pass(
self, test_max_difference=True, test_mean_pixel_difference=True, expected_max_diff=1e-3
):
if not getattr(self, "test_attention_slicing", True):
return
components = self.get_dummy_components()
pipe = self.pipeline_class(**components)
for component in pipe.components.values():
if hasattr(component, "set_default_attn_processor"):
component.set_default_attn_processor()
pipe.to(torch_device)
pipe.set_progress_bar_config(disable=None)
generator_device = "cpu"
inputs = self.get_dummy_inputs(generator_device)
output_without_slicing = pipe(**inputs)[0]
pipe.enable_attention_slicing(slice_size=1)
inputs = self.get_dummy_inputs(generator_device)
output_with_slicing1 = pipe(**inputs)[0]
pipe.enable_attention_slicing(slice_size=2)
inputs = self.get_dummy_inputs(generator_device)
output_with_slicing2 = pipe(**inputs)[0]
if test_max_difference:
max_diff1 = np.abs(to_np(output_with_slicing1) - to_np(output_without_slicing)).max()
max_diff2 = np.abs(to_np(output_with_slicing2) - to_np(output_without_slicing)).max()
self.assertLess(
max(max_diff1, max_diff2),
expected_max_diff,
"Attention slicing should not affect the inference results",
)
def test_save_load_optional_components(self, expected_max_difference=1e-4):
self.pipeline_class._optional_components.remove("safety_checker")
super().test_save_load_optional_components(expected_max_difference=expected_max_difference)
self.pipeline_class._optional_components.append("safety_checker")
def test_serialization_with_variants(self):
components = self.get_dummy_components()
pipe = self.pipeline_class(**components)
model_components = [
component_name
for component_name, component in pipe.components.items()
if isinstance(component, torch.nn.Module)
]
model_components.remove("safety_checker")
variant = "fp16"
with tempfile.TemporaryDirectory() as tmpdir:
pipe.save_pretrained(tmpdir, variant=variant, safe_serialization=False)
with open(f"{tmpdir}/model_index.json", "r") as f:
config = json.load(f)
for subfolder in os.listdir(tmpdir):
if not os.path.isfile(subfolder) and subfolder in model_components:
folder_path = os.path.join(tmpdir, subfolder)
is_folder = os.path.isdir(folder_path) and subfolder in config
assert is_folder and any(p.split(".")[1].startswith(variant) for p in os.listdir(folder_path))
def test_torch_dtype_dict(self):
components = self.get_dummy_components()
if not components:
self.skipTest("No dummy components defined.")
pipe = self.pipeline_class(**components)
specified_key = next(iter(components.keys()))
with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as tmpdirname:
pipe.save_pretrained(tmpdirname, safe_serialization=False)
torch_dtype_dict = {specified_key: torch.bfloat16, "default": torch.float16}
loaded_pipe = self.pipeline_class.from_pretrained(
tmpdirname, safety_checker=DummyCosmosSafetyChecker(), torch_dtype=torch_dtype_dict
)
for name, component in loaded_pipe.components.items():
if name == "safety_checker":
continue
if isinstance(component, torch.nn.Module) and hasattr(component, "dtype"):
expected_dtype = torch_dtype_dict.get(name, torch_dtype_dict.get("default", torch.float32))
self.assertEqual(
component.dtype,
expected_dtype,
f"Component '{name}' has dtype {component.dtype} but expected {expected_dtype}",
)
@unittest.skip(
"The pipeline should not be runnable without a safety checker. The test creates a pipeline without passing in "
"a safety checker, which makes the pipeline default to the actual Cosmos Guardrail. The Cosmos Guardrail is "
"too large and slow to run on CI."
)
def test_encode_prompt_works_in_isolation(self):
pass

View File

@@ -0,0 +1,223 @@
# Copyright 2025 The HuggingFace Team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import diffusers
import numpy as np
import torch
from PIL import Image
from transformers import Qwen2_5_VLConfig, Qwen2_5_VLForConditionalGeneration, Qwen2Tokenizer, Qwen2VLProcessor
from diffusers import (
AutoencoderKLQwenImage,
FlowMatchEulerDiscreteScheduler,
QwenImageLayeredPipeline,
QwenImageTransformer2DModel,
)
from ...testing_utils import enable_full_determinism, torch_device
from ..pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
from ..test_pipelines_common import PipelineTesterMixin, to_np
enable_full_determinism()
class QwenImageLayeredPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
pipeline_class = QwenImageLayeredPipeline
params = TEXT_TO_IMAGE_PARAMS - {"height", "width", "cross_attention_kwargs"}
batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS
image_params = frozenset(["image"])
image_latents_params = frozenset(["latents"])
required_optional_params = frozenset(
[
"num_inference_steps",
"generator",
"latents",
"return_dict",
"callback_on_step_end",
"callback_on_step_end_tensor_inputs",
]
)
supports_dduf = False
test_xformers_attention = False
test_layerwise_casting = True
test_group_offloading = True
def get_dummy_components(self):
tiny_ckpt_id = "hf-internal-testing/tiny-random-Qwen2VLForConditionalGeneration"
torch.manual_seed(0)
transformer = QwenImageTransformer2DModel(
patch_size=2,
in_channels=16,
out_channels=4,
num_layers=2,
attention_head_dim=16,
num_attention_heads=3,
joint_attention_dim=16,
guidance_embeds=False,
axes_dims_rope=(8, 4, 4),
)
torch.manual_seed(0)
z_dim = 4
vae = AutoencoderKLQwenImage(
base_dim=z_dim * 6,
z_dim=z_dim,
dim_mult=[1, 2, 4],
num_res_blocks=1,
temperal_downsample=[False, True],
latents_mean=[0.0] * z_dim,
latents_std=[1.0] * z_dim,
)
torch.manual_seed(0)
scheduler = FlowMatchEulerDiscreteScheduler()
torch.manual_seed(0)
config = Qwen2_5_VLConfig(
text_config={
"hidden_size": 16,
"intermediate_size": 16,
"num_hidden_layers": 2,
"num_attention_heads": 2,
"num_key_value_heads": 2,
"rope_scaling": {
"mrope_section": [1, 1, 2],
"rope_type": "default",
"type": "default",
},
"rope_theta": 1000000.0,
},
vision_config={
"depth": 2,
"hidden_size": 16,
"intermediate_size": 16,
"num_heads": 2,
"out_hidden_size": 16,
},
hidden_size=16,
vocab_size=152064,
vision_end_token_id=151653,
vision_start_token_id=151652,
vision_token_id=151654,
)
text_encoder = Qwen2_5_VLForConditionalGeneration(config)
tokenizer = Qwen2Tokenizer.from_pretrained(tiny_ckpt_id)
processor = Qwen2VLProcessor.from_pretrained(tiny_ckpt_id)
components = {
"transformer": transformer,
"vae": vae,
"scheduler": scheduler,
"text_encoder": text_encoder,
"tokenizer": tokenizer,
"processor": processor,
}
return components
def get_dummy_inputs(self, device, seed=0):
if str(device).startswith("mps"):
generator = torch.manual_seed(seed)
else:
generator = torch.Generator(device=device).manual_seed(seed)
inputs = {
"prompt": "dance monkey",
"image": Image.new("RGB", (32, 32)),
"negative_prompt": "bad quality",
"generator": generator,
"true_cfg_scale": 1.0,
"layers": 2,
"num_inference_steps": 2,
"max_sequence_length": 16,
"resolution": 640,
"output_type": "pt",
}
return inputs
def test_inference(self):
device = "cpu"
components = self.get_dummy_components()
pipe = self.pipeline_class(**components)
pipe.to(device)
pipe.set_progress_bar_config(disable=None)
inputs = self.get_dummy_inputs(device)
images = pipe(**inputs).images
self.assertEqual(len(images), 1)
generated_layers = images[0]
self.assertEqual(generated_layers.shape, (inputs["layers"], 3, 640, 640))
# fmt: off
expected_slice_layer_0 = torch.tensor([0.5752, 0.6324, 0.4913, 0.4421, 0.4917, 0.4923, 0.4790, 0.4299, 0.4029, 0.3506, 0.3302, 0.3352, 0.3579, 0.4422, 0.5086, 0.5961])
expected_slice_layer_1 = torch.tensor([0.5103, 0.6606, 0.5652, 0.6512, 0.5900, 0.5814, 0.5873, 0.5083, 0.5058, 0.4131, 0.4321, 0.5300, 0.3507, 0.4826, 0.4745, 0.5426])
# fmt: on
layer_0_slice = torch.cat([generated_layers[0].flatten()[:8], generated_layers[0].flatten()[-8:]])
layer_1_slice = torch.cat([generated_layers[1].flatten()[:8], generated_layers[1].flatten()[-8:]])
self.assertTrue(torch.allclose(layer_0_slice, expected_slice_layer_0, atol=1e-3))
self.assertTrue(torch.allclose(layer_1_slice, expected_slice_layer_1, atol=1e-3))
def test_inference_batch_single_identical(self, batch_size=3, expected_max_diff=1e-1):
components = self.get_dummy_components()
pipe = self.pipeline_class(**components)
for component in pipe.components.values():
if hasattr(component, "set_default_attn_processor"):
component.set_default_attn_processor()
pipe.to(torch_device)
pipe.set_progress_bar_config(disable=None)
inputs = self.get_dummy_inputs(torch_device)
inputs["generator"] = self.get_generator(0)
logger = diffusers.logging.get_logger(pipe.__module__)
logger.setLevel(level=diffusers.logging.FATAL)
batched_inputs = {}
batched_inputs.update(inputs)
for name in self.batch_params:
if name not in inputs:
continue
value = inputs[name]
if name == "prompt":
len_prompt = len(value)
batched_inputs[name] = [value[: len_prompt // i] for i in range(1, batch_size + 1)]
batched_inputs[name][-1] = 100 * "very long"
else:
batched_inputs[name] = batch_size * [value]
if "generator" in inputs:
batched_inputs["generator"] = [self.get_generator(i) for i in range(batch_size)]
if "batch_size" in inputs:
batched_inputs["batch_size"] = batch_size
batched_inputs["num_inference_steps"] = inputs["num_inference_steps"]
output = pipe(**inputs).images
output_batch = pipe(**batched_inputs).images
self.assertEqual(len(output_batch), batch_size)
max_diff = np.abs(to_np(output_batch[0][0]) - to_np(output[0][0])).max()
self.assertLess(max_diff, expected_max_diff)

View File

@@ -399,32 +399,3 @@ class UniPCMultistepScheduler1DTest(UniPCMultistepSchedulerTest):
def test_exponential_sigmas(self):
self.check_over_configs(use_exponential_sigmas=True)
def test_flow_and_karras_sigmas(self):
self.check_over_configs(use_flow_sigmas=True, use_karras_sigmas=True)
def test_flow_and_karras_sigmas_values(self):
num_train_timesteps = 1000
num_inference_steps = 5
scheduler = UniPCMultistepScheduler(
sigma_min=0.01,
sigma_max=200.0,
use_flow_sigmas=True,
use_karras_sigmas=True,
num_train_timesteps=num_train_timesteps,
)
scheduler.set_timesteps(num_inference_steps=num_inference_steps)
expected_sigmas = [
0.9950248599052429,
0.9787454605102539,
0.8774884343147278,
0.3604971766471863,
0.009900986216962337,
0.0, # 0 appended as default
]
expected_sigmas = torch.tensor(expected_sigmas)
expected_timesteps = (expected_sigmas * num_train_timesteps).to(torch.int64)
expected_timesteps = expected_timesteps[0:-1]
self.assertTrue(torch.allclose(scheduler.sigmas, expected_sigmas))
self.assertTrue(torch.all(expected_timesteps == scheduler.timesteps))