mirror of
https://github.com/huggingface/diffusers.git
synced 2025-12-23 21:04:56 +08:00
Compare commits
3 Commits
fix-torcha
...
modular-qw
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
19e2ce1b2d | ||
|
|
a1af845169 | ||
|
|
3a1ba1a0e2 |
@@ -231,7 +231,7 @@ class BlockState:
|
||||
|
||||
class ModularPipelineBlocks(ConfigMixin, PushToHubMixin):
|
||||
"""
|
||||
Base class for all Pipeline Blocks: PipelineBlock, AutoPipelineBlocks, SequentialPipelineBlocks,
|
||||
Base class for all Pipeline Blocks: ConditionalPipelineBlocks, AutoPipelineBlocks, SequentialPipelineBlocks,
|
||||
LoopSequentialPipelineBlocks
|
||||
|
||||
[`ModularPipelineBlocks`] provides method to load and save the definition of pipeline blocks.
|
||||
@@ -527,9 +527,10 @@ class ModularPipelineBlocks(ConfigMixin, PushToHubMixin):
|
||||
)
|
||||
|
||||
|
||||
class AutoPipelineBlocks(ModularPipelineBlocks):
|
||||
class ConditionalPipelineBlocks(ModularPipelineBlocks):
|
||||
"""
|
||||
A Pipeline Blocks that automatically selects a block to run based on the inputs.
|
||||
A Pipeline Blocks that conditionally selects a block to run based on the inputs.
|
||||
Subclasses must implement the `select_block` method to define the logic for selecting the block.
|
||||
|
||||
This class inherits from [`ModularPipelineBlocks`]. Check the superclass documentation for the generic methods the
|
||||
library implements for all the pipeline blocks (such as loading or saving etc.)
|
||||
@@ -539,12 +540,13 @@ class AutoPipelineBlocks(ModularPipelineBlocks):
|
||||
Attributes:
|
||||
block_classes: List of block classes to be used
|
||||
block_names: List of prefixes for each block
|
||||
block_trigger_inputs: List of input names that trigger specific blocks, with None for default
|
||||
block_trigger_inputs: List of input names that select_block() uses to determine which block to run
|
||||
"""
|
||||
|
||||
block_classes = []
|
||||
block_names = []
|
||||
block_trigger_inputs = []
|
||||
default_block_name = None # name of the default block if no trigger inputs are provided, if None, this block can be skipped if no trigger inputs are provided
|
||||
|
||||
def __init__(self):
|
||||
sub_blocks = InsertableDict()
|
||||
@@ -554,26 +556,15 @@ class AutoPipelineBlocks(ModularPipelineBlocks):
|
||||
else:
|
||||
sub_blocks[block_name] = block
|
||||
self.sub_blocks = sub_blocks
|
||||
if not (len(self.block_classes) == len(self.block_names) == len(self.block_trigger_inputs)):
|
||||
if not (len(self.block_classes) == len(self.block_names)):
|
||||
raise ValueError(
|
||||
f"In {self.__class__.__name__}, the number of block_classes, block_names, and block_trigger_inputs must be the same."
|
||||
f"In {self.__class__.__name__}, the number of block_classes and block_names must be the same."
|
||||
)
|
||||
default_blocks = [t for t in self.block_trigger_inputs if t is None]
|
||||
# can only have 1 or 0 default block, and has to put in the last
|
||||
# the order of blocks matters here because the first block with matching trigger will be dispatched
|
||||
# e.g. blocks = [inpaint, img2img] and block_trigger_inputs = ["mask", "image"]
|
||||
# as long as mask is provided, it is inpaint; if only image is provided, it is img2img
|
||||
if len(default_blocks) > 1 or (len(default_blocks) == 1 and self.block_trigger_inputs[-1] is not None):
|
||||
if self.default_block_name is not None and self.default_block_name not in self.block_names:
|
||||
raise ValueError(
|
||||
f"In {self.__class__.__name__}, exactly one None must be specified as the last element "
|
||||
"in block_trigger_inputs."
|
||||
f"In {self.__class__.__name__}, default_block_name '{self.default_block_name}' must be one of block_names: {self.block_names}"
|
||||
)
|
||||
|
||||
# Map trigger inputs to block objects
|
||||
self.trigger_to_block_map = dict(zip(self.block_trigger_inputs, self.sub_blocks.values()))
|
||||
self.trigger_to_block_name_map = dict(zip(self.block_trigger_inputs, self.sub_blocks.keys()))
|
||||
self.block_to_trigger_map = dict(zip(self.sub_blocks.keys(), self.block_trigger_inputs))
|
||||
|
||||
@property
|
||||
def model_name(self):
|
||||
return next(iter(self.sub_blocks.values())).model_name
|
||||
@@ -602,8 +593,11 @@ class AutoPipelineBlocks(ModularPipelineBlocks):
|
||||
|
||||
@property
|
||||
def required_inputs(self) -> List[str]:
|
||||
if None not in self.block_trigger_inputs:
|
||||
|
||||
# no default block means this conditional block can be skipped entirely
|
||||
if self.default_block_name is None:
|
||||
return []
|
||||
|
||||
first_block = next(iter(self.sub_blocks.values()))
|
||||
required_by_all = set(getattr(first_block, "required_inputs", set()))
|
||||
|
||||
@@ -614,7 +608,7 @@ class AutoPipelineBlocks(ModularPipelineBlocks):
|
||||
|
||||
return list(required_by_all)
|
||||
|
||||
# YiYi TODO: add test for this
|
||||
|
||||
@property
|
||||
def inputs(self) -> List[Tuple[str, Any]]:
|
||||
named_inputs = [(name, block.inputs) for name, block in self.sub_blocks.items()]
|
||||
@@ -639,36 +633,9 @@ class AutoPipelineBlocks(ModularPipelineBlocks):
|
||||
combined_outputs = self.combine_outputs(*named_outputs)
|
||||
return combined_outputs
|
||||
|
||||
@torch.no_grad()
|
||||
def __call__(self, pipeline, state: PipelineState) -> PipelineState:
|
||||
# Find default block first (if any)
|
||||
|
||||
block = self.trigger_to_block_map.get(None)
|
||||
for input_name in self.block_trigger_inputs:
|
||||
if input_name is not None and state.get(input_name) is not None:
|
||||
block = self.trigger_to_block_map[input_name]
|
||||
break
|
||||
|
||||
if block is None:
|
||||
logger.info(f"skipping auto block: {self.__class__.__name__}")
|
||||
return pipeline, state
|
||||
|
||||
try:
|
||||
logger.info(f"Running block: {block.__class__.__name__}, trigger: {input_name}")
|
||||
return block(pipeline, state)
|
||||
except Exception as e:
|
||||
error_msg = (
|
||||
f"\nError in block: {block.__class__.__name__}\n"
|
||||
f"Error details: {str(e)}\n"
|
||||
f"Traceback:\n{traceback.format_exc()}"
|
||||
)
|
||||
logger.error(error_msg)
|
||||
raise
|
||||
|
||||
def _get_trigger_inputs(self):
|
||||
def _get_trigger_inputs(self) -> set:
|
||||
"""
|
||||
Returns a set of all unique trigger input values found in the blocks. Returns: Set[str] containing all unique
|
||||
block_trigger_inputs values
|
||||
Returns a set of all unique trigger input values found in this block and nested blocks.
|
||||
"""
|
||||
|
||||
def fn_recursive_get_trigger(blocks):
|
||||
@@ -676,9 +643,8 @@ class AutoPipelineBlocks(ModularPipelineBlocks):
|
||||
|
||||
if blocks is not None:
|
||||
for name, block in blocks.items():
|
||||
# Check if current block has trigger inputs(i.e. auto block)
|
||||
# Check if current block has block_trigger_inputs
|
||||
if hasattr(block, "block_trigger_inputs") and block.block_trigger_inputs is not None:
|
||||
# Add all non-None values from the trigger inputs list
|
||||
trigger_values.update(t for t in block.block_trigger_inputs if t is not None)
|
||||
|
||||
# If block has sub_blocks, recursively check them
|
||||
@@ -688,15 +654,58 @@ class AutoPipelineBlocks(ModularPipelineBlocks):
|
||||
|
||||
return trigger_values
|
||||
|
||||
trigger_inputs = set(self.block_trigger_inputs)
|
||||
trigger_inputs.update(fn_recursive_get_trigger(self.sub_blocks))
|
||||
# Start with this block's block_trigger_inputs
|
||||
all_triggers = set(t for t in self.block_trigger_inputs if t is not None)
|
||||
# Add nested triggers
|
||||
all_triggers.update(fn_recursive_get_trigger(self.sub_blocks))
|
||||
|
||||
return trigger_inputs
|
||||
return all_triggers
|
||||
|
||||
@property
|
||||
def trigger_inputs(self):
|
||||
"""All trigger inputs including from nested blocks."""
|
||||
return self._get_trigger_inputs()
|
||||
|
||||
def select_block(self, **kwargs) -> Optional[str]:
|
||||
"""
|
||||
Select the block to run based on the trigger inputs.
|
||||
Subclasses must implement this method to define the logic for selecting the block.
|
||||
|
||||
Args:
|
||||
**kwargs: Trigger input names and their values from the state.
|
||||
|
||||
Returns:
|
||||
Optional[str]: The name of the block to run, or None to use default/skip.
|
||||
"""
|
||||
raise NotImplementedError(f"Subclass {self.__class__.__name__} must implement the `select_block` method.")
|
||||
|
||||
@torch.no_grad()
|
||||
def __call__(self, pipeline, state: PipelineState) -> PipelineState:
|
||||
|
||||
trigger_kwargs = {name: state.get(name) for name in self.block_trigger_inputs if name is not None}
|
||||
block_name = self.select_block(**trigger_kwargs)
|
||||
|
||||
if block_name is None:
|
||||
block_name = self.default_block_name
|
||||
|
||||
if block_name is None:
|
||||
logger.info(f"skipping conditional block: {self.__class__.__name__}")
|
||||
return pipeline, state
|
||||
|
||||
block = self.sub_blocks[block_name]
|
||||
|
||||
try:
|
||||
logger.info(f"Running block: {block.__class__.__name__}")
|
||||
return block(pipeline, state)
|
||||
except Exception as e:
|
||||
error_msg = (
|
||||
f"\nError in block: {block.__class__.__name__}\n"
|
||||
f"Error details: {str(e)}\n"
|
||||
f"Traceback:\n{traceback.format_exc()}"
|
||||
)
|
||||
logger.error(error_msg)
|
||||
raise
|
||||
|
||||
def __repr__(self):
|
||||
class_name = self.__class__.__name__
|
||||
base_class = self.__class__.__bases__[0].__name__
|
||||
@@ -708,7 +717,7 @@ class AutoPipelineBlocks(ModularPipelineBlocks):
|
||||
header += "\n"
|
||||
header += " " + "=" * 100 + "\n"
|
||||
header += " This pipeline contains blocks that are selected at runtime based on inputs.\n"
|
||||
header += f" Trigger Inputs: {[inp for inp in self.trigger_inputs if inp is not None]}\n"
|
||||
header += f" Trigger Inputs: {sorted(self.trigger_inputs)}\n"
|
||||
header += " " + "=" * 100 + "\n\n"
|
||||
|
||||
# Format description with proper indentation
|
||||
@@ -729,31 +738,20 @@ class AutoPipelineBlocks(ModularPipelineBlocks):
|
||||
expected_configs = getattr(self, "expected_configs", [])
|
||||
configs_str = format_configs(expected_configs, indent_level=2, add_empty_lines=False)
|
||||
|
||||
# Blocks section - moved to the end with simplified format
|
||||
# Blocks section
|
||||
blocks_str = " Sub-Blocks:\n"
|
||||
for i, (name, block) in enumerate(self.sub_blocks.items()):
|
||||
# Get trigger input for this block
|
||||
trigger = None
|
||||
if hasattr(self, "block_to_trigger_map"):
|
||||
trigger = self.block_to_trigger_map.get(name)
|
||||
# Format the trigger info
|
||||
if trigger is None:
|
||||
trigger_str = "[default]"
|
||||
elif isinstance(trigger, (list, tuple)):
|
||||
trigger_str = f"[trigger: {', '.join(str(t) for t in trigger)}]"
|
||||
else:
|
||||
trigger_str = f"[trigger: {trigger}]"
|
||||
# For AutoPipelineBlocks, add bullet points
|
||||
blocks_str += f" • {name} {trigger_str} ({block.__class__.__name__})\n"
|
||||
if name == self.default_block_name:
|
||||
addtional_str = " [default]"
|
||||
else:
|
||||
# For SequentialPipelineBlocks, show execution order
|
||||
blocks_str += f" [{i}] {name} ({block.__class__.__name__})\n"
|
||||
addtional_str = ""
|
||||
blocks_str += f" • {name}{addtional_str} ({block.__class__.__name__})\n"
|
||||
|
||||
# Add block description
|
||||
desc_lines = block.description.split("\n")
|
||||
indented_desc = desc_lines[0]
|
||||
if len(desc_lines) > 1:
|
||||
indented_desc += "\n" + "\n".join(" " + line for line in desc_lines[1:])
|
||||
block_desc_lines = block.description.split("\n")
|
||||
indented_desc = block_desc_lines[0]
|
||||
if len(block_desc_lines) > 1:
|
||||
indented_desc += "\n" + "\n".join(" " + line for line in block_desc_lines[1:])
|
||||
blocks_str += f" Description: {indented_desc}\n\n"
|
||||
|
||||
# Build the representation with conditional sections
|
||||
@@ -784,6 +782,35 @@ class AutoPipelineBlocks(ModularPipelineBlocks):
|
||||
)
|
||||
|
||||
|
||||
class AutoPipelineBlocks(ConditionalPipelineBlocks):
|
||||
"""
|
||||
A Pipeline Blocks that automatically selects a block to run based on the presence of trigger inputs.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
if not (len(self.block_classes) == len(self.block_names) == len(self.block_trigger_inputs)):
|
||||
raise ValueError(
|
||||
f"In {self.__class__.__name__}, the number of block_classes, block_names, and block_trigger_inputs must be the same."
|
||||
)
|
||||
|
||||
@property
|
||||
def default_block_name(self) -> Optional[str]:
|
||||
"""Derive default_block_name from block_trigger_inputs (None entry)."""
|
||||
if None in self.block_trigger_inputs:
|
||||
idx = self.block_trigger_inputs.index(None)
|
||||
return self.block_names[idx]
|
||||
return None
|
||||
|
||||
def select_block(self, **kwargs) -> Optional[str]:
|
||||
"""Select block based on which trigger input is present (not None)."""
|
||||
for trigger_input, block_name in zip(self.block_trigger_inputs, self.block_names):
|
||||
if trigger_input is not None and kwargs.get(trigger_input) is not None:
|
||||
return block_name
|
||||
return None
|
||||
|
||||
|
||||
class SequentialPipelineBlocks(ModularPipelineBlocks):
|
||||
"""
|
||||
A Pipeline Blocks that combines multiple pipeline block classes into one. When called, it will call each block in
|
||||
@@ -885,7 +912,8 @@ class SequentialPipelineBlocks(ModularPipelineBlocks):
|
||||
|
||||
# Only add outputs if the block cannot be skipped
|
||||
should_add_outputs = True
|
||||
if hasattr(block, "block_trigger_inputs") and None not in block.block_trigger_inputs:
|
||||
if isinstance(block, ConditionalPipelineBlocks) and block.default_block_name is None:
|
||||
# ConditionalPipelineBlocks without default can be skipped
|
||||
should_add_outputs = False
|
||||
|
||||
if should_add_outputs:
|
||||
@@ -948,8 +976,7 @@ class SequentialPipelineBlocks(ModularPipelineBlocks):
|
||||
|
||||
def _get_trigger_inputs(self):
|
||||
"""
|
||||
Returns a set of all unique trigger input values found in the blocks. Returns: Set[str] containing all unique
|
||||
block_trigger_inputs values
|
||||
Returns a set of all unique trigger input values found in the blocks.
|
||||
"""
|
||||
|
||||
def fn_recursive_get_trigger(blocks):
|
||||
@@ -957,9 +984,8 @@ class SequentialPipelineBlocks(ModularPipelineBlocks):
|
||||
|
||||
if blocks is not None:
|
||||
for name, block in blocks.items():
|
||||
# Check if current block has trigger inputs(i.e. auto block)
|
||||
# Check if current block has block_trigger_inputs (ConditionalPipelineBlocks)
|
||||
if hasattr(block, "block_trigger_inputs") and block.block_trigger_inputs is not None:
|
||||
# Add all non-None values from the trigger inputs list
|
||||
trigger_values.update(t for t in block.block_trigger_inputs if t is not None)
|
||||
|
||||
# If block has sub_blocks, recursively check them
|
||||
@@ -975,82 +1001,85 @@ class SequentialPipelineBlocks(ModularPipelineBlocks):
|
||||
def trigger_inputs(self):
|
||||
return self._get_trigger_inputs()
|
||||
|
||||
def _traverse_trigger_blocks(self, trigger_inputs):
|
||||
# Convert trigger_inputs to a set for easier manipulation
|
||||
active_triggers = set(trigger_inputs)
|
||||
def _traverse_trigger_blocks(self, active_inputs):
|
||||
"""
|
||||
Traverse blocks and select which ones would run given the active inputs.
|
||||
|
||||
def fn_recursive_traverse(block, block_name, active_triggers):
|
||||
Args:
|
||||
active_inputs: Dict of input names to values that are "present"
|
||||
|
||||
Returns:
|
||||
OrderedDict of block_name -> block that would execute
|
||||
"""
|
||||
|
||||
def fn_recursive_traverse(block, block_name, active_inputs):
|
||||
result_blocks = OrderedDict()
|
||||
|
||||
# sequential(include loopsequential) or PipelineBlock
|
||||
if not hasattr(block, "block_trigger_inputs"):
|
||||
if block.sub_blocks:
|
||||
# sequential or LoopSequentialPipelineBlocks (keep traversing)
|
||||
for sub_block_name, sub_block in block.sub_blocks.items():
|
||||
blocks_to_update = fn_recursive_traverse(sub_block, sub_block_name, active_triggers)
|
||||
blocks_to_update = fn_recursive_traverse(sub_block, sub_block_name, active_triggers)
|
||||
blocks_to_update = {f"{block_name}.{k}": v for k, v in blocks_to_update.items()}
|
||||
result_blocks.update(blocks_to_update)
|
||||
# ConditionalPipelineBlocks (includes AutoPipelineBlocks)
|
||||
if isinstance(block, ConditionalPipelineBlocks):
|
||||
trigger_kwargs = {name: active_inputs.get(name) for name in block.block_trigger_inputs}
|
||||
selected_block_name = block.select_block(**trigger_kwargs)
|
||||
|
||||
if selected_block_name is None:
|
||||
selected_block_name = block.default_block_name
|
||||
|
||||
if selected_block_name is None:
|
||||
return result_blocks
|
||||
|
||||
selected_block = block.sub_blocks[selected_block_name]
|
||||
|
||||
if selected_block.sub_blocks:
|
||||
result_blocks.update(fn_recursive_traverse(selected_block, block_name, active_inputs))
|
||||
else:
|
||||
# PipelineBlock
|
||||
result_blocks[block_name] = block
|
||||
# Add this block's output names to active triggers if defined
|
||||
if hasattr(block, "outputs"):
|
||||
active_triggers.update(out.name for out in block.outputs)
|
||||
result_blocks[block_name] = selected_block
|
||||
if hasattr(selected_block, "outputs"):
|
||||
for out in selected_block.outputs:
|
||||
active_inputs[out.name] = True
|
||||
|
||||
return result_blocks
|
||||
|
||||
# auto
|
||||
# SequentialPipelineBlocks or LoopSequentialPipelineBlocks
|
||||
if block.sub_blocks:
|
||||
for sub_block_name, sub_block in block.sub_blocks.items():
|
||||
blocks_to_update = fn_recursive_traverse(sub_block, sub_block_name, active_inputs)
|
||||
blocks_to_update = {f"{block_name}.{k}": v for k, v in blocks_to_update.items()}
|
||||
result_blocks.update(blocks_to_update)
|
||||
else:
|
||||
# Find first block_trigger_input that matches any value in our active_triggers
|
||||
this_block = None
|
||||
for trigger_input in block.block_trigger_inputs:
|
||||
if trigger_input is not None and trigger_input in active_triggers:
|
||||
this_block = block.trigger_to_block_map[trigger_input]
|
||||
break
|
||||
|
||||
# If no matches found, try to get the default (None) block
|
||||
if this_block is None and None in block.block_trigger_inputs:
|
||||
this_block = block.trigger_to_block_map[None]
|
||||
|
||||
if this_block is not None:
|
||||
# sequential/auto (keep traversing)
|
||||
if this_block.sub_blocks:
|
||||
result_blocks.update(fn_recursive_traverse(this_block, block_name, active_triggers))
|
||||
else:
|
||||
# PipelineBlock
|
||||
result_blocks[block_name] = this_block
|
||||
# Add this block's output names to active triggers if defined
|
||||
# YiYi TODO: do we need outputs here? can it just be intermediate_outputs? can we get rid of outputs attribute?
|
||||
if hasattr(this_block, "outputs"):
|
||||
active_triggers.update(out.name for out in this_block.outputs)
|
||||
result_blocks[block_name] = block
|
||||
if hasattr(block, "outputs"):
|
||||
for out in block.outputs:
|
||||
active_inputs[out.name] = True
|
||||
|
||||
return result_blocks
|
||||
|
||||
all_blocks = OrderedDict()
|
||||
for block_name, block in self.sub_blocks.items():
|
||||
blocks_to_update = fn_recursive_traverse(block, block_name, active_triggers)
|
||||
blocks_to_update = fn_recursive_traverse(block, block_name, active_inputs)
|
||||
all_blocks.update(blocks_to_update)
|
||||
return all_blocks
|
||||
|
||||
def get_execution_blocks(self, *trigger_inputs):
|
||||
trigger_inputs_all = self.trigger_inputs
|
||||
def get_execution_blocks(self, **kwargs):
|
||||
"""
|
||||
Get the blocks that would execute given the specified inputs.
|
||||
|
||||
if trigger_inputs is not None:
|
||||
if not isinstance(trigger_inputs, (list, tuple, set)):
|
||||
trigger_inputs = [trigger_inputs]
|
||||
invalid_inputs = [x for x in trigger_inputs if x not in trigger_inputs_all]
|
||||
if invalid_inputs:
|
||||
logger.warning(
|
||||
f"The following trigger inputs will be ignored as they are not supported: {invalid_inputs}"
|
||||
)
|
||||
trigger_inputs = [x for x in trigger_inputs if x in trigger_inputs_all]
|
||||
Args:
|
||||
**kwargs: Input names and values. Only trigger inputs affect block selection.
|
||||
Pass any inputs that would be non-None at runtime.
|
||||
|
||||
if trigger_inputs is None:
|
||||
if None in trigger_inputs_all:
|
||||
trigger_inputs = [None]
|
||||
else:
|
||||
trigger_inputs = [trigger_inputs_all[0]]
|
||||
blocks_triggered = self._traverse_trigger_blocks(trigger_inputs)
|
||||
Returns:
|
||||
SequentialPipelineBlocks containing only the blocks that would execute
|
||||
|
||||
Example:
|
||||
# Get blocks for inpainting workflow
|
||||
blocks = pipeline.get_execution_blocks(prompt="a cat", mask=mask, image=image)
|
||||
|
||||
# Get blocks for text2image workflow
|
||||
blocks = pipeline.get_execution_blocks(prompt="a cat")
|
||||
"""
|
||||
# Filter out None values
|
||||
active_inputs = {k: v for k, v in kwargs.items() if v is not None}
|
||||
|
||||
blocks_triggered = self._traverse_trigger_blocks(active_inputs)
|
||||
return SequentialPipelineBlocks.from_blocks_dict(blocks_triggered)
|
||||
|
||||
def __repr__(self):
|
||||
@@ -1067,7 +1096,7 @@ class SequentialPipelineBlocks(ModularPipelineBlocks):
|
||||
header += f" Trigger Inputs: {[inp for inp in self.trigger_inputs if inp is not None]}\n"
|
||||
# Get first trigger input as example
|
||||
example_input = next(t for t in self.trigger_inputs if t is not None)
|
||||
header += f" Use `get_execution_blocks()` with input names to see selected blocks (e.g. `get_execution_blocks('{example_input}')`).\n"
|
||||
header += f" Use `get_execution_blocks()` to see selected blocks (e.g. `get_execution_blocks({example_input}=...)`).\n"
|
||||
header += " " + "=" * 100 + "\n\n"
|
||||
|
||||
# Format description with proper indentation
|
||||
@@ -1091,22 +1120,9 @@ class SequentialPipelineBlocks(ModularPipelineBlocks):
|
||||
# Blocks section - moved to the end with simplified format
|
||||
blocks_str = " Sub-Blocks:\n"
|
||||
for i, (name, block) in enumerate(self.sub_blocks.items()):
|
||||
# Get trigger input for this block
|
||||
trigger = None
|
||||
if hasattr(self, "block_to_trigger_map"):
|
||||
trigger = self.block_to_trigger_map.get(name)
|
||||
# Format the trigger info
|
||||
if trigger is None:
|
||||
trigger_str = "[default]"
|
||||
elif isinstance(trigger, (list, tuple)):
|
||||
trigger_str = f"[trigger: {', '.join(str(t) for t in trigger)}]"
|
||||
else:
|
||||
trigger_str = f"[trigger: {trigger}]"
|
||||
# For AutoPipelineBlocks, add bullet points
|
||||
blocks_str += f" • {name} {trigger_str} ({block.__class__.__name__})\n"
|
||||
else:
|
||||
# For SequentialPipelineBlocks, show execution order
|
||||
blocks_str += f" [{i}] {name} ({block.__class__.__name__})\n"
|
||||
|
||||
# show execution order
|
||||
blocks_str += f" [{i}] {name} ({block.__class__.__name__})\n"
|
||||
|
||||
# Add block description
|
||||
desc_lines = block.description.split("\n")
|
||||
@@ -1230,15 +1246,9 @@ class LoopSequentialPipelineBlocks(ModularPipelineBlocks):
|
||||
if inp.name not in outputs and inp not in inputs:
|
||||
inputs.append(inp)
|
||||
|
||||
# Only add outputs if the block cannot be skipped
|
||||
should_add_outputs = True
|
||||
if hasattr(block, "block_trigger_inputs") and None not in block.block_trigger_inputs:
|
||||
should_add_outputs = False
|
||||
|
||||
if should_add_outputs:
|
||||
# Add this block's outputs
|
||||
block_intermediate_outputs = [out.name for out in block.intermediate_outputs]
|
||||
outputs.update(block_intermediate_outputs)
|
||||
# Add this block's outputs
|
||||
block_intermediate_outputs = [out.name for out in block.intermediate_outputs]
|
||||
outputs.update(block_intermediate_outputs)
|
||||
|
||||
for input_param in inputs:
|
||||
if input_param.name in self.required_inputs:
|
||||
@@ -1295,6 +1305,14 @@ class LoopSequentialPipelineBlocks(ModularPipelineBlocks):
|
||||
sub_blocks[block_name] = block
|
||||
self.sub_blocks = sub_blocks
|
||||
|
||||
# Validate that sub_blocks are only leaf blocks
|
||||
for block_name, block in self.sub_blocks.items():
|
||||
if block.sub_blocks:
|
||||
raise ValueError(
|
||||
f"In {self.__class__.__name__}, sub_blocks must be leaf blocks (no sub_blocks). "
|
||||
f"Block '{block_name}' ({block.__class__.__name__}) has sub_blocks."
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_blocks_dict(cls, blocks_dict: Dict[str, Any]) -> "LoopSequentialPipelineBlocks":
|
||||
"""
|
||||
|
||||
@@ -21,21 +21,16 @@ except OptionalDependencyNotAvailable:
|
||||
|
||||
_dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
|
||||
else:
|
||||
_import_structure["encoders"] = ["QwenImageTextEncoderStep"]
|
||||
_import_structure["modular_blocks"] = [
|
||||
"ALL_BLOCKS",
|
||||
_import_structure["modular_blocks_qwenimage"] = [
|
||||
"AUTO_BLOCKS",
|
||||
"CONTROLNET_BLOCKS",
|
||||
"EDIT_AUTO_BLOCKS",
|
||||
"EDIT_BLOCKS",
|
||||
"EDIT_INPAINT_BLOCKS",
|
||||
"EDIT_PLUS_AUTO_BLOCKS",
|
||||
"EDIT_PLUS_BLOCKS",
|
||||
"IMAGE2IMAGE_BLOCKS",
|
||||
"INPAINT_BLOCKS",
|
||||
"TEXT2IMAGE_BLOCKS",
|
||||
"QwenImageAutoBlocks",
|
||||
]
|
||||
_import_structure["modular_blocks_qwenimage_edit"] = [
|
||||
"EDIT_AUTO_BLOCKS",
|
||||
"QwenImageEditAutoBlocks",
|
||||
]
|
||||
_import_structure["modular_blocks_qwenimage_edit_plus"] = [
|
||||
"EDIT_PLUS_AUTO_BLOCKS",
|
||||
"QwenImageEditPlusAutoBlocks",
|
||||
]
|
||||
_import_structure["modular_pipeline"] = [
|
||||
@@ -51,23 +46,16 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
|
||||
except OptionalDependencyNotAvailable:
|
||||
from ...utils.dummy_torch_and_transformers_objects import * # noqa F403
|
||||
else:
|
||||
from .encoders import (
|
||||
QwenImageTextEncoderStep,
|
||||
)
|
||||
from .modular_blocks import (
|
||||
ALL_BLOCKS,
|
||||
from .modular_blocks_qwenimage import (
|
||||
AUTO_BLOCKS,
|
||||
CONTROLNET_BLOCKS,
|
||||
EDIT_AUTO_BLOCKS,
|
||||
EDIT_BLOCKS,
|
||||
EDIT_INPAINT_BLOCKS,
|
||||
EDIT_PLUS_AUTO_BLOCKS,
|
||||
EDIT_PLUS_BLOCKS,
|
||||
IMAGE2IMAGE_BLOCKS,
|
||||
INPAINT_BLOCKS,
|
||||
TEXT2IMAGE_BLOCKS,
|
||||
QwenImageAutoBlocks,
|
||||
)
|
||||
from .modular_blocks_qwenimage_edit import (
|
||||
EDIT_AUTO_BLOCKS,
|
||||
QwenImageEditAutoBlocks,
|
||||
)
|
||||
from .modular_blocks_qwenimage_edit_plus import (
|
||||
EDIT_PLUS_AUTO_BLOCKS,
|
||||
QwenImageEditPlusAutoBlocks,
|
||||
)
|
||||
from .modular_pipeline import (
|
||||
@@ -86,4 +74,4 @@ else:
|
||||
)
|
||||
|
||||
for name, value in _dummy_objects.items():
|
||||
setattr(sys.modules[__name__], name, value)
|
||||
setattr(sys.modules[__name__], name, value)
|
||||
@@ -639,19 +639,65 @@ class QwenImageEditRoPEInputsStep(ModularPipelineBlocks):
|
||||
return components, state
|
||||
|
||||
|
||||
class QwenImageEditPlusRoPEInputsStep(QwenImageEditRoPEInputsStep):
|
||||
class QwenImageEditPlusRoPEInputsStep(ModularPipelineBlocks):
|
||||
"""RoPE inputs step for Edit Plus that handles lists of image heights/widths."""
|
||||
|
||||
model_name = "qwenimage-edit-plus"
|
||||
|
||||
@property
|
||||
def description(self) -> str:
|
||||
return (
|
||||
"Step that prepares the RoPE inputs for denoising process. This is used in QwenImage Edit Plus.\n"
|
||||
"Unlike Edit, Edit Plus handles lists of image_height/image_width for multiple reference images.\n"
|
||||
"Should be placed after prepare_latents step."
|
||||
)
|
||||
|
||||
@property
|
||||
def inputs(self) -> List[InputParam]:
|
||||
return [
|
||||
InputParam(name="batch_size", required=True),
|
||||
InputParam(name="image_height", required=True, type_hint=List[int]),
|
||||
InputParam(name="image_width", required=True, type_hint=List[int]),
|
||||
InputParam(name="height", required=True),
|
||||
InputParam(name="width", required=True),
|
||||
InputParam(name="prompt_embeds_mask"),
|
||||
InputParam(name="negative_prompt_embeds_mask"),
|
||||
]
|
||||
|
||||
@property
|
||||
def intermediate_outputs(self) -> List[OutputParam]:
|
||||
return [
|
||||
OutputParam(
|
||||
name="img_shapes",
|
||||
type_hint=List[List[Tuple[int, int, int]]],
|
||||
description="The shapes of the image latents, used for RoPE calculation",
|
||||
),
|
||||
OutputParam(
|
||||
name="txt_seq_lens",
|
||||
kwargs_type="denoiser_input_fields",
|
||||
type_hint=List[int],
|
||||
description="The sequence lengths of the prompt embeds, used for RoPE calculation",
|
||||
),
|
||||
OutputParam(
|
||||
name="negative_txt_seq_lens",
|
||||
kwargs_type="denoiser_input_fields",
|
||||
type_hint=List[int],
|
||||
description="The sequence lengths of the negative prompt embeds, used for RoPE calculation",
|
||||
),
|
||||
]
|
||||
|
||||
def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
|
||||
block_state = self.get_block_state(state)
|
||||
|
||||
vae_scale_factor = components.vae_scale_factor
|
||||
|
||||
# Edit Plus: image_height and image_width are lists
|
||||
block_state.img_shapes = [
|
||||
[
|
||||
(1, block_state.height // vae_scale_factor // 2, block_state.width // vae_scale_factor // 2),
|
||||
*[
|
||||
(1, vae_height // vae_scale_factor // 2, vae_width // vae_scale_factor // 2)
|
||||
for vae_height, vae_width in zip(block_state.image_height, block_state.image_width)
|
||||
(1, img_height // vae_scale_factor // 2, img_width // vae_scale_factor // 2)
|
||||
for img_height, img_width in zip(block_state.image_height, block_state.image_width)
|
||||
],
|
||||
]
|
||||
] * block_state.batch_size
|
||||
|
||||
@@ -244,18 +244,19 @@ def encode_vae_image(
|
||||
class QwenImageEditResizeDynamicStep(ModularPipelineBlocks):
|
||||
model_name = "qwenimage"
|
||||
|
||||
def __init__(self, input_name: str = "image", output_name: str = "resized_image"):
|
||||
"""Create a configurable step for resizing images to the target area (1024 * 1024) while maintaining the aspect ratio.
|
||||
|
||||
This block resizes an input image tensor and exposes the resized result under configurable input and output
|
||||
names. Use this when you need to wire the resize step to different image fields (e.g., "image",
|
||||
"control_image")
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
input_name: str = "image",
|
||||
output_name: str = "resized_image",
|
||||
target_area: int = 1024 * 1024,
|
||||
):
|
||||
"""Create a configurable step for resizing images to the target area while maintaining the aspect ratio.
|
||||
Args:
|
||||
input_name (str, optional): Name of the image field to read from the
|
||||
pipeline state. Defaults to "image".
|
||||
output_name (str, optional): Name of the resized image field to write
|
||||
back to the pipeline state. Defaults to "resized_image".
|
||||
target_area (int, optional): Target area in pixels. Defaults to 1024*1024.
|
||||
"""
|
||||
if not isinstance(input_name, str) or not isinstance(output_name, str):
|
||||
raise ValueError(
|
||||
@@ -263,11 +264,12 @@ class QwenImageEditResizeDynamicStep(ModularPipelineBlocks):
|
||||
)
|
||||
self._image_input_name = input_name
|
||||
self._resized_image_output_name = output_name
|
||||
self._target_area = target_area
|
||||
super().__init__()
|
||||
|
||||
@property
|
||||
def description(self) -> str:
|
||||
return f"Image Resize step that resize the {self._image_input_name} to the target area (1024 * 1024) while maintaining the aspect ratio."
|
||||
return f"Image Resize step that resize the {self._image_input_name} to the target area {self._target_area} while maintaining the aspect ratio."
|
||||
|
||||
@property
|
||||
def expected_components(self) -> List[ComponentSpec]:
|
||||
@@ -320,48 +322,67 @@ class QwenImageEditResizeDynamicStep(ModularPipelineBlocks):
|
||||
self.set_block_state(state, block_state)
|
||||
return components, state
|
||||
|
||||
class QwenImageEditPlusResizeDynamicStep(ModularPipelineBlocks):
|
||||
"""Resize each image independently based on its own aspect ratio. For QwenImage Edit Plus."""
|
||||
|
||||
class QwenImageEditPlusResizeDynamicStep(QwenImageEditResizeDynamicStep):
|
||||
model_name = "qwenimage"
|
||||
model_name = "qwenimage-edit-plus"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
input_name: str = "image",
|
||||
self,
|
||||
input_name: str = "image",
|
||||
output_name: str = "resized_image",
|
||||
vae_image_output_name: str = "vae_image",
|
||||
target_area: int = 1024 * 1024,
|
||||
):
|
||||
"""Create a configurable step for resizing images to the target area (384 * 384) while maintaining the aspect ratio.
|
||||
"""Create a step for resizing images to a target area.
|
||||
|
||||
This block resizes an input image or a list input images and exposes the resized result under configurable
|
||||
input and output names. Use this when you need to wire the resize step to different image fields (e.g.,
|
||||
"image", "control_image")
|
||||
Each image is resized independently based on its own aspect ratio.
|
||||
This is suitable for Edit Plus where multiple reference images can have different dimensions.
|
||||
|
||||
Args:
|
||||
input_name (str, optional): Name of the image field to read from the
|
||||
pipeline state. Defaults to "image".
|
||||
output_name (str, optional): Name of the resized image field to write
|
||||
back to the pipeline state. Defaults to "resized_image".
|
||||
vae_image_output_name (str, optional): Name of the image field
|
||||
to write back to the pipeline state. This is used by the VAE encoder step later on. QwenImage Edit Plus
|
||||
processes the input image(s) differently for the VL and the VAE.
|
||||
input_name (str, optional): Name of the image field to read. Defaults to "image".
|
||||
output_name (str, optional): Name of the resized image field to write. Defaults to "resized_image".
|
||||
target_area (int, optional): Target area in pixels. Defaults to 1024*1024.
|
||||
"""
|
||||
if not isinstance(input_name, str) or not isinstance(output_name, str):
|
||||
raise ValueError(
|
||||
f"input_name and output_name must be strings but are {type(input_name)} and {type(output_name)}"
|
||||
)
|
||||
self.condition_image_size = 384 * 384
|
||||
self._image_input_name = input_name
|
||||
self._resized_image_output_name = output_name
|
||||
self._vae_image_output_name = vae_image_output_name
|
||||
self._target_area = target_area
|
||||
super().__init__()
|
||||
|
||||
@property
|
||||
def description(self) -> str:
|
||||
return (
|
||||
f"Image Resize step that resizes {self._image_input_name} to target area {self._target_area}.\n"
|
||||
"Each image is resized independently based on its own aspect ratio."
|
||||
)
|
||||
|
||||
@property
|
||||
def expected_components(self) -> List[ComponentSpec]:
|
||||
return [
|
||||
ComponentSpec(
|
||||
"image_resize_processor",
|
||||
VaeImageProcessor,
|
||||
config=FrozenDict({"vae_scale_factor": 16}),
|
||||
default_creation_method="from_config",
|
||||
),
|
||||
]
|
||||
|
||||
@property
|
||||
def inputs(self) -> List[InputParam]:
|
||||
return [
|
||||
InputParam(
|
||||
name=self._image_input_name, required=True, type_hint=torch.Tensor, description="The image(s) to resize"
|
||||
),
|
||||
]
|
||||
|
||||
@property
|
||||
def intermediate_outputs(self) -> List[OutputParam]:
|
||||
return super().intermediate_outputs + [
|
||||
return [
|
||||
OutputParam(
|
||||
name=self._vae_image_output_name,
|
||||
type_hint=List[PIL.Image.Image],
|
||||
description="The images to be processed which will be further used by the VAE encoder.",
|
||||
name=self._resized_image_output_name, type_hint=List[PIL.Image.Image], description="The resized images"
|
||||
),
|
||||
]
|
||||
|
||||
@@ -374,26 +395,21 @@ class QwenImageEditPlusResizeDynamicStep(QwenImageEditResizeDynamicStep):
|
||||
if not is_valid_image_imagelist(images):
|
||||
raise ValueError(f"Images must be image or list of images but are {type(images)}")
|
||||
|
||||
if (
|
||||
not isinstance(images, torch.Tensor)
|
||||
and isinstance(images, PIL.Image.Image)
|
||||
and not isinstance(images, list)
|
||||
):
|
||||
if is_valid_image(images):
|
||||
images = [images]
|
||||
|
||||
# TODO (sayakpaul): revisit this when the inputs are `torch.Tensor`s
|
||||
condition_images = []
|
||||
vae_images = []
|
||||
for img in images:
|
||||
image_width, image_height = img.size
|
||||
condition_width, condition_height, _ = calculate_dimensions(
|
||||
self.condition_image_size, image_width / image_height
|
||||
# Resize each image independently based on its own aspect ratio
|
||||
resized_images = []
|
||||
for image in images:
|
||||
image_width, image_height = image.size
|
||||
calculated_width, calculated_height, _ = calculate_dimensions(
|
||||
self._target_area, image_width / image_height
|
||||
)
|
||||
resized_images.append(
|
||||
components.image_resize_processor.resize(image, height=calculated_height, width=calculated_width)
|
||||
)
|
||||
condition_images.append(components.image_resize_processor.resize(img, condition_height, condition_width))
|
||||
vae_images.append(img)
|
||||
|
||||
setattr(block_state, self._resized_image_output_name, condition_images)
|
||||
setattr(block_state, self._vae_image_output_name, vae_images)
|
||||
setattr(block_state, self._resized_image_output_name, resized_images)
|
||||
self.set_block_state(state, block_state)
|
||||
return components, state
|
||||
|
||||
@@ -647,8 +663,30 @@ class QwenImageEditTextEncoderStep(ModularPipelineBlocks):
|
||||
return components, state
|
||||
|
||||
|
||||
class QwenImageEditPlusTextEncoderStep(QwenImageEditTextEncoderStep):
|
||||
model_name = "qwenimage"
|
||||
class QwenImageEditPlusTextEncoderStep(ModularPipelineBlocks):
|
||||
"""Text encoder for QwenImage Edit Plus that handles multiple reference images."""
|
||||
|
||||
model_name = "qwenimage-edit-plus"
|
||||
|
||||
@property
|
||||
def description(self) -> str:
|
||||
return (
|
||||
"Text Encoder step for QwenImage Edit Plus that processes prompt and multiple images together "
|
||||
"to generate text embeddings for guiding image generation."
|
||||
)
|
||||
|
||||
@property
|
||||
def expected_components(self) -> List[ComponentSpec]:
|
||||
return [
|
||||
ComponentSpec("text_encoder", Qwen2_5_VLForConditionalGeneration),
|
||||
ComponentSpec("processor", Qwen2VLProcessor),
|
||||
ComponentSpec(
|
||||
"guider",
|
||||
ClassifierFreeGuidance,
|
||||
config=FrozenDict({"guidance_scale": 4.0}),
|
||||
default_creation_method="from_config",
|
||||
),
|
||||
]
|
||||
|
||||
@property
|
||||
def expected_configs(self) -> List[ConfigSpec]:
|
||||
@@ -664,6 +702,60 @@ class QwenImageEditPlusTextEncoderStep(QwenImageEditTextEncoderStep):
|
||||
ConfigSpec(name="prompt_template_encode_start_idx", default=64),
|
||||
]
|
||||
|
||||
@property
|
||||
def inputs(self) -> List[InputParam]:
|
||||
return [
|
||||
InputParam(name="prompt", required=True, type_hint=str, description="The prompt to encode"),
|
||||
InputParam(name="negative_prompt", type_hint=str, description="The negative prompt to encode"),
|
||||
InputParam(
|
||||
name="resized_cond_image",
|
||||
required=True,
|
||||
type_hint=torch.Tensor,
|
||||
description="The image(s) to encode, can be a single image or list of images, should be resized to 384x384 using resize step",
|
||||
),
|
||||
]
|
||||
|
||||
@property
|
||||
def intermediate_outputs(self) -> List[OutputParam]:
|
||||
return [
|
||||
OutputParam(
|
||||
name="prompt_embeds",
|
||||
kwargs_type="denoiser_input_fields",
|
||||
type_hint=torch.Tensor,
|
||||
description="The prompt embeddings",
|
||||
),
|
||||
OutputParam(
|
||||
name="prompt_embeds_mask",
|
||||
kwargs_type="denoiser_input_fields",
|
||||
type_hint=torch.Tensor,
|
||||
description="The encoder attention mask",
|
||||
),
|
||||
OutputParam(
|
||||
name="negative_prompt_embeds",
|
||||
kwargs_type="denoiser_input_fields",
|
||||
type_hint=torch.Tensor,
|
||||
description="The negative prompt embeddings",
|
||||
),
|
||||
OutputParam(
|
||||
name="negative_prompt_embeds_mask",
|
||||
kwargs_type="denoiser_input_fields",
|
||||
type_hint=torch.Tensor,
|
||||
description="The negative prompt embeddings mask",
|
||||
),
|
||||
]
|
||||
|
||||
@staticmethod
|
||||
def check_inputs(prompt, negative_prompt):
|
||||
if not isinstance(prompt, str) and not isinstance(prompt, list):
|
||||
raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
|
||||
|
||||
if (
|
||||
negative_prompt is not None
|
||||
and not isinstance(negative_prompt, str)
|
||||
and not isinstance(negative_prompt, list)
|
||||
):
|
||||
raise ValueError(f"`negative_prompt` has to be of type `str` or `list` but is {type(negative_prompt)}")
|
||||
|
||||
@torch.no_grad()
|
||||
def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
|
||||
block_state = self.get_block_state(state)
|
||||
@@ -676,7 +768,7 @@ class QwenImageEditPlusTextEncoderStep(QwenImageEditTextEncoderStep):
|
||||
components.text_encoder,
|
||||
components.processor,
|
||||
prompt=block_state.prompt,
|
||||
image=block_state.resized_image,
|
||||
image=block_state.resized_cond_image,
|
||||
prompt_template_encode=components.config.prompt_template_encode,
|
||||
img_template_encode=components.config.img_template_encode,
|
||||
prompt_template_encode_start_idx=components.config.prompt_template_encode_start_idx,
|
||||
@@ -692,7 +784,7 @@ class QwenImageEditPlusTextEncoderStep(QwenImageEditTextEncoderStep):
|
||||
components.text_encoder,
|
||||
components.processor,
|
||||
prompt=negative_prompt,
|
||||
image=block_state.resized_image,
|
||||
image=block_state.resized_cond_image,
|
||||
prompt_template_encode=components.config.prompt_template_encode,
|
||||
img_template_encode=components.config.img_template_encode,
|
||||
prompt_template_encode_start_idx=components.config.prompt_template_encode_start_idx,
|
||||
@@ -846,60 +938,60 @@ class QwenImageProcessImagesInputStep(ModularPipelineBlocks):
|
||||
self.set_block_state(state, block_state)
|
||||
return components, state
|
||||
|
||||
|
||||
class QwenImageEditPlusProcessImagesInputStep(QwenImageProcessImagesInputStep):
|
||||
class QwenImageEditPlusProcessImagesInputStep(ModularPipelineBlocks):
|
||||
model_name = "qwenimage-edit-plus"
|
||||
|
||||
def __init__(self):
|
||||
self.vae_image_size = 1024 * 1024
|
||||
super().__init__()
|
||||
|
||||
@property
|
||||
def description(self) -> str:
|
||||
return "Image Preprocess step for QwenImage Edit Plus. Unlike QwenImage Edit, QwenImage Edit Plus doesn't use the same resized image for further preprocessing."
|
||||
return "Image Preprocess step. Images can be resized first using QwenImageEditResizeDynamicStep."
|
||||
|
||||
@property
|
||||
def expected_components(self) -> List[ComponentSpec]:
|
||||
return [
|
||||
ComponentSpec(
|
||||
"image_processor",
|
||||
VaeImageProcessor,
|
||||
config=FrozenDict({"vae_scale_factor": 16}),
|
||||
default_creation_method="from_config",
|
||||
),
|
||||
]
|
||||
|
||||
@property
|
||||
def inputs(self) -> List[InputParam]:
|
||||
return [InputParam("vae_image"), InputParam("image"), InputParam("height"), InputParam("width")]
|
||||
return [InputParam("resized_image")]
|
||||
|
||||
@property
|
||||
def intermediate_outputs(self) -> List[OutputParam]:
|
||||
return [OutputParam(name="processed_image")]
|
||||
|
||||
@torch.no_grad()
|
||||
def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
|
||||
block_state = self.get_block_state(state)
|
||||
|
||||
if block_state.vae_image is None and block_state.image is None:
|
||||
raise ValueError("`vae_image` and `image` cannot be None at the same time")
|
||||
|
||||
vae_image_sizes = None
|
||||
if block_state.vae_image is None:
|
||||
image = block_state.image
|
||||
self.check_inputs(
|
||||
height=block_state.height, width=block_state.width, vae_scale_factor=components.vae_scale_factor
|
||||
)
|
||||
height = block_state.height or components.default_height
|
||||
width = block_state.width or components.default_width
|
||||
block_state.processed_image = components.image_processor.preprocess(
|
||||
image=image, height=height, width=width
|
||||
)
|
||||
else:
|
||||
# QwenImage Edit Plus can allow multiple input images with varied resolutions
|
||||
processed_images = []
|
||||
vae_image_sizes = []
|
||||
for img in block_state.vae_image:
|
||||
width, height = img.size
|
||||
vae_width, vae_height, _ = calculate_dimensions(self.vae_image_size, width / height)
|
||||
vae_image_sizes.append((vae_width, vae_height))
|
||||
processed_images.append(
|
||||
components.image_processor.preprocess(image=img, height=vae_height, width=vae_width)
|
||||
)
|
||||
|
||||
image = block_state.resized_image
|
||||
|
||||
is_image_list = isinstance(image, list)
|
||||
if not is_image_list:
|
||||
image = [image]
|
||||
|
||||
processed_images = []
|
||||
for img in image:
|
||||
img_width, img_height = img.size
|
||||
processed_images.append(components.image_processor.preprocess(image=img, height=img_height, width=img_width))
|
||||
block_state.processed_image = processed_images
|
||||
if is_image_list:
|
||||
block_state.processed_image = processed_images
|
||||
|
||||
block_state.vae_image_sizes = vae_image_sizes
|
||||
else:
|
||||
block_state.processed_image = processed_images[0]
|
||||
|
||||
self.set_block_state(state, block_state)
|
||||
return components, state
|
||||
|
||||
|
||||
class QwenImageVaeEncoderDynamicStep(ModularPipelineBlocks):
|
||||
"""VAE encoder that handles both single images and lists of images with varied resolutions."""
|
||||
|
||||
model_name = "qwenimage"
|
||||
|
||||
def __init__(
|
||||
@@ -909,21 +1001,12 @@ class QwenImageVaeEncoderDynamicStep(ModularPipelineBlocks):
|
||||
):
|
||||
"""Initialize a VAE encoder step for converting images to latent representations.
|
||||
|
||||
Both the input and output names are configurable so this block can be configured to process to different image
|
||||
inputs (e.g., "processed_image" -> "image_latents", "processed_control_image" -> "control_image_latents").
|
||||
Handles both single images and lists of images. When input is a list, outputs a list of latents.
|
||||
When input is a single tensor, outputs a single latent tensor.
|
||||
|
||||
Args:
|
||||
input_name (str, optional): Name of the input image tensor. Defaults to "processed_image".
|
||||
Examples: "processed_image" or "processed_control_image"
|
||||
output_name (str, optional): Name of the output latent tensor. Defaults to "image_latents".
|
||||
Examples: "image_latents" or "control_image_latents"
|
||||
|
||||
Examples:
|
||||
# Basic usage with default settings (includes image processor) QwenImageVaeEncoderDynamicStep()
|
||||
|
||||
# Custom input/output names for control image QwenImageVaeEncoderDynamicStep(
|
||||
input_name="processed_control_image", output_name="control_image_latents"
|
||||
)
|
||||
input_name (str, optional): Name of the input image tensor or list. Defaults to "processed_image".
|
||||
output_name (str, optional): Name of the output latent tensor or list. Defaults to "image_latents".
|
||||
"""
|
||||
self._image_input_name = input_name
|
||||
self._image_latents_output_name = output_name
|
||||
@@ -931,17 +1014,18 @@ class QwenImageVaeEncoderDynamicStep(ModularPipelineBlocks):
|
||||
|
||||
@property
|
||||
def description(self) -> str:
|
||||
return f"Dynamic VAE Encoder step that converts {self._image_input_name} into latent representations {self._image_latents_output_name}.\n"
|
||||
return (
|
||||
f"VAE Encoder step that converts {self._image_input_name} into latent representations {self._image_latents_output_name}.\n"
|
||||
"Handles both single images and lists of images with varied resolutions."
|
||||
)
|
||||
|
||||
@property
|
||||
def expected_components(self) -> List[ComponentSpec]:
|
||||
components = [ComponentSpec("vae", AutoencoderKLQwenImage)]
|
||||
return components
|
||||
return [ComponentSpec("vae", AutoencoderKLQwenImage)]
|
||||
|
||||
@property
|
||||
def inputs(self) -> List[InputParam]:
|
||||
inputs = [InputParam(self._image_input_name, required=True), InputParam("generator")]
|
||||
return inputs
|
||||
return [InputParam(self._image_input_name, required=True), InputParam("generator")]
|
||||
|
||||
@property
|
||||
def intermediate_outputs(self) -> List[OutputParam]:
|
||||
@@ -949,7 +1033,7 @@ class QwenImageVaeEncoderDynamicStep(ModularPipelineBlocks):
|
||||
OutputParam(
|
||||
self._image_latents_output_name,
|
||||
type_hint=torch.Tensor,
|
||||
description="The latents representing the reference image",
|
||||
description="The latents representing the reference image(s). Single tensor or list depending on input.",
|
||||
)
|
||||
]
|
||||
|
||||
@@ -961,47 +1045,11 @@ class QwenImageVaeEncoderDynamicStep(ModularPipelineBlocks):
|
||||
dtype = components.vae.dtype
|
||||
|
||||
image = getattr(block_state, self._image_input_name)
|
||||
is_image_list = isinstance(image, list)
|
||||
if not is_image_list:
|
||||
image = [image]
|
||||
|
||||
# Encode image into latents
|
||||
image_latents = encode_vae_image(
|
||||
image=image,
|
||||
vae=components.vae,
|
||||
generator=block_state.generator,
|
||||
device=device,
|
||||
dtype=dtype,
|
||||
latent_channels=components.num_channels_latents,
|
||||
)
|
||||
setattr(block_state, self._image_latents_output_name, image_latents)
|
||||
|
||||
self.set_block_state(state, block_state)
|
||||
|
||||
return components, state
|
||||
|
||||
|
||||
class QwenImageEditPlusVaeEncoderDynamicStep(QwenImageVaeEncoderDynamicStep):
|
||||
model_name = "qwenimage-edit-plus"
|
||||
|
||||
@property
|
||||
def intermediate_outputs(self) -> List[OutputParam]:
|
||||
# Each reference image latent can have varied resolutions hence we return this as a list.
|
||||
return [
|
||||
OutputParam(
|
||||
self._image_latents_output_name,
|
||||
type_hint=List[torch.Tensor],
|
||||
description="The latents representing the reference image(s).",
|
||||
)
|
||||
]
|
||||
|
||||
@torch.no_grad()
|
||||
def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
|
||||
block_state = self.get_block_state(state)
|
||||
|
||||
device = components._execution_device
|
||||
dtype = components.vae.dtype
|
||||
|
||||
image = getattr(block_state, self._image_input_name)
|
||||
|
||||
# Encode image into latents
|
||||
# Handle both single image and list of images
|
||||
image_latents = []
|
||||
for img in image:
|
||||
image_latents.append(
|
||||
@@ -1014,9 +1062,12 @@ class QwenImageEditPlusVaeEncoderDynamicStep(QwenImageVaeEncoderDynamicStep):
|
||||
latent_channels=components.num_channels_latents,
|
||||
)
|
||||
)
|
||||
if not is_image_list:
|
||||
image_latents = image_latents[0]
|
||||
|
||||
setattr(block_state, self._image_latents_output_name, image_latents)
|
||||
|
||||
|
||||
self.set_block_state(state, block_state)
|
||||
|
||||
return components, state
|
||||
|
||||
@@ -222,36 +222,15 @@ class QwenImageTextInputsStep(ModularPipelineBlocks):
|
||||
|
||||
|
||||
class QwenImageInputsDynamicStep(ModularPipelineBlocks):
|
||||
"""Input step for QwenImage: update height/width, expand batch, patchify."""
|
||||
|
||||
model_name = "qwenimage"
|
||||
|
||||
def __init__(self, image_latent_inputs: List[str] = ["image_latents"], additional_batch_inputs: List[str] = []):
|
||||
"""Initialize a configurable step that standardizes the inputs for the denoising step. It:\n"
|
||||
|
||||
This step handles multiple common tasks to prepare inputs for the denoising step:
|
||||
1. For encoded image latents, use it update height/width if None, patchifies, and expands batch size
|
||||
2. For additional_batch_inputs: Only expands batch dimensions to match final batch size
|
||||
|
||||
This is a dynamic block that allows you to configure which inputs to process.
|
||||
|
||||
Args:
|
||||
image_latent_inputs (List[str], optional): Names of image latent tensors to process.
|
||||
These will be used to determine height/width, patchified, and batch-expanded. Can be a single string or
|
||||
list of strings. Defaults to ["image_latents"]. Examples: ["image_latents"], ["control_image_latents"]
|
||||
additional_batch_inputs (List[str], optional):
|
||||
Names of additional conditional input tensors to expand batch size. These tensors will only have their
|
||||
batch dimensions adjusted to match the final batch size. Can be a single string or list of strings.
|
||||
Defaults to []. Examples: ["processed_mask_image"]
|
||||
|
||||
Examples:
|
||||
# Configure to process image_latents (default behavior) QwenImageInputsDynamicStep()
|
||||
|
||||
# Configure to process multiple image latent inputs
|
||||
QwenImageInputsDynamicStep(image_latent_inputs=["image_latents", "control_image_latents"])
|
||||
|
||||
# Configure to process image latents and additional batch inputs QwenImageInputsDynamicStep(
|
||||
image_latent_inputs=["image_latents"], additional_batch_inputs=["processed_mask_image"]
|
||||
)
|
||||
"""
|
||||
def __init__(
|
||||
self,
|
||||
image_latent_inputs: List[str] = ["image_latents"],
|
||||
additional_batch_inputs: List[str] = [],
|
||||
):
|
||||
if not isinstance(image_latent_inputs, list):
|
||||
image_latent_inputs = [image_latent_inputs]
|
||||
if not isinstance(additional_batch_inputs, list):
|
||||
@@ -263,14 +242,12 @@ class QwenImageInputsDynamicStep(ModularPipelineBlocks):
|
||||
|
||||
@property
|
||||
def description(self) -> str:
|
||||
# Functionality section
|
||||
summary_section = (
|
||||
"Input processing step that:\n"
|
||||
" 1. For image latent inputs: Updates height/width if None, patchifies latents, and expands batch size\n"
|
||||
" 1. For image latent inputs: Updates height/width if None, patchifies, and expands batch size\n"
|
||||
" 2. For additional batch inputs: Expands batch dimensions to match final batch size"
|
||||
)
|
||||
|
||||
# Inputs info
|
||||
inputs_info = ""
|
||||
if self._image_latent_inputs or self._additional_batch_inputs:
|
||||
inputs_info = "\n\nConfigured inputs:"
|
||||
@@ -279,11 +256,16 @@ class QwenImageInputsDynamicStep(ModularPipelineBlocks):
|
||||
if self._additional_batch_inputs:
|
||||
inputs_info += f"\n - Additional batch inputs: {self._additional_batch_inputs}"
|
||||
|
||||
# Placement guidance
|
||||
placement_section = "\n\nThis block should be placed after the encoder steps and the text input step."
|
||||
|
||||
return summary_section + inputs_info + placement_section
|
||||
|
||||
@property
|
||||
def expected_components(self) -> List[ComponentSpec]:
|
||||
return [
|
||||
ComponentSpec("pachifier", QwenImagePachifier, default_creation_method="from_config"),
|
||||
]
|
||||
|
||||
@property
|
||||
def inputs(self) -> List[InputParam]:
|
||||
inputs = [
|
||||
@@ -293,11 +275,9 @@ class QwenImageInputsDynamicStep(ModularPipelineBlocks):
|
||||
InputParam(name="width"),
|
||||
]
|
||||
|
||||
# Add image latent inputs
|
||||
for image_latent_input_name in self._image_latent_inputs:
|
||||
inputs.append(InputParam(name=image_latent_input_name))
|
||||
|
||||
# Add additional batch inputs
|
||||
for input_name in self._additional_batch_inputs:
|
||||
inputs.append(InputParam(name=input_name))
|
||||
|
||||
@@ -310,22 +290,16 @@ class QwenImageInputsDynamicStep(ModularPipelineBlocks):
|
||||
OutputParam(name="image_width", type_hint=int, description="The width of the image latents"),
|
||||
]
|
||||
|
||||
@property
|
||||
def expected_components(self) -> List[ComponentSpec]:
|
||||
return [
|
||||
ComponentSpec("pachifier", QwenImagePachifier, default_creation_method="from_config"),
|
||||
]
|
||||
|
||||
def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
|
||||
block_state = self.get_block_state(state)
|
||||
|
||||
# Process image latent inputs (height/width calculation, patchify, and batch expansion)
|
||||
# Process image latent inputs
|
||||
for image_latent_input_name in self._image_latent_inputs:
|
||||
image_latent_tensor = getattr(block_state, image_latent_input_name)
|
||||
if image_latent_tensor is None:
|
||||
continue
|
||||
|
||||
# 1. Calculate height/width from latents
|
||||
# 1. Calculate height/width from latents and update if not provided
|
||||
height, width = calculate_dimension_from_latents(image_latent_tensor, components.vae_scale_factor)
|
||||
block_state.height = block_state.height or height
|
||||
block_state.width = block_state.width or width
|
||||
@@ -335,7 +309,7 @@ class QwenImageInputsDynamicStep(ModularPipelineBlocks):
|
||||
if not hasattr(block_state, "image_width"):
|
||||
block_state.image_width = width
|
||||
|
||||
# 2. Patchify the image latent tensor
|
||||
# 2. Patchify
|
||||
image_latent_tensor = components.pachifier.pack_latents(image_latent_tensor)
|
||||
|
||||
# 3. Expand batch size
|
||||
@@ -354,7 +328,6 @@ class QwenImageInputsDynamicStep(ModularPipelineBlocks):
|
||||
if input_tensor is None:
|
||||
continue
|
||||
|
||||
# Only expand batch size
|
||||
input_tensor = repeat_tensor_to_batch_size(
|
||||
input_name=input_name,
|
||||
input_tensor=input_tensor,
|
||||
@@ -368,63 +341,130 @@ class QwenImageInputsDynamicStep(ModularPipelineBlocks):
|
||||
return components, state
|
||||
|
||||
|
||||
class QwenImageEditPlusInputsDynamicStep(QwenImageInputsDynamicStep):
|
||||
class QwenImageEditPlusInputsDynamicStep(ModularPipelineBlocks):
|
||||
"""Input step for QwenImage Edit Plus: handles list of latents with different sizes."""
|
||||
|
||||
model_name = "qwenimage-edit-plus"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
image_latent_inputs: List[str] = ["image_latents"],
|
||||
additional_batch_inputs: List[str] = [],
|
||||
):
|
||||
if not isinstance(image_latent_inputs, list):
|
||||
image_latent_inputs = [image_latent_inputs]
|
||||
if not isinstance(additional_batch_inputs, list):
|
||||
additional_batch_inputs = [additional_batch_inputs]
|
||||
|
||||
self._image_latent_inputs = image_latent_inputs
|
||||
self._additional_batch_inputs = additional_batch_inputs
|
||||
super().__init__()
|
||||
|
||||
@property
|
||||
def description(self) -> str:
|
||||
summary_section = (
|
||||
"Input processing step for Edit Plus that:\n"
|
||||
" 1. For image latent inputs (list): Collects heights/widths, patchifies each, concatenates, expands batch\n"
|
||||
" 2. For additional batch inputs: Expands batch dimensions to match final batch size\n"
|
||||
" Height/width defaults to last image in the list."
|
||||
)
|
||||
|
||||
inputs_info = ""
|
||||
if self._image_latent_inputs or self._additional_batch_inputs:
|
||||
inputs_info = "\n\nConfigured inputs:"
|
||||
if self._image_latent_inputs:
|
||||
inputs_info += f"\n - Image latent inputs: {self._image_latent_inputs}"
|
||||
if self._additional_batch_inputs:
|
||||
inputs_info += f"\n - Additional batch inputs: {self._additional_batch_inputs}"
|
||||
|
||||
placement_section = "\n\nThis block should be placed after the encoder steps and the text input step."
|
||||
|
||||
return summary_section + inputs_info + placement_section
|
||||
|
||||
@property
|
||||
def expected_components(self) -> List[ComponentSpec]:
|
||||
return [
|
||||
ComponentSpec("pachifier", QwenImagePachifier, default_creation_method="from_config"),
|
||||
]
|
||||
|
||||
@property
|
||||
def inputs(self) -> List[InputParam]:
|
||||
inputs = [
|
||||
InputParam(name="num_images_per_prompt", default=1),
|
||||
InputParam(name="batch_size", required=True),
|
||||
InputParam(name="height"),
|
||||
InputParam(name="width"),
|
||||
]
|
||||
|
||||
for image_latent_input_name in self._image_latent_inputs:
|
||||
inputs.append(InputParam(name=image_latent_input_name))
|
||||
|
||||
for input_name in self._additional_batch_inputs:
|
||||
inputs.append(InputParam(name=input_name))
|
||||
|
||||
return inputs
|
||||
|
||||
@property
|
||||
def intermediate_outputs(self) -> List[OutputParam]:
|
||||
return [
|
||||
OutputParam(name="image_height", type_hint=List[int], description="The height of the image latents"),
|
||||
OutputParam(name="image_width", type_hint=List[int], description="The width of the image latents"),
|
||||
OutputParam(name="image_height", type_hint=List[int], description="The heights of the image latents"),
|
||||
OutputParam(name="image_width", type_hint=List[int], description="The widths of the image latents"),
|
||||
]
|
||||
|
||||
def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
|
||||
block_state = self.get_block_state(state)
|
||||
|
||||
# Process image latent inputs (height/width calculation, patchify, and batch expansion)
|
||||
# Process image latent inputs
|
||||
for image_latent_input_name in self._image_latent_inputs:
|
||||
image_latent_tensor = getattr(block_state, image_latent_input_name)
|
||||
if image_latent_tensor is None:
|
||||
continue
|
||||
|
||||
# Each image latent can have different size in QwenImage Edit Plus.
|
||||
is_list = isinstance(image_latent_tensor, list)
|
||||
if not is_list:
|
||||
image_latent_tensor = [image_latent_tensor]
|
||||
|
||||
image_heights = []
|
||||
image_widths = []
|
||||
packed_image_latent_tensors = []
|
||||
|
||||
for img_latent_tensor in image_latent_tensor:
|
||||
for i, img_latent_tensor in enumerate(image_latent_tensor):
|
||||
# 1. Calculate height/width from latents
|
||||
height, width = calculate_dimension_from_latents(img_latent_tensor, components.vae_scale_factor)
|
||||
image_heights.append(height)
|
||||
image_widths.append(width)
|
||||
|
||||
# 2. Patchify the image latent tensor
|
||||
# 2. Patchify
|
||||
img_latent_tensor = components.pachifier.pack_latents(img_latent_tensor)
|
||||
|
||||
# 3. Expand batch size
|
||||
img_latent_tensor = repeat_tensor_to_batch_size(
|
||||
input_name=image_latent_input_name,
|
||||
input_name=f"{image_latent_input_name}[{i}]",
|
||||
input_tensor=img_latent_tensor,
|
||||
num_images_per_prompt=block_state.num_images_per_prompt,
|
||||
batch_size=block_state.batch_size,
|
||||
)
|
||||
packed_image_latent_tensors.append(img_latent_tensor)
|
||||
|
||||
# Concatenate all packed latents along dim=1
|
||||
packed_image_latent_tensors = torch.cat(packed_image_latent_tensors, dim=1)
|
||||
|
||||
# Output lists of heights/widths
|
||||
block_state.image_height = image_heights
|
||||
block_state.image_width = image_widths
|
||||
setattr(block_state, image_latent_input_name, packed_image_latent_tensors)
|
||||
|
||||
# Default height/width from last image
|
||||
block_state.height = block_state.height or image_heights[-1]
|
||||
block_state.width = block_state.width or image_widths[-1]
|
||||
|
||||
setattr(block_state, image_latent_input_name, packed_image_latent_tensors)
|
||||
|
||||
# Process additional batch inputs (only batch expansion)
|
||||
for input_name in self._additional_batch_inputs:
|
||||
input_tensor = getattr(block_state, input_name)
|
||||
if input_tensor is None:
|
||||
continue
|
||||
|
||||
# Only expand batch size
|
||||
input_tensor = repeat_tensor_to_batch_size(
|
||||
input_name=input_name,
|
||||
input_tensor=input_tensor,
|
||||
@@ -436,8 +476,6 @@ class QwenImageEditPlusInputsDynamicStep(QwenImageInputsDynamicStep):
|
||||
|
||||
self.set_block_state(state, block_state)
|
||||
return components, state
|
||||
|
||||
|
||||
class QwenImageControlNetInputsStep(ModularPipelineBlocks):
|
||||
model_name = "qwenimage"
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,465 @@
|
||||
# Copyright 2025 Qwen-Image Team and The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from ...utils import logging
|
||||
from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks, ConditionalPipelineBlocks
|
||||
from ..modular_pipeline_utils import InsertableDict
|
||||
from .before_denoise import (
|
||||
QwenImageControlNetBeforeDenoiserStep,
|
||||
QwenImageCreateMaskLatentsStep,
|
||||
QwenImagePrepareLatentsStep,
|
||||
QwenImagePrepareLatentsWithStrengthStep,
|
||||
QwenImageRoPEInputsStep,
|
||||
QwenImageSetTimestepsStep,
|
||||
QwenImageSetTimestepsWithStrengthStep,
|
||||
)
|
||||
from .decoders import (
|
||||
QwenImageAfterDenoiseStep,
|
||||
QwenImageDecoderStep,
|
||||
QwenImageInpaintProcessImagesOutputStep,
|
||||
QwenImageProcessImagesOutputStep,
|
||||
)
|
||||
from .denoise import (
|
||||
QwenImageControlNetDenoiseStep,
|
||||
QwenImageDenoiseStep,
|
||||
QwenImageInpaintControlNetDenoiseStep,
|
||||
QwenImageInpaintDenoiseStep,
|
||||
QwenImageLoopBeforeDenoiserControlNet,
|
||||
)
|
||||
from .encoders import (
|
||||
QwenImageControlNetVaeEncoderStep,
|
||||
QwenImageInpaintProcessImagesInputStep,
|
||||
QwenImageProcessImagesInputStep,
|
||||
QwenImageTextEncoderStep,
|
||||
QwenImageVaeEncoderDynamicStep,
|
||||
)
|
||||
from .inputs import (
|
||||
QwenImageControlNetInputsStep,
|
||||
QwenImageInputsDynamicStep,
|
||||
QwenImageTextInputsStep,
|
||||
)
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
|
||||
# 1. VAE ENCODER
|
||||
|
||||
# inpaint vae encoder
|
||||
class QwenImageInpaintVaeEncoderStep(SequentialPipelineBlocks):
|
||||
model_name = "qwenimage"
|
||||
block_classes = [QwenImageInpaintProcessImagesInputStep(), QwenImageVaeEncoderDynamicStep()]
|
||||
block_names = ["preprocess", "encode"]
|
||||
|
||||
@property
|
||||
def description(self) -> str:
|
||||
return (
|
||||
"This step is used for processing image and mask inputs for inpainting tasks. It:\n"
|
||||
" - Resizes the image to the target size, based on `height` and `width`.\n"
|
||||
" - Processes and updates `image` and `mask_image`.\n"
|
||||
" - Creates `image_latents`."
|
||||
)
|
||||
|
||||
|
||||
# img2img vae encoder
|
||||
class QwenImageImg2ImgVaeEncoderStep(SequentialPipelineBlocks):
|
||||
model_name = "qwenimage"
|
||||
|
||||
block_classes = [QwenImageProcessImagesInputStep(), QwenImageVaeEncoderDynamicStep()]
|
||||
block_names = ["preprocess", "encode"]
|
||||
|
||||
@property
|
||||
def description(self) -> str:
|
||||
return "Vae encoder step that preprocess andencode the image inputs into their latent representations."
|
||||
|
||||
|
||||
# auto vae encoder
|
||||
class QwenImageAutoVaeEncoderStep(AutoPipelineBlocks):
|
||||
block_classes = [QwenImageInpaintVaeEncoderStep, QwenImageImg2ImgVaeEncoderStep]
|
||||
block_names = ["inpaint", "img2img"]
|
||||
block_trigger_inputs = ["mask_image", "image"]
|
||||
|
||||
@property
|
||||
def description(self):
|
||||
return (
|
||||
"Vae encoder step that encode the image inputs into their latent representations.\n"
|
||||
+ "This is an auto pipeline block.\n"
|
||||
+ " - `QwenImageInpaintVaeEncoderStep` (inpaint) is used when `mask_image` is provided.\n"
|
||||
+ " - `QwenImageImg2ImgVaeEncoderStep` (img2img) is used when `image` is provided.\n"
|
||||
+ " - if `mask_image` or `image` is not provided, step will be skipped."
|
||||
)
|
||||
|
||||
|
||||
# optional controlnet vae encoder
|
||||
class QwenImageOptionalControlNetVaeEncoderStep(AutoPipelineBlocks):
|
||||
block_classes = [QwenImageControlNetVaeEncoderStep]
|
||||
block_names = ["controlnet"]
|
||||
block_trigger_inputs = ["control_image"]
|
||||
|
||||
@property
|
||||
def description(self):
|
||||
return (
|
||||
"Vae encoder step that encode the image inputs into their latent representations.\n"
|
||||
+ "This is an auto pipeline block.\n"
|
||||
+ " - `QwenImageControlNetVaeEncoderStep` (controlnet) is used when `control_image` is provided.\n"
|
||||
+ " - if `control_image` is not provided, step will be skipped."
|
||||
)
|
||||
|
||||
# 2. DENOISE
|
||||
# input -> prepare_latents -> set_timesteps -> prepare_rope_inputs -> denoise -> after_denoise
|
||||
|
||||
# img2img input
|
||||
class QwenImageImg2ImgInputStep(SequentialPipelineBlocks):
|
||||
model_name = "qwenimage"
|
||||
block_classes = [QwenImageTextInputsStep(), QwenImageInputsDynamicStep(image_latent_inputs=["image_latents"])]
|
||||
block_names = ["text_inputs", "additional_inputs"]
|
||||
|
||||
@property
|
||||
def description(self):
|
||||
return "Input step that prepares the inputs for the img2img denoising step. It:\n"
|
||||
" - make sure the text embeddings have consistent batch size as well as the additional inputs (`image_latents`).\n"
|
||||
" - update height/width based `image_latents`, patchify `image_latents`."
|
||||
|
||||
|
||||
# inpaint input
|
||||
class QwenImageInpaintInputStep(SequentialPipelineBlocks):
|
||||
model_name = "qwenimage"
|
||||
block_classes = [QwenImageTextInputsStep(), QwenImageInputsDynamicStep(image_latent_inputs=["image_latents"], additional_batch_inputs=["processed_mask_image"])]
|
||||
block_names = ["text_inputs", "additional_inputs"]
|
||||
|
||||
@property
|
||||
def description(self):
|
||||
return "Input step that prepares the inputs for the inpainting denoising step. It:\n"
|
||||
" - make sure the text embeddings have consistent batch size as well as the additional inputs (`image_latents` and `processed_mask_image`).\n"
|
||||
" - update height/width based `image_latents`, patchify `image_latents`."
|
||||
|
||||
# inpaint prepare latents
|
||||
class QwenImageInpaintPrepareLatentsStep(SequentialPipelineBlocks):
|
||||
model_name = "qwenimage"
|
||||
block_classes = [QwenImagePrepareLatentsWithStrengthStep(), QwenImageCreateMaskLatentsStep()]
|
||||
block_names = ["add_noise_to_latents", "create_mask_latents"]
|
||||
|
||||
@property
|
||||
def description(self) -> str:
|
||||
return (
|
||||
"This step prepares the latents/image_latents and mask inputs for the inpainting denoising step. It:\n"
|
||||
" - Add noise to the image latents to create the latents input for the denoiser.\n"
|
||||
" - Create the pachified latents `mask` based on the processedmask image.\n"
|
||||
)
|
||||
|
||||
# CoreDenoiseStep:
|
||||
# (input + prepare_latents + set_timesteps + prepare_rope_inputs + denoise + after_denoise)
|
||||
|
||||
# 1. text2image
|
||||
class QwenImageCoreDenoiseStep(SequentialPipelineBlocks):
|
||||
model_name = "qwenimage"
|
||||
block_classes = [
|
||||
QwenImageTextInputsStep(),
|
||||
QwenImagePrepareLatentsStep(),
|
||||
QwenImageSetTimestepsStep(),
|
||||
QwenImageRoPEInputsStep(),
|
||||
QwenImageDenoiseStep(),
|
||||
QwenImageAfterDenoiseStep(),
|
||||
]
|
||||
block_names = [
|
||||
"input",
|
||||
"prepare_latents",
|
||||
"set_timesteps",
|
||||
"prepare_rope_inputs",
|
||||
"denoise",
|
||||
"after_denoise",
|
||||
]
|
||||
|
||||
@property
|
||||
def description(self):
|
||||
return "step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.)."
|
||||
|
||||
|
||||
# 2.inpaint
|
||||
class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks):
|
||||
model_name = "qwenimage"
|
||||
block_classes = [
|
||||
QwenImageInpaintInputStep(),
|
||||
QwenImagePrepareLatentsStep(),
|
||||
QwenImageSetTimestepsWithStrengthStep(),
|
||||
QwenImageInpaintPrepareLatentsStep(),
|
||||
QwenImageRoPEInputsStep(),
|
||||
QwenImageInpaintDenoiseStep(),
|
||||
QwenImageAfterDenoiseStep(),
|
||||
]
|
||||
block_names = [
|
||||
"input",
|
||||
"prepare_latents",
|
||||
"set_timesteps",
|
||||
"prepare_inpaint_latents",
|
||||
"prepare_rope_inputs",
|
||||
"denoise",
|
||||
"after_denoise",
|
||||
]
|
||||
|
||||
@property
|
||||
def description(self):
|
||||
return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task."
|
||||
|
||||
|
||||
# 3. img2img
|
||||
class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
|
||||
model_name = "qwenimage"
|
||||
block_classes = [
|
||||
QwenImageImg2ImgInputStep(),
|
||||
QwenImagePrepareLatentsStep(),
|
||||
QwenImageSetTimestepsWithStrengthStep(),
|
||||
QwenImagePrepareLatentsWithStrengthStep(),
|
||||
QwenImageRoPEInputsStep(),
|
||||
QwenImageDenoiseStep(),
|
||||
QwenImageAfterDenoiseStep(),
|
||||
]
|
||||
block_names = [
|
||||
"input",
|
||||
"prepare_latents",
|
||||
"set_timesteps",
|
||||
"prepare_img2img_latents",
|
||||
"prepare_rope_inputs",
|
||||
"denoise",
|
||||
"after_denoise",
|
||||
]
|
||||
|
||||
@property
|
||||
def description(self):
|
||||
return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task."
|
||||
|
||||
|
||||
|
||||
# 4. text2image + controlnet
|
||||
class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks):
|
||||
model_name = "qwenimage"
|
||||
block_classes = [
|
||||
QwenImageTextInputsStep(),
|
||||
QwenImageControlNetInputsStep(),
|
||||
QwenImagePrepareLatentsStep(),
|
||||
QwenImageSetTimestepsStep(),
|
||||
QwenImageRoPEInputsStep(),
|
||||
QwenImageControlNetBeforeDenoiserStep(),
|
||||
QwenImageControlNetDenoiseStep(),
|
||||
QwenImageAfterDenoiseStep(),
|
||||
]
|
||||
block_names = [
|
||||
"input",
|
||||
"controlnet_input",
|
||||
"prepare_latents",
|
||||
"set_timesteps",
|
||||
"prepare_rope_inputs",
|
||||
"controlnet_before_denoise",
|
||||
"controlnet_denoise",
|
||||
"after_denoise",
|
||||
]
|
||||
|
||||
@property
|
||||
def description(self):
|
||||
return "step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.)."
|
||||
|
||||
|
||||
# 5. inpaint + controlnet
|
||||
class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks):
|
||||
model_name = "qwenimage"
|
||||
block_classes = [
|
||||
QwenImageInpaintInputStep(),
|
||||
QwenImageControlNetInputsStep(),
|
||||
QwenImagePrepareLatentsStep(),
|
||||
QwenImageSetTimestepsWithStrengthStep(),
|
||||
QwenImageInpaintPrepareLatentsStep(),
|
||||
QwenImageRoPEInputsStep(),
|
||||
QwenImageControlNetBeforeDenoiserStep(),
|
||||
QwenImageInpaintControlNetDenoiseStep(),
|
||||
QwenImageAfterDenoiseStep(),
|
||||
]
|
||||
block_names = [
|
||||
"input",
|
||||
"controlnet_input",
|
||||
"prepare_latents",
|
||||
"set_timesteps",
|
||||
"prepare_inpaint_latents",
|
||||
"prepare_rope_inputs",
|
||||
"controlnet_before_denoise",
|
||||
"controlnet_denoise",
|
||||
"after_denoise",
|
||||
]
|
||||
|
||||
@property
|
||||
def description(self):
|
||||
return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task."
|
||||
|
||||
|
||||
# 6. img2img + controlnet
|
||||
class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
|
||||
model_name = "qwenimage"
|
||||
block_classes = [
|
||||
QwenImageImg2ImgInputStep(),
|
||||
QwenImageControlNetInputsStep(),
|
||||
QwenImagePrepareLatentsStep(),
|
||||
QwenImageSetTimestepsWithStrengthStep(),
|
||||
QwenImagePrepareLatentsWithStrengthStep(),
|
||||
QwenImageRoPEInputsStep(),
|
||||
QwenImageControlNetBeforeDenoiserStep(),
|
||||
QwenImageControlNetDenoiseStep(),
|
||||
QwenImageAfterDenoiseStep(),
|
||||
]
|
||||
block_names = [
|
||||
"input",
|
||||
"controlnet_input",
|
||||
"prepare_latents",
|
||||
"set_timesteps",
|
||||
"prepare_img2img_latents",
|
||||
"prepare_rope_inputs",
|
||||
"controlnet_before_denoise",
|
||||
"controlnet_denoise",
|
||||
"after_denoise",
|
||||
]
|
||||
|
||||
@property
|
||||
def description(self):
|
||||
return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task."
|
||||
|
||||
|
||||
# auto denoise
|
||||
# auto denoise step for controlnet tasks: works for all tasks with controlnet
|
||||
class QwenImageAutoCoreDenoiseStep(ConditionalPipelineBlocks):
|
||||
block_classes = [
|
||||
QwenImageCoreDenoiseStep,
|
||||
QwenImageInpaintCoreDenoiseStep,
|
||||
QwenImageImg2ImgCoreDenoiseStep,
|
||||
QwenImageControlNetCoreDenoiseStep,
|
||||
QwenImageControlNetInpaintCoreDenoiseStep,
|
||||
QwenImageControlNetImg2ImgCoreDenoiseStep,
|
||||
]
|
||||
block_names = [
|
||||
"text2image",
|
||||
"inpaint",
|
||||
"img2img",
|
||||
"controlnet_text2image",
|
||||
"controlnet_inpaint",
|
||||
"controlnet_img2img"]
|
||||
block_trigger_inputs = ["control_image_latents", "processed_mask_image", "image_latents"]
|
||||
default_block_name = "text2image"
|
||||
|
||||
def select_block(self, control_image_latents=None, processed_mask_image=None, image_latents=None):
|
||||
|
||||
if control_image_latents is not None:
|
||||
if processed_mask_image is not None:
|
||||
return "controlnet_inpaint"
|
||||
elif image_latents is not None:
|
||||
return "controlnet_img2img"
|
||||
else:
|
||||
return "controlnet_text2image"
|
||||
else:
|
||||
if processed_mask_image is not None:
|
||||
return "inpaint"
|
||||
elif image_latents is not None:
|
||||
return "img2img"
|
||||
else:
|
||||
return "text2image"
|
||||
|
||||
@property
|
||||
def description(self):
|
||||
return (
|
||||
"Core step that performs the denoising process. \n"
|
||||
+ " - `QwenImageCoreDenoiseStep` (text2image) for text2image tasks.\n"
|
||||
+ " - `QwenImageInpaintCoreDenoiseStep` (inpaint) for inpaint tasks.\n"
|
||||
+ " - `QwenImageImg2ImgCoreDenoiseStep` (img2img) for img2img tasks.\n"
|
||||
+ " - `QwenImageControlNetCoreDenoiseStep` (controlnet_text2image) for text2image tasks with controlnet.\n"
|
||||
+ " - `QwenImageControlNetInpaintCoreDenoiseStep` (controlnet_inpaint) for inpaint tasks with controlnet.\n"
|
||||
+ " - `QwenImageControlNetImg2ImgCoreDenoiseStep` (controlnet_img2img) for img2img tasks with controlnet.\n"
|
||||
+ "This step support text-to-image, image-to-image, inpainting, and controlnet tasks for QwenImage:\n"
|
||||
+ " - for image-to-image generation, you need to provide `image_latents`\n"
|
||||
+ " - for inpainting, you need to provide `processed_mask_image` and `image_latents`\n"
|
||||
+ " - to run the controlnet workflow, you need to provide `control_image_latents`\n"
|
||||
+ " - for text-to-image generation, all you need to provide is prompt embeddings"
|
||||
)
|
||||
|
||||
|
||||
# 4. DECODE
|
||||
|
||||
## 1.1 text2image
|
||||
|
||||
#### decode
|
||||
#### (standard decode step works for most tasks except for inpaint)
|
||||
|
||||
class QwenImageDecodeStep(SequentialPipelineBlocks):
|
||||
model_name = "qwenimage"
|
||||
block_classes = [QwenImageDecoderStep(), QwenImageProcessImagesOutputStep()]
|
||||
block_names = ["decode", "postprocess"]
|
||||
|
||||
@property
|
||||
def description(self):
|
||||
return "Decode step that decodes the latents to images and postprocess the generated image."
|
||||
|
||||
|
||||
|
||||
#### inpaint decode
|
||||
|
||||
class QwenImageInpaintDecodeStep(SequentialPipelineBlocks):
|
||||
model_name = "qwenimage"
|
||||
block_classes = [QwenImageDecoderStep(), QwenImageInpaintProcessImagesOutputStep()]
|
||||
block_names = ["decode", "postprocess"]
|
||||
|
||||
@property
|
||||
def description(self):
|
||||
return "Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask overally to the original image."
|
||||
|
||||
|
||||
# auto decode step for inpaint and text2image tasks
|
||||
class QwenImageAutoDecodeStep(AutoPipelineBlocks):
|
||||
block_classes = [QwenImageInpaintDecodeStep, QwenImageDecodeStep]
|
||||
block_names = ["inpaint_decode", "decode"]
|
||||
block_trigger_inputs = ["mask", None]
|
||||
|
||||
@property
|
||||
def description(self):
|
||||
return (
|
||||
"Decode step that decode the latents into images. \n"
|
||||
" This is an auto pipeline block that works for inpaint/text2image/img2img tasks, for both QwenImage and QwenImage-Edit.\n"
|
||||
+ " - `QwenImageInpaintDecodeStep` (inpaint) is used when `mask` is provided.\n"
|
||||
+ " - `QwenImageDecodeStep` (text2image/img2img) is used when `mask` is not provided.\n"
|
||||
)
|
||||
|
||||
|
||||
|
||||
## 1.10 QwenImage/auto block & presets
|
||||
AUTO_BLOCKS = InsertableDict(
|
||||
[
|
||||
("text_encoder", QwenImageTextEncoderStep()),
|
||||
("vae_encoder", QwenImageAutoVaeEncoderStep()),
|
||||
("controlnet_vae_encoder", QwenImageOptionalControlNetVaeEncoderStep()),
|
||||
("denoise", QwenImageAutoCoreDenoiseStep()),
|
||||
("decode", QwenImageAutoDecodeStep()),
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
class QwenImageAutoBlocks(SequentialPipelineBlocks):
|
||||
model_name = "qwenimage"
|
||||
|
||||
block_classes = AUTO_BLOCKS.values()
|
||||
block_names = AUTO_BLOCKS.keys()
|
||||
|
||||
@property
|
||||
def description(self):
|
||||
return (
|
||||
"Auto Modular pipeline for text-to-image, image-to-image, inpainting, and controlnet tasks using QwenImage.\n"
|
||||
+ "- for image-to-image generation, you need to provide `image`\n"
|
||||
+ "- for inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop` \n"
|
||||
+ "- to run the controlnet workflow, you need to provide `control_image`\n"
|
||||
+ "- for text-to-image generation, all you need to provide is `prompt`"
|
||||
)
|
||||
@@ -0,0 +1,329 @@
|
||||
# Copyright 2025 Qwen-Image Team and The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from typing import Optional
|
||||
|
||||
from ...utils import logging
|
||||
from ..modular_pipeline import AutoPipelineBlocks, ConditionalPipelineBlocks, SequentialPipelineBlocks
|
||||
from ..modular_pipeline_utils import InsertableDict
|
||||
from .before_denoise import (
|
||||
QwenImageCreateMaskLatentsStep,
|
||||
QwenImageEditRoPEInputsStep,
|
||||
QwenImagePrepareLatentsStep,
|
||||
QwenImagePrepareLatentsWithStrengthStep,
|
||||
QwenImageSetTimestepsStep,
|
||||
QwenImageSetTimestepsWithStrengthStep,
|
||||
)
|
||||
from .decoders import (
|
||||
QwenImageAfterDenoiseStep,
|
||||
QwenImageDecoderStep,
|
||||
QwenImageInpaintProcessImagesOutputStep,
|
||||
QwenImageProcessImagesOutputStep,
|
||||
)
|
||||
from .denoise import (
|
||||
QwenImageEditDenoiseStep,
|
||||
QwenImageEditInpaintDenoiseStep,
|
||||
)
|
||||
from .encoders import (
|
||||
QwenImageEditResizeDynamicStep,
|
||||
QwenImageEditTextEncoderStep,
|
||||
QwenImageInpaintProcessImagesInputStep,
|
||||
QwenImageProcessImagesInputStep,
|
||||
QwenImageVaeEncoderDynamicStep,
|
||||
)
|
||||
from .inputs import (
|
||||
QwenImageInputsDynamicStep,
|
||||
QwenImageTextInputsStep,
|
||||
)
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
# ====================
|
||||
# 1. TEXT ENCODER
|
||||
# ====================
|
||||
|
||||
class QwenImageEditVLEncoderStep(SequentialPipelineBlocks):
|
||||
"""VL encoder that takes both image and text prompts."""
|
||||
model_name = "qwenimage-edit"
|
||||
block_classes = [
|
||||
QwenImageEditResizeDynamicStep(),
|
||||
QwenImageEditTextEncoderStep(),
|
||||
]
|
||||
block_names = ["resize", "encode"]
|
||||
|
||||
@property
|
||||
def description(self) -> str:
|
||||
return "QwenImage-Edit VL encoder step that encode the image and text prompts together."
|
||||
|
||||
|
||||
# ====================
|
||||
# 2. VAE ENCODER
|
||||
# ====================
|
||||
|
||||
# Edit VAE encoder
|
||||
class QwenImageEditVaeEncoderStep(SequentialPipelineBlocks):
|
||||
model_name = "qwenimage-edit"
|
||||
block_classes = [
|
||||
QwenImageEditResizeDynamicStep(),
|
||||
QwenImageProcessImagesInputStep(),
|
||||
QwenImageVaeEncoderDynamicStep(),
|
||||
]
|
||||
block_names = ["resize", "preprocess", "encode"]
|
||||
|
||||
@property
|
||||
def description(self) -> str:
|
||||
return "Vae encoder step that encode the image inputs into their latent representations."
|
||||
|
||||
|
||||
# Edit Inpaint VAE encoder
|
||||
class QwenImageEditInpaintVaeEncoderStep(SequentialPipelineBlocks):
|
||||
model_name = "qwenimage-edit"
|
||||
block_classes = [
|
||||
QwenImageEditResizeDynamicStep(),
|
||||
QwenImageInpaintProcessImagesInputStep(),
|
||||
QwenImageVaeEncoderDynamicStep(input_name="processed_image", output_name="image_latents"),
|
||||
]
|
||||
block_names = ["resize", "preprocess", "encode"]
|
||||
|
||||
@property
|
||||
def description(self) -> str:
|
||||
return (
|
||||
"This step is used for processing image and mask inputs for QwenImage-Edit inpaint tasks. It:\n"
|
||||
" - resize the image for target area (1024 * 1024) while maintaining the aspect ratio.\n"
|
||||
" - process the resized image and mask image.\n"
|
||||
" - create image latents."
|
||||
)
|
||||
|
||||
|
||||
# Auto VAE encoder
|
||||
class QwenImageEditAutoVaeEncoderStep(AutoPipelineBlocks):
|
||||
block_classes = [QwenImageEditInpaintVaeEncoderStep, QwenImageEditVaeEncoderStep]
|
||||
block_names = ["edit_inpaint", "edit"]
|
||||
block_trigger_inputs = ["mask_image", "image"]
|
||||
|
||||
@property
|
||||
def description(self):
|
||||
return (
|
||||
"Vae encoder step that encode the image inputs into their latent representations.\n"
|
||||
"This is an auto pipeline block.\n"
|
||||
" - `QwenImageEditInpaintVaeEncoderStep` (edit_inpaint) is used when `mask_image` is provided.\n"
|
||||
" - `QwenImageEditVaeEncoderStep` (edit) is used when `image` is provided.\n"
|
||||
" - if `mask_image` or `image` is not provided, step will be skipped."
|
||||
)
|
||||
|
||||
|
||||
# ====================
|
||||
# 3. DENOISE - input -> prepare_latents -> set_timesteps -> prepare_rope_inputs -> denoise -> after_denoise
|
||||
# ====================
|
||||
|
||||
# Edit input step
|
||||
class QwenImageEditInputStep(SequentialPipelineBlocks):
|
||||
model_name = "qwenimage-edit"
|
||||
block_classes = [
|
||||
QwenImageTextInputsStep(),
|
||||
QwenImageInputsDynamicStep(image_latent_inputs=["image_latents"]),
|
||||
]
|
||||
block_names = ["text_inputs", "additional_inputs"]
|
||||
|
||||
@property
|
||||
def description(self):
|
||||
return (
|
||||
"Input step that prepares the inputs for the edit denoising step. It:\n"
|
||||
" - make sure the text embeddings have consistent batch size as well as the additional inputs.\n"
|
||||
" - update height/width based `image_latents`, patchify `image_latents`."
|
||||
)
|
||||
|
||||
|
||||
# Edit Inpaint input step
|
||||
class QwenImageEditInpaintInputStep(SequentialPipelineBlocks):
|
||||
model_name = "qwenimage-edit"
|
||||
block_classes = [
|
||||
QwenImageTextInputsStep(),
|
||||
QwenImageInputsDynamicStep(image_latent_inputs=["image_latents"], additional_batch_inputs=["processed_mask_image"]),
|
||||
]
|
||||
block_names = ["text_inputs", "additional_inputs"]
|
||||
|
||||
@property
|
||||
def description(self):
|
||||
return (
|
||||
"Input step that prepares the inputs for the edit inpaint denoising step. It:\n"
|
||||
" - make sure the text embeddings have consistent batch size as well as the additional inputs.\n"
|
||||
" - update height/width based `image_latents`, patchify `image_latents`."
|
||||
)
|
||||
|
||||
|
||||
# Edit Inpaint prepare latents step
|
||||
class QwenImageEditInpaintPrepareLatentsStep(SequentialPipelineBlocks):
|
||||
model_name = "qwenimage-edit"
|
||||
block_classes = [QwenImagePrepareLatentsWithStrengthStep(), QwenImageCreateMaskLatentsStep()]
|
||||
block_names = ["add_noise_to_latents", "create_mask_latents"]
|
||||
|
||||
@property
|
||||
def description(self) -> str:
|
||||
return (
|
||||
"This step prepares the latents/image_latents and mask inputs for the edit inpainting denoising step. It:\n"
|
||||
" - Add noise to the image latents to create the latents input for the denoiser.\n"
|
||||
" - Create the patchified latents `mask` based on the processed mask image.\n"
|
||||
)
|
||||
|
||||
|
||||
# 1. Edit (img2img) core denoise
|
||||
class QwenImageEditCoreDenoiseStep(SequentialPipelineBlocks):
|
||||
model_name = "qwenimage-edit"
|
||||
block_classes = [
|
||||
QwenImageEditInputStep(),
|
||||
QwenImagePrepareLatentsStep(),
|
||||
QwenImageSetTimestepsStep(),
|
||||
QwenImageEditRoPEInputsStep(),
|
||||
QwenImageEditDenoiseStep(),
|
||||
QwenImageAfterDenoiseStep(),
|
||||
]
|
||||
block_names = [
|
||||
"input",
|
||||
"prepare_latents",
|
||||
"set_timesteps",
|
||||
"prepare_rope_inputs",
|
||||
"denoise",
|
||||
"after_denoise",
|
||||
]
|
||||
|
||||
@property
|
||||
def description(self):
|
||||
return "Core denoising workflow for QwenImage-Edit edit (img2img) task."
|
||||
|
||||
|
||||
# 2. Edit Inpaint core denoise
|
||||
class QwenImageEditInpaintCoreDenoiseStep(SequentialPipelineBlocks):
|
||||
model_name = "qwenimage-edit"
|
||||
block_classes = [
|
||||
QwenImageEditInpaintInputStep(),
|
||||
QwenImagePrepareLatentsStep(),
|
||||
QwenImageSetTimestepsWithStrengthStep(),
|
||||
QwenImageEditInpaintPrepareLatentsStep(),
|
||||
QwenImageEditRoPEInputsStep(),
|
||||
QwenImageEditInpaintDenoiseStep(),
|
||||
QwenImageAfterDenoiseStep(),
|
||||
]
|
||||
block_names = [
|
||||
"input",
|
||||
"prepare_latents",
|
||||
"set_timesteps",
|
||||
"prepare_inpaint_latents",
|
||||
"prepare_rope_inputs",
|
||||
"denoise",
|
||||
"after_denoise",
|
||||
]
|
||||
|
||||
@property
|
||||
def description(self):
|
||||
return "Core denoising workflow for QwenImage-Edit edit inpaint task."
|
||||
|
||||
|
||||
# Auto core denoise step
|
||||
class QwenImageEditAutoCoreDenoiseStep(ConditionalPipelineBlocks):
|
||||
block_classes = [
|
||||
QwenImageEditInpaintCoreDenoiseStep,
|
||||
QwenImageEditCoreDenoiseStep,
|
||||
]
|
||||
block_names = ["edit_inpaint", "edit"]
|
||||
block_trigger_inputs = ["processed_mask_image", "image_latents"]
|
||||
default_block_name = "edit"
|
||||
|
||||
def select_block(self, processed_mask_image=None, image_latents=None) -> Optional[str]:
|
||||
if processed_mask_image is not None:
|
||||
return "edit_inpaint"
|
||||
elif image_latents is not None:
|
||||
return "edit"
|
||||
return None
|
||||
|
||||
@property
|
||||
def description(self):
|
||||
return (
|
||||
"Auto core denoising step that selects the appropriate workflow based on inputs.\n"
|
||||
" - `QwenImageEditInpaintCoreDenoiseStep` when `processed_mask_image` is provided\n"
|
||||
" - `QwenImageEditCoreDenoiseStep` when `image_latents` is provided\n"
|
||||
"Supports edit (img2img) and edit inpainting tasks for QwenImage-Edit."
|
||||
)
|
||||
|
||||
|
||||
# ====================
|
||||
# 4. DECODE
|
||||
# ====================
|
||||
|
||||
# Decode step (standard)
|
||||
class QwenImageEditDecodeStep(SequentialPipelineBlocks):
|
||||
model_name = "qwenimage-edit"
|
||||
block_classes = [QwenImageDecoderStep(), QwenImageProcessImagesOutputStep()]
|
||||
block_names = ["decode", "postprocess"]
|
||||
|
||||
@property
|
||||
def description(self):
|
||||
return "Decode step that decodes the latents to images and postprocess the generated image."
|
||||
|
||||
|
||||
# Inpaint decode step
|
||||
class QwenImageEditInpaintDecodeStep(SequentialPipelineBlocks):
|
||||
model_name = "qwenimage-edit"
|
||||
block_classes = [QwenImageDecoderStep(), QwenImageInpaintProcessImagesOutputStep()]
|
||||
block_names = ["decode", "postprocess"]
|
||||
|
||||
@property
|
||||
def description(self):
|
||||
return "Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask overlay to the original image."
|
||||
|
||||
|
||||
# Auto decode step
|
||||
class QwenImageEditAutoDecodeStep(AutoPipelineBlocks):
|
||||
block_classes = [QwenImageEditInpaintDecodeStep, QwenImageEditDecodeStep]
|
||||
block_names = ["inpaint_decode", "decode"]
|
||||
block_trigger_inputs = ["mask", None]
|
||||
|
||||
@property
|
||||
def description(self):
|
||||
return (
|
||||
"Decode step that decode the latents into images.\n"
|
||||
"This is an auto pipeline block.\n"
|
||||
" - `QwenImageEditInpaintDecodeStep` (inpaint) is used when `mask` is provided.\n"
|
||||
" - `QwenImageEditDecodeStep` (edit) is used when `mask` is not provided.\n"
|
||||
)
|
||||
|
||||
|
||||
# ====================
|
||||
# 5. AUTO BLOCKS & PRESETS
|
||||
# ====================
|
||||
|
||||
EDIT_AUTO_BLOCKS = InsertableDict(
|
||||
[
|
||||
("text_encoder", QwenImageEditVLEncoderStep()),
|
||||
("vae_encoder", QwenImageEditAutoVaeEncoderStep()),
|
||||
("denoise", QwenImageEditAutoCoreDenoiseStep()),
|
||||
("decode", QwenImageEditAutoDecodeStep()),
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
class QwenImageEditAutoBlocks(SequentialPipelineBlocks):
|
||||
model_name = "qwenimage-edit"
|
||||
block_classes = EDIT_AUTO_BLOCKS.values()
|
||||
block_names = EDIT_AUTO_BLOCKS.keys()
|
||||
|
||||
@property
|
||||
def description(self):
|
||||
return (
|
||||
"Auto Modular pipeline for edit (img2img) and edit inpaint tasks using QwenImage-Edit.\n"
|
||||
"- for edit (img2img) generation, you need to provide `image`\n"
|
||||
"- for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop`\n"
|
||||
)
|
||||
@@ -0,0 +1,175 @@
|
||||
# Copyright 2025 Qwen-Image Team and The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from ...utils import logging
|
||||
from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks
|
||||
from ..modular_pipeline_utils import InsertableDict
|
||||
from .before_denoise import (
|
||||
QwenImageEditPlusRoPEInputsStep,
|
||||
QwenImagePrepareLatentsStep,
|
||||
QwenImageSetTimestepsStep,
|
||||
)
|
||||
from .decoders import (
|
||||
QwenImageAfterDenoiseStep,
|
||||
QwenImageDecoderStep,
|
||||
QwenImageProcessImagesOutputStep,
|
||||
)
|
||||
from .denoise import (
|
||||
QwenImageEditDenoiseStep,
|
||||
)
|
||||
from .encoders import (
|
||||
QwenImageEditPlusResizeDynamicStep,
|
||||
QwenImageEditPlusTextEncoderStep,
|
||||
QwenImageEditPlusProcessImagesInputStep,
|
||||
QwenImageVaeEncoderDynamicStep,
|
||||
)
|
||||
from .inputs import (
|
||||
QwenImageEditPlusInputsDynamicStep,
|
||||
QwenImageTextInputsStep,
|
||||
)
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
# ====================
|
||||
# 1. TEXT ENCODER
|
||||
# ====================
|
||||
|
||||
class QwenImageEditPlusVLEncoderStep(SequentialPipelineBlocks):
|
||||
"""VL encoder that takes both image and text prompts. Uses 384x384 target area."""
|
||||
model_name = "qwenimage-edit-plus"
|
||||
block_classes = [
|
||||
QwenImageEditPlusResizeDynamicStep(target_area=384 * 384, output_name="resized_cond_image"),
|
||||
QwenImageEditPlusTextEncoderStep(),
|
||||
]
|
||||
block_names = ["resize", "encode"]
|
||||
|
||||
@property
|
||||
def description(self) -> str:
|
||||
return "QwenImage-Edit Plus VL encoder step that encodes the image and text prompts together."
|
||||
|
||||
|
||||
# ====================
|
||||
# 2. VAE ENCODER
|
||||
# ====================
|
||||
|
||||
class QwenImageEditPlusVaeEncoderStep(SequentialPipelineBlocks):
|
||||
"""VAE encoder that handles multiple images with different sizes. Uses 1024x1024 target area."""
|
||||
model_name = "qwenimage-edit-plus"
|
||||
block_classes = [
|
||||
QwenImageEditPlusResizeDynamicStep(target_area=1024 * 1024, output_name="resized_image"),
|
||||
QwenImageEditPlusProcessImagesInputStep(),
|
||||
QwenImageVaeEncoderDynamicStep(),
|
||||
]
|
||||
block_names = ["resize", "preprocess", "encode"]
|
||||
|
||||
@property
|
||||
def description(self) -> str:
|
||||
return (
|
||||
"VAE encoder step that encodes image inputs into latent representations.\n"
|
||||
"Each image is resized independently based on its own aspect ratio to 1024x1024 target area."
|
||||
)
|
||||
|
||||
|
||||
# ====================
|
||||
# 3. DENOISE - input -> prepare_latents -> set_timesteps -> prepare_rope_inputs -> denoise -> after_denoise
|
||||
# ====================
|
||||
|
||||
# Edit Plus input step
|
||||
class QwenImageEditPlusInputStep(SequentialPipelineBlocks):
|
||||
model_name = "qwenimage-edit-plus"
|
||||
block_classes = [
|
||||
QwenImageTextInputsStep(),
|
||||
QwenImageEditPlusInputsDynamicStep(image_latent_inputs=["image_latents"]),
|
||||
]
|
||||
block_names = ["text_inputs", "additional_inputs"]
|
||||
|
||||
@property
|
||||
def description(self):
|
||||
return (
|
||||
"Input step that prepares the inputs for the Edit Plus denoising step. It:\n"
|
||||
" - Standardizes text embeddings batch size.\n"
|
||||
" - Processes list of image latents: patchifies, concatenates along dim=1, expands batch.\n"
|
||||
" - Outputs lists of image_height/image_width for RoPE calculation.\n"
|
||||
" - Defaults height/width from last image in the list."
|
||||
)
|
||||
|
||||
|
||||
# Edit Plus core denoise
|
||||
class QwenImageEditPlusCoreDenoiseStep(SequentialPipelineBlocks):
|
||||
model_name = "qwenimage-edit-plus"
|
||||
block_classes = [
|
||||
QwenImageEditPlusInputStep(),
|
||||
QwenImagePrepareLatentsStep(),
|
||||
QwenImageSetTimestepsStep(),
|
||||
QwenImageEditPlusRoPEInputsStep(),
|
||||
QwenImageEditDenoiseStep(),
|
||||
QwenImageAfterDenoiseStep(),
|
||||
]
|
||||
block_names = [
|
||||
"input",
|
||||
"prepare_latents",
|
||||
"set_timesteps",
|
||||
"prepare_rope_inputs",
|
||||
"denoise",
|
||||
"after_denoise",
|
||||
]
|
||||
|
||||
@property
|
||||
def description(self):
|
||||
return "Core denoising workflow for QwenImage-Edit Plus edit (img2img) task."
|
||||
|
||||
|
||||
# ====================
|
||||
# 4. DECODE
|
||||
# ====================
|
||||
|
||||
class QwenImageEditPlusDecodeStep(SequentialPipelineBlocks):
|
||||
model_name = "qwenimage-edit-plus"
|
||||
block_classes = [QwenImageDecoderStep(), QwenImageProcessImagesOutputStep()]
|
||||
block_names = ["decode", "postprocess"]
|
||||
|
||||
@property
|
||||
def description(self):
|
||||
return "Decode step that decodes the latents to images and postprocesses the generated image."
|
||||
|
||||
|
||||
# ====================
|
||||
# 5. AUTO BLOCKS & PRESETS
|
||||
# ====================
|
||||
|
||||
EDIT_PLUS_AUTO_BLOCKS = InsertableDict(
|
||||
[
|
||||
("text_encoder", QwenImageEditPlusVLEncoderStep()),
|
||||
("vae_encoder", QwenImageEditPlusVaeEncoderStep()),
|
||||
("denoise", QwenImageEditPlusCoreDenoiseStep()),
|
||||
("decode", QwenImageEditPlusDecodeStep()),
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
class QwenImageEditPlusAutoBlocks(SequentialPipelineBlocks):
|
||||
model_name = "qwenimage-edit-plus"
|
||||
block_classes = EDIT_PLUS_AUTO_BLOCKS.values()
|
||||
block_names = EDIT_PLUS_AUTO_BLOCKS.keys()
|
||||
|
||||
@property
|
||||
def description(self):
|
||||
return (
|
||||
"Auto Modular pipeline for edit (img2img) tasks using QwenImage-Edit Plus.\n"
|
||||
"- `image` is required input (can be single image or list of images).\n"
|
||||
"- Each image is resized independently based on its own aspect ratio.\n"
|
||||
"- VL encoder uses 384x384 target area, VAE encoder uses 1024x1024 target area."
|
||||
)
|
||||
Reference in New Issue
Block a user