mirror of
https://github.com/huggingface/diffusers.git
synced 2026-02-01 00:15:00 +08:00
Compare commits
29 Commits
main
...
modular-wo
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
20c35da75c | ||
|
|
6a549f5f55 | ||
|
|
412e51c856 | ||
|
|
23d06423ab | ||
|
|
aba551c868 | ||
|
|
1f9576a2ca | ||
|
|
d75fbc43c7 | ||
|
|
b7127ce7a7 | ||
|
|
7e9d2b954e | ||
|
|
94525200fd | ||
|
|
f056af1fbb | ||
|
|
8d45ff5bf6 | ||
|
|
fb15752d55 | ||
|
|
1f2dbc9dd2 | ||
|
|
002c3e8239 | ||
|
|
de03d7f100 | ||
|
|
25c968a38f | ||
|
|
aea0d046f6 | ||
|
|
1c90ce33f2 | ||
|
|
507953f415 | ||
|
|
f0555af1c6 | ||
|
|
2a81f2ec54 | ||
|
|
d20f413f78 | ||
|
|
ff09bf1a63 | ||
|
|
34a743e2dc | ||
|
|
43ab14845d | ||
|
|
fbfe5c8d6b | ||
|
|
b29873dee7 | ||
|
|
7b499de6d0 |
20
.github/workflows/pr_modular_tests.yml
vendored
20
.github/workflows/pr_modular_tests.yml
vendored
@@ -75,9 +75,27 @@ jobs:
|
|||||||
if: ${{ failure() }}
|
if: ${{ failure() }}
|
||||||
run: |
|
run: |
|
||||||
echo "Repo consistency check failed. Please ensure the right dependency versions are installed with 'pip install -e .[quality]' and run 'make fix-copies'" >> $GITHUB_STEP_SUMMARY
|
echo "Repo consistency check failed. Please ensure the right dependency versions are installed with 'pip install -e .[quality]' and run 'make fix-copies'" >> $GITHUB_STEP_SUMMARY
|
||||||
|
check_auto_docs:
|
||||||
|
runs-on: ubuntu-22.04
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v6
|
||||||
|
- name: Set up Python
|
||||||
|
uses: actions/setup-python@v6
|
||||||
|
with:
|
||||||
|
python-version: "3.10"
|
||||||
|
- name: Install dependencies
|
||||||
|
run: |
|
||||||
|
pip install --upgrade pip
|
||||||
|
pip install .[quality]
|
||||||
|
- name: Check auto docs
|
||||||
|
run: make modular-autodoctrings
|
||||||
|
- name: Check if failure
|
||||||
|
if: ${{ failure() }}
|
||||||
|
run: |
|
||||||
|
echo "Auto docstring checks failed. Please run `python utils/modular_auto_docstring.py --fix_and_overwrite`." >> $GITHUB_STEP_SUMMARY
|
||||||
|
|
||||||
run_fast_tests:
|
run_fast_tests:
|
||||||
needs: [check_code_quality, check_repository_consistency]
|
needs: [check_code_quality, check_repository_consistency, check_auto_docs]
|
||||||
name: Fast PyTorch Modular Pipeline CPU tests
|
name: Fast PyTorch Modular Pipeline CPU tests
|
||||||
|
|
||||||
runs-on:
|
runs-on:
|
||||||
|
|||||||
4
Makefile
4
Makefile
@@ -70,6 +70,10 @@ fix-copies:
|
|||||||
python utils/check_copies.py --fix_and_overwrite
|
python utils/check_copies.py --fix_and_overwrite
|
||||||
python utils/check_dummies.py --fix_and_overwrite
|
python utils/check_dummies.py --fix_and_overwrite
|
||||||
|
|
||||||
|
# Auto docstrings in modular blocks
|
||||||
|
modular-autodoctrings:
|
||||||
|
python utils/modular_auto_docstring.py
|
||||||
|
|
||||||
# Run tests for the library
|
# Run tests for the library
|
||||||
|
|
||||||
test:
|
test:
|
||||||
|
|||||||
@@ -39,8 +39,11 @@ from .modular_pipeline_utils import (
|
|||||||
InputParam,
|
InputParam,
|
||||||
InsertableDict,
|
InsertableDict,
|
||||||
OutputParam,
|
OutputParam,
|
||||||
|
combine_inputs,
|
||||||
|
combine_outputs,
|
||||||
format_components,
|
format_components,
|
||||||
format_configs,
|
format_configs,
|
||||||
|
format_workflow,
|
||||||
make_doc_string,
|
make_doc_string,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -242,6 +245,7 @@ class ModularPipelineBlocks(ConfigMixin, PushToHubMixin):
|
|||||||
|
|
||||||
config_name = "modular_config.json"
|
config_name = "modular_config.json"
|
||||||
model_name = None
|
model_name = None
|
||||||
|
_workflow_map = None
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def _get_signature_keys(cls, obj):
|
def _get_signature_keys(cls, obj):
|
||||||
@@ -297,6 +301,35 @@ class ModularPipelineBlocks(ConfigMixin, PushToHubMixin):
|
|||||||
def outputs(self) -> List[OutputParam]:
|
def outputs(self) -> List[OutputParam]:
|
||||||
return self._get_outputs()
|
return self._get_outputs()
|
||||||
|
|
||||||
|
# currentlyonly ConditionalPipelineBlocks and SequentialPipelineBlocks support `get_execution_blocks`
|
||||||
|
def get_execution_blocks(self, **kwargs):
|
||||||
|
"""
|
||||||
|
Get the block(s) that would execute given the inputs. Must be implemented by subclasses that support
|
||||||
|
conditional block selection.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
**kwargs: Input names and values. Only trigger inputs affect block selection.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError(f"`get_execution_blocks` is not implemented for {self.__class__.__name__}")
|
||||||
|
|
||||||
|
# currently only SequentialPipelineBlocks support workflows
|
||||||
|
@property
|
||||||
|
def workflow_names(self):
|
||||||
|
"""
|
||||||
|
Returns a list of available workflow names. Must be implemented by subclasses that define `_workflow_map`.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError(f"`workflow_names` is not implemented for {self.__class__.__name__}")
|
||||||
|
|
||||||
|
def get_workflow(self, workflow_name: str):
|
||||||
|
"""
|
||||||
|
Get the execution blocks for a specific workflow. Must be implemented by subclasses that define
|
||||||
|
`_workflow_map`.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
workflow_name: Name of the workflow to retrieve.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError(f"`get_workflow` is not implemented for {self.__class__.__name__}")
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_pretrained(
|
def from_pretrained(
|
||||||
cls,
|
cls,
|
||||||
@@ -434,72 +467,6 @@ class ModularPipelineBlocks(ConfigMixin, PushToHubMixin):
|
|||||||
if current_value is not param: # Using identity comparison to check if object was modified
|
if current_value is not param: # Using identity comparison to check if object was modified
|
||||||
state.set(param_name, param, input_param.kwargs_type)
|
state.set(param_name, param, input_param.kwargs_type)
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def combine_inputs(*named_input_lists: List[Tuple[str, List[InputParam]]]) -> List[InputParam]:
|
|
||||||
"""
|
|
||||||
Combines multiple lists of InputParam objects from different blocks. For duplicate inputs, updates only if
|
|
||||||
current default value is None and new default value is not None. Warns if multiple non-None default values
|
|
||||||
exist for the same input.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
named_input_lists: List of tuples containing (block_name, input_param_list) pairs
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
List[InputParam]: Combined list of unique InputParam objects
|
|
||||||
"""
|
|
||||||
combined_dict = {} # name -> InputParam
|
|
||||||
value_sources = {} # name -> block_name
|
|
||||||
|
|
||||||
for block_name, inputs in named_input_lists:
|
|
||||||
for input_param in inputs:
|
|
||||||
if input_param.name is None and input_param.kwargs_type is not None:
|
|
||||||
input_name = "*_" + input_param.kwargs_type
|
|
||||||
else:
|
|
||||||
input_name = input_param.name
|
|
||||||
if input_name in combined_dict:
|
|
||||||
current_param = combined_dict[input_name]
|
|
||||||
if (
|
|
||||||
current_param.default is not None
|
|
||||||
and input_param.default is not None
|
|
||||||
and current_param.default != input_param.default
|
|
||||||
):
|
|
||||||
warnings.warn(
|
|
||||||
f"Multiple different default values found for input '{input_name}': "
|
|
||||||
f"{current_param.default} (from block '{value_sources[input_name]}') and "
|
|
||||||
f"{input_param.default} (from block '{block_name}'). Using {current_param.default}."
|
|
||||||
)
|
|
||||||
if current_param.default is None and input_param.default is not None:
|
|
||||||
combined_dict[input_name] = input_param
|
|
||||||
value_sources[input_name] = block_name
|
|
||||||
else:
|
|
||||||
combined_dict[input_name] = input_param
|
|
||||||
value_sources[input_name] = block_name
|
|
||||||
|
|
||||||
return list(combined_dict.values())
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def combine_outputs(*named_output_lists: List[Tuple[str, List[OutputParam]]]) -> List[OutputParam]:
|
|
||||||
"""
|
|
||||||
Combines multiple lists of OutputParam objects from different blocks. For duplicate outputs, keeps the first
|
|
||||||
occurrence of each output name.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
named_output_lists: List of tuples containing (block_name, output_param_list) pairs
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
List[OutputParam]: Combined list of unique OutputParam objects
|
|
||||||
"""
|
|
||||||
combined_dict = {} # name -> OutputParam
|
|
||||||
|
|
||||||
for block_name, outputs in named_output_lists:
|
|
||||||
for output_param in outputs:
|
|
||||||
if (output_param.name not in combined_dict) or (
|
|
||||||
combined_dict[output_param.name].kwargs_type is None and output_param.kwargs_type is not None
|
|
||||||
):
|
|
||||||
combined_dict[output_param.name] = output_param
|
|
||||||
|
|
||||||
return list(combined_dict.values())
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def input_names(self) -> List[str]:
|
def input_names(self) -> List[str]:
|
||||||
return [input_param.name for input_param in self.inputs if input_param.name is not None]
|
return [input_param.name for input_param in self.inputs if input_param.name is not None]
|
||||||
@@ -531,7 +498,8 @@ class ModularPipelineBlocks(ConfigMixin, PushToHubMixin):
|
|||||||
class ConditionalPipelineBlocks(ModularPipelineBlocks):
|
class ConditionalPipelineBlocks(ModularPipelineBlocks):
|
||||||
"""
|
"""
|
||||||
A Pipeline Blocks that conditionally selects a block to run based on the inputs. Subclasses must implement the
|
A Pipeline Blocks that conditionally selects a block to run based on the inputs. Subclasses must implement the
|
||||||
`select_block` method to define the logic for selecting the block.
|
`select_block` method to define the logic for selecting the block. Currently, we only support selection logic based
|
||||||
|
on the presence or absence of inputs (i.e., whether they are `None` or not)
|
||||||
|
|
||||||
This class inherits from [`ModularPipelineBlocks`]. Check the superclass documentation for the generic methods the
|
This class inherits from [`ModularPipelineBlocks`]. Check the superclass documentation for the generic methods the
|
||||||
library implements for all the pipeline blocks (such as loading or saving etc.)
|
library implements for all the pipeline blocks (such as loading or saving etc.)
|
||||||
@@ -539,15 +507,20 @@ class ConditionalPipelineBlocks(ModularPipelineBlocks):
|
|||||||
> [!WARNING] > This is an experimental feature and is likely to change in the future.
|
> [!WARNING] > This is an experimental feature and is likely to change in the future.
|
||||||
|
|
||||||
Attributes:
|
Attributes:
|
||||||
block_classes: List of block classes to be used
|
block_classes: List of block classes to be used. Must have the same length as `block_names`.
|
||||||
block_names: List of prefixes for each block
|
block_names: List of names for each block. Must have the same length as `block_classes`.
|
||||||
block_trigger_inputs: List of input names that select_block() uses to determine which block to run
|
block_trigger_inputs: List of input names that `select_block()` uses to determine which block to run.
|
||||||
|
For `ConditionalPipelineBlocks`, this does not need to correspond to `block_names` and `block_classes`. For
|
||||||
|
`AutoPipelineBlocks`, this must have the same length as `block_names` and `block_classes`, where each
|
||||||
|
element specifies the trigger input for the corresponding block.
|
||||||
|
default_block_name: Name of the default block to run when no trigger inputs match.
|
||||||
|
If None, this block can be skipped entirely when no trigger inputs are provided.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
block_classes = []
|
block_classes = []
|
||||||
block_names = []
|
block_names = []
|
||||||
block_trigger_inputs = []
|
block_trigger_inputs = []
|
||||||
default_block_name = None # name of the default block if no trigger inputs are provided, if None, this block can be skipped if no trigger inputs are provided
|
default_block_name = None
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
sub_blocks = InsertableDict()
|
sub_blocks = InsertableDict()
|
||||||
@@ -611,7 +584,7 @@ class ConditionalPipelineBlocks(ModularPipelineBlocks):
|
|||||||
@property
|
@property
|
||||||
def inputs(self) -> List[Tuple[str, Any]]:
|
def inputs(self) -> List[Tuple[str, Any]]:
|
||||||
named_inputs = [(name, block.inputs) for name, block in self.sub_blocks.items()]
|
named_inputs = [(name, block.inputs) for name, block in self.sub_blocks.items()]
|
||||||
combined_inputs = self.combine_inputs(*named_inputs)
|
combined_inputs = combine_inputs(*named_inputs)
|
||||||
# mark Required inputs only if that input is required by all the blocks
|
# mark Required inputs only if that input is required by all the blocks
|
||||||
for input_param in combined_inputs:
|
for input_param in combined_inputs:
|
||||||
if input_param.name in self.required_inputs:
|
if input_param.name in self.required_inputs:
|
||||||
@@ -623,15 +596,16 @@ class ConditionalPipelineBlocks(ModularPipelineBlocks):
|
|||||||
@property
|
@property
|
||||||
def intermediate_outputs(self) -> List[str]:
|
def intermediate_outputs(self) -> List[str]:
|
||||||
named_outputs = [(name, block.intermediate_outputs) for name, block in self.sub_blocks.items()]
|
named_outputs = [(name, block.intermediate_outputs) for name, block in self.sub_blocks.items()]
|
||||||
combined_outputs = self.combine_outputs(*named_outputs)
|
combined_outputs = combine_outputs(*named_outputs)
|
||||||
return combined_outputs
|
return combined_outputs
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def outputs(self) -> List[str]:
|
def outputs(self) -> List[str]:
|
||||||
named_outputs = [(name, block.outputs) for name, block in self.sub_blocks.items()]
|
named_outputs = [(name, block.outputs) for name, block in self.sub_blocks.items()]
|
||||||
combined_outputs = self.combine_outputs(*named_outputs)
|
combined_outputs = combine_outputs(*named_outputs)
|
||||||
return combined_outputs
|
return combined_outputs
|
||||||
|
|
||||||
|
# used for `__repr__`
|
||||||
def _get_trigger_inputs(self) -> set:
|
def _get_trigger_inputs(self) -> set:
|
||||||
"""
|
"""
|
||||||
Returns a set of all unique trigger input values found in this block and nested blocks.
|
Returns a set of all unique trigger input values found in this block and nested blocks.
|
||||||
@@ -660,11 +634,6 @@ class ConditionalPipelineBlocks(ModularPipelineBlocks):
|
|||||||
|
|
||||||
return all_triggers
|
return all_triggers
|
||||||
|
|
||||||
@property
|
|
||||||
def trigger_inputs(self):
|
|
||||||
"""All trigger inputs including from nested blocks."""
|
|
||||||
return self._get_trigger_inputs()
|
|
||||||
|
|
||||||
def select_block(self, **kwargs) -> Optional[str]:
|
def select_block(self, **kwargs) -> Optional[str]:
|
||||||
"""
|
"""
|
||||||
Select the block to run based on the trigger inputs. Subclasses must implement this method to define the logic
|
Select the block to run based on the trigger inputs. Subclasses must implement this method to define the logic
|
||||||
@@ -704,6 +673,39 @@ class ConditionalPipelineBlocks(ModularPipelineBlocks):
|
|||||||
logger.error(error_msg)
|
logger.error(error_msg)
|
||||||
raise
|
raise
|
||||||
|
|
||||||
|
def get_execution_blocks(self, **kwargs) -> Optional["ModularPipelineBlocks"]:
|
||||||
|
"""
|
||||||
|
Get the block(s) that would execute given the inputs.
|
||||||
|
|
||||||
|
Recursively resolves nested ConditionalPipelineBlocks until reaching either:
|
||||||
|
- A leaf block (no sub_blocks) → returns single `ModularPipelineBlocks`
|
||||||
|
- A `SequentialPipelineBlocks` → delegates to its `get_execution_blocks()` which returns
|
||||||
|
a `SequentialPipelineBlocks` containing the resolved execution blocks
|
||||||
|
|
||||||
|
Args:
|
||||||
|
**kwargs: Input names and values. Only trigger inputs affect block selection.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
- `ModularPipelineBlocks`: A leaf block or resolved `SequentialPipelineBlocks`
|
||||||
|
- `None`: If this block would be skipped (no trigger matched and no default)
|
||||||
|
"""
|
||||||
|
trigger_kwargs = {name: kwargs.get(name) for name in self.block_trigger_inputs if name is not None}
|
||||||
|
block_name = self.select_block(**trigger_kwargs)
|
||||||
|
|
||||||
|
if block_name is None:
|
||||||
|
block_name = self.default_block_name
|
||||||
|
|
||||||
|
if block_name is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
block = self.sub_blocks[block_name]
|
||||||
|
|
||||||
|
# Recursively resolve until we hit a leaf block or a SequentialPipelineBlocks
|
||||||
|
if block.sub_blocks:
|
||||||
|
return block.get_execution_blocks(**kwargs)
|
||||||
|
|
||||||
|
return block
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
class_name = self.__class__.__name__
|
class_name = self.__class__.__name__
|
||||||
base_class = self.__class__.__bases__[0].__name__
|
base_class = self.__class__.__bases__[0].__name__
|
||||||
@@ -711,11 +713,11 @@ class ConditionalPipelineBlocks(ModularPipelineBlocks):
|
|||||||
f"{class_name}(\n Class: {base_class}\n" if base_class and base_class != "object" else f"{class_name}(\n"
|
f"{class_name}(\n Class: {base_class}\n" if base_class and base_class != "object" else f"{class_name}(\n"
|
||||||
)
|
)
|
||||||
|
|
||||||
if self.trigger_inputs:
|
if self._get_trigger_inputs():
|
||||||
header += "\n"
|
header += "\n"
|
||||||
header += " " + "=" * 100 + "\n"
|
header += " " + "=" * 100 + "\n"
|
||||||
header += " This pipeline contains blocks that are selected at runtime based on inputs.\n"
|
header += " This pipeline contains blocks that are selected at runtime based on inputs.\n"
|
||||||
header += f" Trigger Inputs: {sorted(self.trigger_inputs)}\n"
|
header += f" Trigger Inputs: {sorted(self._get_trigger_inputs())}\n"
|
||||||
header += " " + "=" * 100 + "\n\n"
|
header += " " + "=" * 100 + "\n\n"
|
||||||
|
|
||||||
# Format description with proper indentation
|
# Format description with proper indentation
|
||||||
@@ -782,24 +784,56 @@ class ConditionalPipelineBlocks(ModularPipelineBlocks):
|
|||||||
|
|
||||||
class AutoPipelineBlocks(ConditionalPipelineBlocks):
|
class AutoPipelineBlocks(ConditionalPipelineBlocks):
|
||||||
"""
|
"""
|
||||||
A Pipeline Blocks that automatically selects a block to run based on the presence of trigger inputs.
|
A Pipeline Blocks that automatically selects a block to run based on the presence of trigger inputs.
|
||||||
|
|
||||||
|
This is a specialized version of `ConditionalPipelineBlocks` where:
|
||||||
|
- Each block has one corresponding trigger input (1:1 mapping)
|
||||||
|
- Block selection is automatic: the first block whose trigger input is present gets selected
|
||||||
|
- `block_trigger_inputs` must have the same length as `block_names` and `block_classes`
|
||||||
|
- Use `None` in `block_trigger_inputs` to specify the default block, i.e the block that will run if no trigger
|
||||||
|
inputs are present
|
||||||
|
|
||||||
|
Attributes:
|
||||||
|
block_classes:
|
||||||
|
List of block classes to be used. Must have the same length as `block_names` and
|
||||||
|
`block_trigger_inputs`.
|
||||||
|
block_names:
|
||||||
|
List of names for each block. Must have the same length as `block_classes` and `block_trigger_inputs`.
|
||||||
|
block_trigger_inputs:
|
||||||
|
List of input names where each element specifies the trigger input for the corresponding block. Use
|
||||||
|
`None` to mark the default block.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
```python
|
||||||
|
class MyAutoBlock(AutoPipelineBlocks):
|
||||||
|
block_classes = [InpaintEncoderBlock, ImageEncoderBlock, TextEncoderBlock]
|
||||||
|
block_names = ["inpaint", "img2img", "text2img"]
|
||||||
|
block_trigger_inputs = ["mask_image", "image", None] # text2img is the default
|
||||||
|
```
|
||||||
|
|
||||||
|
With this definition:
|
||||||
|
- As long as `mask_image` is provided, "inpaint" block runs (regardless of `image` being provided or not)
|
||||||
|
- If `mask_image` is not provided but `image` is provided, "img2img" block runs
|
||||||
|
- Otherwise, "text2img" block runs (default, trigger is `None`)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
|
if self.default_block_name is not None:
|
||||||
|
raise ValueError(
|
||||||
|
f"In {self.__class__.__name__}, do not set `default_block_name` for AutoPipelineBlocks. "
|
||||||
|
f"Use `None` in `block_trigger_inputs` to specify the default block."
|
||||||
|
)
|
||||||
|
|
||||||
if not (len(self.block_classes) == len(self.block_names) == len(self.block_trigger_inputs)):
|
if not (len(self.block_classes) == len(self.block_names) == len(self.block_trigger_inputs)):
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"In {self.__class__.__name__}, the number of block_classes, block_names, and block_trigger_inputs must be the same."
|
f"In {self.__class__.__name__}, the number of block_classes, block_names, and block_trigger_inputs must be the same."
|
||||||
)
|
)
|
||||||
|
|
||||||
@property
|
|
||||||
def default_block_name(self) -> Optional[str]:
|
|
||||||
"""Derive default_block_name from block_trigger_inputs (None entry)."""
|
|
||||||
if None in self.block_trigger_inputs:
|
if None in self.block_trigger_inputs:
|
||||||
idx = self.block_trigger_inputs.index(None)
|
idx = self.block_trigger_inputs.index(None)
|
||||||
return self.block_names[idx]
|
self.default_block_name = self.block_names[idx]
|
||||||
return None
|
|
||||||
|
|
||||||
def select_block(self, **kwargs) -> Optional[str]:
|
def select_block(self, **kwargs) -> Optional[str]:
|
||||||
"""Select block based on which trigger input is present (not None)."""
|
"""Select block based on which trigger input is present (not None)."""
|
||||||
@@ -853,6 +887,29 @@ class SequentialPipelineBlocks(ModularPipelineBlocks):
|
|||||||
expected_configs.append(config)
|
expected_configs.append(config)
|
||||||
return expected_configs
|
return expected_configs
|
||||||
|
|
||||||
|
@property
|
||||||
|
def workflow_names(self):
|
||||||
|
if self._workflow_map is None:
|
||||||
|
raise NotImplementedError(
|
||||||
|
f"workflows is not supported because _workflow_map is not set for {self.__class__.__name__}"
|
||||||
|
)
|
||||||
|
|
||||||
|
return list(self._workflow_map.keys())
|
||||||
|
|
||||||
|
def get_workflow(self, workflow_name: str):
|
||||||
|
if self._workflow_map is None:
|
||||||
|
raise NotImplementedError(
|
||||||
|
f"workflows is not supported because _workflow_map is not set for {self.__class__.__name__}"
|
||||||
|
)
|
||||||
|
|
||||||
|
if workflow_name not in self._workflow_map:
|
||||||
|
raise ValueError(f"Workflow {workflow_name} not found in {self.__class__.__name__}")
|
||||||
|
|
||||||
|
trigger_inputs = self._workflow_map[workflow_name]
|
||||||
|
workflow_blocks = self.get_execution_blocks(**trigger_inputs)
|
||||||
|
|
||||||
|
return workflow_blocks
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_blocks_dict(
|
def from_blocks_dict(
|
||||||
cls, blocks_dict: Dict[str, Any], description: Optional[str] = None
|
cls, blocks_dict: Dict[str, Any], description: Optional[str] = None
|
||||||
@@ -948,7 +1005,7 @@ class SequentialPipelineBlocks(ModularPipelineBlocks):
|
|||||||
# filter out them here so they do not end up as intermediate_outputs
|
# filter out them here so they do not end up as intermediate_outputs
|
||||||
if name not in inp_names:
|
if name not in inp_names:
|
||||||
named_outputs.append((name, block.intermediate_outputs))
|
named_outputs.append((name, block.intermediate_outputs))
|
||||||
combined_outputs = self.combine_outputs(*named_outputs)
|
combined_outputs = combine_outputs(*named_outputs)
|
||||||
return combined_outputs
|
return combined_outputs
|
||||||
|
|
||||||
# YiYi TODO: I think we can remove the outputs property
|
# YiYi TODO: I think we can remove the outputs property
|
||||||
@@ -972,6 +1029,7 @@ class SequentialPipelineBlocks(ModularPipelineBlocks):
|
|||||||
raise
|
raise
|
||||||
return pipeline, state
|
return pipeline, state
|
||||||
|
|
||||||
|
# used for `trigger_inputs` property
|
||||||
def _get_trigger_inputs(self):
|
def _get_trigger_inputs(self):
|
||||||
"""
|
"""
|
||||||
Returns a set of all unique trigger input values found in the blocks.
|
Returns a set of all unique trigger input values found in the blocks.
|
||||||
@@ -995,89 +1053,50 @@ class SequentialPipelineBlocks(ModularPipelineBlocks):
|
|||||||
|
|
||||||
return fn_recursive_get_trigger(self.sub_blocks)
|
return fn_recursive_get_trigger(self.sub_blocks)
|
||||||
|
|
||||||
@property
|
def get_execution_blocks(self, **kwargs) -> "SequentialPipelineBlocks":
|
||||||
def trigger_inputs(self):
|
|
||||||
return self._get_trigger_inputs()
|
|
||||||
|
|
||||||
def _traverse_trigger_blocks(self, active_inputs):
|
|
||||||
"""
|
"""
|
||||||
Traverse blocks and select which ones would run given the active inputs.
|
Get the blocks that would execute given the specified inputs.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
active_inputs: Dict of input names to values that are "present"
|
**kwargs: Input names and values. Only trigger inputs affect block selection.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
OrderedDict of block_name -> block that would execute
|
SequentialPipelineBlocks containing only the blocks that would execute
|
||||||
"""
|
"""
|
||||||
|
# Copy kwargs so we can add outputs as we traverse
|
||||||
|
active_inputs = dict(kwargs)
|
||||||
|
|
||||||
def fn_recursive_traverse(block, block_name, active_inputs):
|
def fn_recursive_traverse(block, block_name, active_inputs):
|
||||||
result_blocks = OrderedDict()
|
result_blocks = OrderedDict()
|
||||||
|
|
||||||
# ConditionalPipelineBlocks (includes AutoPipelineBlocks)
|
# ConditionalPipelineBlocks (includes AutoPipelineBlocks)
|
||||||
if isinstance(block, ConditionalPipelineBlocks):
|
if isinstance(block, ConditionalPipelineBlocks):
|
||||||
trigger_kwargs = {name: active_inputs.get(name) for name in block.block_trigger_inputs}
|
block = block.get_execution_blocks(**active_inputs)
|
||||||
selected_block_name = block.select_block(**trigger_kwargs)
|
if block is None:
|
||||||
|
|
||||||
if selected_block_name is None:
|
|
||||||
selected_block_name = block.default_block_name
|
|
||||||
|
|
||||||
if selected_block_name is None:
|
|
||||||
return result_blocks
|
return result_blocks
|
||||||
|
|
||||||
selected_block = block.sub_blocks[selected_block_name]
|
# Has sub_blocks (SequentialPipelineBlocks/ConditionalPipelineBlocks)
|
||||||
|
if block.sub_blocks and not isinstance(block, LoopSequentialPipelineBlocks):
|
||||||
if selected_block.sub_blocks:
|
|
||||||
result_blocks.update(fn_recursive_traverse(selected_block, block_name, active_inputs))
|
|
||||||
else:
|
|
||||||
result_blocks[block_name] = selected_block
|
|
||||||
if hasattr(selected_block, "outputs"):
|
|
||||||
for out in selected_block.outputs:
|
|
||||||
active_inputs[out.name] = True
|
|
||||||
|
|
||||||
return result_blocks
|
|
||||||
|
|
||||||
# SequentialPipelineBlocks or LoopSequentialPipelineBlocks
|
|
||||||
if block.sub_blocks:
|
|
||||||
for sub_block_name, sub_block in block.sub_blocks.items():
|
for sub_block_name, sub_block in block.sub_blocks.items():
|
||||||
blocks_to_update = fn_recursive_traverse(sub_block, sub_block_name, active_inputs)
|
nested_blocks = fn_recursive_traverse(sub_block, sub_block_name, active_inputs)
|
||||||
blocks_to_update = {f"{block_name}.{k}": v for k, v in blocks_to_update.items()}
|
nested_blocks = {f"{block_name}.{k}": v for k, v in nested_blocks.items()}
|
||||||
result_blocks.update(blocks_to_update)
|
result_blocks.update(nested_blocks)
|
||||||
else:
|
else:
|
||||||
|
# Leaf block: single ModularPipelineBlocks or LoopSequentialPipelineBlocks
|
||||||
result_blocks[block_name] = block
|
result_blocks[block_name] = block
|
||||||
if hasattr(block, "outputs"):
|
# Add outputs to active_inputs so subsequent blocks can use them as triggers
|
||||||
for out in block.outputs:
|
if hasattr(block, "intermediate_outputs"):
|
||||||
|
for out in block.intermediate_outputs:
|
||||||
active_inputs[out.name] = True
|
active_inputs[out.name] = True
|
||||||
|
|
||||||
return result_blocks
|
return result_blocks
|
||||||
|
|
||||||
all_blocks = OrderedDict()
|
all_blocks = OrderedDict()
|
||||||
for block_name, block in self.sub_blocks.items():
|
for block_name, block in self.sub_blocks.items():
|
||||||
blocks_to_update = fn_recursive_traverse(block, block_name, active_inputs)
|
nested_blocks = fn_recursive_traverse(block, block_name, active_inputs)
|
||||||
all_blocks.update(blocks_to_update)
|
all_blocks.update(nested_blocks)
|
||||||
return all_blocks
|
|
||||||
|
|
||||||
def get_execution_blocks(self, **kwargs):
|
return SequentialPipelineBlocks.from_blocks_dict(all_blocks)
|
||||||
"""
|
|
||||||
Get the blocks that would execute given the specified inputs.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
**kwargs: Input names and values. Only trigger inputs affect block selection.
|
|
||||||
Pass any inputs that would be non-None at runtime.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
SequentialPipelineBlocks containing only the blocks that would execute
|
|
||||||
|
|
||||||
Example:
|
|
||||||
# Get blocks for inpainting workflow blocks = pipeline.get_execution_blocks(prompt="a cat", mask=mask,
|
|
||||||
image=image)
|
|
||||||
|
|
||||||
# Get blocks for text2image workflow blocks = pipeline.get_execution_blocks(prompt="a cat")
|
|
||||||
"""
|
|
||||||
# Filter out None values
|
|
||||||
active_inputs = {k: v for k, v in kwargs.items() if v is not None}
|
|
||||||
|
|
||||||
blocks_triggered = self._traverse_trigger_blocks(active_inputs)
|
|
||||||
return SequentialPipelineBlocks.from_blocks_dict(blocks_triggered)
|
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
class_name = self.__class__.__name__
|
class_name = self.__class__.__name__
|
||||||
@@ -1086,18 +1105,23 @@ class SequentialPipelineBlocks(ModularPipelineBlocks):
|
|||||||
f"{class_name}(\n Class: {base_class}\n" if base_class and base_class != "object" else f"{class_name}(\n"
|
f"{class_name}(\n Class: {base_class}\n" if base_class and base_class != "object" else f"{class_name}(\n"
|
||||||
)
|
)
|
||||||
|
|
||||||
if self.trigger_inputs:
|
if self._workflow_map is None and self._get_trigger_inputs():
|
||||||
header += "\n"
|
header += "\n"
|
||||||
header += " " + "=" * 100 + "\n"
|
header += " " + "=" * 100 + "\n"
|
||||||
header += " This pipeline contains blocks that are selected at runtime based on inputs.\n"
|
header += " This pipeline contains blocks that are selected at runtime based on inputs.\n"
|
||||||
header += f" Trigger Inputs: {[inp for inp in self.trigger_inputs if inp is not None]}\n"
|
header += f" Trigger Inputs: {[inp for inp in self._get_trigger_inputs() if inp is not None]}\n"
|
||||||
# Get first trigger input as example
|
# Get first trigger input as example
|
||||||
example_input = next(t for t in self.trigger_inputs if t is not None)
|
example_input = next(t for t in self._get_trigger_inputs() if t is not None)
|
||||||
header += f" Use `get_execution_blocks()` to see selected blocks (e.g. `get_execution_blocks({example_input}=...)`).\n"
|
header += f" Use `get_execution_blocks()` to see selected blocks (e.g. `get_execution_blocks({example_input}=...)`).\n"
|
||||||
header += " " + "=" * 100 + "\n\n"
|
header += " " + "=" * 100 + "\n\n"
|
||||||
|
|
||||||
|
description = self.description
|
||||||
|
if self._workflow_map is not None:
|
||||||
|
workflow_str = format_workflow(self._workflow_map)
|
||||||
|
description = f"{self.description}\n\n{workflow_str}"
|
||||||
|
|
||||||
# Format description with proper indentation
|
# Format description with proper indentation
|
||||||
desc_lines = self.description.split("\n")
|
desc_lines = description.split("\n")
|
||||||
desc = []
|
desc = []
|
||||||
# First line with "Description:" label
|
# First line with "Description:" label
|
||||||
desc.append(f" Description: {desc_lines[0]}")
|
desc.append(f" Description: {desc_lines[0]}")
|
||||||
@@ -1145,10 +1169,15 @@ class SequentialPipelineBlocks(ModularPipelineBlocks):
|
|||||||
|
|
||||||
@property
|
@property
|
||||||
def doc(self):
|
def doc(self):
|
||||||
|
description = self.description
|
||||||
|
if self._workflow_map is not None:
|
||||||
|
workflow_str = format_workflow(self._workflow_map)
|
||||||
|
description = f"{self.description}\n\n{workflow_str}"
|
||||||
|
|
||||||
return make_doc_string(
|
return make_doc_string(
|
||||||
self.inputs,
|
self.inputs,
|
||||||
self.outputs,
|
self.outputs,
|
||||||
self.description,
|
description=description,
|
||||||
class_name=self.__class__.__name__,
|
class_name=self.__class__.__name__,
|
||||||
expected_components=self.expected_components,
|
expected_components=self.expected_components,
|
||||||
expected_configs=self.expected_configs,
|
expected_configs=self.expected_configs,
|
||||||
@@ -1281,7 +1310,7 @@ class LoopSequentialPipelineBlocks(ModularPipelineBlocks):
|
|||||||
@property
|
@property
|
||||||
def intermediate_outputs(self) -> List[str]:
|
def intermediate_outputs(self) -> List[str]:
|
||||||
named_outputs = [(name, block.intermediate_outputs) for name, block in self.sub_blocks.items()]
|
named_outputs = [(name, block.intermediate_outputs) for name, block in self.sub_blocks.items()]
|
||||||
combined_outputs = self.combine_outputs(*named_outputs)
|
combined_outputs = combine_outputs(*named_outputs)
|
||||||
for output in self.loop_intermediate_outputs:
|
for output in self.loop_intermediate_outputs:
|
||||||
if output.name not in {output.name for output in combined_outputs}:
|
if output.name not in {output.name for output in combined_outputs}:
|
||||||
combined_outputs.append(output)
|
combined_outputs.append(output)
|
||||||
|
|||||||
@@ -14,10 +14,12 @@
|
|||||||
|
|
||||||
import inspect
|
import inspect
|
||||||
import re
|
import re
|
||||||
|
import warnings
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
from dataclasses import dataclass, field, fields
|
from dataclasses import dataclass, field, fields
|
||||||
from typing import Any, Dict, List, Literal, Optional, Type, Union
|
from typing import Any, Dict, List, Literal, Optional, Tuple, Type, Union
|
||||||
|
|
||||||
|
import PIL.Image
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from ..configuration_utils import ConfigMixin, FrozenDict
|
from ..configuration_utils import ConfigMixin, FrozenDict
|
||||||
@@ -323,11 +325,192 @@ class ConfigSpec:
|
|||||||
description: Optional[str] = None
|
description: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
# YiYi Notes: both inputs and intermediate_inputs are InputParam objects
|
# ======================================================
|
||||||
# however some fields are not relevant for intermediate_inputs
|
# InputParam and OutputParam templates
|
||||||
# e.g. unlike inputs, required only used in docstring for intermediate_inputs, we do not check if a required intermediate inputs is passed
|
# ======================================================
|
||||||
# default is not used for intermediate_inputs, we only use default from inputs, so it is ignored if it is set for intermediate_inputs
|
|
||||||
# -> should we use different class for inputs and intermediate_inputs?
|
INPUT_PARAM_TEMPLATES = {
|
||||||
|
"prompt": {
|
||||||
|
"type_hint": str,
|
||||||
|
"required": True,
|
||||||
|
"description": "The prompt or prompts to guide image generation.",
|
||||||
|
},
|
||||||
|
"negative_prompt": {
|
||||||
|
"type_hint": str,
|
||||||
|
"description": "The prompt or prompts not to guide the image generation.",
|
||||||
|
},
|
||||||
|
"max_sequence_length": {
|
||||||
|
"type_hint": int,
|
||||||
|
"default": 512,
|
||||||
|
"description": "Maximum sequence length for prompt encoding.",
|
||||||
|
},
|
||||||
|
"height": {
|
||||||
|
"type_hint": int,
|
||||||
|
"description": "The height in pixels of the generated image.",
|
||||||
|
},
|
||||||
|
"width": {
|
||||||
|
"type_hint": int,
|
||||||
|
"description": "The width in pixels of the generated image.",
|
||||||
|
},
|
||||||
|
"num_inference_steps": {
|
||||||
|
"type_hint": int,
|
||||||
|
"default": 50,
|
||||||
|
"description": "The number of denoising steps.",
|
||||||
|
},
|
||||||
|
"num_images_per_prompt": {
|
||||||
|
"type_hint": int,
|
||||||
|
"default": 1,
|
||||||
|
"description": "The number of images to generate per prompt.",
|
||||||
|
},
|
||||||
|
"generator": {
|
||||||
|
"type_hint": torch.Generator,
|
||||||
|
"description": "Torch generator for deterministic generation.",
|
||||||
|
},
|
||||||
|
"sigmas": {
|
||||||
|
"type_hint": List[float],
|
||||||
|
"description": "Custom sigmas for the denoising process.",
|
||||||
|
},
|
||||||
|
"strength": {
|
||||||
|
"type_hint": float,
|
||||||
|
"default": 0.9,
|
||||||
|
"description": "Strength for img2img/inpainting.",
|
||||||
|
},
|
||||||
|
"image": {
|
||||||
|
"type_hint": Union[PIL.Image.Image, List[PIL.Image.Image]],
|
||||||
|
"required": True,
|
||||||
|
"description": "Reference image(s) for denoising. Can be a single image or list of images.",
|
||||||
|
},
|
||||||
|
"latents": {
|
||||||
|
"type_hint": torch.Tensor,
|
||||||
|
"description": "Pre-generated noisy latents for image generation.",
|
||||||
|
},
|
||||||
|
"timesteps": {
|
||||||
|
"type_hint": torch.Tensor,
|
||||||
|
"description": "Timesteps for the denoising process.",
|
||||||
|
},
|
||||||
|
"output_type": {
|
||||||
|
"type_hint": str,
|
||||||
|
"default": "pil",
|
||||||
|
"description": "Output format: 'pil', 'np', 'pt'.",
|
||||||
|
},
|
||||||
|
"attention_kwargs": {
|
||||||
|
"type_hint": Dict[str, Any],
|
||||||
|
"description": "Additional kwargs for attention processors.",
|
||||||
|
},
|
||||||
|
"denoiser_input_fields": {
|
||||||
|
"name": None,
|
||||||
|
"kwargs_type": "denoiser_input_fields",
|
||||||
|
"description": "conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.",
|
||||||
|
},
|
||||||
|
# inpainting
|
||||||
|
"mask_image": {
|
||||||
|
"type_hint": PIL.Image.Image,
|
||||||
|
"required": True,
|
||||||
|
"description": "Mask image for inpainting.",
|
||||||
|
},
|
||||||
|
"padding_mask_crop": {
|
||||||
|
"type_hint": int,
|
||||||
|
"description": "Padding for mask cropping in inpainting.",
|
||||||
|
},
|
||||||
|
# controlnet
|
||||||
|
"control_image": {
|
||||||
|
"type_hint": PIL.Image.Image,
|
||||||
|
"required": True,
|
||||||
|
"description": "Control image for ControlNet conditioning.",
|
||||||
|
},
|
||||||
|
"control_guidance_start": {
|
||||||
|
"type_hint": float,
|
||||||
|
"default": 0.0,
|
||||||
|
"description": "When to start applying ControlNet.",
|
||||||
|
},
|
||||||
|
"control_guidance_end": {
|
||||||
|
"type_hint": float,
|
||||||
|
"default": 1.0,
|
||||||
|
"description": "When to stop applying ControlNet.",
|
||||||
|
},
|
||||||
|
"controlnet_conditioning_scale": {
|
||||||
|
"type_hint": float,
|
||||||
|
"default": 1.0,
|
||||||
|
"description": "Scale for ControlNet conditioning.",
|
||||||
|
},
|
||||||
|
"layers": {
|
||||||
|
"type_hint": int,
|
||||||
|
"default": 4,
|
||||||
|
"description": "Number of layers to extract from the image",
|
||||||
|
},
|
||||||
|
# common intermediate inputs
|
||||||
|
"prompt_embeds": {
|
||||||
|
"type_hint": torch.Tensor,
|
||||||
|
"required": True,
|
||||||
|
"description": "text embeddings used to guide the image generation. Can be generated from text_encoder step.",
|
||||||
|
},
|
||||||
|
"prompt_embeds_mask": {
|
||||||
|
"type_hint": torch.Tensor,
|
||||||
|
"required": True,
|
||||||
|
"description": "mask for the text embeddings. Can be generated from text_encoder step.",
|
||||||
|
},
|
||||||
|
"negative_prompt_embeds": {
|
||||||
|
"type_hint": torch.Tensor,
|
||||||
|
"description": "negative text embeddings used to guide the image generation. Can be generated from text_encoder step.",
|
||||||
|
},
|
||||||
|
"negative_prompt_embeds_mask": {
|
||||||
|
"type_hint": torch.Tensor,
|
||||||
|
"description": "mask for the negative text embeddings. Can be generated from text_encoder step.",
|
||||||
|
},
|
||||||
|
"image_latents": {
|
||||||
|
"type_hint": torch.Tensor,
|
||||||
|
"required": True,
|
||||||
|
"description": "image latents used to guide the image generation. Can be generated from vae_encoder step.",
|
||||||
|
},
|
||||||
|
"batch_size": {
|
||||||
|
"type_hint": int,
|
||||||
|
"default": 1,
|
||||||
|
"description": "Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be generated in input step.",
|
||||||
|
},
|
||||||
|
"dtype": {
|
||||||
|
"type_hint": torch.dtype,
|
||||||
|
"default": torch.float32,
|
||||||
|
"description": "The dtype of the model inputs, can be generated in input step.",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
OUTPUT_PARAM_TEMPLATES = {
|
||||||
|
"images": {
|
||||||
|
"type_hint": List[PIL.Image.Image],
|
||||||
|
"description": "Generated images.",
|
||||||
|
},
|
||||||
|
"latents": {
|
||||||
|
"type_hint": torch.Tensor,
|
||||||
|
"description": "Denoised latents.",
|
||||||
|
},
|
||||||
|
# intermediate outputs
|
||||||
|
"prompt_embeds": {
|
||||||
|
"type_hint": torch.Tensor,
|
||||||
|
"kwargs_type": "denoiser_input_fields",
|
||||||
|
"description": "The prompt embeddings.",
|
||||||
|
},
|
||||||
|
"prompt_embeds_mask": {
|
||||||
|
"type_hint": torch.Tensor,
|
||||||
|
"kwargs_type": "denoiser_input_fields",
|
||||||
|
"description": "The encoder attention mask.",
|
||||||
|
},
|
||||||
|
"negative_prompt_embeds": {
|
||||||
|
"type_hint": torch.Tensor,
|
||||||
|
"kwargs_type": "denoiser_input_fields",
|
||||||
|
"description": "The negative prompt embeddings.",
|
||||||
|
},
|
||||||
|
"negative_prompt_embeds_mask": {
|
||||||
|
"type_hint": torch.Tensor,
|
||||||
|
"kwargs_type": "denoiser_input_fields",
|
||||||
|
"description": "The negative prompt embeddings mask.",
|
||||||
|
},
|
||||||
|
"image_latents": {
|
||||||
|
"type_hint": torch.Tensor,
|
||||||
|
"description": "The latent representation of the input image.",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class InputParam:
|
class InputParam:
|
||||||
"""Specification for an input parameter."""
|
"""Specification for an input parameter."""
|
||||||
@@ -337,11 +520,31 @@ class InputParam:
|
|||||||
default: Any = None
|
default: Any = None
|
||||||
required: bool = False
|
required: bool = False
|
||||||
description: str = ""
|
description: str = ""
|
||||||
kwargs_type: str = None # YiYi Notes: remove this feature (maybe)
|
kwargs_type: str = None
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return f"<{self.name}: {'required' if self.required else 'optional'}, default={self.default}>"
|
return f"<{self.name}: {'required' if self.required else 'optional'}, default={self.default}>"
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def template(cls, template_name: str, note: str = None, **overrides) -> "InputParam":
|
||||||
|
"""Get template for name if exists, otherwise raise ValueError."""
|
||||||
|
if template_name not in INPUT_PARAM_TEMPLATES:
|
||||||
|
raise ValueError(f"InputParam template for {template_name} not found")
|
||||||
|
|
||||||
|
template_kwargs = INPUT_PARAM_TEMPLATES[template_name].copy()
|
||||||
|
|
||||||
|
# Determine the actual param name:
|
||||||
|
# 1. From overrides if provided
|
||||||
|
# 2. From template if present
|
||||||
|
# 3. Fall back to template_name
|
||||||
|
name = overrides.pop("name", template_kwargs.pop("name", template_name))
|
||||||
|
|
||||||
|
if note and "description" in template_kwargs:
|
||||||
|
template_kwargs["description"] = f"{template_kwargs['description']} ({note})"
|
||||||
|
|
||||||
|
template_kwargs.update(overrides)
|
||||||
|
return cls(name=name, **template_kwargs)
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class OutputParam:
|
class OutputParam:
|
||||||
@@ -350,13 +553,33 @@ class OutputParam:
|
|||||||
name: str
|
name: str
|
||||||
type_hint: Any = None
|
type_hint: Any = None
|
||||||
description: str = ""
|
description: str = ""
|
||||||
kwargs_type: str = None # YiYi notes: remove this feature (maybe)
|
kwargs_type: str = None
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return (
|
return (
|
||||||
f"<{self.name}: {self.type_hint.__name__ if hasattr(self.type_hint, '__name__') else str(self.type_hint)}>"
|
f"<{self.name}: {self.type_hint.__name__ if hasattr(self.type_hint, '__name__') else str(self.type_hint)}>"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def template(cls, template_name: str, note: str = None, **overrides) -> "OutputParam":
|
||||||
|
"""Get template for name if exists, otherwise raise ValueError."""
|
||||||
|
if template_name not in OUTPUT_PARAM_TEMPLATES:
|
||||||
|
raise ValueError(f"OutputParam template for {template_name} not found")
|
||||||
|
|
||||||
|
template_kwargs = OUTPUT_PARAM_TEMPLATES[template_name].copy()
|
||||||
|
|
||||||
|
# Determine the actual param name:
|
||||||
|
# 1. From overrides if provided
|
||||||
|
# 2. From template if present
|
||||||
|
# 3. Fall back to template_name
|
||||||
|
name = overrides.pop("name", template_kwargs.pop("name", template_name))
|
||||||
|
|
||||||
|
if note and "description" in template_kwargs:
|
||||||
|
template_kwargs["description"] = f"{template_kwargs['description']} ({note})"
|
||||||
|
|
||||||
|
template_kwargs.update(overrides)
|
||||||
|
return cls(name=name, **template_kwargs)
|
||||||
|
|
||||||
|
|
||||||
def format_inputs_short(inputs):
|
def format_inputs_short(inputs):
|
||||||
"""
|
"""
|
||||||
@@ -509,10 +732,12 @@ def format_params(params, header="Args", indent_level=4, max_line_length=115):
|
|||||||
desc = re.sub(r"\[(.*?)\]\((https?://[^\s\)]+)\)", r"[\1](\2)", param.description)
|
desc = re.sub(r"\[(.*?)\]\((https?://[^\s\)]+)\)", r"[\1](\2)", param.description)
|
||||||
wrapped_desc = wrap_text(desc, desc_indent, max_line_length)
|
wrapped_desc = wrap_text(desc, desc_indent, max_line_length)
|
||||||
param_str += f"\n{desc_indent}{wrapped_desc}"
|
param_str += f"\n{desc_indent}{wrapped_desc}"
|
||||||
|
else:
|
||||||
|
param_str += f"\n{desc_indent}TODO: Add description."
|
||||||
|
|
||||||
formatted_params.append(param_str)
|
formatted_params.append(param_str)
|
||||||
|
|
||||||
return "\n\n".join(formatted_params)
|
return "\n".join(formatted_params)
|
||||||
|
|
||||||
|
|
||||||
def format_input_params(input_params, indent_level=4, max_line_length=115):
|
def format_input_params(input_params, indent_level=4, max_line_length=115):
|
||||||
@@ -582,7 +807,7 @@ def format_components(components, indent_level=4, max_line_length=115, add_empty
|
|||||||
loading_field_values = []
|
loading_field_values = []
|
||||||
for field_name in component.loading_fields():
|
for field_name in component.loading_fields():
|
||||||
field_value = getattr(component, field_name)
|
field_value = getattr(component, field_name)
|
||||||
if field_value is not None:
|
if field_value:
|
||||||
loading_field_values.append(f"{field_name}={field_value}")
|
loading_field_values.append(f"{field_name}={field_value}")
|
||||||
|
|
||||||
# Add loading field information if available
|
# Add loading field information if available
|
||||||
@@ -636,6 +861,30 @@ def format_configs(configs, indent_level=4, max_line_length=115, add_empty_lines
|
|||||||
return "\n".join(formatted_configs)
|
return "\n".join(formatted_configs)
|
||||||
|
|
||||||
|
|
||||||
|
def format_workflow(workflow_map):
|
||||||
|
"""Format a workflow map into a readable string representation.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
workflow_map: Dictionary mapping workflow names to trigger inputs
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A formatted string representing all workflows
|
||||||
|
"""
|
||||||
|
if workflow_map is None:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
lines = ["Supported workflows:"]
|
||||||
|
for workflow_name, trigger_inputs in workflow_map.items():
|
||||||
|
required_inputs = [k for k, v in trigger_inputs.items() if v]
|
||||||
|
if required_inputs:
|
||||||
|
inputs_str = ", ".join(f"`{t}`" for t in required_inputs)
|
||||||
|
lines.append(f" - `{workflow_name}`: requires {inputs_str}")
|
||||||
|
else:
|
||||||
|
lines.append(f" - `{workflow_name}`: default (no additional inputs required)")
|
||||||
|
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
def make_doc_string(
|
def make_doc_string(
|
||||||
inputs,
|
inputs,
|
||||||
outputs,
|
outputs,
|
||||||
@@ -669,17 +918,17 @@ def make_doc_string(
|
|||||||
# Add description
|
# Add description
|
||||||
if description:
|
if description:
|
||||||
desc_lines = description.strip().split("\n")
|
desc_lines = description.strip().split("\n")
|
||||||
aligned_desc = "\n".join(" " + line for line in desc_lines)
|
aligned_desc = "\n".join(" " + line.rstrip() for line in desc_lines)
|
||||||
output += aligned_desc + "\n\n"
|
output += aligned_desc + "\n\n"
|
||||||
|
|
||||||
# Add components section if provided
|
# Add components section if provided
|
||||||
if expected_components and len(expected_components) > 0:
|
if expected_components and len(expected_components) > 0:
|
||||||
components_str = format_components(expected_components, indent_level=2)
|
components_str = format_components(expected_components, indent_level=2, add_empty_lines=False)
|
||||||
output += components_str + "\n\n"
|
output += components_str + "\n\n"
|
||||||
|
|
||||||
# Add configs section if provided
|
# Add configs section if provided
|
||||||
if expected_configs and len(expected_configs) > 0:
|
if expected_configs and len(expected_configs) > 0:
|
||||||
configs_str = format_configs(expected_configs, indent_level=2)
|
configs_str = format_configs(expected_configs, indent_level=2, add_empty_lines=False)
|
||||||
output += configs_str + "\n\n"
|
output += configs_str + "\n\n"
|
||||||
|
|
||||||
# Add inputs section
|
# Add inputs section
|
||||||
@@ -690,3 +939,69 @@ def make_doc_string(
|
|||||||
output += format_output_params(outputs, indent_level=2)
|
output += format_output_params(outputs, indent_level=2)
|
||||||
|
|
||||||
return output
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
def combine_inputs(*named_input_lists: List[Tuple[str, List[InputParam]]]) -> List[InputParam]:
|
||||||
|
"""
|
||||||
|
Combines multiple lists of InputParam objects from different blocks. For duplicate inputs, updates only if current
|
||||||
|
default value is None and new default value is not None. Warns if multiple non-None default values exist for the
|
||||||
|
same input.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
named_input_lists: List of tuples containing (block_name, input_param_list) pairs
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List[InputParam]: Combined list of unique InputParam objects
|
||||||
|
"""
|
||||||
|
combined_dict = {} # name -> InputParam
|
||||||
|
value_sources = {} # name -> block_name
|
||||||
|
|
||||||
|
for block_name, inputs in named_input_lists:
|
||||||
|
for input_param in inputs:
|
||||||
|
if input_param.name is None and input_param.kwargs_type is not None:
|
||||||
|
input_name = "*_" + input_param.kwargs_type
|
||||||
|
else:
|
||||||
|
input_name = input_param.name
|
||||||
|
if input_name in combined_dict:
|
||||||
|
current_param = combined_dict[input_name]
|
||||||
|
if (
|
||||||
|
current_param.default is not None
|
||||||
|
and input_param.default is not None
|
||||||
|
and current_param.default != input_param.default
|
||||||
|
):
|
||||||
|
warnings.warn(
|
||||||
|
f"Multiple different default values found for input '{input_name}': "
|
||||||
|
f"{current_param.default} (from block '{value_sources[input_name]}') and "
|
||||||
|
f"{input_param.default} (from block '{block_name}'). Using {current_param.default}."
|
||||||
|
)
|
||||||
|
if current_param.default is None and input_param.default is not None:
|
||||||
|
combined_dict[input_name] = input_param
|
||||||
|
value_sources[input_name] = block_name
|
||||||
|
else:
|
||||||
|
combined_dict[input_name] = input_param
|
||||||
|
value_sources[input_name] = block_name
|
||||||
|
|
||||||
|
return list(combined_dict.values())
|
||||||
|
|
||||||
|
|
||||||
|
def combine_outputs(*named_output_lists: List[Tuple[str, List[OutputParam]]]) -> List[OutputParam]:
|
||||||
|
"""
|
||||||
|
Combines multiple lists of OutputParam objects from different blocks. For duplicate outputs, keeps the first
|
||||||
|
occurrence of each output name.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
named_output_lists: List of tuples containing (block_name, output_param_list) pairs
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List[OutputParam]: Combined list of unique OutputParam objects
|
||||||
|
"""
|
||||||
|
combined_dict = {} # name -> OutputParam
|
||||||
|
|
||||||
|
for block_name, outputs in named_output_lists:
|
||||||
|
for output_param in outputs:
|
||||||
|
if (output_param.name not in combined_dict) or (
|
||||||
|
combined_dict[output_param.name].kwargs_type is None and output_param.kwargs_type is not None
|
||||||
|
):
|
||||||
|
combined_dict[output_param.name] = output_param
|
||||||
|
|
||||||
|
return list(combined_dict.values())
|
||||||
|
|||||||
@@ -118,7 +118,40 @@ def get_timesteps(scheduler, num_inference_steps, strength):
|
|||||||
# ====================
|
# ====================
|
||||||
|
|
||||||
|
|
||||||
|
# auto_docstring
|
||||||
class QwenImagePrepareLatentsStep(ModularPipelineBlocks):
|
class QwenImagePrepareLatentsStep(ModularPipelineBlocks):
|
||||||
|
"""
|
||||||
|
Prepare initial random noise for the generation process
|
||||||
|
|
||||||
|
Components:
|
||||||
|
pachifier (`QwenImagePachifier`)
|
||||||
|
|
||||||
|
Inputs:
|
||||||
|
latents (`Tensor`, *optional*):
|
||||||
|
Pre-generated noisy latents for image generation.
|
||||||
|
height (`int`, *optional*):
|
||||||
|
The height in pixels of the generated image.
|
||||||
|
width (`int`, *optional*):
|
||||||
|
The width in pixels of the generated image.
|
||||||
|
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
||||||
|
The number of images to generate per prompt.
|
||||||
|
generator (`Generator`, *optional*):
|
||||||
|
Torch generator for deterministic generation.
|
||||||
|
batch_size (`int`, *optional*, defaults to 1):
|
||||||
|
Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
|
||||||
|
be generated in input step.
|
||||||
|
dtype (`dtype`, *optional*, defaults to torch.float32):
|
||||||
|
The dtype of the model inputs, can be generated in input step.
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
height (`int`):
|
||||||
|
if not set, updated to default value
|
||||||
|
width (`int`):
|
||||||
|
if not set, updated to default value
|
||||||
|
latents (`Tensor`):
|
||||||
|
The initial latents to use for the denoising process
|
||||||
|
"""
|
||||||
|
|
||||||
model_name = "qwenimage"
|
model_name = "qwenimage"
|
||||||
|
|
||||||
@property
|
@property
|
||||||
@@ -134,28 +167,20 @@ class QwenImagePrepareLatentsStep(ModularPipelineBlocks):
|
|||||||
@property
|
@property
|
||||||
def inputs(self) -> List[InputParam]:
|
def inputs(self) -> List[InputParam]:
|
||||||
return [
|
return [
|
||||||
InputParam("latents"),
|
InputParam.template("latents"),
|
||||||
InputParam(name="height"),
|
InputParam.template("height"),
|
||||||
InputParam(name="width"),
|
InputParam.template("width"),
|
||||||
InputParam(name="num_images_per_prompt", default=1),
|
InputParam.template("num_images_per_prompt"),
|
||||||
InputParam(name="generator"),
|
InputParam.template("generator"),
|
||||||
InputParam(
|
InputParam.template("batch_size"),
|
||||||
name="batch_size",
|
InputParam.template("dtype"),
|
||||||
required=True,
|
|
||||||
type_hint=int,
|
|
||||||
description="Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be generated in input step.",
|
|
||||||
),
|
|
||||||
InputParam(
|
|
||||||
name="dtype",
|
|
||||||
required=True,
|
|
||||||
type_hint=torch.dtype,
|
|
||||||
description="The dtype of the model inputs, can be generated in input step.",
|
|
||||||
),
|
|
||||||
]
|
]
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def intermediate_outputs(self) -> List[OutputParam]:
|
def intermediate_outputs(self) -> List[OutputParam]:
|
||||||
return [
|
return [
|
||||||
|
OutputParam(name="height", type_hint=int, description="if not set, updated to default value"),
|
||||||
|
OutputParam(name="width", type_hint=int, description="if not set, updated to default value"),
|
||||||
OutputParam(
|
OutputParam(
|
||||||
name="latents",
|
name="latents",
|
||||||
type_hint=torch.Tensor,
|
type_hint=torch.Tensor,
|
||||||
@@ -209,7 +234,42 @@ class QwenImagePrepareLatentsStep(ModularPipelineBlocks):
|
|||||||
return components, state
|
return components, state
|
||||||
|
|
||||||
|
|
||||||
|
# auto_docstring
|
||||||
class QwenImageLayeredPrepareLatentsStep(ModularPipelineBlocks):
|
class QwenImageLayeredPrepareLatentsStep(ModularPipelineBlocks):
|
||||||
|
"""
|
||||||
|
Prepare initial random noise (B, layers+1, C, H, W) for the generation process
|
||||||
|
|
||||||
|
Components:
|
||||||
|
pachifier (`QwenImageLayeredPachifier`)
|
||||||
|
|
||||||
|
Inputs:
|
||||||
|
latents (`Tensor`, *optional*):
|
||||||
|
Pre-generated noisy latents for image generation.
|
||||||
|
height (`int`, *optional*):
|
||||||
|
The height in pixels of the generated image.
|
||||||
|
width (`int`, *optional*):
|
||||||
|
The width in pixels of the generated image.
|
||||||
|
layers (`int`, *optional*, defaults to 4):
|
||||||
|
Number of layers to extract from the image
|
||||||
|
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
||||||
|
The number of images to generate per prompt.
|
||||||
|
generator (`Generator`, *optional*):
|
||||||
|
Torch generator for deterministic generation.
|
||||||
|
batch_size (`int`, *optional*, defaults to 1):
|
||||||
|
Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
|
||||||
|
be generated in input step.
|
||||||
|
dtype (`dtype`, *optional*, defaults to torch.float32):
|
||||||
|
The dtype of the model inputs, can be generated in input step.
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
height (`int`):
|
||||||
|
if not set, updated to default value
|
||||||
|
width (`int`):
|
||||||
|
if not set, updated to default value
|
||||||
|
latents (`Tensor`):
|
||||||
|
The initial latents to use for the denoising process
|
||||||
|
"""
|
||||||
|
|
||||||
model_name = "qwenimage-layered"
|
model_name = "qwenimage-layered"
|
||||||
|
|
||||||
@property
|
@property
|
||||||
@@ -225,29 +285,21 @@ class QwenImageLayeredPrepareLatentsStep(ModularPipelineBlocks):
|
|||||||
@property
|
@property
|
||||||
def inputs(self) -> List[InputParam]:
|
def inputs(self) -> List[InputParam]:
|
||||||
return [
|
return [
|
||||||
InputParam("latents"),
|
InputParam.template("latents"),
|
||||||
InputParam(name="height"),
|
InputParam.template("height"),
|
||||||
InputParam(name="width"),
|
InputParam.template("width"),
|
||||||
InputParam(name="layers", default=4),
|
InputParam.template("layers"),
|
||||||
InputParam(name="num_images_per_prompt", default=1),
|
InputParam.template("num_images_per_prompt"),
|
||||||
InputParam(name="generator"),
|
InputParam.template("generator"),
|
||||||
InputParam(
|
InputParam.template("batch_size"),
|
||||||
name="batch_size",
|
InputParam.template("dtype"),
|
||||||
required=True,
|
|
||||||
type_hint=int,
|
|
||||||
description="Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be generated in input step.",
|
|
||||||
),
|
|
||||||
InputParam(
|
|
||||||
name="dtype",
|
|
||||||
required=True,
|
|
||||||
type_hint=torch.dtype,
|
|
||||||
description="The dtype of the model inputs, can be generated in input step.",
|
|
||||||
),
|
|
||||||
]
|
]
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def intermediate_outputs(self) -> List[OutputParam]:
|
def intermediate_outputs(self) -> List[OutputParam]:
|
||||||
return [
|
return [
|
||||||
|
OutputParam(name="height", type_hint=int, description="if not set, updated to default value"),
|
||||||
|
OutputParam(name="width", type_hint=int, description="if not set, updated to default value"),
|
||||||
OutputParam(
|
OutputParam(
|
||||||
name="latents",
|
name="latents",
|
||||||
type_hint=torch.Tensor,
|
type_hint=torch.Tensor,
|
||||||
@@ -301,7 +353,31 @@ class QwenImageLayeredPrepareLatentsStep(ModularPipelineBlocks):
|
|||||||
return components, state
|
return components, state
|
||||||
|
|
||||||
|
|
||||||
|
# auto_docstring
|
||||||
class QwenImagePrepareLatentsWithStrengthStep(ModularPipelineBlocks):
|
class QwenImagePrepareLatentsWithStrengthStep(ModularPipelineBlocks):
|
||||||
|
"""
|
||||||
|
Step that adds noise to image latents for image-to-image/inpainting. Should be run after set_timesteps,
|
||||||
|
prepare_latents. Both noise and image latents should alreadybe patchified.
|
||||||
|
|
||||||
|
Components:
|
||||||
|
scheduler (`FlowMatchEulerDiscreteScheduler`)
|
||||||
|
|
||||||
|
Inputs:
|
||||||
|
latents (`Tensor`):
|
||||||
|
The initial random noised, can be generated in prepare latent step.
|
||||||
|
image_latents (`Tensor`):
|
||||||
|
image latents used to guide the image generation. Can be generated from vae_encoder step. (Can be
|
||||||
|
generated from vae encoder and updated in input step.)
|
||||||
|
timesteps (`Tensor`):
|
||||||
|
The timesteps to use for the denoising process. Can be generated in set_timesteps step.
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
initial_noise (`Tensor`):
|
||||||
|
The initial random noised used for inpainting denoising.
|
||||||
|
latents (`Tensor`):
|
||||||
|
The scaled noisy latents to use for inpainting/image-to-image denoising.
|
||||||
|
"""
|
||||||
|
|
||||||
model_name = "qwenimage"
|
model_name = "qwenimage"
|
||||||
|
|
||||||
@property
|
@property
|
||||||
@@ -323,12 +399,7 @@ class QwenImagePrepareLatentsWithStrengthStep(ModularPipelineBlocks):
|
|||||||
type_hint=torch.Tensor,
|
type_hint=torch.Tensor,
|
||||||
description="The initial random noised, can be generated in prepare latent step.",
|
description="The initial random noised, can be generated in prepare latent step.",
|
||||||
),
|
),
|
||||||
InputParam(
|
InputParam.template("image_latents", note="Can be generated from vae encoder and updated in input step."),
|
||||||
name="image_latents",
|
|
||||||
required=True,
|
|
||||||
type_hint=torch.Tensor,
|
|
||||||
description="The image latents to use for the denoising process. Can be generated in vae encoder and packed in input step.",
|
|
||||||
),
|
|
||||||
InputParam(
|
InputParam(
|
||||||
name="timesteps",
|
name="timesteps",
|
||||||
required=True,
|
required=True,
|
||||||
@@ -345,6 +416,11 @@ class QwenImagePrepareLatentsWithStrengthStep(ModularPipelineBlocks):
|
|||||||
type_hint=torch.Tensor,
|
type_hint=torch.Tensor,
|
||||||
description="The initial random noised used for inpainting denoising.",
|
description="The initial random noised used for inpainting denoising.",
|
||||||
),
|
),
|
||||||
|
OutputParam(
|
||||||
|
name="latents",
|
||||||
|
type_hint=torch.Tensor,
|
||||||
|
description="The scaled noisy latents to use for inpainting/image-to-image denoising.",
|
||||||
|
),
|
||||||
]
|
]
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@@ -382,7 +458,29 @@ class QwenImagePrepareLatentsWithStrengthStep(ModularPipelineBlocks):
|
|||||||
return components, state
|
return components, state
|
||||||
|
|
||||||
|
|
||||||
|
# auto_docstring
|
||||||
class QwenImageCreateMaskLatentsStep(ModularPipelineBlocks):
|
class QwenImageCreateMaskLatentsStep(ModularPipelineBlocks):
|
||||||
|
"""
|
||||||
|
Step that creates mask latents from preprocessed mask_image by interpolating to latent space.
|
||||||
|
|
||||||
|
Components:
|
||||||
|
pachifier (`QwenImagePachifier`)
|
||||||
|
|
||||||
|
Inputs:
|
||||||
|
processed_mask_image (`Tensor`):
|
||||||
|
The processed mask to use for the inpainting process.
|
||||||
|
height (`int`):
|
||||||
|
The height in pixels of the generated image.
|
||||||
|
width (`int`):
|
||||||
|
The width in pixels of the generated image.
|
||||||
|
dtype (`dtype`, *optional*, defaults to torch.float32):
|
||||||
|
The dtype of the model inputs, can be generated in input step.
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
mask (`Tensor`):
|
||||||
|
The mask to use for the inpainting process.
|
||||||
|
"""
|
||||||
|
|
||||||
model_name = "qwenimage"
|
model_name = "qwenimage"
|
||||||
|
|
||||||
@property
|
@property
|
||||||
@@ -404,9 +502,9 @@ class QwenImageCreateMaskLatentsStep(ModularPipelineBlocks):
|
|||||||
type_hint=torch.Tensor,
|
type_hint=torch.Tensor,
|
||||||
description="The processed mask to use for the inpainting process.",
|
description="The processed mask to use for the inpainting process.",
|
||||||
),
|
),
|
||||||
InputParam(name="height", required=True),
|
InputParam.template("height", required=True),
|
||||||
InputParam(name="width", required=True),
|
InputParam.template("width", required=True),
|
||||||
InputParam(name="dtype", required=True),
|
InputParam.template("dtype"),
|
||||||
]
|
]
|
||||||
|
|
||||||
@property
|
@property
|
||||||
@@ -450,7 +548,27 @@ class QwenImageCreateMaskLatentsStep(ModularPipelineBlocks):
|
|||||||
# ====================
|
# ====================
|
||||||
|
|
||||||
|
|
||||||
|
# auto_docstring
|
||||||
class QwenImageSetTimestepsStep(ModularPipelineBlocks):
|
class QwenImageSetTimestepsStep(ModularPipelineBlocks):
|
||||||
|
"""
|
||||||
|
Step that sets the scheduler's timesteps for text-to-image generation. Should be run after prepare latents step.
|
||||||
|
|
||||||
|
Components:
|
||||||
|
scheduler (`FlowMatchEulerDiscreteScheduler`)
|
||||||
|
|
||||||
|
Inputs:
|
||||||
|
num_inference_steps (`int`, *optional*, defaults to 50):
|
||||||
|
The number of denoising steps.
|
||||||
|
sigmas (`List`, *optional*):
|
||||||
|
Custom sigmas for the denoising process.
|
||||||
|
latents (`Tensor`):
|
||||||
|
The initial random noised latents for the denoising process. Can be generated in prepare latents step.
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
timesteps (`Tensor`):
|
||||||
|
The timesteps to use for the denoising process
|
||||||
|
"""
|
||||||
|
|
||||||
model_name = "qwenimage"
|
model_name = "qwenimage"
|
||||||
|
|
||||||
@property
|
@property
|
||||||
@@ -466,13 +584,13 @@ class QwenImageSetTimestepsStep(ModularPipelineBlocks):
|
|||||||
@property
|
@property
|
||||||
def inputs(self) -> List[InputParam]:
|
def inputs(self) -> List[InputParam]:
|
||||||
return [
|
return [
|
||||||
InputParam(name="num_inference_steps", default=50),
|
InputParam.template("num_inference_steps"),
|
||||||
InputParam(name="sigmas"),
|
InputParam.template("sigmas"),
|
||||||
InputParam(
|
InputParam(
|
||||||
name="latents",
|
name="latents",
|
||||||
required=True,
|
required=True,
|
||||||
type_hint=torch.Tensor,
|
type_hint=torch.Tensor,
|
||||||
description="The latents to use for the denoising process, used to calculate the image sequence length.",
|
description="The initial random noised latents for the denoising process. Can be generated in prepare latents step.",
|
||||||
),
|
),
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -516,7 +634,27 @@ class QwenImageSetTimestepsStep(ModularPipelineBlocks):
|
|||||||
return components, state
|
return components, state
|
||||||
|
|
||||||
|
|
||||||
|
# auto_docstring
|
||||||
class QwenImageLayeredSetTimestepsStep(ModularPipelineBlocks):
|
class QwenImageLayeredSetTimestepsStep(ModularPipelineBlocks):
|
||||||
|
"""
|
||||||
|
Set timesteps step for QwenImage Layered with custom mu calculation based on image_latents.
|
||||||
|
|
||||||
|
Components:
|
||||||
|
scheduler (`FlowMatchEulerDiscreteScheduler`)
|
||||||
|
|
||||||
|
Inputs:
|
||||||
|
num_inference_steps (`int`, *optional*, defaults to 50):
|
||||||
|
The number of denoising steps.
|
||||||
|
sigmas (`List`, *optional*):
|
||||||
|
Custom sigmas for the denoising process.
|
||||||
|
image_latents (`Tensor`):
|
||||||
|
image latents used to guide the image generation. Can be generated from vae_encoder step.
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
timesteps (`Tensor`):
|
||||||
|
The timesteps to use for the denoising process.
|
||||||
|
"""
|
||||||
|
|
||||||
model_name = "qwenimage-layered"
|
model_name = "qwenimage-layered"
|
||||||
|
|
||||||
@property
|
@property
|
||||||
@@ -532,15 +670,17 @@ class QwenImageLayeredSetTimestepsStep(ModularPipelineBlocks):
|
|||||||
@property
|
@property
|
||||||
def inputs(self) -> List[InputParam]:
|
def inputs(self) -> List[InputParam]:
|
||||||
return [
|
return [
|
||||||
InputParam("num_inference_steps", default=50, type_hint=int),
|
InputParam.template("num_inference_steps"),
|
||||||
InputParam("sigmas", type_hint=List[float]),
|
InputParam.template("sigmas"),
|
||||||
InputParam("image_latents", required=True, type_hint=torch.Tensor),
|
InputParam.template("image_latents"),
|
||||||
]
|
]
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def intermediate_outputs(self) -> List[OutputParam]:
|
def intermediate_outputs(self) -> List[OutputParam]:
|
||||||
return [
|
return [
|
||||||
OutputParam(name="timesteps", type_hint=torch.Tensor),
|
OutputParam(
|
||||||
|
name="timesteps", type_hint=torch.Tensor, description="The timesteps to use for the denoising process."
|
||||||
|
),
|
||||||
]
|
]
|
||||||
|
|
||||||
@torch.no_grad()
|
@torch.no_grad()
|
||||||
@@ -574,7 +714,32 @@ class QwenImageLayeredSetTimestepsStep(ModularPipelineBlocks):
|
|||||||
return components, state
|
return components, state
|
||||||
|
|
||||||
|
|
||||||
|
# auto_docstring
|
||||||
class QwenImageSetTimestepsWithStrengthStep(ModularPipelineBlocks):
|
class QwenImageSetTimestepsWithStrengthStep(ModularPipelineBlocks):
|
||||||
|
"""
|
||||||
|
Step that sets the scheduler's timesteps for image-to-image generation, and inpainting. Should be run after prepare
|
||||||
|
latents step.
|
||||||
|
|
||||||
|
Components:
|
||||||
|
scheduler (`FlowMatchEulerDiscreteScheduler`)
|
||||||
|
|
||||||
|
Inputs:
|
||||||
|
num_inference_steps (`int`, *optional*, defaults to 50):
|
||||||
|
The number of denoising steps.
|
||||||
|
sigmas (`List`, *optional*):
|
||||||
|
Custom sigmas for the denoising process.
|
||||||
|
latents (`Tensor`):
|
||||||
|
The latents to use for the denoising process. Can be generated in prepare latents step.
|
||||||
|
strength (`float`, *optional*, defaults to 0.9):
|
||||||
|
Strength for img2img/inpainting.
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
timesteps (`Tensor`):
|
||||||
|
The timesteps to use for the denoising process.
|
||||||
|
num_inference_steps (`int`):
|
||||||
|
The number of denoising steps to perform at inference time. Updated based on strength.
|
||||||
|
"""
|
||||||
|
|
||||||
model_name = "qwenimage"
|
model_name = "qwenimage"
|
||||||
|
|
||||||
@property
|
@property
|
||||||
@@ -590,15 +755,15 @@ class QwenImageSetTimestepsWithStrengthStep(ModularPipelineBlocks):
|
|||||||
@property
|
@property
|
||||||
def inputs(self) -> List[InputParam]:
|
def inputs(self) -> List[InputParam]:
|
||||||
return [
|
return [
|
||||||
InputParam(name="num_inference_steps", default=50),
|
InputParam.template("num_inference_steps"),
|
||||||
InputParam(name="sigmas"),
|
InputParam.template("sigmas"),
|
||||||
InputParam(
|
InputParam(
|
||||||
name="latents",
|
"latents",
|
||||||
required=True,
|
required=True,
|
||||||
type_hint=torch.Tensor,
|
type_hint=torch.Tensor,
|
||||||
description="The latents to use for the denoising process, used to calculate the image sequence length.",
|
description="The latents to use for the denoising process. Can be generated in prepare latents step.",
|
||||||
),
|
),
|
||||||
InputParam(name="strength", default=0.9),
|
InputParam.template("strength", default=0.9),
|
||||||
]
|
]
|
||||||
|
|
||||||
@property
|
@property
|
||||||
@@ -607,7 +772,12 @@ class QwenImageSetTimestepsWithStrengthStep(ModularPipelineBlocks):
|
|||||||
OutputParam(
|
OutputParam(
|
||||||
name="timesteps",
|
name="timesteps",
|
||||||
type_hint=torch.Tensor,
|
type_hint=torch.Tensor,
|
||||||
description="The timesteps to use for the denoising process. Can be generated in set_timesteps step.",
|
description="The timesteps to use for the denoising process.",
|
||||||
|
),
|
||||||
|
OutputParam(
|
||||||
|
name="num_inference_steps",
|
||||||
|
type_hint=int,
|
||||||
|
description="The number of denoising steps to perform at inference time. Updated based on strength.",
|
||||||
),
|
),
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -654,7 +824,29 @@ class QwenImageSetTimestepsWithStrengthStep(ModularPipelineBlocks):
|
|||||||
## RoPE inputs for denoiser
|
## RoPE inputs for denoiser
|
||||||
|
|
||||||
|
|
||||||
|
# auto_docstring
|
||||||
class QwenImageRoPEInputsStep(ModularPipelineBlocks):
|
class QwenImageRoPEInputsStep(ModularPipelineBlocks):
|
||||||
|
"""
|
||||||
|
Step that prepares the RoPE inputs for the denoising process. Should be place after prepare_latents step
|
||||||
|
|
||||||
|
Inputs:
|
||||||
|
batch_size (`int`, *optional*, defaults to 1):
|
||||||
|
Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
|
||||||
|
be generated in input step.
|
||||||
|
height (`int`):
|
||||||
|
The height in pixels of the generated image.
|
||||||
|
width (`int`):
|
||||||
|
The width in pixels of the generated image.
|
||||||
|
prompt_embeds_mask (`Tensor`):
|
||||||
|
mask for the text embeddings. Can be generated from text_encoder step.
|
||||||
|
negative_prompt_embeds_mask (`Tensor`, *optional*):
|
||||||
|
mask for the negative text embeddings. Can be generated from text_encoder step.
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
img_shapes (`List`):
|
||||||
|
The shapes of the images latents, used for RoPE calculation
|
||||||
|
"""
|
||||||
|
|
||||||
model_name = "qwenimage"
|
model_name = "qwenimage"
|
||||||
|
|
||||||
@property
|
@property
|
||||||
@@ -666,11 +858,11 @@ class QwenImageRoPEInputsStep(ModularPipelineBlocks):
|
|||||||
@property
|
@property
|
||||||
def inputs(self) -> List[InputParam]:
|
def inputs(self) -> List[InputParam]:
|
||||||
return [
|
return [
|
||||||
InputParam(name="batch_size", required=True),
|
InputParam.template("batch_size"),
|
||||||
InputParam(name="height", required=True),
|
InputParam.template("height", required=True),
|
||||||
InputParam(name="width", required=True),
|
InputParam.template("width", required=True),
|
||||||
InputParam(name="prompt_embeds_mask"),
|
InputParam.template("prompt_embeds_mask"),
|
||||||
InputParam(name="negative_prompt_embeds_mask"),
|
InputParam.template("negative_prompt_embeds_mask"),
|
||||||
]
|
]
|
||||||
|
|
||||||
@property
|
@property
|
||||||
@@ -702,7 +894,34 @@ class QwenImageRoPEInputsStep(ModularPipelineBlocks):
|
|||||||
return components, state
|
return components, state
|
||||||
|
|
||||||
|
|
||||||
|
# auto_docstring
|
||||||
class QwenImageEditRoPEInputsStep(ModularPipelineBlocks):
|
class QwenImageEditRoPEInputsStep(ModularPipelineBlocks):
|
||||||
|
"""
|
||||||
|
Step that prepares the RoPE inputs for denoising process. This is used in QwenImage Edit. Should be placed after
|
||||||
|
prepare_latents step
|
||||||
|
|
||||||
|
Inputs:
|
||||||
|
batch_size (`int`, *optional*, defaults to 1):
|
||||||
|
Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
|
||||||
|
be generated in input step.
|
||||||
|
image_height (`int`):
|
||||||
|
The height of the reference image. Can be generated in input step.
|
||||||
|
image_width (`int`):
|
||||||
|
The width of the reference image. Can be generated in input step.
|
||||||
|
height (`int`):
|
||||||
|
The height in pixels of the generated image.
|
||||||
|
width (`int`):
|
||||||
|
The width in pixels of the generated image.
|
||||||
|
prompt_embeds_mask (`Tensor`):
|
||||||
|
mask for the text embeddings. Can be generated from text_encoder step.
|
||||||
|
negative_prompt_embeds_mask (`Tensor`, *optional*):
|
||||||
|
mask for the negative text embeddings. Can be generated from text_encoder step.
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
img_shapes (`List`):
|
||||||
|
The shapes of the images latents, used for RoPE calculation
|
||||||
|
"""
|
||||||
|
|
||||||
model_name = "qwenimage"
|
model_name = "qwenimage"
|
||||||
|
|
||||||
@property
|
@property
|
||||||
@@ -712,13 +931,23 @@ class QwenImageEditRoPEInputsStep(ModularPipelineBlocks):
|
|||||||
@property
|
@property
|
||||||
def inputs(self) -> List[InputParam]:
|
def inputs(self) -> List[InputParam]:
|
||||||
return [
|
return [
|
||||||
InputParam(name="batch_size", required=True),
|
InputParam.template("batch_size"),
|
||||||
InputParam(name="image_height", required=True),
|
InputParam(
|
||||||
InputParam(name="image_width", required=True),
|
name="image_height",
|
||||||
InputParam(name="height", required=True),
|
required=True,
|
||||||
InputParam(name="width", required=True),
|
type_hint=int,
|
||||||
InputParam(name="prompt_embeds_mask"),
|
description="The height of the reference image. Can be generated in input step.",
|
||||||
InputParam(name="negative_prompt_embeds_mask"),
|
),
|
||||||
|
InputParam(
|
||||||
|
name="image_width",
|
||||||
|
required=True,
|
||||||
|
type_hint=int,
|
||||||
|
description="The width of the reference image. Can be generated in input step.",
|
||||||
|
),
|
||||||
|
InputParam.template("height", required=True),
|
||||||
|
InputParam.template("width", required=True),
|
||||||
|
InputParam.template("prompt_embeds_mask"),
|
||||||
|
InputParam.template("negative_prompt_embeds_mask"),
|
||||||
]
|
]
|
||||||
|
|
||||||
@property
|
@property
|
||||||
@@ -756,7 +985,39 @@ class QwenImageEditRoPEInputsStep(ModularPipelineBlocks):
|
|||||||
return components, state
|
return components, state
|
||||||
|
|
||||||
|
|
||||||
|
# auto_docstring
|
||||||
class QwenImageEditPlusRoPEInputsStep(ModularPipelineBlocks):
|
class QwenImageEditPlusRoPEInputsStep(ModularPipelineBlocks):
|
||||||
|
"""
|
||||||
|
Step that prepares the RoPE inputs for denoising process. This is used in QwenImage Edit Plus.
|
||||||
|
Unlike Edit, Edit Plus handles lists of image_height/image_width for multiple reference images. Should be placed
|
||||||
|
after prepare_latents step.
|
||||||
|
|
||||||
|
Inputs:
|
||||||
|
batch_size (`int`, *optional*, defaults to 1):
|
||||||
|
Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
|
||||||
|
be generated in input step.
|
||||||
|
image_height (`List`):
|
||||||
|
The heights of the reference images. Can be generated in input step.
|
||||||
|
image_width (`List`):
|
||||||
|
The widths of the reference images. Can be generated in input step.
|
||||||
|
height (`int`):
|
||||||
|
The height in pixels of the generated image.
|
||||||
|
width (`int`):
|
||||||
|
The width in pixels of the generated image.
|
||||||
|
prompt_embeds_mask (`Tensor`):
|
||||||
|
mask for the text embeddings. Can be generated from text_encoder step.
|
||||||
|
negative_prompt_embeds_mask (`Tensor`, *optional*):
|
||||||
|
mask for the negative text embeddings. Can be generated from text_encoder step.
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
img_shapes (`List`):
|
||||||
|
The shapes of the image latents, used for RoPE calculation
|
||||||
|
txt_seq_lens (`List`):
|
||||||
|
The sequence lengths of the prompt embeds, used for RoPE calculation
|
||||||
|
negative_txt_seq_lens (`List`):
|
||||||
|
The sequence lengths of the negative prompt embeds, used for RoPE calculation
|
||||||
|
"""
|
||||||
|
|
||||||
model_name = "qwenimage-edit-plus"
|
model_name = "qwenimage-edit-plus"
|
||||||
|
|
||||||
@property
|
@property
|
||||||
@@ -770,13 +1031,23 @@ class QwenImageEditPlusRoPEInputsStep(ModularPipelineBlocks):
|
|||||||
@property
|
@property
|
||||||
def inputs(self) -> List[InputParam]:
|
def inputs(self) -> List[InputParam]:
|
||||||
return [
|
return [
|
||||||
InputParam(name="batch_size", required=True),
|
InputParam.template("batch_size"),
|
||||||
InputParam(name="image_height", required=True, type_hint=List[int]),
|
InputParam(
|
||||||
InputParam(name="image_width", required=True, type_hint=List[int]),
|
name="image_height",
|
||||||
InputParam(name="height", required=True),
|
required=True,
|
||||||
InputParam(name="width", required=True),
|
type_hint=List[int],
|
||||||
InputParam(name="prompt_embeds_mask"),
|
description="The heights of the reference images. Can be generated in input step.",
|
||||||
InputParam(name="negative_prompt_embeds_mask"),
|
),
|
||||||
|
InputParam(
|
||||||
|
name="image_width",
|
||||||
|
required=True,
|
||||||
|
type_hint=List[int],
|
||||||
|
description="The widths of the reference images. Can be generated in input step.",
|
||||||
|
),
|
||||||
|
InputParam.template("height", required=True),
|
||||||
|
InputParam.template("width", required=True),
|
||||||
|
InputParam.template("prompt_embeds_mask"),
|
||||||
|
InputParam.template("negative_prompt_embeds_mask"),
|
||||||
]
|
]
|
||||||
|
|
||||||
@property
|
@property
|
||||||
@@ -832,7 +1103,37 @@ class QwenImageEditPlusRoPEInputsStep(ModularPipelineBlocks):
|
|||||||
return components, state
|
return components, state
|
||||||
|
|
||||||
|
|
||||||
|
# auto_docstring
|
||||||
class QwenImageLayeredRoPEInputsStep(ModularPipelineBlocks):
|
class QwenImageLayeredRoPEInputsStep(ModularPipelineBlocks):
|
||||||
|
"""
|
||||||
|
Step that prepares the RoPE inputs for the denoising process. Should be place after prepare_latents step
|
||||||
|
|
||||||
|
Inputs:
|
||||||
|
batch_size (`int`, *optional*, defaults to 1):
|
||||||
|
Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
|
||||||
|
be generated in input step.
|
||||||
|
layers (`int`, *optional*, defaults to 4):
|
||||||
|
Number of layers to extract from the image
|
||||||
|
height (`int`):
|
||||||
|
The height in pixels of the generated image.
|
||||||
|
width (`int`):
|
||||||
|
The width in pixels of the generated image.
|
||||||
|
prompt_embeds_mask (`Tensor`):
|
||||||
|
mask for the text embeddings. Can be generated from text_encoder step.
|
||||||
|
negative_prompt_embeds_mask (`Tensor`, *optional*):
|
||||||
|
mask for the negative text embeddings. Can be generated from text_encoder step.
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
img_shapes (`List`):
|
||||||
|
The shapes of the image latents, used for RoPE calculation
|
||||||
|
txt_seq_lens (`List`):
|
||||||
|
The sequence lengths of the prompt embeds, used for RoPE calculation
|
||||||
|
negative_txt_seq_lens (`List`):
|
||||||
|
The sequence lengths of the negative prompt embeds, used for RoPE calculation
|
||||||
|
additional_t_cond (`Tensor`):
|
||||||
|
The additional t cond, used for RoPE calculation
|
||||||
|
"""
|
||||||
|
|
||||||
model_name = "qwenimage-layered"
|
model_name = "qwenimage-layered"
|
||||||
|
|
||||||
@property
|
@property
|
||||||
@@ -844,12 +1145,12 @@ class QwenImageLayeredRoPEInputsStep(ModularPipelineBlocks):
|
|||||||
@property
|
@property
|
||||||
def inputs(self) -> List[InputParam]:
|
def inputs(self) -> List[InputParam]:
|
||||||
return [
|
return [
|
||||||
InputParam(name="batch_size", required=True),
|
InputParam.template("batch_size"),
|
||||||
InputParam(name="layers", required=True),
|
InputParam.template("layers"),
|
||||||
InputParam(name="height", required=True),
|
InputParam.template("height", required=True),
|
||||||
InputParam(name="width", required=True),
|
InputParam.template("width", required=True),
|
||||||
InputParam(name="prompt_embeds_mask"),
|
InputParam.template("prompt_embeds_mask"),
|
||||||
InputParam(name="negative_prompt_embeds_mask"),
|
InputParam.template("negative_prompt_embeds_mask"),
|
||||||
]
|
]
|
||||||
|
|
||||||
@property
|
@property
|
||||||
@@ -914,7 +1215,34 @@ class QwenImageLayeredRoPEInputsStep(ModularPipelineBlocks):
|
|||||||
|
|
||||||
|
|
||||||
## ControlNet inputs for denoiser
|
## ControlNet inputs for denoiser
|
||||||
|
|
||||||
|
|
||||||
|
# auto_docstring
|
||||||
class QwenImageControlNetBeforeDenoiserStep(ModularPipelineBlocks):
|
class QwenImageControlNetBeforeDenoiserStep(ModularPipelineBlocks):
|
||||||
|
"""
|
||||||
|
step that prepare inputs for controlnet. Insert before the Denoise Step, after set_timesteps step.
|
||||||
|
|
||||||
|
Components:
|
||||||
|
controlnet (`QwenImageControlNetModel`)
|
||||||
|
|
||||||
|
Inputs:
|
||||||
|
control_guidance_start (`float`, *optional*, defaults to 0.0):
|
||||||
|
When to start applying ControlNet.
|
||||||
|
control_guidance_end (`float`, *optional*, defaults to 1.0):
|
||||||
|
When to stop applying ControlNet.
|
||||||
|
controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
|
||||||
|
Scale for ControlNet conditioning.
|
||||||
|
control_image_latents (`Tensor`):
|
||||||
|
The control image latents to use for the denoising process. Can be generated in controlnet vae encoder
|
||||||
|
step.
|
||||||
|
timesteps (`Tensor`):
|
||||||
|
The timesteps to use for the denoising process. Can be generated in set_timesteps step.
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
controlnet_keep (`List`):
|
||||||
|
The controlnet keep values
|
||||||
|
"""
|
||||||
|
|
||||||
model_name = "qwenimage"
|
model_name = "qwenimage"
|
||||||
|
|
||||||
@property
|
@property
|
||||||
@@ -930,12 +1258,17 @@ class QwenImageControlNetBeforeDenoiserStep(ModularPipelineBlocks):
|
|||||||
@property
|
@property
|
||||||
def inputs(self) -> List[InputParam]:
|
def inputs(self) -> List[InputParam]:
|
||||||
return [
|
return [
|
||||||
InputParam("control_guidance_start", default=0.0),
|
InputParam.template("control_guidance_start"),
|
||||||
InputParam("control_guidance_end", default=1.0),
|
InputParam.template("control_guidance_end"),
|
||||||
InputParam("controlnet_conditioning_scale", default=1.0),
|
InputParam.template("controlnet_conditioning_scale"),
|
||||||
InputParam("control_image_latents", required=True),
|
|
||||||
InputParam(
|
InputParam(
|
||||||
"timesteps",
|
name="control_image_latents",
|
||||||
|
required=True,
|
||||||
|
type_hint=torch.Tensor,
|
||||||
|
description="The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step.",
|
||||||
|
),
|
||||||
|
InputParam(
|
||||||
|
name="timesteps",
|
||||||
required=True,
|
required=True,
|
||||||
type_hint=torch.Tensor,
|
type_hint=torch.Tensor,
|
||||||
description="The timesteps to use for the denoising process. Can be generated in set_timesteps step.",
|
description="The timesteps to use for the denoising process. Can be generated in set_timesteps step.",
|
||||||
|
|||||||
@@ -12,10 +12,8 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
from typing import List, Union
|
from typing import Any, Dict, List
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
import PIL
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from ...configuration_utils import FrozenDict
|
from ...configuration_utils import FrozenDict
|
||||||
@@ -31,7 +29,30 @@ logger = logging.get_logger(__name__)
|
|||||||
|
|
||||||
|
|
||||||
# after denoising loop (unpack latents)
|
# after denoising loop (unpack latents)
|
||||||
|
|
||||||
|
|
||||||
|
# auto_docstring
|
||||||
class QwenImageAfterDenoiseStep(ModularPipelineBlocks):
|
class QwenImageAfterDenoiseStep(ModularPipelineBlocks):
|
||||||
|
"""
|
||||||
|
Step that unpack the latents from 3D tensor (batch_size, sequence_length, channels) into 5D tensor (batch_size,
|
||||||
|
channels, 1, height, width)
|
||||||
|
|
||||||
|
Components:
|
||||||
|
pachifier (`QwenImagePachifier`)
|
||||||
|
|
||||||
|
Inputs:
|
||||||
|
height (`int`):
|
||||||
|
The height in pixels of the generated image.
|
||||||
|
width (`int`):
|
||||||
|
The width in pixels of the generated image.
|
||||||
|
latents (`Tensor`):
|
||||||
|
The latents to decode, can be generated in the denoise step.
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
latents (`Tensor`):
|
||||||
|
The denoisedlatents unpacked to B, C, 1, H, W
|
||||||
|
"""
|
||||||
|
|
||||||
model_name = "qwenimage"
|
model_name = "qwenimage"
|
||||||
|
|
||||||
@property
|
@property
|
||||||
@@ -49,13 +70,21 @@ class QwenImageAfterDenoiseStep(ModularPipelineBlocks):
|
|||||||
@property
|
@property
|
||||||
def inputs(self) -> List[InputParam]:
|
def inputs(self) -> List[InputParam]:
|
||||||
return [
|
return [
|
||||||
InputParam(name="height", required=True),
|
InputParam.template("height", required=True),
|
||||||
InputParam(name="width", required=True),
|
InputParam.template("width", required=True),
|
||||||
InputParam(
|
InputParam(
|
||||||
name="latents",
|
name="latents",
|
||||||
required=True,
|
required=True,
|
||||||
type_hint=torch.Tensor,
|
type_hint=torch.Tensor,
|
||||||
description="The latents to decode, can be generated in the denoise step",
|
description="The latents to decode, can be generated in the denoise step.",
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def intermediate_outputs(self) -> List[OutputParam]:
|
||||||
|
return [
|
||||||
|
OutputParam(
|
||||||
|
name="latents", type_hint=torch.Tensor, description="The denoisedlatents unpacked to B, C, 1, H, W"
|
||||||
),
|
),
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -72,7 +101,29 @@ class QwenImageAfterDenoiseStep(ModularPipelineBlocks):
|
|||||||
return components, state
|
return components, state
|
||||||
|
|
||||||
|
|
||||||
|
# auto_docstring
|
||||||
class QwenImageLayeredAfterDenoiseStep(ModularPipelineBlocks):
|
class QwenImageLayeredAfterDenoiseStep(ModularPipelineBlocks):
|
||||||
|
"""
|
||||||
|
Unpack latents from (B, seq, C*4) to (B, C, layers+1, H, W) after denoising.
|
||||||
|
|
||||||
|
Components:
|
||||||
|
pachifier (`QwenImageLayeredPachifier`)
|
||||||
|
|
||||||
|
Inputs:
|
||||||
|
latents (`Tensor`):
|
||||||
|
The denoised latents to decode, can be generated in the denoise step.
|
||||||
|
height (`int`):
|
||||||
|
The height in pixels of the generated image.
|
||||||
|
width (`int`):
|
||||||
|
The width in pixels of the generated image.
|
||||||
|
layers (`int`, *optional*, defaults to 4):
|
||||||
|
Number of layers to extract from the image
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
latents (`Tensor`):
|
||||||
|
Denoised latents. (unpacked to B, C, layers+1, H, W)
|
||||||
|
"""
|
||||||
|
|
||||||
model_name = "qwenimage-layered"
|
model_name = "qwenimage-layered"
|
||||||
|
|
||||||
@property
|
@property
|
||||||
@@ -88,10 +139,21 @@ class QwenImageLayeredAfterDenoiseStep(ModularPipelineBlocks):
|
|||||||
@property
|
@property
|
||||||
def inputs(self) -> List[InputParam]:
|
def inputs(self) -> List[InputParam]:
|
||||||
return [
|
return [
|
||||||
InputParam("latents", required=True, type_hint=torch.Tensor),
|
InputParam(
|
||||||
InputParam("height", required=True, type_hint=int),
|
name="latents",
|
||||||
InputParam("width", required=True, type_hint=int),
|
required=True,
|
||||||
InputParam("layers", required=True, type_hint=int),
|
type_hint=torch.Tensor,
|
||||||
|
description="The denoised latents to decode, can be generated in the denoise step.",
|
||||||
|
),
|
||||||
|
InputParam.template("height", required=True),
|
||||||
|
InputParam.template("width", required=True),
|
||||||
|
InputParam.template("layers"),
|
||||||
|
]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def intermediate_outputs(self) -> List[OutputParam]:
|
||||||
|
return [
|
||||||
|
OutputParam.template("latents", note="unpacked to B, C, layers+1, H, W"),
|
||||||
]
|
]
|
||||||
|
|
||||||
@torch.no_grad()
|
@torch.no_grad()
|
||||||
@@ -112,7 +174,26 @@ class QwenImageLayeredAfterDenoiseStep(ModularPipelineBlocks):
|
|||||||
|
|
||||||
|
|
||||||
# decode step
|
# decode step
|
||||||
|
|
||||||
|
|
||||||
|
# auto_docstring
|
||||||
class QwenImageDecoderStep(ModularPipelineBlocks):
|
class QwenImageDecoderStep(ModularPipelineBlocks):
|
||||||
|
"""
|
||||||
|
Step that decodes the latents to images
|
||||||
|
|
||||||
|
Components:
|
||||||
|
vae (`AutoencoderKLQwenImage`)
|
||||||
|
|
||||||
|
Inputs:
|
||||||
|
latents (`Tensor`):
|
||||||
|
The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise
|
||||||
|
step.
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
images (`List`):
|
||||||
|
Generated images. (tensor output of the vae decoder.)
|
||||||
|
"""
|
||||||
|
|
||||||
model_name = "qwenimage"
|
model_name = "qwenimage"
|
||||||
|
|
||||||
@property
|
@property
|
||||||
@@ -134,19 +215,13 @@ class QwenImageDecoderStep(ModularPipelineBlocks):
|
|||||||
name="latents",
|
name="latents",
|
||||||
required=True,
|
required=True,
|
||||||
type_hint=torch.Tensor,
|
type_hint=torch.Tensor,
|
||||||
description="The latents to decode, can be generated in the denoise step",
|
description="The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step.",
|
||||||
),
|
),
|
||||||
]
|
]
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def intermediate_outputs(self) -> List[str]:
|
def intermediate_outputs(self) -> List[OutputParam]:
|
||||||
return [
|
return [OutputParam.template("images", note="tensor output of the vae decoder.")]
|
||||||
OutputParam(
|
|
||||||
"images",
|
|
||||||
type_hint=Union[List[PIL.Image.Image], List[torch.Tensor], List[np.array]],
|
|
||||||
description="The generated images, can be a PIL.Image.Image, torch.Tensor or a numpy array",
|
|
||||||
)
|
|
||||||
]
|
|
||||||
|
|
||||||
@torch.no_grad()
|
@torch.no_grad()
|
||||||
def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
|
def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
|
||||||
@@ -176,7 +251,26 @@ class QwenImageDecoderStep(ModularPipelineBlocks):
|
|||||||
return components, state
|
return components, state
|
||||||
|
|
||||||
|
|
||||||
|
# auto_docstring
|
||||||
class QwenImageLayeredDecoderStep(ModularPipelineBlocks):
|
class QwenImageLayeredDecoderStep(ModularPipelineBlocks):
|
||||||
|
"""
|
||||||
|
Decode unpacked latents (B, C, layers+1, H, W) into layer images.
|
||||||
|
|
||||||
|
Components:
|
||||||
|
vae (`AutoencoderKLQwenImage`) image_processor (`VaeImageProcessor`)
|
||||||
|
|
||||||
|
Inputs:
|
||||||
|
latents (`Tensor`):
|
||||||
|
The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise
|
||||||
|
step.
|
||||||
|
output_type (`str`, *optional*, defaults to pil):
|
||||||
|
Output format: 'pil', 'np', 'pt'.
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
images (`List`):
|
||||||
|
Generated images.
|
||||||
|
"""
|
||||||
|
|
||||||
model_name = "qwenimage-layered"
|
model_name = "qwenimage-layered"
|
||||||
|
|
||||||
@property
|
@property
|
||||||
@@ -198,14 +292,19 @@ class QwenImageLayeredDecoderStep(ModularPipelineBlocks):
|
|||||||
@property
|
@property
|
||||||
def inputs(self) -> List[InputParam]:
|
def inputs(self) -> List[InputParam]:
|
||||||
return [
|
return [
|
||||||
InputParam("latents", required=True, type_hint=torch.Tensor),
|
InputParam(
|
||||||
InputParam("output_type", default="pil", type_hint=str),
|
name="latents",
|
||||||
|
required=True,
|
||||||
|
type_hint=torch.Tensor,
|
||||||
|
description="The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step.",
|
||||||
|
),
|
||||||
|
InputParam.template("output_type"),
|
||||||
]
|
]
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def intermediate_outputs(self) -> List[OutputParam]:
|
def intermediate_outputs(self) -> List[OutputParam]:
|
||||||
return [
|
return [
|
||||||
OutputParam(name="images", type_hint=List[List[PIL.Image.Image]]),
|
OutputParam.template("images"),
|
||||||
]
|
]
|
||||||
|
|
||||||
@torch.no_grad()
|
@torch.no_grad()
|
||||||
@@ -251,7 +350,27 @@ class QwenImageLayeredDecoderStep(ModularPipelineBlocks):
|
|||||||
|
|
||||||
|
|
||||||
# postprocess the decoded images
|
# postprocess the decoded images
|
||||||
|
|
||||||
|
|
||||||
|
# auto_docstring
|
||||||
class QwenImageProcessImagesOutputStep(ModularPipelineBlocks):
|
class QwenImageProcessImagesOutputStep(ModularPipelineBlocks):
|
||||||
|
"""
|
||||||
|
postprocess the generated image
|
||||||
|
|
||||||
|
Components:
|
||||||
|
image_processor (`VaeImageProcessor`)
|
||||||
|
|
||||||
|
Inputs:
|
||||||
|
images (`Tensor`):
|
||||||
|
the generated image tensor from decoders step
|
||||||
|
output_type (`str`, *optional*, defaults to pil):
|
||||||
|
Output format: 'pil', 'np', 'pt'.
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
images (`List`):
|
||||||
|
Generated images.
|
||||||
|
"""
|
||||||
|
|
||||||
model_name = "qwenimage"
|
model_name = "qwenimage"
|
||||||
|
|
||||||
@property
|
@property
|
||||||
@@ -272,15 +391,19 @@ class QwenImageProcessImagesOutputStep(ModularPipelineBlocks):
|
|||||||
@property
|
@property
|
||||||
def inputs(self) -> List[InputParam]:
|
def inputs(self) -> List[InputParam]:
|
||||||
return [
|
return [
|
||||||
InputParam("images", required=True, description="the generated image from decoders step"),
|
|
||||||
InputParam(
|
InputParam(
|
||||||
name="output_type",
|
name="images",
|
||||||
default="pil",
|
required=True,
|
||||||
type_hint=str,
|
type_hint=torch.Tensor,
|
||||||
description="The type of the output images, can be 'pil', 'np', 'pt'",
|
description="the generated image tensor from decoders step",
|
||||||
),
|
),
|
||||||
|
InputParam.template("output_type"),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def intermediate_outputs(self) -> List[OutputParam]:
|
||||||
|
return [OutputParam.template("images")]
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def check_inputs(output_type):
|
def check_inputs(output_type):
|
||||||
if output_type not in ["pil", "np", "pt"]:
|
if output_type not in ["pil", "np", "pt"]:
|
||||||
@@ -301,7 +424,28 @@ class QwenImageProcessImagesOutputStep(ModularPipelineBlocks):
|
|||||||
return components, state
|
return components, state
|
||||||
|
|
||||||
|
|
||||||
|
# auto_docstring
|
||||||
class QwenImageInpaintProcessImagesOutputStep(ModularPipelineBlocks):
|
class QwenImageInpaintProcessImagesOutputStep(ModularPipelineBlocks):
|
||||||
|
"""
|
||||||
|
postprocess the generated image, optional apply the mask overally to the original image..
|
||||||
|
|
||||||
|
Components:
|
||||||
|
image_mask_processor (`InpaintProcessor`)
|
||||||
|
|
||||||
|
Inputs:
|
||||||
|
images (`Tensor`):
|
||||||
|
the generated image tensor from decoders step
|
||||||
|
output_type (`str`, *optional*, defaults to pil):
|
||||||
|
Output format: 'pil', 'np', 'pt'.
|
||||||
|
mask_overlay_kwargs (`Dict`, *optional*):
|
||||||
|
The kwargs for the postprocess step to apply the mask overlay. generated in
|
||||||
|
InpaintProcessImagesInputStep.
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
images (`List`):
|
||||||
|
Generated images.
|
||||||
|
"""
|
||||||
|
|
||||||
model_name = "qwenimage"
|
model_name = "qwenimage"
|
||||||
|
|
||||||
@property
|
@property
|
||||||
@@ -322,16 +466,24 @@ class QwenImageInpaintProcessImagesOutputStep(ModularPipelineBlocks):
|
|||||||
@property
|
@property
|
||||||
def inputs(self) -> List[InputParam]:
|
def inputs(self) -> List[InputParam]:
|
||||||
return [
|
return [
|
||||||
InputParam("images", required=True, description="the generated image from decoders step"),
|
|
||||||
InputParam(
|
InputParam(
|
||||||
name="output_type",
|
name="images",
|
||||||
default="pil",
|
required=True,
|
||||||
type_hint=str,
|
type_hint=torch.Tensor,
|
||||||
description="The type of the output images, can be 'pil', 'np', 'pt'",
|
description="the generated image tensor from decoders step",
|
||||||
|
),
|
||||||
|
InputParam.template("output_type"),
|
||||||
|
InputParam(
|
||||||
|
name="mask_overlay_kwargs",
|
||||||
|
type_hint=Dict[str, Any],
|
||||||
|
description="The kwargs for the postprocess step to apply the mask overlay. generated in InpaintProcessImagesInputStep.",
|
||||||
),
|
),
|
||||||
InputParam("mask_overlay_kwargs"),
|
|
||||||
]
|
]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def intermediate_outputs(self) -> List[OutputParam]:
|
||||||
|
return [OutputParam.template("images")]
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def check_inputs(output_type, mask_overlay_kwargs):
|
def check_inputs(output_type, mask_overlay_kwargs):
|
||||||
if output_type not in ["pil", "np", "pt"]:
|
if output_type not in ["pil", "np", "pt"]:
|
||||||
|
|||||||
@@ -50,7 +50,7 @@ class QwenImageLoopBeforeDenoiser(ModularPipelineBlocks):
|
|||||||
def inputs(self) -> List[InputParam]:
|
def inputs(self) -> List[InputParam]:
|
||||||
return [
|
return [
|
||||||
InputParam(
|
InputParam(
|
||||||
"latents",
|
name="latents",
|
||||||
required=True,
|
required=True,
|
||||||
type_hint=torch.Tensor,
|
type_hint=torch.Tensor,
|
||||||
description="The initial latents to use for the denoising process. Can be generated in prepare_latent step.",
|
description="The initial latents to use for the denoising process. Can be generated in prepare_latent step.",
|
||||||
@@ -80,17 +80,12 @@ class QwenImageEditLoopBeforeDenoiser(ModularPipelineBlocks):
|
|||||||
def inputs(self) -> List[InputParam]:
|
def inputs(self) -> List[InputParam]:
|
||||||
return [
|
return [
|
||||||
InputParam(
|
InputParam(
|
||||||
"latents",
|
name="latents",
|
||||||
required=True,
|
required=True,
|
||||||
type_hint=torch.Tensor,
|
type_hint=torch.Tensor,
|
||||||
description="The initial latents to use for the denoising process. Can be generated in prepare_latent step.",
|
description="The initial latents to use for the denoising process. Can be generated in prepare_latent step.",
|
||||||
),
|
),
|
||||||
InputParam(
|
InputParam.template("image_latents"),
|
||||||
"image_latents",
|
|
||||||
required=True,
|
|
||||||
type_hint=torch.Tensor,
|
|
||||||
description="The initial image latents to use for the denoising process. Can be encoded in vae_encoder step and packed in prepare_image_latents step.",
|
|
||||||
),
|
|
||||||
]
|
]
|
||||||
|
|
||||||
@torch.no_grad()
|
@torch.no_grad()
|
||||||
@@ -134,29 +129,12 @@ class QwenImageLoopBeforeDenoiserControlNet(ModularPipelineBlocks):
|
|||||||
type_hint=torch.Tensor,
|
type_hint=torch.Tensor,
|
||||||
description="The control image to use for the denoising process. Can be generated in prepare_controlnet_inputs step.",
|
description="The control image to use for the denoising process. Can be generated in prepare_controlnet_inputs step.",
|
||||||
),
|
),
|
||||||
|
InputParam.template("controlnet_conditioning_scale", note="updated in prepare_controlnet_inputs step."),
|
||||||
InputParam(
|
InputParam(
|
||||||
"controlnet_conditioning_scale",
|
name="controlnet_keep",
|
||||||
type_hint=float,
|
|
||||||
description="The controlnet conditioning scale value to use for the denoising process. Can be generated in prepare_controlnet_inputs step.",
|
|
||||||
),
|
|
||||||
InputParam(
|
|
||||||
"controlnet_keep",
|
|
||||||
required=True,
|
required=True,
|
||||||
type_hint=List[float],
|
type_hint=List[float],
|
||||||
description="The controlnet keep values to use for the denoising process. Can be generated in prepare_controlnet_inputs step.",
|
description="The controlnet keep values. Can be generated in prepare_controlnet_inputs step.",
|
||||||
),
|
|
||||||
InputParam(
|
|
||||||
"num_inference_steps",
|
|
||||||
required=True,
|
|
||||||
type_hint=int,
|
|
||||||
description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.",
|
|
||||||
),
|
|
||||||
InputParam(
|
|
||||||
kwargs_type="denoiser_input_fields",
|
|
||||||
description=(
|
|
||||||
"All conditional model inputs for the denoiser. "
|
|
||||||
"It should contain prompt_embeds/negative_prompt_embeds."
|
|
||||||
),
|
|
||||||
),
|
),
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -217,28 +195,13 @@ class QwenImageLoopDenoiser(ModularPipelineBlocks):
|
|||||||
@property
|
@property
|
||||||
def inputs(self) -> List[InputParam]:
|
def inputs(self) -> List[InputParam]:
|
||||||
return [
|
return [
|
||||||
InputParam("attention_kwargs"),
|
InputParam.template("attention_kwargs"),
|
||||||
InputParam(
|
InputParam.template("denoiser_input_fields"),
|
||||||
"latents",
|
|
||||||
required=True,
|
|
||||||
type_hint=torch.Tensor,
|
|
||||||
description="The latents to use for the denoising process. Can be generated in prepare_latents step.",
|
|
||||||
),
|
|
||||||
InputParam(
|
|
||||||
"num_inference_steps",
|
|
||||||
required=True,
|
|
||||||
type_hint=int,
|
|
||||||
description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.",
|
|
||||||
),
|
|
||||||
InputParam(
|
|
||||||
kwargs_type="denoiser_input_fields",
|
|
||||||
description="conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.",
|
|
||||||
),
|
|
||||||
InputParam(
|
InputParam(
|
||||||
"img_shapes",
|
"img_shapes",
|
||||||
required=True,
|
required=True,
|
||||||
type_hint=List[Tuple[int, int]],
|
type_hint=List[Tuple[int, int]],
|
||||||
description="The shape of the image latents for RoPE calculation. Can be generated in prepare_additional_inputs step.",
|
description="The shape of the image latents for RoPE calculation. can be generated in prepare_additional_inputs step.",
|
||||||
),
|
),
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -317,23 +280,8 @@ class QwenImageEditLoopDenoiser(ModularPipelineBlocks):
|
|||||||
@property
|
@property
|
||||||
def inputs(self) -> List[InputParam]:
|
def inputs(self) -> List[InputParam]:
|
||||||
return [
|
return [
|
||||||
InputParam("attention_kwargs"),
|
InputParam.template("attention_kwargs"),
|
||||||
InputParam(
|
InputParam.template("denoiser_input_fields"),
|
||||||
"latents",
|
|
||||||
required=True,
|
|
||||||
type_hint=torch.Tensor,
|
|
||||||
description="The latents to use for the denoising process. Can be generated in prepare_latents step.",
|
|
||||||
),
|
|
||||||
InputParam(
|
|
||||||
"num_inference_steps",
|
|
||||||
required=True,
|
|
||||||
type_hint=int,
|
|
||||||
description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.",
|
|
||||||
),
|
|
||||||
InputParam(
|
|
||||||
kwargs_type="denoiser_input_fields",
|
|
||||||
description="conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.",
|
|
||||||
),
|
|
||||||
InputParam(
|
InputParam(
|
||||||
"img_shapes",
|
"img_shapes",
|
||||||
required=True,
|
required=True,
|
||||||
@@ -415,7 +363,7 @@ class QwenImageLoopAfterDenoiser(ModularPipelineBlocks):
|
|||||||
@property
|
@property
|
||||||
def intermediate_outputs(self) -> List[OutputParam]:
|
def intermediate_outputs(self) -> List[OutputParam]:
|
||||||
return [
|
return [
|
||||||
OutputParam("latents", type_hint=torch.Tensor, description="The denoised latents."),
|
OutputParam.template("latents"),
|
||||||
]
|
]
|
||||||
|
|
||||||
@torch.no_grad()
|
@torch.no_grad()
|
||||||
@@ -456,24 +404,19 @@ class QwenImageLoopAfterDenoiserInpaint(ModularPipelineBlocks):
|
|||||||
type_hint=torch.Tensor,
|
type_hint=torch.Tensor,
|
||||||
description="The mask to use for the inpainting process. Can be generated in inpaint prepare latents step.",
|
description="The mask to use for the inpainting process. Can be generated in inpaint prepare latents step.",
|
||||||
),
|
),
|
||||||
InputParam(
|
InputParam.template("image_latents"),
|
||||||
"image_latents",
|
|
||||||
required=True,
|
|
||||||
type_hint=torch.Tensor,
|
|
||||||
description="The image latents to use for the inpainting process. Can be generated in inpaint prepare latents step.",
|
|
||||||
),
|
|
||||||
InputParam(
|
InputParam(
|
||||||
"initial_noise",
|
"initial_noise",
|
||||||
required=True,
|
required=True,
|
||||||
type_hint=torch.Tensor,
|
type_hint=torch.Tensor,
|
||||||
description="The initial noise to use for the inpainting process. Can be generated in inpaint prepare latents step.",
|
description="The initial noise to use for the inpainting process. Can be generated in inpaint prepare latents step.",
|
||||||
),
|
),
|
||||||
InputParam(
|
]
|
||||||
"timesteps",
|
|
||||||
required=True,
|
@property
|
||||||
type_hint=torch.Tensor,
|
def intermediate_outputs(self) -> List[OutputParam]:
|
||||||
description="The timesteps to use for the denoising process. Can be generated in set_timesteps step.",
|
return [
|
||||||
),
|
OutputParam.template("latents"),
|
||||||
]
|
]
|
||||||
|
|
||||||
@torch.no_grad()
|
@torch.no_grad()
|
||||||
@@ -515,17 +458,12 @@ class QwenImageDenoiseLoopWrapper(LoopSequentialPipelineBlocks):
|
|||||||
def loop_inputs(self) -> List[InputParam]:
|
def loop_inputs(self) -> List[InputParam]:
|
||||||
return [
|
return [
|
||||||
InputParam(
|
InputParam(
|
||||||
"timesteps",
|
name="timesteps",
|
||||||
required=True,
|
required=True,
|
||||||
type_hint=torch.Tensor,
|
type_hint=torch.Tensor,
|
||||||
description="The timesteps to use for the denoising process. Can be generated in set_timesteps step.",
|
description="The timesteps to use for the denoising process. Can be generated in set_timesteps step.",
|
||||||
),
|
),
|
||||||
InputParam(
|
InputParam.template("num_inference_steps", required=True),
|
||||||
"num_inference_steps",
|
|
||||||
required=True,
|
|
||||||
type_hint=int,
|
|
||||||
description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.",
|
|
||||||
),
|
|
||||||
]
|
]
|
||||||
|
|
||||||
@torch.no_grad()
|
@torch.no_grad()
|
||||||
@@ -557,7 +495,42 @@ class QwenImageDenoiseLoopWrapper(LoopSequentialPipelineBlocks):
|
|||||||
|
|
||||||
|
|
||||||
# Qwen Image (text2image, image2image)
|
# Qwen Image (text2image, image2image)
|
||||||
|
|
||||||
|
|
||||||
|
# auto_docstring
|
||||||
class QwenImageDenoiseStep(QwenImageDenoiseLoopWrapper):
|
class QwenImageDenoiseStep(QwenImageDenoiseLoopWrapper):
|
||||||
|
"""
|
||||||
|
Denoise step that iteratively denoise the latents.
|
||||||
|
Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method At each iteration, it runs blocks
|
||||||
|
defined in `sub_blocks` sequencially:
|
||||||
|
- `QwenImageLoopBeforeDenoiser`
|
||||||
|
- `QwenImageLoopDenoiser`
|
||||||
|
- `QwenImageLoopAfterDenoiser`
|
||||||
|
This block supports text2image and image2image tasks for QwenImage.
|
||||||
|
|
||||||
|
Components:
|
||||||
|
guider (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`) scheduler
|
||||||
|
(`FlowMatchEulerDiscreteScheduler`)
|
||||||
|
|
||||||
|
Inputs:
|
||||||
|
timesteps (`Tensor`):
|
||||||
|
The timesteps to use for the denoising process. Can be generated in set_timesteps step.
|
||||||
|
num_inference_steps (`int`):
|
||||||
|
The number of denoising steps.
|
||||||
|
latents (`Tensor`):
|
||||||
|
The initial latents to use for the denoising process. Can be generated in prepare_latent step.
|
||||||
|
attention_kwargs (`Dict`, *optional*):
|
||||||
|
Additional kwargs for attention processors.
|
||||||
|
**denoiser_input_fields (`None`, *optional*):
|
||||||
|
conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
|
||||||
|
img_shapes (`List`):
|
||||||
|
The shape of the image latents for RoPE calculation. can be generated in prepare_additional_inputs step.
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
latents (`Tensor`):
|
||||||
|
Denoised latents.
|
||||||
|
"""
|
||||||
|
|
||||||
model_name = "qwenimage"
|
model_name = "qwenimage"
|
||||||
|
|
||||||
block_classes = [
|
block_classes = [
|
||||||
@@ -570,8 +543,8 @@ class QwenImageDenoiseStep(QwenImageDenoiseLoopWrapper):
|
|||||||
@property
|
@property
|
||||||
def description(self) -> str:
|
def description(self) -> str:
|
||||||
return (
|
return (
|
||||||
"Denoise step that iteratively denoise the latents. \n"
|
"Denoise step that iteratively denoise the latents.\n"
|
||||||
"Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method \n"
|
"Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method\n"
|
||||||
"At each iteration, it runs blocks defined in `sub_blocks` sequencially:\n"
|
"At each iteration, it runs blocks defined in `sub_blocks` sequencially:\n"
|
||||||
" - `QwenImageLoopBeforeDenoiser`\n"
|
" - `QwenImageLoopBeforeDenoiser`\n"
|
||||||
" - `QwenImageLoopDenoiser`\n"
|
" - `QwenImageLoopDenoiser`\n"
|
||||||
@@ -581,7 +554,47 @@ class QwenImageDenoiseStep(QwenImageDenoiseLoopWrapper):
|
|||||||
|
|
||||||
|
|
||||||
# Qwen Image (inpainting)
|
# Qwen Image (inpainting)
|
||||||
|
# auto_docstring
|
||||||
class QwenImageInpaintDenoiseStep(QwenImageDenoiseLoopWrapper):
|
class QwenImageInpaintDenoiseStep(QwenImageDenoiseLoopWrapper):
|
||||||
|
"""
|
||||||
|
Denoise step that iteratively denoise the latents.
|
||||||
|
Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method At each iteration, it runs blocks
|
||||||
|
defined in `sub_blocks` sequencially:
|
||||||
|
- `QwenImageLoopBeforeDenoiser`
|
||||||
|
- `QwenImageLoopDenoiser`
|
||||||
|
- `QwenImageLoopAfterDenoiser`
|
||||||
|
- `QwenImageLoopAfterDenoiserInpaint`
|
||||||
|
This block supports inpainting tasks for QwenImage.
|
||||||
|
|
||||||
|
Components:
|
||||||
|
guider (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`) scheduler
|
||||||
|
(`FlowMatchEulerDiscreteScheduler`)
|
||||||
|
|
||||||
|
Inputs:
|
||||||
|
timesteps (`Tensor`):
|
||||||
|
The timesteps to use for the denoising process. Can be generated in set_timesteps step.
|
||||||
|
num_inference_steps (`int`):
|
||||||
|
The number of denoising steps.
|
||||||
|
latents (`Tensor`):
|
||||||
|
The initial latents to use for the denoising process. Can be generated in prepare_latent step.
|
||||||
|
attention_kwargs (`Dict`, *optional*):
|
||||||
|
Additional kwargs for attention processors.
|
||||||
|
**denoiser_input_fields (`None`, *optional*):
|
||||||
|
conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
|
||||||
|
img_shapes (`List`):
|
||||||
|
The shape of the image latents for RoPE calculation. can be generated in prepare_additional_inputs step.
|
||||||
|
mask (`Tensor`):
|
||||||
|
The mask to use for the inpainting process. Can be generated in inpaint prepare latents step.
|
||||||
|
image_latents (`Tensor`):
|
||||||
|
image latents used to guide the image generation. Can be generated from vae_encoder step.
|
||||||
|
initial_noise (`Tensor`):
|
||||||
|
The initial noise to use for the inpainting process. Can be generated in inpaint prepare latents step.
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
latents (`Tensor`):
|
||||||
|
Denoised latents.
|
||||||
|
"""
|
||||||
|
|
||||||
model_name = "qwenimage"
|
model_name = "qwenimage"
|
||||||
block_classes = [
|
block_classes = [
|
||||||
QwenImageLoopBeforeDenoiser,
|
QwenImageLoopBeforeDenoiser,
|
||||||
@@ -606,7 +619,47 @@ class QwenImageInpaintDenoiseStep(QwenImageDenoiseLoopWrapper):
|
|||||||
|
|
||||||
|
|
||||||
# Qwen Image (text2image, image2image) with controlnet
|
# Qwen Image (text2image, image2image) with controlnet
|
||||||
|
# auto_docstring
|
||||||
class QwenImageControlNetDenoiseStep(QwenImageDenoiseLoopWrapper):
|
class QwenImageControlNetDenoiseStep(QwenImageDenoiseLoopWrapper):
|
||||||
|
"""
|
||||||
|
Denoise step that iteratively denoise the latents.
|
||||||
|
Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method At each iteration, it runs blocks
|
||||||
|
defined in `sub_blocks` sequencially:
|
||||||
|
- `QwenImageLoopBeforeDenoiser`
|
||||||
|
- `QwenImageLoopBeforeDenoiserControlNet`
|
||||||
|
- `QwenImageLoopDenoiser`
|
||||||
|
- `QwenImageLoopAfterDenoiser`
|
||||||
|
This block supports text2img/img2img tasks with controlnet for QwenImage.
|
||||||
|
|
||||||
|
Components:
|
||||||
|
guider (`ClassifierFreeGuidance`) controlnet (`QwenImageControlNetModel`) transformer
|
||||||
|
(`QwenImageTransformer2DModel`) scheduler (`FlowMatchEulerDiscreteScheduler`)
|
||||||
|
|
||||||
|
Inputs:
|
||||||
|
timesteps (`Tensor`):
|
||||||
|
The timesteps to use for the denoising process. Can be generated in set_timesteps step.
|
||||||
|
num_inference_steps (`int`):
|
||||||
|
The number of denoising steps.
|
||||||
|
latents (`Tensor`):
|
||||||
|
The initial latents to use for the denoising process. Can be generated in prepare_latent step.
|
||||||
|
control_image_latents (`Tensor`):
|
||||||
|
The control image to use for the denoising process. Can be generated in prepare_controlnet_inputs step.
|
||||||
|
controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
|
||||||
|
Scale for ControlNet conditioning. (updated in prepare_controlnet_inputs step.)
|
||||||
|
controlnet_keep (`List`):
|
||||||
|
The controlnet keep values. Can be generated in prepare_controlnet_inputs step.
|
||||||
|
attention_kwargs (`Dict`, *optional*):
|
||||||
|
Additional kwargs for attention processors.
|
||||||
|
**denoiser_input_fields (`None`, *optional*):
|
||||||
|
conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
|
||||||
|
img_shapes (`List`):
|
||||||
|
The shape of the image latents for RoPE calculation. can be generated in prepare_additional_inputs step.
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
latents (`Tensor`):
|
||||||
|
Denoised latents.
|
||||||
|
"""
|
||||||
|
|
||||||
model_name = "qwenimage"
|
model_name = "qwenimage"
|
||||||
block_classes = [
|
block_classes = [
|
||||||
QwenImageLoopBeforeDenoiser,
|
QwenImageLoopBeforeDenoiser,
|
||||||
@@ -631,7 +684,54 @@ class QwenImageControlNetDenoiseStep(QwenImageDenoiseLoopWrapper):
|
|||||||
|
|
||||||
|
|
||||||
# Qwen Image (inpainting) with controlnet
|
# Qwen Image (inpainting) with controlnet
|
||||||
|
# auto_docstring
|
||||||
class QwenImageInpaintControlNetDenoiseStep(QwenImageDenoiseLoopWrapper):
|
class QwenImageInpaintControlNetDenoiseStep(QwenImageDenoiseLoopWrapper):
|
||||||
|
"""
|
||||||
|
Denoise step that iteratively denoise the latents.
|
||||||
|
Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method At each iteration, it runs blocks
|
||||||
|
defined in `sub_blocks` sequencially:
|
||||||
|
- `QwenImageLoopBeforeDenoiser`
|
||||||
|
- `QwenImageLoopBeforeDenoiserControlNet`
|
||||||
|
- `QwenImageLoopDenoiser`
|
||||||
|
- `QwenImageLoopAfterDenoiser`
|
||||||
|
- `QwenImageLoopAfterDenoiserInpaint`
|
||||||
|
This block supports inpainting tasks with controlnet for QwenImage.
|
||||||
|
|
||||||
|
Components:
|
||||||
|
guider (`ClassifierFreeGuidance`) controlnet (`QwenImageControlNetModel`) transformer
|
||||||
|
(`QwenImageTransformer2DModel`) scheduler (`FlowMatchEulerDiscreteScheduler`)
|
||||||
|
|
||||||
|
Inputs:
|
||||||
|
timesteps (`Tensor`):
|
||||||
|
The timesteps to use for the denoising process. Can be generated in set_timesteps step.
|
||||||
|
num_inference_steps (`int`):
|
||||||
|
The number of denoising steps.
|
||||||
|
latents (`Tensor`):
|
||||||
|
The initial latents to use for the denoising process. Can be generated in prepare_latent step.
|
||||||
|
control_image_latents (`Tensor`):
|
||||||
|
The control image to use for the denoising process. Can be generated in prepare_controlnet_inputs step.
|
||||||
|
controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
|
||||||
|
Scale for ControlNet conditioning. (updated in prepare_controlnet_inputs step.)
|
||||||
|
controlnet_keep (`List`):
|
||||||
|
The controlnet keep values. Can be generated in prepare_controlnet_inputs step.
|
||||||
|
attention_kwargs (`Dict`, *optional*):
|
||||||
|
Additional kwargs for attention processors.
|
||||||
|
**denoiser_input_fields (`None`, *optional*):
|
||||||
|
conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
|
||||||
|
img_shapes (`List`):
|
||||||
|
The shape of the image latents for RoPE calculation. can be generated in prepare_additional_inputs step.
|
||||||
|
mask (`Tensor`):
|
||||||
|
The mask to use for the inpainting process. Can be generated in inpaint prepare latents step.
|
||||||
|
image_latents (`Tensor`):
|
||||||
|
image latents used to guide the image generation. Can be generated from vae_encoder step.
|
||||||
|
initial_noise (`Tensor`):
|
||||||
|
The initial noise to use for the inpainting process. Can be generated in inpaint prepare latents step.
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
latents (`Tensor`):
|
||||||
|
Denoised latents.
|
||||||
|
"""
|
||||||
|
|
||||||
model_name = "qwenimage"
|
model_name = "qwenimage"
|
||||||
block_classes = [
|
block_classes = [
|
||||||
QwenImageLoopBeforeDenoiser,
|
QwenImageLoopBeforeDenoiser,
|
||||||
@@ -664,7 +764,42 @@ class QwenImageInpaintControlNetDenoiseStep(QwenImageDenoiseLoopWrapper):
|
|||||||
|
|
||||||
|
|
||||||
# Qwen Image Edit (image2image)
|
# Qwen Image Edit (image2image)
|
||||||
|
# auto_docstring
|
||||||
class QwenImageEditDenoiseStep(QwenImageDenoiseLoopWrapper):
|
class QwenImageEditDenoiseStep(QwenImageDenoiseLoopWrapper):
|
||||||
|
"""
|
||||||
|
Denoise step that iteratively denoise the latents.
|
||||||
|
Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method At each iteration, it runs blocks
|
||||||
|
defined in `sub_blocks` sequencially:
|
||||||
|
- `QwenImageEditLoopBeforeDenoiser`
|
||||||
|
- `QwenImageEditLoopDenoiser`
|
||||||
|
- `QwenImageLoopAfterDenoiser`
|
||||||
|
This block supports QwenImage Edit.
|
||||||
|
|
||||||
|
Components:
|
||||||
|
guider (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`) scheduler
|
||||||
|
(`FlowMatchEulerDiscreteScheduler`)
|
||||||
|
|
||||||
|
Inputs:
|
||||||
|
timesteps (`Tensor`):
|
||||||
|
The timesteps to use for the denoising process. Can be generated in set_timesteps step.
|
||||||
|
num_inference_steps (`int`):
|
||||||
|
The number of denoising steps.
|
||||||
|
latents (`Tensor`):
|
||||||
|
The initial latents to use for the denoising process. Can be generated in prepare_latent step.
|
||||||
|
image_latents (`Tensor`):
|
||||||
|
image latents used to guide the image generation. Can be generated from vae_encoder step.
|
||||||
|
attention_kwargs (`Dict`, *optional*):
|
||||||
|
Additional kwargs for attention processors.
|
||||||
|
**denoiser_input_fields (`None`, *optional*):
|
||||||
|
conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
|
||||||
|
img_shapes (`List`):
|
||||||
|
The shape of the image latents for RoPE calculation. Can be generated in prepare_additional_inputs step.
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
latents (`Tensor`):
|
||||||
|
Denoised latents.
|
||||||
|
"""
|
||||||
|
|
||||||
model_name = "qwenimage-edit"
|
model_name = "qwenimage-edit"
|
||||||
block_classes = [
|
block_classes = [
|
||||||
QwenImageEditLoopBeforeDenoiser,
|
QwenImageEditLoopBeforeDenoiser,
|
||||||
@@ -687,7 +822,47 @@ class QwenImageEditDenoiseStep(QwenImageDenoiseLoopWrapper):
|
|||||||
|
|
||||||
|
|
||||||
# Qwen Image Edit (inpainting)
|
# Qwen Image Edit (inpainting)
|
||||||
|
# auto_docstring
|
||||||
class QwenImageEditInpaintDenoiseStep(QwenImageDenoiseLoopWrapper):
|
class QwenImageEditInpaintDenoiseStep(QwenImageDenoiseLoopWrapper):
|
||||||
|
"""
|
||||||
|
Denoise step that iteratively denoise the latents.
|
||||||
|
Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method At each iteration, it runs blocks
|
||||||
|
defined in `sub_blocks` sequencially:
|
||||||
|
- `QwenImageEditLoopBeforeDenoiser`
|
||||||
|
- `QwenImageEditLoopDenoiser`
|
||||||
|
- `QwenImageLoopAfterDenoiser`
|
||||||
|
- `QwenImageLoopAfterDenoiserInpaint`
|
||||||
|
This block supports inpainting tasks for QwenImage Edit.
|
||||||
|
|
||||||
|
Components:
|
||||||
|
guider (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`) scheduler
|
||||||
|
(`FlowMatchEulerDiscreteScheduler`)
|
||||||
|
|
||||||
|
Inputs:
|
||||||
|
timesteps (`Tensor`):
|
||||||
|
The timesteps to use for the denoising process. Can be generated in set_timesteps step.
|
||||||
|
num_inference_steps (`int`):
|
||||||
|
The number of denoising steps.
|
||||||
|
latents (`Tensor`):
|
||||||
|
The initial latents to use for the denoising process. Can be generated in prepare_latent step.
|
||||||
|
image_latents (`Tensor`):
|
||||||
|
image latents used to guide the image generation. Can be generated from vae_encoder step.
|
||||||
|
attention_kwargs (`Dict`, *optional*):
|
||||||
|
Additional kwargs for attention processors.
|
||||||
|
**denoiser_input_fields (`None`, *optional*):
|
||||||
|
conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
|
||||||
|
img_shapes (`List`):
|
||||||
|
The shape of the image latents for RoPE calculation. Can be generated in prepare_additional_inputs step.
|
||||||
|
mask (`Tensor`):
|
||||||
|
The mask to use for the inpainting process. Can be generated in inpaint prepare latents step.
|
||||||
|
initial_noise (`Tensor`):
|
||||||
|
The initial noise to use for the inpainting process. Can be generated in inpaint prepare latents step.
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
latents (`Tensor`):
|
||||||
|
Denoised latents.
|
||||||
|
"""
|
||||||
|
|
||||||
model_name = "qwenimage-edit"
|
model_name = "qwenimage-edit"
|
||||||
block_classes = [
|
block_classes = [
|
||||||
QwenImageEditLoopBeforeDenoiser,
|
QwenImageEditLoopBeforeDenoiser,
|
||||||
@@ -712,7 +887,42 @@ class QwenImageEditInpaintDenoiseStep(QwenImageDenoiseLoopWrapper):
|
|||||||
|
|
||||||
|
|
||||||
# Qwen Image Layered (image2image)
|
# Qwen Image Layered (image2image)
|
||||||
|
# auto_docstring
|
||||||
class QwenImageLayeredDenoiseStep(QwenImageDenoiseLoopWrapper):
|
class QwenImageLayeredDenoiseStep(QwenImageDenoiseLoopWrapper):
|
||||||
|
"""
|
||||||
|
Denoise step that iteratively denoise the latents.
|
||||||
|
Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method At each iteration, it runs blocks
|
||||||
|
defined in `sub_blocks` sequencially:
|
||||||
|
- `QwenImageEditLoopBeforeDenoiser`
|
||||||
|
- `QwenImageEditLoopDenoiser`
|
||||||
|
- `QwenImageLoopAfterDenoiser`
|
||||||
|
This block supports QwenImage Layered.
|
||||||
|
|
||||||
|
Components:
|
||||||
|
guider (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`) scheduler
|
||||||
|
(`FlowMatchEulerDiscreteScheduler`)
|
||||||
|
|
||||||
|
Inputs:
|
||||||
|
timesteps (`Tensor`):
|
||||||
|
The timesteps to use for the denoising process. Can be generated in set_timesteps step.
|
||||||
|
num_inference_steps (`int`):
|
||||||
|
The number of denoising steps.
|
||||||
|
latents (`Tensor`):
|
||||||
|
The initial latents to use for the denoising process. Can be generated in prepare_latent step.
|
||||||
|
image_latents (`Tensor`):
|
||||||
|
image latents used to guide the image generation. Can be generated from vae_encoder step.
|
||||||
|
attention_kwargs (`Dict`, *optional*):
|
||||||
|
Additional kwargs for attention processors.
|
||||||
|
**denoiser_input_fields (`None`, *optional*):
|
||||||
|
conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
|
||||||
|
img_shapes (`List`):
|
||||||
|
The shape of the image latents for RoPE calculation. Can be generated in prepare_additional_inputs step.
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
latents (`Tensor`):
|
||||||
|
Denoised latents.
|
||||||
|
"""
|
||||||
|
|
||||||
model_name = "qwenimage-layered"
|
model_name = "qwenimage-layered"
|
||||||
block_classes = [
|
block_classes = [
|
||||||
QwenImageEditLoopBeforeDenoiser,
|
QwenImageEditLoopBeforeDenoiser,
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@@ -12,7 +12,7 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
from typing import List, Tuple
|
from typing import List, Optional, Tuple
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
@@ -109,7 +109,44 @@ def calculate_dimension_from_latents(latents: torch.Tensor, vae_scale_factor: in
|
|||||||
return height, width
|
return height, width
|
||||||
|
|
||||||
|
|
||||||
|
# auto_docstring
|
||||||
class QwenImageTextInputsStep(ModularPipelineBlocks):
|
class QwenImageTextInputsStep(ModularPipelineBlocks):
|
||||||
|
"""
|
||||||
|
Text input processing step that standardizes text embeddings for the pipeline.
|
||||||
|
This step:
|
||||||
|
1. Determines `batch_size` and `dtype` based on `prompt_embeds`
|
||||||
|
2. Ensures all text embeddings have consistent batch sizes (batch_size * num_images_per_prompt)
|
||||||
|
|
||||||
|
This block should be placed after all encoder steps to process the text embeddings before they are used in
|
||||||
|
subsequent pipeline steps.
|
||||||
|
|
||||||
|
Inputs:
|
||||||
|
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
||||||
|
The number of images to generate per prompt.
|
||||||
|
prompt_embeds (`Tensor`):
|
||||||
|
text embeddings used to guide the image generation. Can be generated from text_encoder step.
|
||||||
|
prompt_embeds_mask (`Tensor`):
|
||||||
|
mask for the text embeddings. Can be generated from text_encoder step.
|
||||||
|
negative_prompt_embeds (`Tensor`, *optional*):
|
||||||
|
negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
|
||||||
|
negative_prompt_embeds_mask (`Tensor`, *optional*):
|
||||||
|
mask for the negative text embeddings. Can be generated from text_encoder step.
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
batch_size (`int`):
|
||||||
|
The batch size of the prompt embeddings
|
||||||
|
dtype (`dtype`):
|
||||||
|
The data type of the prompt embeddings
|
||||||
|
prompt_embeds (`Tensor`):
|
||||||
|
The prompt embeddings. (batch-expanded)
|
||||||
|
prompt_embeds_mask (`Tensor`):
|
||||||
|
The encoder attention mask. (batch-expanded)
|
||||||
|
negative_prompt_embeds (`Tensor`):
|
||||||
|
The negative prompt embeddings. (batch-expanded)
|
||||||
|
negative_prompt_embeds_mask (`Tensor`):
|
||||||
|
The negative prompt embeddings mask. (batch-expanded)
|
||||||
|
"""
|
||||||
|
|
||||||
model_name = "qwenimage"
|
model_name = "qwenimage"
|
||||||
|
|
||||||
@property
|
@property
|
||||||
@@ -129,26 +166,22 @@ class QwenImageTextInputsStep(ModularPipelineBlocks):
|
|||||||
@property
|
@property
|
||||||
def inputs(self) -> List[InputParam]:
|
def inputs(self) -> List[InputParam]:
|
||||||
return [
|
return [
|
||||||
InputParam(name="num_images_per_prompt", default=1),
|
InputParam.template("num_images_per_prompt"),
|
||||||
InputParam(name="prompt_embeds", required=True, kwargs_type="denoiser_input_fields"),
|
InputParam.template("prompt_embeds"),
|
||||||
InputParam(name="prompt_embeds_mask", required=True, kwargs_type="denoiser_input_fields"),
|
InputParam.template("prompt_embeds_mask"),
|
||||||
InputParam(name="negative_prompt_embeds", kwargs_type="denoiser_input_fields"),
|
InputParam.template("negative_prompt_embeds"),
|
||||||
InputParam(name="negative_prompt_embeds_mask", kwargs_type="denoiser_input_fields"),
|
InputParam.template("negative_prompt_embeds_mask"),
|
||||||
]
|
]
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def intermediate_outputs(self) -> List[str]:
|
def intermediate_outputs(self) -> List[OutputParam]:
|
||||||
return [
|
return [
|
||||||
OutputParam(
|
OutputParam(name="batch_size", type_hint=int, description="The batch size of the prompt embeddings"),
|
||||||
"batch_size",
|
OutputParam(name="dtype", type_hint=torch.dtype, description="The data type of the prompt embeddings"),
|
||||||
type_hint=int,
|
OutputParam.template("prompt_embeds", note="batch-expanded"),
|
||||||
description="Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt",
|
OutputParam.template("prompt_embeds_mask", note="batch-expanded"),
|
||||||
),
|
OutputParam.template("negative_prompt_embeds", note="batch-expanded"),
|
||||||
OutputParam(
|
OutputParam.template("negative_prompt_embeds_mask", note="batch-expanded"),
|
||||||
"dtype",
|
|
||||||
type_hint=torch.dtype,
|
|
||||||
description="Data type of model tensor inputs (determined by `prompt_embeds`)",
|
|
||||||
),
|
|
||||||
]
|
]
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@@ -221,20 +254,76 @@ class QwenImageTextInputsStep(ModularPipelineBlocks):
|
|||||||
return components, state
|
return components, state
|
||||||
|
|
||||||
|
|
||||||
|
# auto_docstring
|
||||||
class QwenImageAdditionalInputsStep(ModularPipelineBlocks):
|
class QwenImageAdditionalInputsStep(ModularPipelineBlocks):
|
||||||
"""Input step for QwenImage: update height/width, expand batch, patchify."""
|
"""
|
||||||
|
Input processing step that:
|
||||||
|
1. For image latent inputs: Updates height/width if None, patchifies, and expands batch size
|
||||||
|
2. For additional batch inputs: Expands batch dimensions to match final batch size
|
||||||
|
|
||||||
|
Configured inputs:
|
||||||
|
- Image latent inputs: ['image_latents']
|
||||||
|
|
||||||
|
This block should be placed after the encoder steps and the text input step.
|
||||||
|
|
||||||
|
Components:
|
||||||
|
pachifier (`QwenImagePachifier`)
|
||||||
|
|
||||||
|
Inputs:
|
||||||
|
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
||||||
|
The number of images to generate per prompt.
|
||||||
|
batch_size (`int`, *optional*, defaults to 1):
|
||||||
|
Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
|
||||||
|
be generated in input step.
|
||||||
|
height (`int`, *optional*):
|
||||||
|
The height in pixels of the generated image.
|
||||||
|
width (`int`, *optional*):
|
||||||
|
The width in pixels of the generated image.
|
||||||
|
image_latents (`Tensor`):
|
||||||
|
image latents used to guide the image generation. Can be generated from vae_encoder step.
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
image_height (`int`):
|
||||||
|
The image height calculated from the image latents dimension
|
||||||
|
image_width (`int`):
|
||||||
|
The image width calculated from the image latents dimension
|
||||||
|
height (`int`):
|
||||||
|
if not provided, updated to image height
|
||||||
|
width (`int`):
|
||||||
|
if not provided, updated to image width
|
||||||
|
image_latents (`Tensor`):
|
||||||
|
image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified and
|
||||||
|
batch-expanded)
|
||||||
|
"""
|
||||||
|
|
||||||
model_name = "qwenimage"
|
model_name = "qwenimage"
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
image_latent_inputs: List[str] = ["image_latents"],
|
image_latent_inputs: Optional[List[InputParam]] = None,
|
||||||
additional_batch_inputs: List[str] = [],
|
additional_batch_inputs: Optional[List[InputParam]] = None,
|
||||||
):
|
):
|
||||||
|
# by default, process `image_latents`
|
||||||
|
if image_latent_inputs is None:
|
||||||
|
image_latent_inputs = [InputParam.template("image_latents")]
|
||||||
|
if additional_batch_inputs is None:
|
||||||
|
additional_batch_inputs = []
|
||||||
|
|
||||||
if not isinstance(image_latent_inputs, list):
|
if not isinstance(image_latent_inputs, list):
|
||||||
image_latent_inputs = [image_latent_inputs]
|
raise ValueError(f"image_latent_inputs must be a list, but got {type(image_latent_inputs)}")
|
||||||
|
else:
|
||||||
|
for input_param in image_latent_inputs:
|
||||||
|
if not isinstance(input_param, InputParam):
|
||||||
|
raise ValueError(f"image_latent_inputs must be a list of InputParam, but got {type(input_param)}")
|
||||||
|
|
||||||
if not isinstance(additional_batch_inputs, list):
|
if not isinstance(additional_batch_inputs, list):
|
||||||
additional_batch_inputs = [additional_batch_inputs]
|
raise ValueError(f"additional_batch_inputs must be a list, but got {type(additional_batch_inputs)}")
|
||||||
|
else:
|
||||||
|
for input_param in additional_batch_inputs:
|
||||||
|
if not isinstance(input_param, InputParam):
|
||||||
|
raise ValueError(
|
||||||
|
f"additional_batch_inputs must be a list of InputParam, but got {type(input_param)}"
|
||||||
|
)
|
||||||
|
|
||||||
self._image_latent_inputs = image_latent_inputs
|
self._image_latent_inputs = image_latent_inputs
|
||||||
self._additional_batch_inputs = additional_batch_inputs
|
self._additional_batch_inputs = additional_batch_inputs
|
||||||
@@ -252,9 +341,9 @@ class QwenImageAdditionalInputsStep(ModularPipelineBlocks):
|
|||||||
if self._image_latent_inputs or self._additional_batch_inputs:
|
if self._image_latent_inputs or self._additional_batch_inputs:
|
||||||
inputs_info = "\n\nConfigured inputs:"
|
inputs_info = "\n\nConfigured inputs:"
|
||||||
if self._image_latent_inputs:
|
if self._image_latent_inputs:
|
||||||
inputs_info += f"\n - Image latent inputs: {self._image_latent_inputs}"
|
inputs_info += f"\n - Image latent inputs: {[p.name for p in self._image_latent_inputs]}"
|
||||||
if self._additional_batch_inputs:
|
if self._additional_batch_inputs:
|
||||||
inputs_info += f"\n - Additional batch inputs: {self._additional_batch_inputs}"
|
inputs_info += f"\n - Additional batch inputs: {[p.name for p in self._additional_batch_inputs]}"
|
||||||
|
|
||||||
placement_section = "\n\nThis block should be placed after the encoder steps and the text input step."
|
placement_section = "\n\nThis block should be placed after the encoder steps and the text input step."
|
||||||
|
|
||||||
@@ -269,23 +358,19 @@ class QwenImageAdditionalInputsStep(ModularPipelineBlocks):
|
|||||||
@property
|
@property
|
||||||
def inputs(self) -> List[InputParam]:
|
def inputs(self) -> List[InputParam]:
|
||||||
inputs = [
|
inputs = [
|
||||||
InputParam(name="num_images_per_prompt", default=1),
|
InputParam.template("num_images_per_prompt"),
|
||||||
InputParam(name="batch_size", required=True),
|
InputParam.template("batch_size"),
|
||||||
InputParam(name="height"),
|
InputParam.template("height"),
|
||||||
InputParam(name="width"),
|
InputParam.template("width"),
|
||||||
]
|
]
|
||||||
|
# default is `image_latents`
|
||||||
for image_latent_input_name in self._image_latent_inputs:
|
inputs += self._image_latent_inputs + self._additional_batch_inputs
|
||||||
inputs.append(InputParam(name=image_latent_input_name))
|
|
||||||
|
|
||||||
for input_name in self._additional_batch_inputs:
|
|
||||||
inputs.append(InputParam(name=input_name))
|
|
||||||
|
|
||||||
return inputs
|
return inputs
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def intermediate_outputs(self) -> List[OutputParam]:
|
def intermediate_outputs(self) -> List[OutputParam]:
|
||||||
return [
|
outputs = [
|
||||||
OutputParam(
|
OutputParam(
|
||||||
name="image_height",
|
name="image_height",
|
||||||
type_hint=int,
|
type_hint=int,
|
||||||
@@ -298,11 +383,43 @@ class QwenImageAdditionalInputsStep(ModularPipelineBlocks):
|
|||||||
),
|
),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# `height`/`width` are not new outputs, but they will be updated if any image latent inputs are provided
|
||||||
|
if len(self._image_latent_inputs) > 0:
|
||||||
|
outputs.append(
|
||||||
|
OutputParam(name="height", type_hint=int, description="if not provided, updated to image height")
|
||||||
|
)
|
||||||
|
outputs.append(
|
||||||
|
OutputParam(name="width", type_hint=int, description="if not provided, updated to image width")
|
||||||
|
)
|
||||||
|
|
||||||
|
# image latent inputs are modified in place (patchified and batch-expanded)
|
||||||
|
for input_param in self._image_latent_inputs:
|
||||||
|
outputs.append(
|
||||||
|
OutputParam(
|
||||||
|
name=input_param.name,
|
||||||
|
type_hint=input_param.type_hint,
|
||||||
|
description=input_param.description + " (patchified and batch-expanded)",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
# additional batch inputs (batch-expanded only)
|
||||||
|
for input_param in self._additional_batch_inputs:
|
||||||
|
outputs.append(
|
||||||
|
OutputParam(
|
||||||
|
name=input_param.name,
|
||||||
|
type_hint=input_param.type_hint,
|
||||||
|
description=input_param.description + " (batch-expanded)",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
return outputs
|
||||||
|
|
||||||
def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
|
def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
|
||||||
block_state = self.get_block_state(state)
|
block_state = self.get_block_state(state)
|
||||||
|
|
||||||
# Process image latent inputs
|
# Process image latent inputs
|
||||||
for image_latent_input_name in self._image_latent_inputs:
|
for input_param in self._image_latent_inputs:
|
||||||
|
image_latent_input_name = input_param.name
|
||||||
image_latent_tensor = getattr(block_state, image_latent_input_name)
|
image_latent_tensor = getattr(block_state, image_latent_input_name)
|
||||||
if image_latent_tensor is None:
|
if image_latent_tensor is None:
|
||||||
continue
|
continue
|
||||||
@@ -331,7 +448,8 @@ class QwenImageAdditionalInputsStep(ModularPipelineBlocks):
|
|||||||
setattr(block_state, image_latent_input_name, image_latent_tensor)
|
setattr(block_state, image_latent_input_name, image_latent_tensor)
|
||||||
|
|
||||||
# Process additional batch inputs (only batch expansion)
|
# Process additional batch inputs (only batch expansion)
|
||||||
for input_name in self._additional_batch_inputs:
|
for input_param in self._additional_batch_inputs:
|
||||||
|
input_name = input_param.name
|
||||||
input_tensor = getattr(block_state, input_name)
|
input_tensor = getattr(block_state, input_name)
|
||||||
if input_tensor is None:
|
if input_tensor is None:
|
||||||
continue
|
continue
|
||||||
@@ -349,20 +467,76 @@ class QwenImageAdditionalInputsStep(ModularPipelineBlocks):
|
|||||||
return components, state
|
return components, state
|
||||||
|
|
||||||
|
|
||||||
|
# auto_docstring
|
||||||
class QwenImageEditPlusAdditionalInputsStep(ModularPipelineBlocks):
|
class QwenImageEditPlusAdditionalInputsStep(ModularPipelineBlocks):
|
||||||
"""Input step for QwenImage Edit Plus: handles list of latents with different sizes."""
|
"""
|
||||||
|
Input processing step for Edit Plus that:
|
||||||
|
1. For image latent inputs (list): Collects heights/widths, patchifies each, concatenates, expands batch
|
||||||
|
2. For additional batch inputs: Expands batch dimensions to match final batch size
|
||||||
|
Height/width defaults to last image in the list.
|
||||||
|
|
||||||
|
Configured inputs:
|
||||||
|
- Image latent inputs: ['image_latents']
|
||||||
|
|
||||||
|
This block should be placed after the encoder steps and the text input step.
|
||||||
|
|
||||||
|
Components:
|
||||||
|
pachifier (`QwenImagePachifier`)
|
||||||
|
|
||||||
|
Inputs:
|
||||||
|
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
||||||
|
The number of images to generate per prompt.
|
||||||
|
batch_size (`int`, *optional*, defaults to 1):
|
||||||
|
Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
|
||||||
|
be generated in input step.
|
||||||
|
height (`int`, *optional*):
|
||||||
|
The height in pixels of the generated image.
|
||||||
|
width (`int`, *optional*):
|
||||||
|
The width in pixels of the generated image.
|
||||||
|
image_latents (`Tensor`):
|
||||||
|
image latents used to guide the image generation. Can be generated from vae_encoder step.
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
image_height (`List`):
|
||||||
|
The image heights calculated from the image latents dimension
|
||||||
|
image_width (`List`):
|
||||||
|
The image widths calculated from the image latents dimension
|
||||||
|
height (`int`):
|
||||||
|
if not provided, updated to image height
|
||||||
|
width (`int`):
|
||||||
|
if not provided, updated to image width
|
||||||
|
image_latents (`Tensor`):
|
||||||
|
image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified,
|
||||||
|
concatenated, and batch-expanded)
|
||||||
|
"""
|
||||||
|
|
||||||
model_name = "qwenimage-edit-plus"
|
model_name = "qwenimage-edit-plus"
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
image_latent_inputs: List[str] = ["image_latents"],
|
image_latent_inputs: Optional[List[InputParam]] = None,
|
||||||
additional_batch_inputs: List[str] = [],
|
additional_batch_inputs: Optional[List[InputParam]] = None,
|
||||||
):
|
):
|
||||||
|
if image_latent_inputs is None:
|
||||||
|
image_latent_inputs = [InputParam.template("image_latents")]
|
||||||
|
if additional_batch_inputs is None:
|
||||||
|
additional_batch_inputs = []
|
||||||
|
|
||||||
if not isinstance(image_latent_inputs, list):
|
if not isinstance(image_latent_inputs, list):
|
||||||
image_latent_inputs = [image_latent_inputs]
|
raise ValueError(f"image_latent_inputs must be a list, but got {type(image_latent_inputs)}")
|
||||||
|
else:
|
||||||
|
for input_param in image_latent_inputs:
|
||||||
|
if not isinstance(input_param, InputParam):
|
||||||
|
raise ValueError(f"image_latent_inputs must be a list of InputParam, but got {type(input_param)}")
|
||||||
|
|
||||||
if not isinstance(additional_batch_inputs, list):
|
if not isinstance(additional_batch_inputs, list):
|
||||||
additional_batch_inputs = [additional_batch_inputs]
|
raise ValueError(f"additional_batch_inputs must be a list, but got {type(additional_batch_inputs)}")
|
||||||
|
else:
|
||||||
|
for input_param in additional_batch_inputs:
|
||||||
|
if not isinstance(input_param, InputParam):
|
||||||
|
raise ValueError(
|
||||||
|
f"additional_batch_inputs must be a list of InputParam, but got {type(input_param)}"
|
||||||
|
)
|
||||||
|
|
||||||
self._image_latent_inputs = image_latent_inputs
|
self._image_latent_inputs = image_latent_inputs
|
||||||
self._additional_batch_inputs = additional_batch_inputs
|
self._additional_batch_inputs = additional_batch_inputs
|
||||||
@@ -381,9 +555,9 @@ class QwenImageEditPlusAdditionalInputsStep(ModularPipelineBlocks):
|
|||||||
if self._image_latent_inputs or self._additional_batch_inputs:
|
if self._image_latent_inputs or self._additional_batch_inputs:
|
||||||
inputs_info = "\n\nConfigured inputs:"
|
inputs_info = "\n\nConfigured inputs:"
|
||||||
if self._image_latent_inputs:
|
if self._image_latent_inputs:
|
||||||
inputs_info += f"\n - Image latent inputs: {self._image_latent_inputs}"
|
inputs_info += f"\n - Image latent inputs: {[p.name for p in self._image_latent_inputs]}"
|
||||||
if self._additional_batch_inputs:
|
if self._additional_batch_inputs:
|
||||||
inputs_info += f"\n - Additional batch inputs: {self._additional_batch_inputs}"
|
inputs_info += f"\n - Additional batch inputs: {[p.name for p in self._additional_batch_inputs]}"
|
||||||
|
|
||||||
placement_section = "\n\nThis block should be placed after the encoder steps and the text input step."
|
placement_section = "\n\nThis block should be placed after the encoder steps and the text input step."
|
||||||
|
|
||||||
@@ -398,23 +572,20 @@ class QwenImageEditPlusAdditionalInputsStep(ModularPipelineBlocks):
|
|||||||
@property
|
@property
|
||||||
def inputs(self) -> List[InputParam]:
|
def inputs(self) -> List[InputParam]:
|
||||||
inputs = [
|
inputs = [
|
||||||
InputParam(name="num_images_per_prompt", default=1),
|
InputParam.template("num_images_per_prompt"),
|
||||||
InputParam(name="batch_size", required=True),
|
InputParam.template("batch_size"),
|
||||||
InputParam(name="height"),
|
InputParam.template("height"),
|
||||||
InputParam(name="width"),
|
InputParam.template("width"),
|
||||||
]
|
]
|
||||||
|
|
||||||
for image_latent_input_name in self._image_latent_inputs:
|
# default is `image_latents`
|
||||||
inputs.append(InputParam(name=image_latent_input_name))
|
inputs += self._image_latent_inputs + self._additional_batch_inputs
|
||||||
|
|
||||||
for input_name in self._additional_batch_inputs:
|
|
||||||
inputs.append(InputParam(name=input_name))
|
|
||||||
|
|
||||||
return inputs
|
return inputs
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def intermediate_outputs(self) -> List[OutputParam]:
|
def intermediate_outputs(self) -> List[OutputParam]:
|
||||||
return [
|
outputs = [
|
||||||
OutputParam(
|
OutputParam(
|
||||||
name="image_height",
|
name="image_height",
|
||||||
type_hint=List[int],
|
type_hint=List[int],
|
||||||
@@ -427,11 +598,43 @@ class QwenImageEditPlusAdditionalInputsStep(ModularPipelineBlocks):
|
|||||||
),
|
),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# `height`/`width` are updated if any image latent inputs are provided
|
||||||
|
if len(self._image_latent_inputs) > 0:
|
||||||
|
outputs.append(
|
||||||
|
OutputParam(name="height", type_hint=int, description="if not provided, updated to image height")
|
||||||
|
)
|
||||||
|
outputs.append(
|
||||||
|
OutputParam(name="width", type_hint=int, description="if not provided, updated to image width")
|
||||||
|
)
|
||||||
|
|
||||||
|
# image latent inputs are modified in place (patchified, concatenated, and batch-expanded)
|
||||||
|
for input_param in self._image_latent_inputs:
|
||||||
|
outputs.append(
|
||||||
|
OutputParam(
|
||||||
|
name=input_param.name,
|
||||||
|
type_hint=input_param.type_hint,
|
||||||
|
description=input_param.description + " (patchified, concatenated, and batch-expanded)",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
# additional batch inputs (batch-expanded only)
|
||||||
|
for input_param in self._additional_batch_inputs:
|
||||||
|
outputs.append(
|
||||||
|
OutputParam(
|
||||||
|
name=input_param.name,
|
||||||
|
type_hint=input_param.type_hint,
|
||||||
|
description=input_param.description + " (batch-expanded)",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
return outputs
|
||||||
|
|
||||||
def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
|
def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
|
||||||
block_state = self.get_block_state(state)
|
block_state = self.get_block_state(state)
|
||||||
|
|
||||||
# Process image latent inputs
|
# Process image latent inputs
|
||||||
for image_latent_input_name in self._image_latent_inputs:
|
for input_param in self._image_latent_inputs:
|
||||||
|
image_latent_input_name = input_param.name
|
||||||
image_latent_tensor = getattr(block_state, image_latent_input_name)
|
image_latent_tensor = getattr(block_state, image_latent_input_name)
|
||||||
if image_latent_tensor is None:
|
if image_latent_tensor is None:
|
||||||
continue
|
continue
|
||||||
@@ -476,7 +679,8 @@ class QwenImageEditPlusAdditionalInputsStep(ModularPipelineBlocks):
|
|||||||
setattr(block_state, image_latent_input_name, packed_image_latent_tensors)
|
setattr(block_state, image_latent_input_name, packed_image_latent_tensors)
|
||||||
|
|
||||||
# Process additional batch inputs (only batch expansion)
|
# Process additional batch inputs (only batch expansion)
|
||||||
for input_name in self._additional_batch_inputs:
|
for input_param in self._additional_batch_inputs:
|
||||||
|
input_name = input_param.name
|
||||||
input_tensor = getattr(block_state, input_name)
|
input_tensor = getattr(block_state, input_name)
|
||||||
if input_tensor is None:
|
if input_tensor is None:
|
||||||
continue
|
continue
|
||||||
@@ -494,22 +698,75 @@ class QwenImageEditPlusAdditionalInputsStep(ModularPipelineBlocks):
|
|||||||
return components, state
|
return components, state
|
||||||
|
|
||||||
|
|
||||||
# YiYi TODO: support define config default component from the ModularPipeline level.
|
# same as QwenImageAdditionalInputsStep, but with layered pachifier.
|
||||||
# it is same as QwenImageAdditionalInputsStep, but with layered pachifier.
|
|
||||||
|
|
||||||
|
# auto_docstring
|
||||||
class QwenImageLayeredAdditionalInputsStep(ModularPipelineBlocks):
|
class QwenImageLayeredAdditionalInputsStep(ModularPipelineBlocks):
|
||||||
"""Input step for QwenImage Layered: update height/width, expand batch, patchify with layered pachifier."""
|
"""
|
||||||
|
Input processing step for Layered that:
|
||||||
|
1. For image latent inputs: Updates height/width if None, patchifies with layered pachifier, and expands batch
|
||||||
|
size
|
||||||
|
2. For additional batch inputs: Expands batch dimensions to match final batch size
|
||||||
|
|
||||||
|
Configured inputs:
|
||||||
|
- Image latent inputs: ['image_latents']
|
||||||
|
|
||||||
|
This block should be placed after the encoder steps and the text input step.
|
||||||
|
|
||||||
|
Components:
|
||||||
|
pachifier (`QwenImageLayeredPachifier`)
|
||||||
|
|
||||||
|
Inputs:
|
||||||
|
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
||||||
|
The number of images to generate per prompt.
|
||||||
|
batch_size (`int`, *optional*, defaults to 1):
|
||||||
|
Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
|
||||||
|
be generated in input step.
|
||||||
|
image_latents (`Tensor`):
|
||||||
|
image latents used to guide the image generation. Can be generated from vae_encoder step.
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
image_height (`int`):
|
||||||
|
The image height calculated from the image latents dimension
|
||||||
|
image_width (`int`):
|
||||||
|
The image width calculated from the image latents dimension
|
||||||
|
height (`int`):
|
||||||
|
if not provided, updated to image height
|
||||||
|
width (`int`):
|
||||||
|
if not provided, updated to image width
|
||||||
|
image_latents (`Tensor`):
|
||||||
|
image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified
|
||||||
|
with layered pachifier and batch-expanded)
|
||||||
|
"""
|
||||||
|
|
||||||
model_name = "qwenimage-layered"
|
model_name = "qwenimage-layered"
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
image_latent_inputs: List[str] = ["image_latents"],
|
image_latent_inputs: Optional[List[InputParam]] = None,
|
||||||
additional_batch_inputs: List[str] = [],
|
additional_batch_inputs: Optional[List[InputParam]] = None,
|
||||||
):
|
):
|
||||||
|
if image_latent_inputs is None:
|
||||||
|
image_latent_inputs = [InputParam.template("image_latents")]
|
||||||
|
if additional_batch_inputs is None:
|
||||||
|
additional_batch_inputs = []
|
||||||
|
|
||||||
if not isinstance(image_latent_inputs, list):
|
if not isinstance(image_latent_inputs, list):
|
||||||
image_latent_inputs = [image_latent_inputs]
|
raise ValueError(f"image_latent_inputs must be a list, but got {type(image_latent_inputs)}")
|
||||||
|
else:
|
||||||
|
for input_param in image_latent_inputs:
|
||||||
|
if not isinstance(input_param, InputParam):
|
||||||
|
raise ValueError(f"image_latent_inputs must be a list of InputParam, but got {type(input_param)}")
|
||||||
|
|
||||||
if not isinstance(additional_batch_inputs, list):
|
if not isinstance(additional_batch_inputs, list):
|
||||||
additional_batch_inputs = [additional_batch_inputs]
|
raise ValueError(f"additional_batch_inputs must be a list, but got {type(additional_batch_inputs)}")
|
||||||
|
else:
|
||||||
|
for input_param in additional_batch_inputs:
|
||||||
|
if not isinstance(input_param, InputParam):
|
||||||
|
raise ValueError(
|
||||||
|
f"additional_batch_inputs must be a list of InputParam, but got {type(input_param)}"
|
||||||
|
)
|
||||||
|
|
||||||
self._image_latent_inputs = image_latent_inputs
|
self._image_latent_inputs = image_latent_inputs
|
||||||
self._additional_batch_inputs = additional_batch_inputs
|
self._additional_batch_inputs = additional_batch_inputs
|
||||||
@@ -527,9 +784,9 @@ class QwenImageLayeredAdditionalInputsStep(ModularPipelineBlocks):
|
|||||||
if self._image_latent_inputs or self._additional_batch_inputs:
|
if self._image_latent_inputs or self._additional_batch_inputs:
|
||||||
inputs_info = "\n\nConfigured inputs:"
|
inputs_info = "\n\nConfigured inputs:"
|
||||||
if self._image_latent_inputs:
|
if self._image_latent_inputs:
|
||||||
inputs_info += f"\n - Image latent inputs: {self._image_latent_inputs}"
|
inputs_info += f"\n - Image latent inputs: {[p.name for p in self._image_latent_inputs]}"
|
||||||
if self._additional_batch_inputs:
|
if self._additional_batch_inputs:
|
||||||
inputs_info += f"\n - Additional batch inputs: {self._additional_batch_inputs}"
|
inputs_info += f"\n - Additional batch inputs: {[p.name for p in self._additional_batch_inputs]}"
|
||||||
|
|
||||||
placement_section = "\n\nThis block should be placed after the encoder steps and the text input step."
|
placement_section = "\n\nThis block should be placed after the encoder steps and the text input step."
|
||||||
|
|
||||||
@@ -544,21 +801,18 @@ class QwenImageLayeredAdditionalInputsStep(ModularPipelineBlocks):
|
|||||||
@property
|
@property
|
||||||
def inputs(self) -> List[InputParam]:
|
def inputs(self) -> List[InputParam]:
|
||||||
inputs = [
|
inputs = [
|
||||||
InputParam(name="num_images_per_prompt", default=1),
|
InputParam.template("num_images_per_prompt"),
|
||||||
InputParam(name="batch_size", required=True),
|
InputParam.template("batch_size"),
|
||||||
]
|
]
|
||||||
|
# default is `image_latents`
|
||||||
|
|
||||||
for image_latent_input_name in self._image_latent_inputs:
|
inputs += self._image_latent_inputs + self._additional_batch_inputs
|
||||||
inputs.append(InputParam(name=image_latent_input_name))
|
|
||||||
|
|
||||||
for input_name in self._additional_batch_inputs:
|
|
||||||
inputs.append(InputParam(name=input_name))
|
|
||||||
|
|
||||||
return inputs
|
return inputs
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def intermediate_outputs(self) -> List[OutputParam]:
|
def intermediate_outputs(self) -> List[OutputParam]:
|
||||||
return [
|
outputs = [
|
||||||
OutputParam(
|
OutputParam(
|
||||||
name="image_height",
|
name="image_height",
|
||||||
type_hint=int,
|
type_hint=int,
|
||||||
@@ -569,15 +823,44 @@ class QwenImageLayeredAdditionalInputsStep(ModularPipelineBlocks):
|
|||||||
type_hint=int,
|
type_hint=int,
|
||||||
description="The image width calculated from the image latents dimension",
|
description="The image width calculated from the image latents dimension",
|
||||||
),
|
),
|
||||||
OutputParam(name="height", type_hint=int, description="The height of the image output"),
|
|
||||||
OutputParam(name="width", type_hint=int, description="The width of the image output"),
|
|
||||||
]
|
]
|
||||||
|
|
||||||
|
if len(self._image_latent_inputs) > 0:
|
||||||
|
outputs.append(
|
||||||
|
OutputParam(name="height", type_hint=int, description="if not provided, updated to image height")
|
||||||
|
)
|
||||||
|
outputs.append(
|
||||||
|
OutputParam(name="width", type_hint=int, description="if not provided, updated to image width")
|
||||||
|
)
|
||||||
|
|
||||||
|
# Add outputs for image latent inputs (patchified with layered pachifier and batch-expanded)
|
||||||
|
for input_param in self._image_latent_inputs:
|
||||||
|
outputs.append(
|
||||||
|
OutputParam(
|
||||||
|
name=input_param.name,
|
||||||
|
type_hint=input_param.type_hint,
|
||||||
|
description=input_param.description + " (patchified with layered pachifier and batch-expanded)",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Add outputs for additional batch inputs (batch-expanded only)
|
||||||
|
for input_param in self._additional_batch_inputs:
|
||||||
|
outputs.append(
|
||||||
|
OutputParam(
|
||||||
|
name=input_param.name,
|
||||||
|
type_hint=input_param.type_hint,
|
||||||
|
description=input_param.description + " (batch-expanded)",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
return outputs
|
||||||
|
|
||||||
def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
|
def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
|
||||||
block_state = self.get_block_state(state)
|
block_state = self.get_block_state(state)
|
||||||
|
|
||||||
# Process image latent inputs
|
# Process image latent inputs
|
||||||
for image_latent_input_name in self._image_latent_inputs:
|
for input_param in self._image_latent_inputs:
|
||||||
|
image_latent_input_name = input_param.name
|
||||||
image_latent_tensor = getattr(block_state, image_latent_input_name)
|
image_latent_tensor = getattr(block_state, image_latent_input_name)
|
||||||
if image_latent_tensor is None:
|
if image_latent_tensor is None:
|
||||||
continue
|
continue
|
||||||
@@ -608,7 +891,8 @@ class QwenImageLayeredAdditionalInputsStep(ModularPipelineBlocks):
|
|||||||
setattr(block_state, image_latent_input_name, image_latent_tensor)
|
setattr(block_state, image_latent_input_name, image_latent_tensor)
|
||||||
|
|
||||||
# Process additional batch inputs (only batch expansion)
|
# Process additional batch inputs (only batch expansion)
|
||||||
for input_name in self._additional_batch_inputs:
|
for input_param in self._additional_batch_inputs:
|
||||||
|
input_name = input_param.name
|
||||||
input_tensor = getattr(block_state, input_name)
|
input_tensor = getattr(block_state, input_name)
|
||||||
if input_tensor is None:
|
if input_tensor is None:
|
||||||
continue
|
continue
|
||||||
@@ -626,7 +910,34 @@ class QwenImageLayeredAdditionalInputsStep(ModularPipelineBlocks):
|
|||||||
return components, state
|
return components, state
|
||||||
|
|
||||||
|
|
||||||
|
# auto_docstring
|
||||||
class QwenImageControlNetInputsStep(ModularPipelineBlocks):
|
class QwenImageControlNetInputsStep(ModularPipelineBlocks):
|
||||||
|
"""
|
||||||
|
prepare the `control_image_latents` for controlnet. Insert after all the other inputs steps.
|
||||||
|
|
||||||
|
Inputs:
|
||||||
|
control_image_latents (`Tensor`):
|
||||||
|
The control image latents to use for the denoising process. Can be generated in controlnet vae encoder
|
||||||
|
step.
|
||||||
|
batch_size (`int`, *optional*, defaults to 1):
|
||||||
|
Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
|
||||||
|
be generated in input step.
|
||||||
|
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
||||||
|
The number of images to generate per prompt.
|
||||||
|
height (`int`, *optional*):
|
||||||
|
The height in pixels of the generated image.
|
||||||
|
width (`int`, *optional*):
|
||||||
|
The width in pixels of the generated image.
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
control_image_latents (`Tensor`):
|
||||||
|
The control image latents (patchified and batch-expanded).
|
||||||
|
height (`int`):
|
||||||
|
if not provided, updated to control image height
|
||||||
|
width (`int`):
|
||||||
|
if not provided, updated to control image width
|
||||||
|
"""
|
||||||
|
|
||||||
model_name = "qwenimage"
|
model_name = "qwenimage"
|
||||||
|
|
||||||
@property
|
@property
|
||||||
@@ -636,11 +947,28 @@ class QwenImageControlNetInputsStep(ModularPipelineBlocks):
|
|||||||
@property
|
@property
|
||||||
def inputs(self) -> List[InputParam]:
|
def inputs(self) -> List[InputParam]:
|
||||||
return [
|
return [
|
||||||
InputParam(name="control_image_latents", required=True),
|
InputParam(
|
||||||
InputParam(name="batch_size", required=True),
|
name="control_image_latents",
|
||||||
InputParam(name="num_images_per_prompt", default=1),
|
required=True,
|
||||||
InputParam(name="height"),
|
type_hint=torch.Tensor,
|
||||||
InputParam(name="width"),
|
description="The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step.",
|
||||||
|
),
|
||||||
|
InputParam.template("batch_size"),
|
||||||
|
InputParam.template("num_images_per_prompt"),
|
||||||
|
InputParam.template("height"),
|
||||||
|
InputParam.template("width"),
|
||||||
|
]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def intermediate_outputs(self) -> List[OutputParam]:
|
||||||
|
return [
|
||||||
|
OutputParam(
|
||||||
|
name="control_image_latents",
|
||||||
|
type_hint=torch.Tensor,
|
||||||
|
description="The control image latents (patchified and batch-expanded).",
|
||||||
|
),
|
||||||
|
OutputParam(name="height", type_hint=int, description="if not provided, updated to control image height"),
|
||||||
|
OutputParam(name="width", type_hint=int, description="if not provided, updated to control image width"),
|
||||||
]
|
]
|
||||||
|
|
||||||
@torch.no_grad()
|
@torch.no_grad()
|
||||||
|
|||||||
@@ -12,14 +12,11 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
from typing import List
|
|
||||||
|
|
||||||
import PIL.Image
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from ...utils import logging
|
from ...utils import logging
|
||||||
from ..modular_pipeline import AutoPipelineBlocks, ConditionalPipelineBlocks, SequentialPipelineBlocks
|
from ..modular_pipeline import AutoPipelineBlocks, ConditionalPipelineBlocks, SequentialPipelineBlocks
|
||||||
from ..modular_pipeline_utils import InsertableDict, OutputParam
|
from ..modular_pipeline_utils import InputParam, InsertableDict, OutputParam
|
||||||
from .before_denoise import (
|
from .before_denoise import (
|
||||||
QwenImageControlNetBeforeDenoiserStep,
|
QwenImageControlNetBeforeDenoiserStep,
|
||||||
QwenImageCreateMaskLatentsStep,
|
QwenImageCreateMaskLatentsStep,
|
||||||
@@ -59,11 +56,91 @@ logger = logging.get_logger(__name__)
|
|||||||
|
|
||||||
|
|
||||||
# ====================
|
# ====================
|
||||||
# 1. VAE ENCODER
|
# 1. TEXT ENCODER
|
||||||
# ====================
|
# ====================
|
||||||
|
|
||||||
|
|
||||||
|
# auto_docstring
|
||||||
|
class QwenImageAutoTextEncoderStep(AutoPipelineBlocks):
|
||||||
|
"""
|
||||||
|
Text encoder step that encodes the text prompt into a text embedding. This is an auto pipeline block.
|
||||||
|
|
||||||
|
Components:
|
||||||
|
text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use tokenizer (`Qwen2Tokenizer`):
|
||||||
|
The tokenizer to use guider (`ClassifierFreeGuidance`)
|
||||||
|
|
||||||
|
Inputs:
|
||||||
|
prompt (`str`, *optional*):
|
||||||
|
The prompt or prompts to guide image generation.
|
||||||
|
negative_prompt (`str`, *optional*):
|
||||||
|
The prompt or prompts not to guide the image generation.
|
||||||
|
max_sequence_length (`int`, *optional*, defaults to 1024):
|
||||||
|
Maximum sequence length for prompt encoding.
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
prompt_embeds (`Tensor`):
|
||||||
|
The prompt embeddings.
|
||||||
|
prompt_embeds_mask (`Tensor`):
|
||||||
|
The encoder attention mask.
|
||||||
|
negative_prompt_embeds (`Tensor`):
|
||||||
|
The negative prompt embeddings.
|
||||||
|
negative_prompt_embeds_mask (`Tensor`):
|
||||||
|
The negative prompt embeddings mask.
|
||||||
|
"""
|
||||||
|
|
||||||
|
model_name = "qwenimage"
|
||||||
|
block_classes = [QwenImageTextEncoderStep()]
|
||||||
|
block_names = ["text_encoder"]
|
||||||
|
block_trigger_inputs = ["prompt"]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def description(self) -> str:
|
||||||
|
return "Text encoder step that encodes the text prompt into a text embedding. This is an auto pipeline block."
|
||||||
|
" - `QwenImageTextEncoderStep` (text_encoder) is used when `prompt` is provided."
|
||||||
|
" - if `prompt` is not provided, step will be skipped."
|
||||||
|
|
||||||
|
|
||||||
|
# ====================
|
||||||
|
# 2. VAE ENCODER
|
||||||
|
# ====================
|
||||||
|
|
||||||
|
|
||||||
|
# auto_docstring
|
||||||
class QwenImageInpaintVaeEncoderStep(SequentialPipelineBlocks):
|
class QwenImageInpaintVaeEncoderStep(SequentialPipelineBlocks):
|
||||||
|
"""
|
||||||
|
This step is used for processing image and mask inputs for inpainting tasks. It:
|
||||||
|
- Resizes the image to the target size, based on `height` and `width`.
|
||||||
|
- Processes and updates `image` and `mask_image`.
|
||||||
|
- Creates `image_latents`.
|
||||||
|
|
||||||
|
Components:
|
||||||
|
image_mask_processor (`InpaintProcessor`) vae (`AutoencoderKLQwenImage`)
|
||||||
|
|
||||||
|
Inputs:
|
||||||
|
mask_image (`Image`):
|
||||||
|
Mask image for inpainting.
|
||||||
|
image (`Union[Image, List]`):
|
||||||
|
Reference image(s) for denoising. Can be a single image or list of images.
|
||||||
|
height (`int`, *optional*):
|
||||||
|
The height in pixels of the generated image.
|
||||||
|
width (`int`, *optional*):
|
||||||
|
The width in pixels of the generated image.
|
||||||
|
padding_mask_crop (`int`, *optional*):
|
||||||
|
Padding for mask cropping in inpainting.
|
||||||
|
generator (`Generator`, *optional*):
|
||||||
|
Torch generator for deterministic generation.
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
processed_image (`Tensor`):
|
||||||
|
The processed image
|
||||||
|
processed_mask_image (`Tensor`):
|
||||||
|
The processed mask image
|
||||||
|
mask_overlay_kwargs (`Dict`):
|
||||||
|
The kwargs for the postprocess step to apply the mask overlay
|
||||||
|
image_latents (`Tensor`):
|
||||||
|
The latent representation of the input image.
|
||||||
|
"""
|
||||||
|
|
||||||
model_name = "qwenimage"
|
model_name = "qwenimage"
|
||||||
block_classes = [QwenImageInpaintProcessImagesInputStep(), QwenImageVaeEncoderStep()]
|
block_classes = [QwenImageInpaintProcessImagesInputStep(), QwenImageVaeEncoderStep()]
|
||||||
block_names = ["preprocess", "encode"]
|
block_names = ["preprocess", "encode"]
|
||||||
@@ -78,7 +155,31 @@ class QwenImageInpaintVaeEncoderStep(SequentialPipelineBlocks):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# auto_docstring
|
||||||
class QwenImageImg2ImgVaeEncoderStep(SequentialPipelineBlocks):
|
class QwenImageImg2ImgVaeEncoderStep(SequentialPipelineBlocks):
|
||||||
|
"""
|
||||||
|
Vae encoder step that preprocess andencode the image inputs into their latent representations.
|
||||||
|
|
||||||
|
Components:
|
||||||
|
image_processor (`VaeImageProcessor`) vae (`AutoencoderKLQwenImage`)
|
||||||
|
|
||||||
|
Inputs:
|
||||||
|
image (`Union[Image, List]`):
|
||||||
|
Reference image(s) for denoising. Can be a single image or list of images.
|
||||||
|
height (`int`, *optional*):
|
||||||
|
The height in pixels of the generated image.
|
||||||
|
width (`int`, *optional*):
|
||||||
|
The width in pixels of the generated image.
|
||||||
|
generator (`Generator`, *optional*):
|
||||||
|
Torch generator for deterministic generation.
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
processed_image (`Tensor`):
|
||||||
|
The processed image
|
||||||
|
image_latents (`Tensor`):
|
||||||
|
The latent representation of the input image.
|
||||||
|
"""
|
||||||
|
|
||||||
model_name = "qwenimage"
|
model_name = "qwenimage"
|
||||||
|
|
||||||
block_classes = [QwenImageProcessImagesInputStep(), QwenImageVaeEncoderStep()]
|
block_classes = [QwenImageProcessImagesInputStep(), QwenImageVaeEncoderStep()]
|
||||||
@@ -89,7 +190,6 @@ class QwenImageImg2ImgVaeEncoderStep(SequentialPipelineBlocks):
|
|||||||
return "Vae encoder step that preprocess andencode the image inputs into their latent representations."
|
return "Vae encoder step that preprocess andencode the image inputs into their latent representations."
|
||||||
|
|
||||||
|
|
||||||
# Auto VAE encoder
|
|
||||||
class QwenImageAutoVaeEncoderStep(AutoPipelineBlocks):
|
class QwenImageAutoVaeEncoderStep(AutoPipelineBlocks):
|
||||||
block_classes = [QwenImageInpaintVaeEncoderStep, QwenImageImg2ImgVaeEncoderStep]
|
block_classes = [QwenImageInpaintVaeEncoderStep, QwenImageImg2ImgVaeEncoderStep]
|
||||||
block_names = ["inpaint", "img2img"]
|
block_names = ["inpaint", "img2img"]
|
||||||
@@ -107,7 +207,33 @@ class QwenImageAutoVaeEncoderStep(AutoPipelineBlocks):
|
|||||||
|
|
||||||
|
|
||||||
# optional controlnet vae encoder
|
# optional controlnet vae encoder
|
||||||
|
# auto_docstring
|
||||||
class QwenImageOptionalControlNetVaeEncoderStep(AutoPipelineBlocks):
|
class QwenImageOptionalControlNetVaeEncoderStep(AutoPipelineBlocks):
|
||||||
|
"""
|
||||||
|
Vae encoder step that encode the image inputs into their latent representations.
|
||||||
|
This is an auto pipeline block.
|
||||||
|
- `QwenImageControlNetVaeEncoderStep` (controlnet) is used when `control_image` is provided.
|
||||||
|
- if `control_image` is not provided, step will be skipped.
|
||||||
|
|
||||||
|
Components:
|
||||||
|
vae (`AutoencoderKLQwenImage`) controlnet (`QwenImageControlNetModel`) control_image_processor
|
||||||
|
(`VaeImageProcessor`)
|
||||||
|
|
||||||
|
Inputs:
|
||||||
|
control_image (`Image`, *optional*):
|
||||||
|
Control image for ControlNet conditioning.
|
||||||
|
height (`int`, *optional*):
|
||||||
|
The height in pixels of the generated image.
|
||||||
|
width (`int`, *optional*):
|
||||||
|
The width in pixels of the generated image.
|
||||||
|
generator (`Generator`, *optional*):
|
||||||
|
Torch generator for deterministic generation.
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
control_image_latents (`Tensor`):
|
||||||
|
The latents representing the control image
|
||||||
|
"""
|
||||||
|
|
||||||
block_classes = [QwenImageControlNetVaeEncoderStep]
|
block_classes = [QwenImageControlNetVaeEncoderStep]
|
||||||
block_names = ["controlnet"]
|
block_names = ["controlnet"]
|
||||||
block_trigger_inputs = ["control_image"]
|
block_trigger_inputs = ["control_image"]
|
||||||
@@ -123,14 +249,65 @@ class QwenImageOptionalControlNetVaeEncoderStep(AutoPipelineBlocks):
|
|||||||
|
|
||||||
|
|
||||||
# ====================
|
# ====================
|
||||||
# 2. DENOISE (input -> prepare_latents -> set_timesteps -> prepare_rope_inputs -> denoise -> after_denoise)
|
# 3. DENOISE (input -> prepare_latents -> set_timesteps -> prepare_rope_inputs -> denoise -> after_denoise)
|
||||||
# ====================
|
# ====================
|
||||||
|
|
||||||
|
|
||||||
# assemble input steps
|
# assemble input steps
|
||||||
|
# auto_docstring
|
||||||
class QwenImageImg2ImgInputStep(SequentialPipelineBlocks):
|
class QwenImageImg2ImgInputStep(SequentialPipelineBlocks):
|
||||||
|
"""
|
||||||
|
Input step that prepares the inputs for the img2img denoising step. It:
|
||||||
|
|
||||||
|
Components:
|
||||||
|
pachifier (`QwenImagePachifier`)
|
||||||
|
|
||||||
|
Inputs:
|
||||||
|
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
||||||
|
The number of images to generate per prompt.
|
||||||
|
prompt_embeds (`Tensor`):
|
||||||
|
text embeddings used to guide the image generation. Can be generated from text_encoder step.
|
||||||
|
prompt_embeds_mask (`Tensor`):
|
||||||
|
mask for the text embeddings. Can be generated from text_encoder step.
|
||||||
|
negative_prompt_embeds (`Tensor`, *optional*):
|
||||||
|
negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
|
||||||
|
negative_prompt_embeds_mask (`Tensor`, *optional*):
|
||||||
|
mask for the negative text embeddings. Can be generated from text_encoder step.
|
||||||
|
height (`int`, *optional*):
|
||||||
|
The height in pixels of the generated image.
|
||||||
|
width (`int`, *optional*):
|
||||||
|
The width in pixels of the generated image.
|
||||||
|
image_latents (`Tensor`):
|
||||||
|
image latents used to guide the image generation. Can be generated from vae_encoder step.
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
batch_size (`int`):
|
||||||
|
The batch size of the prompt embeddings
|
||||||
|
dtype (`dtype`):
|
||||||
|
The data type of the prompt embeddings
|
||||||
|
prompt_embeds (`Tensor`):
|
||||||
|
The prompt embeddings. (batch-expanded)
|
||||||
|
prompt_embeds_mask (`Tensor`):
|
||||||
|
The encoder attention mask. (batch-expanded)
|
||||||
|
negative_prompt_embeds (`Tensor`):
|
||||||
|
The negative prompt embeddings. (batch-expanded)
|
||||||
|
negative_prompt_embeds_mask (`Tensor`):
|
||||||
|
The negative prompt embeddings mask. (batch-expanded)
|
||||||
|
image_height (`int`):
|
||||||
|
The image height calculated from the image latents dimension
|
||||||
|
image_width (`int`):
|
||||||
|
The image width calculated from the image latents dimension
|
||||||
|
height (`int`):
|
||||||
|
if not provided, updated to image height
|
||||||
|
width (`int`):
|
||||||
|
if not provided, updated to image width
|
||||||
|
image_latents (`Tensor`):
|
||||||
|
image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified and
|
||||||
|
batch-expanded)
|
||||||
|
"""
|
||||||
|
|
||||||
model_name = "qwenimage"
|
model_name = "qwenimage"
|
||||||
block_classes = [QwenImageTextInputsStep(), QwenImageAdditionalInputsStep(image_latent_inputs=["image_latents"])]
|
block_classes = [QwenImageTextInputsStep(), QwenImageAdditionalInputsStep()]
|
||||||
block_names = ["text_inputs", "additional_inputs"]
|
block_names = ["text_inputs", "additional_inputs"]
|
||||||
|
|
||||||
@property
|
@property
|
||||||
@@ -140,12 +317,69 @@ class QwenImageImg2ImgInputStep(SequentialPipelineBlocks):
|
|||||||
" - update height/width based `image_latents`, patchify `image_latents`."
|
" - update height/width based `image_latents`, patchify `image_latents`."
|
||||||
|
|
||||||
|
|
||||||
|
# auto_docstring
|
||||||
class QwenImageInpaintInputStep(SequentialPipelineBlocks):
|
class QwenImageInpaintInputStep(SequentialPipelineBlocks):
|
||||||
|
"""
|
||||||
|
Input step that prepares the inputs for the inpainting denoising step. It:
|
||||||
|
|
||||||
|
Components:
|
||||||
|
pachifier (`QwenImagePachifier`)
|
||||||
|
|
||||||
|
Inputs:
|
||||||
|
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
||||||
|
The number of images to generate per prompt.
|
||||||
|
prompt_embeds (`Tensor`):
|
||||||
|
text embeddings used to guide the image generation. Can be generated from text_encoder step.
|
||||||
|
prompt_embeds_mask (`Tensor`):
|
||||||
|
mask for the text embeddings. Can be generated from text_encoder step.
|
||||||
|
negative_prompt_embeds (`Tensor`, *optional*):
|
||||||
|
negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
|
||||||
|
negative_prompt_embeds_mask (`Tensor`, *optional*):
|
||||||
|
mask for the negative text embeddings. Can be generated from text_encoder step.
|
||||||
|
height (`int`, *optional*):
|
||||||
|
The height in pixels of the generated image.
|
||||||
|
width (`int`, *optional*):
|
||||||
|
The width in pixels of the generated image.
|
||||||
|
image_latents (`Tensor`, *optional*):
|
||||||
|
image latents used to guide the image generation. Can be generated from vae_encoder step.
|
||||||
|
processed_mask_image (`Tensor`, *optional*):
|
||||||
|
The processed mask image
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
batch_size (`int`):
|
||||||
|
The batch size of the prompt embeddings
|
||||||
|
dtype (`dtype`):
|
||||||
|
The data type of the prompt embeddings
|
||||||
|
prompt_embeds (`Tensor`):
|
||||||
|
The prompt embeddings. (batch-expanded)
|
||||||
|
prompt_embeds_mask (`Tensor`):
|
||||||
|
The encoder attention mask. (batch-expanded)
|
||||||
|
negative_prompt_embeds (`Tensor`):
|
||||||
|
The negative prompt embeddings. (batch-expanded)
|
||||||
|
negative_prompt_embeds_mask (`Tensor`):
|
||||||
|
The negative prompt embeddings mask. (batch-expanded)
|
||||||
|
image_height (`int`):
|
||||||
|
The image height calculated from the image latents dimension
|
||||||
|
image_width (`int`):
|
||||||
|
The image width calculated from the image latents dimension
|
||||||
|
height (`int`):
|
||||||
|
if not provided, updated to image height
|
||||||
|
width (`int`):
|
||||||
|
if not provided, updated to image width
|
||||||
|
image_latents (`Tensor`):
|
||||||
|
image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified and
|
||||||
|
batch-expanded)
|
||||||
|
processed_mask_image (`Tensor`):
|
||||||
|
The processed mask image (batch-expanded)
|
||||||
|
"""
|
||||||
|
|
||||||
model_name = "qwenimage"
|
model_name = "qwenimage"
|
||||||
block_classes = [
|
block_classes = [
|
||||||
QwenImageTextInputsStep(),
|
QwenImageTextInputsStep(),
|
||||||
QwenImageAdditionalInputsStep(
|
QwenImageAdditionalInputsStep(
|
||||||
image_latent_inputs=["image_latents"], additional_batch_inputs=["processed_mask_image"]
|
additional_batch_inputs=[
|
||||||
|
InputParam(name="processed_mask_image", type_hint=torch.Tensor, description="The processed mask image")
|
||||||
|
]
|
||||||
),
|
),
|
||||||
]
|
]
|
||||||
block_names = ["text_inputs", "additional_inputs"]
|
block_names = ["text_inputs", "additional_inputs"]
|
||||||
@@ -158,7 +392,42 @@ class QwenImageInpaintInputStep(SequentialPipelineBlocks):
|
|||||||
|
|
||||||
|
|
||||||
# assemble prepare latents steps
|
# assemble prepare latents steps
|
||||||
|
# auto_docstring
|
||||||
class QwenImageInpaintPrepareLatentsStep(SequentialPipelineBlocks):
|
class QwenImageInpaintPrepareLatentsStep(SequentialPipelineBlocks):
|
||||||
|
"""
|
||||||
|
This step prepares the latents/image_latents and mask inputs for the inpainting denoising step. It:
|
||||||
|
- Add noise to the image latents to create the latents input for the denoiser.
|
||||||
|
- Create the pachified latents `mask` based on the processedmask image.
|
||||||
|
|
||||||
|
Components:
|
||||||
|
scheduler (`FlowMatchEulerDiscreteScheduler`) pachifier (`QwenImagePachifier`)
|
||||||
|
|
||||||
|
Inputs:
|
||||||
|
latents (`Tensor`):
|
||||||
|
The initial random noised, can be generated in prepare latent step.
|
||||||
|
image_latents (`Tensor`):
|
||||||
|
image latents used to guide the image generation. Can be generated from vae_encoder step. (Can be
|
||||||
|
generated from vae encoder and updated in input step.)
|
||||||
|
timesteps (`Tensor`):
|
||||||
|
The timesteps to use for the denoising process. Can be generated in set_timesteps step.
|
||||||
|
processed_mask_image (`Tensor`):
|
||||||
|
The processed mask to use for the inpainting process.
|
||||||
|
height (`int`):
|
||||||
|
The height in pixels of the generated image.
|
||||||
|
width (`int`):
|
||||||
|
The width in pixels of the generated image.
|
||||||
|
dtype (`dtype`, *optional*, defaults to torch.float32):
|
||||||
|
The dtype of the model inputs, can be generated in input step.
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
initial_noise (`Tensor`):
|
||||||
|
The initial random noised used for inpainting denoising.
|
||||||
|
latents (`Tensor`):
|
||||||
|
The scaled noisy latents to use for inpainting/image-to-image denoising.
|
||||||
|
mask (`Tensor`):
|
||||||
|
The mask to use for the inpainting process.
|
||||||
|
"""
|
||||||
|
|
||||||
model_name = "qwenimage"
|
model_name = "qwenimage"
|
||||||
block_classes = [QwenImagePrepareLatentsWithStrengthStep(), QwenImageCreateMaskLatentsStep()]
|
block_classes = [QwenImagePrepareLatentsWithStrengthStep(), QwenImageCreateMaskLatentsStep()]
|
||||||
block_names = ["add_noise_to_latents", "create_mask_latents"]
|
block_names = ["add_noise_to_latents", "create_mask_latents"]
|
||||||
@@ -176,7 +445,49 @@ class QwenImageInpaintPrepareLatentsStep(SequentialPipelineBlocks):
|
|||||||
|
|
||||||
|
|
||||||
# Qwen Image (text2image)
|
# Qwen Image (text2image)
|
||||||
|
# auto_docstring
|
||||||
class QwenImageCoreDenoiseStep(SequentialPipelineBlocks):
|
class QwenImageCoreDenoiseStep(SequentialPipelineBlocks):
|
||||||
|
"""
|
||||||
|
step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs
|
||||||
|
(timesteps, latents, rope inputs etc.).
|
||||||
|
|
||||||
|
Components:
|
||||||
|
pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider
|
||||||
|
(`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)
|
||||||
|
|
||||||
|
Inputs:
|
||||||
|
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
||||||
|
The number of images to generate per prompt.
|
||||||
|
prompt_embeds (`Tensor`):
|
||||||
|
text embeddings used to guide the image generation. Can be generated from text_encoder step.
|
||||||
|
prompt_embeds_mask (`Tensor`):
|
||||||
|
mask for the text embeddings. Can be generated from text_encoder step.
|
||||||
|
negative_prompt_embeds (`Tensor`, *optional*):
|
||||||
|
negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
|
||||||
|
negative_prompt_embeds_mask (`Tensor`, *optional*):
|
||||||
|
mask for the negative text embeddings. Can be generated from text_encoder step.
|
||||||
|
latents (`Tensor`, *optional*):
|
||||||
|
Pre-generated noisy latents for image generation.
|
||||||
|
height (`int`, *optional*):
|
||||||
|
The height in pixels of the generated image.
|
||||||
|
width (`int`, *optional*):
|
||||||
|
The width in pixels of the generated image.
|
||||||
|
generator (`Generator`, *optional*):
|
||||||
|
Torch generator for deterministic generation.
|
||||||
|
num_inference_steps (`int`, *optional*, defaults to 50):
|
||||||
|
The number of denoising steps.
|
||||||
|
sigmas (`List`, *optional*):
|
||||||
|
Custom sigmas for the denoising process.
|
||||||
|
attention_kwargs (`Dict`, *optional*):
|
||||||
|
Additional kwargs for attention processors.
|
||||||
|
**denoiser_input_fields (`None`, *optional*):
|
||||||
|
conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
latents (`Tensor`):
|
||||||
|
Denoised latents.
|
||||||
|
"""
|
||||||
|
|
||||||
model_name = "qwenimage"
|
model_name = "qwenimage"
|
||||||
block_classes = [
|
block_classes = [
|
||||||
QwenImageTextInputsStep(),
|
QwenImageTextInputsStep(),
|
||||||
@@ -199,9 +510,63 @@ class QwenImageCoreDenoiseStep(SequentialPipelineBlocks):
|
|||||||
def description(self):
|
def description(self):
|
||||||
return "step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.)."
|
return "step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.)."
|
||||||
|
|
||||||
|
@property
|
||||||
|
def outputs(self):
|
||||||
|
return [
|
||||||
|
OutputParam.template("latents"),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
# Qwen Image (inpainting)
|
# Qwen Image (inpainting)
|
||||||
|
# auto_docstring
|
||||||
class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks):
|
class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks):
|
||||||
|
"""
|
||||||
|
Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint
|
||||||
|
task.
|
||||||
|
|
||||||
|
Components:
|
||||||
|
pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider
|
||||||
|
(`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)
|
||||||
|
|
||||||
|
Inputs:
|
||||||
|
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
||||||
|
The number of images to generate per prompt.
|
||||||
|
prompt_embeds (`Tensor`):
|
||||||
|
text embeddings used to guide the image generation. Can be generated from text_encoder step.
|
||||||
|
prompt_embeds_mask (`Tensor`):
|
||||||
|
mask for the text embeddings. Can be generated from text_encoder step.
|
||||||
|
negative_prompt_embeds (`Tensor`, *optional*):
|
||||||
|
negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
|
||||||
|
negative_prompt_embeds_mask (`Tensor`, *optional*):
|
||||||
|
mask for the negative text embeddings. Can be generated from text_encoder step.
|
||||||
|
height (`int`, *optional*):
|
||||||
|
The height in pixels of the generated image.
|
||||||
|
width (`int`, *optional*):
|
||||||
|
The width in pixels of the generated image.
|
||||||
|
image_latents (`Tensor`, *optional*):
|
||||||
|
image latents used to guide the image generation. Can be generated from vae_encoder step.
|
||||||
|
processed_mask_image (`Tensor`, *optional*):
|
||||||
|
The processed mask image
|
||||||
|
latents (`Tensor`, *optional*):
|
||||||
|
Pre-generated noisy latents for image generation.
|
||||||
|
generator (`Generator`, *optional*):
|
||||||
|
Torch generator for deterministic generation.
|
||||||
|
num_inference_steps (`int`, *optional*, defaults to 50):
|
||||||
|
The number of denoising steps.
|
||||||
|
sigmas (`List`, *optional*):
|
||||||
|
Custom sigmas for the denoising process.
|
||||||
|
strength (`float`, *optional*, defaults to 0.9):
|
||||||
|
Strength for img2img/inpainting.
|
||||||
|
attention_kwargs (`Dict`, *optional*):
|
||||||
|
Additional kwargs for attention processors.
|
||||||
|
**denoiser_input_fields (`None`, *optional*):
|
||||||
|
conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
latents (`Tensor`):
|
||||||
|
Denoised latents.
|
||||||
|
"""
|
||||||
|
|
||||||
model_name = "qwenimage"
|
model_name = "qwenimage"
|
||||||
block_classes = [
|
block_classes = [
|
||||||
QwenImageInpaintInputStep(),
|
QwenImageInpaintInputStep(),
|
||||||
@@ -226,9 +591,61 @@ class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks):
|
|||||||
def description(self):
|
def description(self):
|
||||||
return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task."
|
return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task."
|
||||||
|
|
||||||
|
@property
|
||||||
|
def outputs(self):
|
||||||
|
return [
|
||||||
|
OutputParam.template("latents"),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
# Qwen Image (image2image)
|
# Qwen Image (image2image)
|
||||||
|
# auto_docstring
|
||||||
class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
|
class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
|
||||||
|
"""
|
||||||
|
Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img
|
||||||
|
task.
|
||||||
|
|
||||||
|
Components:
|
||||||
|
pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider
|
||||||
|
(`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)
|
||||||
|
|
||||||
|
Inputs:
|
||||||
|
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
||||||
|
The number of images to generate per prompt.
|
||||||
|
prompt_embeds (`Tensor`):
|
||||||
|
text embeddings used to guide the image generation. Can be generated from text_encoder step.
|
||||||
|
prompt_embeds_mask (`Tensor`):
|
||||||
|
mask for the text embeddings. Can be generated from text_encoder step.
|
||||||
|
negative_prompt_embeds (`Tensor`, *optional*):
|
||||||
|
negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
|
||||||
|
negative_prompt_embeds_mask (`Tensor`, *optional*):
|
||||||
|
mask for the negative text embeddings. Can be generated from text_encoder step.
|
||||||
|
height (`int`, *optional*):
|
||||||
|
The height in pixels of the generated image.
|
||||||
|
width (`int`, *optional*):
|
||||||
|
The width in pixels of the generated image.
|
||||||
|
image_latents (`Tensor`):
|
||||||
|
image latents used to guide the image generation. Can be generated from vae_encoder step.
|
||||||
|
latents (`Tensor`, *optional*):
|
||||||
|
Pre-generated noisy latents for image generation.
|
||||||
|
generator (`Generator`, *optional*):
|
||||||
|
Torch generator for deterministic generation.
|
||||||
|
num_inference_steps (`int`, *optional*, defaults to 50):
|
||||||
|
The number of denoising steps.
|
||||||
|
sigmas (`List`, *optional*):
|
||||||
|
Custom sigmas for the denoising process.
|
||||||
|
strength (`float`, *optional*, defaults to 0.9):
|
||||||
|
Strength for img2img/inpainting.
|
||||||
|
attention_kwargs (`Dict`, *optional*):
|
||||||
|
Additional kwargs for attention processors.
|
||||||
|
**denoiser_input_fields (`None`, *optional*):
|
||||||
|
conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
latents (`Tensor`):
|
||||||
|
Denoised latents.
|
||||||
|
"""
|
||||||
|
|
||||||
model_name = "qwenimage"
|
model_name = "qwenimage"
|
||||||
block_classes = [
|
block_classes = [
|
||||||
QwenImageImg2ImgInputStep(),
|
QwenImageImg2ImgInputStep(),
|
||||||
@@ -253,9 +670,66 @@ class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
|
|||||||
def description(self):
|
def description(self):
|
||||||
return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task."
|
return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task."
|
||||||
|
|
||||||
|
@property
|
||||||
|
def outputs(self):
|
||||||
|
return [
|
||||||
|
OutputParam.template("latents"),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
# Qwen Image (text2image) with controlnet
|
# Qwen Image (text2image) with controlnet
|
||||||
|
# auto_docstring
|
||||||
class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks):
|
class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks):
|
||||||
|
"""
|
||||||
|
step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs
|
||||||
|
(timesteps, latents, rope inputs etc.).
|
||||||
|
|
||||||
|
Components:
|
||||||
|
pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) controlnet
|
||||||
|
(`QwenImageControlNetModel`) guider (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)
|
||||||
|
|
||||||
|
Inputs:
|
||||||
|
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
||||||
|
The number of images to generate per prompt.
|
||||||
|
prompt_embeds (`Tensor`):
|
||||||
|
text embeddings used to guide the image generation. Can be generated from text_encoder step.
|
||||||
|
prompt_embeds_mask (`Tensor`):
|
||||||
|
mask for the text embeddings. Can be generated from text_encoder step.
|
||||||
|
negative_prompt_embeds (`Tensor`, *optional*):
|
||||||
|
negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
|
||||||
|
negative_prompt_embeds_mask (`Tensor`, *optional*):
|
||||||
|
mask for the negative text embeddings. Can be generated from text_encoder step.
|
||||||
|
control_image_latents (`Tensor`):
|
||||||
|
The control image latents to use for the denoising process. Can be generated in controlnet vae encoder
|
||||||
|
step.
|
||||||
|
height (`int`, *optional*):
|
||||||
|
The height in pixels of the generated image.
|
||||||
|
width (`int`, *optional*):
|
||||||
|
The width in pixels of the generated image.
|
||||||
|
latents (`Tensor`, *optional*):
|
||||||
|
Pre-generated noisy latents for image generation.
|
||||||
|
generator (`Generator`, *optional*):
|
||||||
|
Torch generator for deterministic generation.
|
||||||
|
num_inference_steps (`int`, *optional*, defaults to 50):
|
||||||
|
The number of denoising steps.
|
||||||
|
sigmas (`List`, *optional*):
|
||||||
|
Custom sigmas for the denoising process.
|
||||||
|
control_guidance_start (`float`, *optional*, defaults to 0.0):
|
||||||
|
When to start applying ControlNet.
|
||||||
|
control_guidance_end (`float`, *optional*, defaults to 1.0):
|
||||||
|
When to stop applying ControlNet.
|
||||||
|
controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
|
||||||
|
Scale for ControlNet conditioning.
|
||||||
|
attention_kwargs (`Dict`, *optional*):
|
||||||
|
Additional kwargs for attention processors.
|
||||||
|
**denoiser_input_fields (`None`, *optional*):
|
||||||
|
conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
latents (`Tensor`):
|
||||||
|
Denoised latents.
|
||||||
|
"""
|
||||||
|
|
||||||
model_name = "qwenimage"
|
model_name = "qwenimage"
|
||||||
block_classes = [
|
block_classes = [
|
||||||
QwenImageTextInputsStep(),
|
QwenImageTextInputsStep(),
|
||||||
@@ -282,9 +756,72 @@ class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks):
|
|||||||
def description(self):
|
def description(self):
|
||||||
return "step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.)."
|
return "step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.)."
|
||||||
|
|
||||||
|
@property
|
||||||
|
def outputs(self):
|
||||||
|
return [
|
||||||
|
OutputParam.template("latents"),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
# Qwen Image (inpainting) with controlnet
|
# Qwen Image (inpainting) with controlnet
|
||||||
|
# auto_docstring
|
||||||
class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks):
|
class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks):
|
||||||
|
"""
|
||||||
|
Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint
|
||||||
|
task.
|
||||||
|
|
||||||
|
Components:
|
||||||
|
pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) controlnet
|
||||||
|
(`QwenImageControlNetModel`) guider (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)
|
||||||
|
|
||||||
|
Inputs:
|
||||||
|
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
||||||
|
The number of images to generate per prompt.
|
||||||
|
prompt_embeds (`Tensor`):
|
||||||
|
text embeddings used to guide the image generation. Can be generated from text_encoder step.
|
||||||
|
prompt_embeds_mask (`Tensor`):
|
||||||
|
mask for the text embeddings. Can be generated from text_encoder step.
|
||||||
|
negative_prompt_embeds (`Tensor`, *optional*):
|
||||||
|
negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
|
||||||
|
negative_prompt_embeds_mask (`Tensor`, *optional*):
|
||||||
|
mask for the negative text embeddings. Can be generated from text_encoder step.
|
||||||
|
height (`int`, *optional*):
|
||||||
|
The height in pixels of the generated image.
|
||||||
|
width (`int`, *optional*):
|
||||||
|
The width in pixels of the generated image.
|
||||||
|
image_latents (`Tensor`, *optional*):
|
||||||
|
image latents used to guide the image generation. Can be generated from vae_encoder step.
|
||||||
|
processed_mask_image (`Tensor`, *optional*):
|
||||||
|
The processed mask image
|
||||||
|
control_image_latents (`Tensor`):
|
||||||
|
The control image latents to use for the denoising process. Can be generated in controlnet vae encoder
|
||||||
|
step.
|
||||||
|
latents (`Tensor`, *optional*):
|
||||||
|
Pre-generated noisy latents for image generation.
|
||||||
|
generator (`Generator`, *optional*):
|
||||||
|
Torch generator for deterministic generation.
|
||||||
|
num_inference_steps (`int`, *optional*, defaults to 50):
|
||||||
|
The number of denoising steps.
|
||||||
|
sigmas (`List`, *optional*):
|
||||||
|
Custom sigmas for the denoising process.
|
||||||
|
strength (`float`, *optional*, defaults to 0.9):
|
||||||
|
Strength for img2img/inpainting.
|
||||||
|
control_guidance_start (`float`, *optional*, defaults to 0.0):
|
||||||
|
When to start applying ControlNet.
|
||||||
|
control_guidance_end (`float`, *optional*, defaults to 1.0):
|
||||||
|
When to stop applying ControlNet.
|
||||||
|
controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
|
||||||
|
Scale for ControlNet conditioning.
|
||||||
|
attention_kwargs (`Dict`, *optional*):
|
||||||
|
Additional kwargs for attention processors.
|
||||||
|
**denoiser_input_fields (`None`, *optional*):
|
||||||
|
conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
latents (`Tensor`):
|
||||||
|
Denoised latents.
|
||||||
|
"""
|
||||||
|
|
||||||
model_name = "qwenimage"
|
model_name = "qwenimage"
|
||||||
block_classes = [
|
block_classes = [
|
||||||
QwenImageInpaintInputStep(),
|
QwenImageInpaintInputStep(),
|
||||||
@@ -313,9 +850,70 @@ class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks):
|
|||||||
def description(self):
|
def description(self):
|
||||||
return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task."
|
return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task."
|
||||||
|
|
||||||
|
@property
|
||||||
|
def outputs(self):
|
||||||
|
return [
|
||||||
|
OutputParam.template("latents"),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
# Qwen Image (image2image) with controlnet
|
# Qwen Image (image2image) with controlnet
|
||||||
|
# auto_docstring
|
||||||
class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
|
class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
|
||||||
|
"""
|
||||||
|
Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img
|
||||||
|
task.
|
||||||
|
|
||||||
|
Components:
|
||||||
|
pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) controlnet
|
||||||
|
(`QwenImageControlNetModel`) guider (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)
|
||||||
|
|
||||||
|
Inputs:
|
||||||
|
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
||||||
|
The number of images to generate per prompt.
|
||||||
|
prompt_embeds (`Tensor`):
|
||||||
|
text embeddings used to guide the image generation. Can be generated from text_encoder step.
|
||||||
|
prompt_embeds_mask (`Tensor`):
|
||||||
|
mask for the text embeddings. Can be generated from text_encoder step.
|
||||||
|
negative_prompt_embeds (`Tensor`, *optional*):
|
||||||
|
negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
|
||||||
|
negative_prompt_embeds_mask (`Tensor`, *optional*):
|
||||||
|
mask for the negative text embeddings. Can be generated from text_encoder step.
|
||||||
|
height (`int`, *optional*):
|
||||||
|
The height in pixels of the generated image.
|
||||||
|
width (`int`, *optional*):
|
||||||
|
The width in pixels of the generated image.
|
||||||
|
image_latents (`Tensor`):
|
||||||
|
image latents used to guide the image generation. Can be generated from vae_encoder step.
|
||||||
|
control_image_latents (`Tensor`):
|
||||||
|
The control image latents to use for the denoising process. Can be generated in controlnet vae encoder
|
||||||
|
step.
|
||||||
|
latents (`Tensor`, *optional*):
|
||||||
|
Pre-generated noisy latents for image generation.
|
||||||
|
generator (`Generator`, *optional*):
|
||||||
|
Torch generator for deterministic generation.
|
||||||
|
num_inference_steps (`int`, *optional*, defaults to 50):
|
||||||
|
The number of denoising steps.
|
||||||
|
sigmas (`List`, *optional*):
|
||||||
|
Custom sigmas for the denoising process.
|
||||||
|
strength (`float`, *optional*, defaults to 0.9):
|
||||||
|
Strength for img2img/inpainting.
|
||||||
|
control_guidance_start (`float`, *optional*, defaults to 0.0):
|
||||||
|
When to start applying ControlNet.
|
||||||
|
control_guidance_end (`float`, *optional*, defaults to 1.0):
|
||||||
|
When to stop applying ControlNet.
|
||||||
|
controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
|
||||||
|
Scale for ControlNet conditioning.
|
||||||
|
attention_kwargs (`Dict`, *optional*):
|
||||||
|
Additional kwargs for attention processors.
|
||||||
|
**denoiser_input_fields (`None`, *optional*):
|
||||||
|
conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
latents (`Tensor`):
|
||||||
|
Denoised latents.
|
||||||
|
"""
|
||||||
|
|
||||||
model_name = "qwenimage"
|
model_name = "qwenimage"
|
||||||
block_classes = [
|
block_classes = [
|
||||||
QwenImageImg2ImgInputStep(),
|
QwenImageImg2ImgInputStep(),
|
||||||
@@ -344,6 +942,12 @@ class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
|
|||||||
def description(self):
|
def description(self):
|
||||||
return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task."
|
return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task."
|
||||||
|
|
||||||
|
@property
|
||||||
|
def outputs(self):
|
||||||
|
return [
|
||||||
|
OutputParam.template("latents"),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
# Auto denoise step for QwenImage
|
# Auto denoise step for QwenImage
|
||||||
class QwenImageAutoCoreDenoiseStep(ConditionalPipelineBlocks):
|
class QwenImageAutoCoreDenoiseStep(ConditionalPipelineBlocks):
|
||||||
@@ -402,19 +1006,36 @@ class QwenImageAutoCoreDenoiseStep(ConditionalPipelineBlocks):
|
|||||||
@property
|
@property
|
||||||
def outputs(self):
|
def outputs(self):
|
||||||
return [
|
return [
|
||||||
OutputParam(
|
OutputParam.template("latents"),
|
||||||
name="latents", type_hint=torch.Tensor, description="The latents generated by the denoising step"
|
|
||||||
),
|
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
# ====================
|
# ====================
|
||||||
# 3. DECODE
|
# 4. DECODE
|
||||||
# ====================
|
# ====================
|
||||||
|
|
||||||
|
|
||||||
# standard decode step works for most tasks except for inpaint
|
# standard decode step works for most tasks except for inpaint
|
||||||
|
# auto_docstring
|
||||||
class QwenImageDecodeStep(SequentialPipelineBlocks):
|
class QwenImageDecodeStep(SequentialPipelineBlocks):
|
||||||
|
"""
|
||||||
|
Decode step that decodes the latents to images and postprocess the generated image.
|
||||||
|
|
||||||
|
Components:
|
||||||
|
vae (`AutoencoderKLQwenImage`) image_processor (`VaeImageProcessor`)
|
||||||
|
|
||||||
|
Inputs:
|
||||||
|
latents (`Tensor`):
|
||||||
|
The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise
|
||||||
|
step.
|
||||||
|
output_type (`str`, *optional*, defaults to pil):
|
||||||
|
Output format: 'pil', 'np', 'pt'.
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
images (`List`):
|
||||||
|
Generated images. (tensor output of the vae decoder.)
|
||||||
|
"""
|
||||||
|
|
||||||
model_name = "qwenimage"
|
model_name = "qwenimage"
|
||||||
block_classes = [QwenImageDecoderStep(), QwenImageProcessImagesOutputStep()]
|
block_classes = [QwenImageDecoderStep(), QwenImageProcessImagesOutputStep()]
|
||||||
block_names = ["decode", "postprocess"]
|
block_names = ["decode", "postprocess"]
|
||||||
@@ -425,7 +1046,30 @@ class QwenImageDecodeStep(SequentialPipelineBlocks):
|
|||||||
|
|
||||||
|
|
||||||
# Inpaint decode step
|
# Inpaint decode step
|
||||||
|
# auto_docstring
|
||||||
class QwenImageInpaintDecodeStep(SequentialPipelineBlocks):
|
class QwenImageInpaintDecodeStep(SequentialPipelineBlocks):
|
||||||
|
"""
|
||||||
|
Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask
|
||||||
|
overally to the original image.
|
||||||
|
|
||||||
|
Components:
|
||||||
|
vae (`AutoencoderKLQwenImage`) image_mask_processor (`InpaintProcessor`)
|
||||||
|
|
||||||
|
Inputs:
|
||||||
|
latents (`Tensor`):
|
||||||
|
The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise
|
||||||
|
step.
|
||||||
|
output_type (`str`, *optional*, defaults to pil):
|
||||||
|
Output format: 'pil', 'np', 'pt'.
|
||||||
|
mask_overlay_kwargs (`Dict`, *optional*):
|
||||||
|
The kwargs for the postprocess step to apply the mask overlay. generated in
|
||||||
|
InpaintProcessImagesInputStep.
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
images (`List`):
|
||||||
|
Generated images. (tensor output of the vae decoder.)
|
||||||
|
"""
|
||||||
|
|
||||||
model_name = "qwenimage"
|
model_name = "qwenimage"
|
||||||
block_classes = [QwenImageDecoderStep(), QwenImageInpaintProcessImagesOutputStep()]
|
block_classes = [QwenImageDecoderStep(), QwenImageInpaintProcessImagesOutputStep()]
|
||||||
block_names = ["decode", "postprocess"]
|
block_names = ["decode", "postprocess"]
|
||||||
@@ -452,11 +1096,11 @@ class QwenImageAutoDecodeStep(AutoPipelineBlocks):
|
|||||||
|
|
||||||
|
|
||||||
# ====================
|
# ====================
|
||||||
# 4. AUTO BLOCKS & PRESETS
|
# 5. AUTO BLOCKS & PRESETS
|
||||||
# ====================
|
# ====================
|
||||||
AUTO_BLOCKS = InsertableDict(
|
AUTO_BLOCKS = InsertableDict(
|
||||||
[
|
[
|
||||||
("text_encoder", QwenImageTextEncoderStep()),
|
("text_encoder", QwenImageAutoTextEncoderStep()),
|
||||||
("vae_encoder", QwenImageAutoVaeEncoderStep()),
|
("vae_encoder", QwenImageAutoVaeEncoderStep()),
|
||||||
("controlnet_vae_encoder", QwenImageOptionalControlNetVaeEncoderStep()),
|
("controlnet_vae_encoder", QwenImageOptionalControlNetVaeEncoderStep()),
|
||||||
("denoise", QwenImageAutoCoreDenoiseStep()),
|
("denoise", QwenImageAutoCoreDenoiseStep()),
|
||||||
@@ -465,24 +1109,119 @@ AUTO_BLOCKS = InsertableDict(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# auto_docstring
|
||||||
class QwenImageAutoBlocks(SequentialPipelineBlocks):
|
class QwenImageAutoBlocks(SequentialPipelineBlocks):
|
||||||
|
"""
|
||||||
|
Auto Modular pipeline for text-to-image, image-to-image, inpainting, and controlnet tasks using QwenImage.
|
||||||
|
|
||||||
|
Supported workflows:
|
||||||
|
- `text2image`: requires `prompt`
|
||||||
|
- `image2image`: requires `prompt`, `image`
|
||||||
|
- `inpainting`: requires `prompt`, `mask_image`, `image`
|
||||||
|
- `controlnet_text2image`: requires `prompt`, `control_image`
|
||||||
|
- `controlnet_image2image`: requires `prompt`, `image`, `control_image`
|
||||||
|
- `controlnet_inpainting`: requires `prompt`, `mask_image`, `image`, `control_image`
|
||||||
|
|
||||||
|
Components:
|
||||||
|
text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use tokenizer (`Qwen2Tokenizer`):
|
||||||
|
The tokenizer to use guider (`ClassifierFreeGuidance`) image_mask_processor (`InpaintProcessor`) vae
|
||||||
|
(`AutoencoderKLQwenImage`) image_processor (`VaeImageProcessor`) controlnet (`QwenImageControlNetModel`)
|
||||||
|
control_image_processor (`VaeImageProcessor`) pachifier (`QwenImagePachifier`) scheduler
|
||||||
|
(`FlowMatchEulerDiscreteScheduler`) transformer (`QwenImageTransformer2DModel`)
|
||||||
|
|
||||||
|
Inputs:
|
||||||
|
prompt (`str`, *optional*):
|
||||||
|
The prompt or prompts to guide image generation.
|
||||||
|
negative_prompt (`str`, *optional*):
|
||||||
|
The prompt or prompts not to guide the image generation.
|
||||||
|
max_sequence_length (`int`, *optional*, defaults to 1024):
|
||||||
|
Maximum sequence length for prompt encoding.
|
||||||
|
mask_image (`Image`, *optional*):
|
||||||
|
Mask image for inpainting.
|
||||||
|
image (`Union[Image, List]`, *optional*):
|
||||||
|
Reference image(s) for denoising. Can be a single image or list of images.
|
||||||
|
height (`int`, *optional*):
|
||||||
|
The height in pixels of the generated image.
|
||||||
|
width (`int`, *optional*):
|
||||||
|
The width in pixels of the generated image.
|
||||||
|
padding_mask_crop (`int`, *optional*):
|
||||||
|
Padding for mask cropping in inpainting.
|
||||||
|
generator (`Generator`, *optional*):
|
||||||
|
Torch generator for deterministic generation.
|
||||||
|
control_image (`Image`, *optional*):
|
||||||
|
Control image for ControlNet conditioning.
|
||||||
|
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
||||||
|
The number of images to generate per prompt.
|
||||||
|
prompt_embeds (`Tensor`):
|
||||||
|
text embeddings used to guide the image generation. Can be generated from text_encoder step.
|
||||||
|
prompt_embeds_mask (`Tensor`):
|
||||||
|
mask for the text embeddings. Can be generated from text_encoder step.
|
||||||
|
negative_prompt_embeds (`Tensor`, *optional*):
|
||||||
|
negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
|
||||||
|
negative_prompt_embeds_mask (`Tensor`, *optional*):
|
||||||
|
mask for the negative text embeddings. Can be generated from text_encoder step.
|
||||||
|
latents (`Tensor`):
|
||||||
|
Pre-generated noisy latents for image generation.
|
||||||
|
num_inference_steps (`int`):
|
||||||
|
The number of denoising steps.
|
||||||
|
sigmas (`List`, *optional*):
|
||||||
|
Custom sigmas for the denoising process.
|
||||||
|
attention_kwargs (`Dict`, *optional*):
|
||||||
|
Additional kwargs for attention processors.
|
||||||
|
**denoiser_input_fields (`None`, *optional*):
|
||||||
|
conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
|
||||||
|
image_latents (`Tensor`, *optional*):
|
||||||
|
image latents used to guide the image generation. Can be generated from vae_encoder step.
|
||||||
|
processed_mask_image (`Tensor`, *optional*):
|
||||||
|
The processed mask image
|
||||||
|
strength (`float`, *optional*, defaults to 0.9):
|
||||||
|
Strength for img2img/inpainting.
|
||||||
|
control_image_latents (`Tensor`, *optional*):
|
||||||
|
The control image latents to use for the denoising process. Can be generated in controlnet vae encoder
|
||||||
|
step.
|
||||||
|
control_guidance_start (`float`, *optional*, defaults to 0.0):
|
||||||
|
When to start applying ControlNet.
|
||||||
|
control_guidance_end (`float`, *optional*, defaults to 1.0):
|
||||||
|
When to stop applying ControlNet.
|
||||||
|
controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
|
||||||
|
Scale for ControlNet conditioning.
|
||||||
|
output_type (`str`, *optional*, defaults to pil):
|
||||||
|
Output format: 'pil', 'np', 'pt'.
|
||||||
|
mask_overlay_kwargs (`Dict`, *optional*):
|
||||||
|
The kwargs for the postprocess step to apply the mask overlay. generated in
|
||||||
|
InpaintProcessImagesInputStep.
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
images (`List`):
|
||||||
|
Generated images.
|
||||||
|
"""
|
||||||
|
|
||||||
model_name = "qwenimage"
|
model_name = "qwenimage"
|
||||||
|
|
||||||
block_classes = AUTO_BLOCKS.values()
|
block_classes = AUTO_BLOCKS.values()
|
||||||
block_names = AUTO_BLOCKS.keys()
|
block_names = AUTO_BLOCKS.keys()
|
||||||
|
|
||||||
|
# Workflow map defines the trigger conditions for each workflow.
|
||||||
|
# How to define:
|
||||||
|
# - Only include required inputs and trigger inputs (inputs that determine which blocks run)
|
||||||
|
# - `True` means the workflow triggers when the input is not None (most common case)
|
||||||
|
# - Use specific values (e.g., `{"strength": 0.5}`) if your `select_block` logic depends on the value
|
||||||
|
|
||||||
|
_workflow_map = {
|
||||||
|
"text2image": {"prompt": True},
|
||||||
|
"image2image": {"prompt": True, "image": True},
|
||||||
|
"inpainting": {"prompt": True, "mask_image": True, "image": True},
|
||||||
|
"controlnet_text2image": {"prompt": True, "control_image": True},
|
||||||
|
"controlnet_image2image": {"prompt": True, "image": True, "control_image": True},
|
||||||
|
"controlnet_inpainting": {"prompt": True, "mask_image": True, "image": True, "control_image": True},
|
||||||
|
}
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def description(self):
|
def description(self):
|
||||||
return (
|
return "Auto Modular pipeline for text-to-image, image-to-image, inpainting, and controlnet tasks using QwenImage."
|
||||||
"Auto Modular pipeline for text-to-image, image-to-image, inpainting, and controlnet tasks using QwenImage.\n"
|
|
||||||
+ "- for image-to-image generation, you need to provide `image`\n"
|
|
||||||
+ "- for inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop` \n"
|
|
||||||
+ "- to run the controlnet workflow, you need to provide `control_image`\n"
|
|
||||||
+ "- for text-to-image generation, all you need to provide is `prompt`"
|
|
||||||
)
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def outputs(self):
|
def outputs(self):
|
||||||
return [
|
return [
|
||||||
OutputParam(name="images", type_hint=List[List[PIL.Image.Image]]),
|
OutputParam.template("images"),
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -12,14 +12,13 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
from typing import List, Optional
|
from typing import Optional
|
||||||
|
|
||||||
import PIL.Image
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from ...utils import logging
|
from ...utils import logging
|
||||||
from ..modular_pipeline import AutoPipelineBlocks, ConditionalPipelineBlocks, SequentialPipelineBlocks
|
from ..modular_pipeline import AutoPipelineBlocks, ConditionalPipelineBlocks, SequentialPipelineBlocks
|
||||||
from ..modular_pipeline_utils import InsertableDict, OutputParam
|
from ..modular_pipeline_utils import InputParam, InsertableDict, OutputParam
|
||||||
from .before_denoise import (
|
from .before_denoise import (
|
||||||
QwenImageCreateMaskLatentsStep,
|
QwenImageCreateMaskLatentsStep,
|
||||||
QwenImageEditRoPEInputsStep,
|
QwenImageEditRoPEInputsStep,
|
||||||
@@ -59,8 +58,35 @@ logger = logging.get_logger(__name__)
|
|||||||
# ====================
|
# ====================
|
||||||
|
|
||||||
|
|
||||||
|
# auto_docstring
|
||||||
class QwenImageEditVLEncoderStep(SequentialPipelineBlocks):
|
class QwenImageEditVLEncoderStep(SequentialPipelineBlocks):
|
||||||
"""VL encoder that takes both image and text prompts."""
|
"""
|
||||||
|
QwenImage-Edit VL encoder step that encode the image and text prompts together.
|
||||||
|
|
||||||
|
Components:
|
||||||
|
image_resize_processor (`VaeImageProcessor`) text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor
|
||||||
|
(`Qwen2VLProcessor`) guider (`ClassifierFreeGuidance`)
|
||||||
|
|
||||||
|
Inputs:
|
||||||
|
image (`Union[Image, List]`):
|
||||||
|
Reference image(s) for denoising. Can be a single image or list of images.
|
||||||
|
prompt (`str`):
|
||||||
|
The prompt or prompts to guide image generation.
|
||||||
|
negative_prompt (`str`, *optional*):
|
||||||
|
The prompt or prompts not to guide the image generation.
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
resized_image (`List`):
|
||||||
|
The resized images
|
||||||
|
prompt_embeds (`Tensor`):
|
||||||
|
The prompt embeddings.
|
||||||
|
prompt_embeds_mask (`Tensor`):
|
||||||
|
The encoder attention mask.
|
||||||
|
negative_prompt_embeds (`Tensor`):
|
||||||
|
The negative prompt embeddings.
|
||||||
|
negative_prompt_embeds_mask (`Tensor`):
|
||||||
|
The negative prompt embeddings mask.
|
||||||
|
"""
|
||||||
|
|
||||||
model_name = "qwenimage-edit"
|
model_name = "qwenimage-edit"
|
||||||
block_classes = [
|
block_classes = [
|
||||||
@@ -80,7 +106,30 @@ class QwenImageEditVLEncoderStep(SequentialPipelineBlocks):
|
|||||||
|
|
||||||
|
|
||||||
# Edit VAE encoder
|
# Edit VAE encoder
|
||||||
|
# auto_docstring
|
||||||
class QwenImageEditVaeEncoderStep(SequentialPipelineBlocks):
|
class QwenImageEditVaeEncoderStep(SequentialPipelineBlocks):
|
||||||
|
"""
|
||||||
|
Vae encoder step that encode the image inputs into their latent representations.
|
||||||
|
|
||||||
|
Components:
|
||||||
|
image_resize_processor (`VaeImageProcessor`) image_processor (`VaeImageProcessor`) vae
|
||||||
|
(`AutoencoderKLQwenImage`)
|
||||||
|
|
||||||
|
Inputs:
|
||||||
|
image (`Union[Image, List]`):
|
||||||
|
Reference image(s) for denoising. Can be a single image or list of images.
|
||||||
|
generator (`Generator`, *optional*):
|
||||||
|
Torch generator for deterministic generation.
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
resized_image (`List`):
|
||||||
|
The resized images
|
||||||
|
processed_image (`Tensor`):
|
||||||
|
The processed image
|
||||||
|
image_latents (`Tensor`):
|
||||||
|
The latent representation of the input image.
|
||||||
|
"""
|
||||||
|
|
||||||
model_name = "qwenimage-edit"
|
model_name = "qwenimage-edit"
|
||||||
block_classes = [
|
block_classes = [
|
||||||
QwenImageEditResizeStep(),
|
QwenImageEditResizeStep(),
|
||||||
@@ -95,12 +144,46 @@ class QwenImageEditVaeEncoderStep(SequentialPipelineBlocks):
|
|||||||
|
|
||||||
|
|
||||||
# Edit Inpaint VAE encoder
|
# Edit Inpaint VAE encoder
|
||||||
|
# auto_docstring
|
||||||
class QwenImageEditInpaintVaeEncoderStep(SequentialPipelineBlocks):
|
class QwenImageEditInpaintVaeEncoderStep(SequentialPipelineBlocks):
|
||||||
|
"""
|
||||||
|
This step is used for processing image and mask inputs for QwenImage-Edit inpaint tasks. It:
|
||||||
|
- resize the image for target area (1024 * 1024) while maintaining the aspect ratio.
|
||||||
|
- process the resized image and mask image.
|
||||||
|
- create image latents.
|
||||||
|
|
||||||
|
Components:
|
||||||
|
image_resize_processor (`VaeImageProcessor`) image_mask_processor (`InpaintProcessor`) vae
|
||||||
|
(`AutoencoderKLQwenImage`)
|
||||||
|
|
||||||
|
Inputs:
|
||||||
|
image (`Union[Image, List]`):
|
||||||
|
Reference image(s) for denoising. Can be a single image or list of images.
|
||||||
|
mask_image (`Image`):
|
||||||
|
Mask image for inpainting.
|
||||||
|
padding_mask_crop (`int`, *optional*):
|
||||||
|
Padding for mask cropping in inpainting.
|
||||||
|
generator (`Generator`, *optional*):
|
||||||
|
Torch generator for deterministic generation.
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
resized_image (`List`):
|
||||||
|
The resized images
|
||||||
|
processed_image (`Tensor`):
|
||||||
|
The processed image
|
||||||
|
processed_mask_image (`Tensor`):
|
||||||
|
The processed mask image
|
||||||
|
mask_overlay_kwargs (`Dict`):
|
||||||
|
The kwargs for the postprocess step to apply the mask overlay
|
||||||
|
image_latents (`Tensor`):
|
||||||
|
The latent representation of the input image.
|
||||||
|
"""
|
||||||
|
|
||||||
model_name = "qwenimage-edit"
|
model_name = "qwenimage-edit"
|
||||||
block_classes = [
|
block_classes = [
|
||||||
QwenImageEditResizeStep(),
|
QwenImageEditResizeStep(),
|
||||||
QwenImageEditInpaintProcessImagesInputStep(),
|
QwenImageEditInpaintProcessImagesInputStep(),
|
||||||
QwenImageVaeEncoderStep(input_name="processed_image", output_name="image_latents"),
|
QwenImageVaeEncoderStep(),
|
||||||
]
|
]
|
||||||
block_names = ["resize", "preprocess", "encode"]
|
block_names = ["resize", "preprocess", "encode"]
|
||||||
|
|
||||||
@@ -137,11 +220,64 @@ class QwenImageEditAutoVaeEncoderStep(AutoPipelineBlocks):
|
|||||||
|
|
||||||
|
|
||||||
# assemble input steps
|
# assemble input steps
|
||||||
|
# auto_docstring
|
||||||
class QwenImageEditInputStep(SequentialPipelineBlocks):
|
class QwenImageEditInputStep(SequentialPipelineBlocks):
|
||||||
|
"""
|
||||||
|
Input step that prepares the inputs for the edit denoising step. It:
|
||||||
|
- make sure the text embeddings have consistent batch size as well as the additional inputs.
|
||||||
|
- update height/width based `image_latents`, patchify `image_latents`.
|
||||||
|
|
||||||
|
Components:
|
||||||
|
pachifier (`QwenImagePachifier`)
|
||||||
|
|
||||||
|
Inputs:
|
||||||
|
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
||||||
|
The number of images to generate per prompt.
|
||||||
|
prompt_embeds (`Tensor`):
|
||||||
|
text embeddings used to guide the image generation. Can be generated from text_encoder step.
|
||||||
|
prompt_embeds_mask (`Tensor`):
|
||||||
|
mask for the text embeddings. Can be generated from text_encoder step.
|
||||||
|
negative_prompt_embeds (`Tensor`, *optional*):
|
||||||
|
negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
|
||||||
|
negative_prompt_embeds_mask (`Tensor`, *optional*):
|
||||||
|
mask for the negative text embeddings. Can be generated from text_encoder step.
|
||||||
|
height (`int`, *optional*):
|
||||||
|
The height in pixels of the generated image.
|
||||||
|
width (`int`, *optional*):
|
||||||
|
The width in pixels of the generated image.
|
||||||
|
image_latents (`Tensor`):
|
||||||
|
image latents used to guide the image generation. Can be generated from vae_encoder step.
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
batch_size (`int`):
|
||||||
|
The batch size of the prompt embeddings
|
||||||
|
dtype (`dtype`):
|
||||||
|
The data type of the prompt embeddings
|
||||||
|
prompt_embeds (`Tensor`):
|
||||||
|
The prompt embeddings. (batch-expanded)
|
||||||
|
prompt_embeds_mask (`Tensor`):
|
||||||
|
The encoder attention mask. (batch-expanded)
|
||||||
|
negative_prompt_embeds (`Tensor`):
|
||||||
|
The negative prompt embeddings. (batch-expanded)
|
||||||
|
negative_prompt_embeds_mask (`Tensor`):
|
||||||
|
The negative prompt embeddings mask. (batch-expanded)
|
||||||
|
image_height (`int`):
|
||||||
|
The image height calculated from the image latents dimension
|
||||||
|
image_width (`int`):
|
||||||
|
The image width calculated from the image latents dimension
|
||||||
|
height (`int`):
|
||||||
|
if not provided, updated to image height
|
||||||
|
width (`int`):
|
||||||
|
if not provided, updated to image width
|
||||||
|
image_latents (`Tensor`):
|
||||||
|
image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified and
|
||||||
|
batch-expanded)
|
||||||
|
"""
|
||||||
|
|
||||||
model_name = "qwenimage-edit"
|
model_name = "qwenimage-edit"
|
||||||
block_classes = [
|
block_classes = [
|
||||||
QwenImageTextInputsStep(),
|
QwenImageTextInputsStep(),
|
||||||
QwenImageAdditionalInputsStep(image_latent_inputs=["image_latents"]),
|
QwenImageAdditionalInputsStep(),
|
||||||
]
|
]
|
||||||
block_names = ["text_inputs", "additional_inputs"]
|
block_names = ["text_inputs", "additional_inputs"]
|
||||||
|
|
||||||
@@ -154,12 +290,71 @@ class QwenImageEditInputStep(SequentialPipelineBlocks):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# auto_docstring
|
||||||
class QwenImageEditInpaintInputStep(SequentialPipelineBlocks):
|
class QwenImageEditInpaintInputStep(SequentialPipelineBlocks):
|
||||||
|
"""
|
||||||
|
Input step that prepares the inputs for the edit inpaint denoising step. It:
|
||||||
|
- make sure the text embeddings have consistent batch size as well as the additional inputs.
|
||||||
|
- update height/width based `image_latents`, patchify `image_latents`.
|
||||||
|
|
||||||
|
Components:
|
||||||
|
pachifier (`QwenImagePachifier`)
|
||||||
|
|
||||||
|
Inputs:
|
||||||
|
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
||||||
|
The number of images to generate per prompt.
|
||||||
|
prompt_embeds (`Tensor`):
|
||||||
|
text embeddings used to guide the image generation. Can be generated from text_encoder step.
|
||||||
|
prompt_embeds_mask (`Tensor`):
|
||||||
|
mask for the text embeddings. Can be generated from text_encoder step.
|
||||||
|
negative_prompt_embeds (`Tensor`, *optional*):
|
||||||
|
negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
|
||||||
|
negative_prompt_embeds_mask (`Tensor`, *optional*):
|
||||||
|
mask for the negative text embeddings. Can be generated from text_encoder step.
|
||||||
|
height (`int`, *optional*):
|
||||||
|
The height in pixels of the generated image.
|
||||||
|
width (`int`, *optional*):
|
||||||
|
The width in pixels of the generated image.
|
||||||
|
image_latents (`Tensor`):
|
||||||
|
image latents used to guide the image generation. Can be generated from vae_encoder step.
|
||||||
|
processed_mask_image (`Tensor`, *optional*):
|
||||||
|
The processed mask image
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
batch_size (`int`):
|
||||||
|
The batch size of the prompt embeddings
|
||||||
|
dtype (`dtype`):
|
||||||
|
The data type of the prompt embeddings
|
||||||
|
prompt_embeds (`Tensor`):
|
||||||
|
The prompt embeddings. (batch-expanded)
|
||||||
|
prompt_embeds_mask (`Tensor`):
|
||||||
|
The encoder attention mask. (batch-expanded)
|
||||||
|
negative_prompt_embeds (`Tensor`):
|
||||||
|
The negative prompt embeddings. (batch-expanded)
|
||||||
|
negative_prompt_embeds_mask (`Tensor`):
|
||||||
|
The negative prompt embeddings mask. (batch-expanded)
|
||||||
|
image_height (`int`):
|
||||||
|
The image height calculated from the image latents dimension
|
||||||
|
image_width (`int`):
|
||||||
|
The image width calculated from the image latents dimension
|
||||||
|
height (`int`):
|
||||||
|
if not provided, updated to image height
|
||||||
|
width (`int`):
|
||||||
|
if not provided, updated to image width
|
||||||
|
image_latents (`Tensor`):
|
||||||
|
image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified and
|
||||||
|
batch-expanded)
|
||||||
|
processed_mask_image (`Tensor`):
|
||||||
|
The processed mask image (batch-expanded)
|
||||||
|
"""
|
||||||
|
|
||||||
model_name = "qwenimage-edit"
|
model_name = "qwenimage-edit"
|
||||||
block_classes = [
|
block_classes = [
|
||||||
QwenImageTextInputsStep(),
|
QwenImageTextInputsStep(),
|
||||||
QwenImageAdditionalInputsStep(
|
QwenImageAdditionalInputsStep(
|
||||||
image_latent_inputs=["image_latents"], additional_batch_inputs=["processed_mask_image"]
|
additional_batch_inputs=[
|
||||||
|
InputParam(name="processed_mask_image", type_hint=torch.Tensor, description="The processed mask image")
|
||||||
|
]
|
||||||
),
|
),
|
||||||
]
|
]
|
||||||
block_names = ["text_inputs", "additional_inputs"]
|
block_names = ["text_inputs", "additional_inputs"]
|
||||||
@@ -174,7 +369,42 @@ class QwenImageEditInpaintInputStep(SequentialPipelineBlocks):
|
|||||||
|
|
||||||
|
|
||||||
# assemble prepare latents steps
|
# assemble prepare latents steps
|
||||||
|
# auto_docstring
|
||||||
class QwenImageEditInpaintPrepareLatentsStep(SequentialPipelineBlocks):
|
class QwenImageEditInpaintPrepareLatentsStep(SequentialPipelineBlocks):
|
||||||
|
"""
|
||||||
|
This step prepares the latents/image_latents and mask inputs for the edit inpainting denoising step. It:
|
||||||
|
- Add noise to the image latents to create the latents input for the denoiser.
|
||||||
|
- Create the patchified latents `mask` based on the processed mask image.
|
||||||
|
|
||||||
|
Components:
|
||||||
|
scheduler (`FlowMatchEulerDiscreteScheduler`) pachifier (`QwenImagePachifier`)
|
||||||
|
|
||||||
|
Inputs:
|
||||||
|
latents (`Tensor`):
|
||||||
|
The initial random noised, can be generated in prepare latent step.
|
||||||
|
image_latents (`Tensor`):
|
||||||
|
image latents used to guide the image generation. Can be generated from vae_encoder step. (Can be
|
||||||
|
generated from vae encoder and updated in input step.)
|
||||||
|
timesteps (`Tensor`):
|
||||||
|
The timesteps to use for the denoising process. Can be generated in set_timesteps step.
|
||||||
|
processed_mask_image (`Tensor`):
|
||||||
|
The processed mask to use for the inpainting process.
|
||||||
|
height (`int`):
|
||||||
|
The height in pixels of the generated image.
|
||||||
|
width (`int`):
|
||||||
|
The width in pixels of the generated image.
|
||||||
|
dtype (`dtype`, *optional*, defaults to torch.float32):
|
||||||
|
The dtype of the model inputs, can be generated in input step.
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
initial_noise (`Tensor`):
|
||||||
|
The initial random noised used for inpainting denoising.
|
||||||
|
latents (`Tensor`):
|
||||||
|
The scaled noisy latents to use for inpainting/image-to-image denoising.
|
||||||
|
mask (`Tensor`):
|
||||||
|
The mask to use for the inpainting process.
|
||||||
|
"""
|
||||||
|
|
||||||
model_name = "qwenimage-edit"
|
model_name = "qwenimage-edit"
|
||||||
block_classes = [QwenImagePrepareLatentsWithStrengthStep(), QwenImageCreateMaskLatentsStep()]
|
block_classes = [QwenImagePrepareLatentsWithStrengthStep(), QwenImageCreateMaskLatentsStep()]
|
||||||
block_names = ["add_noise_to_latents", "create_mask_latents"]
|
block_names = ["add_noise_to_latents", "create_mask_latents"]
|
||||||
@@ -189,7 +419,50 @@ class QwenImageEditInpaintPrepareLatentsStep(SequentialPipelineBlocks):
|
|||||||
|
|
||||||
|
|
||||||
# Qwen Image Edit (image2image) core denoise step
|
# Qwen Image Edit (image2image) core denoise step
|
||||||
|
# auto_docstring
|
||||||
class QwenImageEditCoreDenoiseStep(SequentialPipelineBlocks):
|
class QwenImageEditCoreDenoiseStep(SequentialPipelineBlocks):
|
||||||
|
"""
|
||||||
|
Core denoising workflow for QwenImage-Edit edit (img2img) task.
|
||||||
|
|
||||||
|
Components:
|
||||||
|
pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider
|
||||||
|
(`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)
|
||||||
|
|
||||||
|
Inputs:
|
||||||
|
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
||||||
|
The number of images to generate per prompt.
|
||||||
|
prompt_embeds (`Tensor`):
|
||||||
|
text embeddings used to guide the image generation. Can be generated from text_encoder step.
|
||||||
|
prompt_embeds_mask (`Tensor`):
|
||||||
|
mask for the text embeddings. Can be generated from text_encoder step.
|
||||||
|
negative_prompt_embeds (`Tensor`, *optional*):
|
||||||
|
negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
|
||||||
|
negative_prompt_embeds_mask (`Tensor`, *optional*):
|
||||||
|
mask for the negative text embeddings. Can be generated from text_encoder step.
|
||||||
|
height (`int`, *optional*):
|
||||||
|
The height in pixels of the generated image.
|
||||||
|
width (`int`, *optional*):
|
||||||
|
The width in pixels of the generated image.
|
||||||
|
image_latents (`Tensor`):
|
||||||
|
image latents used to guide the image generation. Can be generated from vae_encoder step.
|
||||||
|
latents (`Tensor`, *optional*):
|
||||||
|
Pre-generated noisy latents for image generation.
|
||||||
|
generator (`Generator`, *optional*):
|
||||||
|
Torch generator for deterministic generation.
|
||||||
|
num_inference_steps (`int`, *optional*, defaults to 50):
|
||||||
|
The number of denoising steps.
|
||||||
|
sigmas (`List`, *optional*):
|
||||||
|
Custom sigmas for the denoising process.
|
||||||
|
attention_kwargs (`Dict`, *optional*):
|
||||||
|
Additional kwargs for attention processors.
|
||||||
|
**denoiser_input_fields (`None`, *optional*):
|
||||||
|
conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
latents (`Tensor`):
|
||||||
|
Denoised latents.
|
||||||
|
"""
|
||||||
|
|
||||||
model_name = "qwenimage-edit"
|
model_name = "qwenimage-edit"
|
||||||
block_classes = [
|
block_classes = [
|
||||||
QwenImageEditInputStep(),
|
QwenImageEditInputStep(),
|
||||||
@@ -212,9 +485,62 @@ class QwenImageEditCoreDenoiseStep(SequentialPipelineBlocks):
|
|||||||
def description(self):
|
def description(self):
|
||||||
return "Core denoising workflow for QwenImage-Edit edit (img2img) task."
|
return "Core denoising workflow for QwenImage-Edit edit (img2img) task."
|
||||||
|
|
||||||
|
@property
|
||||||
|
def outputs(self):
|
||||||
|
return [
|
||||||
|
OutputParam.template("latents"),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
# Qwen Image Edit (inpainting) core denoise step
|
# Qwen Image Edit (inpainting) core denoise step
|
||||||
|
# auto_docstring
|
||||||
class QwenImageEditInpaintCoreDenoiseStep(SequentialPipelineBlocks):
|
class QwenImageEditInpaintCoreDenoiseStep(SequentialPipelineBlocks):
|
||||||
|
"""
|
||||||
|
Core denoising workflow for QwenImage-Edit edit inpaint task.
|
||||||
|
|
||||||
|
Components:
|
||||||
|
pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider
|
||||||
|
(`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)
|
||||||
|
|
||||||
|
Inputs:
|
||||||
|
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
||||||
|
The number of images to generate per prompt.
|
||||||
|
prompt_embeds (`Tensor`):
|
||||||
|
text embeddings used to guide the image generation. Can be generated from text_encoder step.
|
||||||
|
prompt_embeds_mask (`Tensor`):
|
||||||
|
mask for the text embeddings. Can be generated from text_encoder step.
|
||||||
|
negative_prompt_embeds (`Tensor`, *optional*):
|
||||||
|
negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
|
||||||
|
negative_prompt_embeds_mask (`Tensor`, *optional*):
|
||||||
|
mask for the negative text embeddings. Can be generated from text_encoder step.
|
||||||
|
height (`int`, *optional*):
|
||||||
|
The height in pixels of the generated image.
|
||||||
|
width (`int`, *optional*):
|
||||||
|
The width in pixels of the generated image.
|
||||||
|
image_latents (`Tensor`):
|
||||||
|
image latents used to guide the image generation. Can be generated from vae_encoder step.
|
||||||
|
processed_mask_image (`Tensor`, *optional*):
|
||||||
|
The processed mask image
|
||||||
|
latents (`Tensor`, *optional*):
|
||||||
|
Pre-generated noisy latents for image generation.
|
||||||
|
generator (`Generator`, *optional*):
|
||||||
|
Torch generator for deterministic generation.
|
||||||
|
num_inference_steps (`int`, *optional*, defaults to 50):
|
||||||
|
The number of denoising steps.
|
||||||
|
sigmas (`List`, *optional*):
|
||||||
|
Custom sigmas for the denoising process.
|
||||||
|
strength (`float`, *optional*, defaults to 0.9):
|
||||||
|
Strength for img2img/inpainting.
|
||||||
|
attention_kwargs (`Dict`, *optional*):
|
||||||
|
Additional kwargs for attention processors.
|
||||||
|
**denoiser_input_fields (`None`, *optional*):
|
||||||
|
conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
latents (`Tensor`):
|
||||||
|
Denoised latents.
|
||||||
|
"""
|
||||||
|
|
||||||
model_name = "qwenimage-edit"
|
model_name = "qwenimage-edit"
|
||||||
block_classes = [
|
block_classes = [
|
||||||
QwenImageEditInpaintInputStep(),
|
QwenImageEditInpaintInputStep(),
|
||||||
@@ -239,6 +565,12 @@ class QwenImageEditInpaintCoreDenoiseStep(SequentialPipelineBlocks):
|
|||||||
def description(self):
|
def description(self):
|
||||||
return "Core denoising workflow for QwenImage-Edit edit inpaint task."
|
return "Core denoising workflow for QwenImage-Edit edit inpaint task."
|
||||||
|
|
||||||
|
@property
|
||||||
|
def outputs(self):
|
||||||
|
return [
|
||||||
|
OutputParam.template("latents"),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
# Auto core denoise step for QwenImage Edit
|
# Auto core denoise step for QwenImage Edit
|
||||||
class QwenImageEditAutoCoreDenoiseStep(ConditionalPipelineBlocks):
|
class QwenImageEditAutoCoreDenoiseStep(ConditionalPipelineBlocks):
|
||||||
@@ -267,6 +599,12 @@ class QwenImageEditAutoCoreDenoiseStep(ConditionalPipelineBlocks):
|
|||||||
"Supports edit (img2img) and edit inpainting tasks for QwenImage-Edit."
|
"Supports edit (img2img) and edit inpainting tasks for QwenImage-Edit."
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def outputs(self):
|
||||||
|
return [
|
||||||
|
OutputParam.template("latents"),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
# ====================
|
# ====================
|
||||||
# 4. DECODE
|
# 4. DECODE
|
||||||
@@ -274,7 +612,26 @@ class QwenImageEditAutoCoreDenoiseStep(ConditionalPipelineBlocks):
|
|||||||
|
|
||||||
|
|
||||||
# Decode step (standard)
|
# Decode step (standard)
|
||||||
|
# auto_docstring
|
||||||
class QwenImageEditDecodeStep(SequentialPipelineBlocks):
|
class QwenImageEditDecodeStep(SequentialPipelineBlocks):
|
||||||
|
"""
|
||||||
|
Decode step that decodes the latents to images and postprocess the generated image.
|
||||||
|
|
||||||
|
Components:
|
||||||
|
vae (`AutoencoderKLQwenImage`) image_processor (`VaeImageProcessor`)
|
||||||
|
|
||||||
|
Inputs:
|
||||||
|
latents (`Tensor`):
|
||||||
|
The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise
|
||||||
|
step.
|
||||||
|
output_type (`str`, *optional*, defaults to pil):
|
||||||
|
Output format: 'pil', 'np', 'pt'.
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
images (`List`):
|
||||||
|
Generated images. (tensor output of the vae decoder.)
|
||||||
|
"""
|
||||||
|
|
||||||
model_name = "qwenimage-edit"
|
model_name = "qwenimage-edit"
|
||||||
block_classes = [QwenImageDecoderStep(), QwenImageProcessImagesOutputStep()]
|
block_classes = [QwenImageDecoderStep(), QwenImageProcessImagesOutputStep()]
|
||||||
block_names = ["decode", "postprocess"]
|
block_names = ["decode", "postprocess"]
|
||||||
@@ -285,7 +642,30 @@ class QwenImageEditDecodeStep(SequentialPipelineBlocks):
|
|||||||
|
|
||||||
|
|
||||||
# Inpaint decode step
|
# Inpaint decode step
|
||||||
|
# auto_docstring
|
||||||
class QwenImageEditInpaintDecodeStep(SequentialPipelineBlocks):
|
class QwenImageEditInpaintDecodeStep(SequentialPipelineBlocks):
|
||||||
|
"""
|
||||||
|
Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask
|
||||||
|
overlay to the original image.
|
||||||
|
|
||||||
|
Components:
|
||||||
|
vae (`AutoencoderKLQwenImage`) image_mask_processor (`InpaintProcessor`)
|
||||||
|
|
||||||
|
Inputs:
|
||||||
|
latents (`Tensor`):
|
||||||
|
The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise
|
||||||
|
step.
|
||||||
|
output_type (`str`, *optional*, defaults to pil):
|
||||||
|
Output format: 'pil', 'np', 'pt'.
|
||||||
|
mask_overlay_kwargs (`Dict`, *optional*):
|
||||||
|
The kwargs for the postprocess step to apply the mask overlay. generated in
|
||||||
|
InpaintProcessImagesInputStep.
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
images (`List`):
|
||||||
|
Generated images. (tensor output of the vae decoder.)
|
||||||
|
"""
|
||||||
|
|
||||||
model_name = "qwenimage-edit"
|
model_name = "qwenimage-edit"
|
||||||
block_classes = [QwenImageDecoderStep(), QwenImageInpaintProcessImagesOutputStep()]
|
block_classes = [QwenImageDecoderStep(), QwenImageInpaintProcessImagesOutputStep()]
|
||||||
block_names = ["decode", "postprocess"]
|
block_names = ["decode", "postprocess"]
|
||||||
@@ -313,9 +693,7 @@ class QwenImageEditAutoDecodeStep(AutoPipelineBlocks):
|
|||||||
@property
|
@property
|
||||||
def outputs(self):
|
def outputs(self):
|
||||||
return [
|
return [
|
||||||
OutputParam(
|
OutputParam.template("latents"),
|
||||||
name="latents", type_hint=torch.Tensor, description="The latents generated by the denoising step"
|
|
||||||
),
|
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
@@ -333,7 +711,66 @@ EDIT_AUTO_BLOCKS = InsertableDict(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# auto_docstring
|
||||||
class QwenImageEditAutoBlocks(SequentialPipelineBlocks):
|
class QwenImageEditAutoBlocks(SequentialPipelineBlocks):
|
||||||
|
"""
|
||||||
|
Auto Modular pipeline for edit (img2img) and edit inpaint tasks using QwenImage-Edit.
|
||||||
|
- for edit (img2img) generation, you need to provide `image`
|
||||||
|
- for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide
|
||||||
|
`padding_mask_crop`
|
||||||
|
|
||||||
|
Components:
|
||||||
|
image_resize_processor (`VaeImageProcessor`) text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor
|
||||||
|
(`Qwen2VLProcessor`) guider (`ClassifierFreeGuidance`) image_mask_processor (`InpaintProcessor`) vae
|
||||||
|
(`AutoencoderKLQwenImage`) image_processor (`VaeImageProcessor`) pachifier (`QwenImagePachifier`) scheduler
|
||||||
|
(`FlowMatchEulerDiscreteScheduler`) transformer (`QwenImageTransformer2DModel`)
|
||||||
|
|
||||||
|
Inputs:
|
||||||
|
image (`Union[Image, List]`):
|
||||||
|
Reference image(s) for denoising. Can be a single image or list of images.
|
||||||
|
prompt (`str`):
|
||||||
|
The prompt or prompts to guide image generation.
|
||||||
|
negative_prompt (`str`, *optional*):
|
||||||
|
The prompt or prompts not to guide the image generation.
|
||||||
|
mask_image (`Image`, *optional*):
|
||||||
|
Mask image for inpainting.
|
||||||
|
padding_mask_crop (`int`, *optional*):
|
||||||
|
Padding for mask cropping in inpainting.
|
||||||
|
generator (`Generator`, *optional*):
|
||||||
|
Torch generator for deterministic generation.
|
||||||
|
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
||||||
|
The number of images to generate per prompt.
|
||||||
|
height (`int`):
|
||||||
|
The height in pixels of the generated image.
|
||||||
|
width (`int`):
|
||||||
|
The width in pixels of the generated image.
|
||||||
|
image_latents (`Tensor`):
|
||||||
|
image latents used to guide the image generation. Can be generated from vae_encoder step.
|
||||||
|
processed_mask_image (`Tensor`, *optional*):
|
||||||
|
The processed mask image
|
||||||
|
latents (`Tensor`):
|
||||||
|
Pre-generated noisy latents for image generation.
|
||||||
|
num_inference_steps (`int`):
|
||||||
|
The number of denoising steps.
|
||||||
|
sigmas (`List`, *optional*):
|
||||||
|
Custom sigmas for the denoising process.
|
||||||
|
strength (`float`, *optional*, defaults to 0.9):
|
||||||
|
Strength for img2img/inpainting.
|
||||||
|
attention_kwargs (`Dict`, *optional*):
|
||||||
|
Additional kwargs for attention processors.
|
||||||
|
**denoiser_input_fields (`None`, *optional*):
|
||||||
|
conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
|
||||||
|
output_type (`str`, *optional*, defaults to pil):
|
||||||
|
Output format: 'pil', 'np', 'pt'.
|
||||||
|
mask_overlay_kwargs (`Dict`, *optional*):
|
||||||
|
The kwargs for the postprocess step to apply the mask overlay. generated in
|
||||||
|
InpaintProcessImagesInputStep.
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
images (`List`):
|
||||||
|
Generated images.
|
||||||
|
"""
|
||||||
|
|
||||||
model_name = "qwenimage-edit"
|
model_name = "qwenimage-edit"
|
||||||
block_classes = EDIT_AUTO_BLOCKS.values()
|
block_classes = EDIT_AUTO_BLOCKS.values()
|
||||||
block_names = EDIT_AUTO_BLOCKS.keys()
|
block_names = EDIT_AUTO_BLOCKS.keys()
|
||||||
@@ -349,5 +786,5 @@ class QwenImageEditAutoBlocks(SequentialPipelineBlocks):
|
|||||||
@property
|
@property
|
||||||
def outputs(self):
|
def outputs(self):
|
||||||
return [
|
return [
|
||||||
OutputParam(name="images", type_hint=List[List[PIL.Image.Image]], description="The generated images"),
|
OutputParam.template("images"),
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -12,11 +12,6 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
from typing import List
|
|
||||||
|
|
||||||
import PIL.Image
|
|
||||||
import torch
|
|
||||||
|
|
||||||
from ...utils import logging
|
from ...utils import logging
|
||||||
from ..modular_pipeline import SequentialPipelineBlocks
|
from ..modular_pipeline import SequentialPipelineBlocks
|
||||||
from ..modular_pipeline_utils import InsertableDict, OutputParam
|
from ..modular_pipeline_utils import InsertableDict, OutputParam
|
||||||
@@ -53,12 +48,41 @@ logger = logging.get_logger(__name__)
|
|||||||
# ====================
|
# ====================
|
||||||
|
|
||||||
|
|
||||||
|
# auto_docstring
|
||||||
class QwenImageEditPlusVLEncoderStep(SequentialPipelineBlocks):
|
class QwenImageEditPlusVLEncoderStep(SequentialPipelineBlocks):
|
||||||
"""VL encoder that takes both image and text prompts. Uses 384x384 target area."""
|
"""
|
||||||
|
QwenImage-Edit Plus VL encoder step that encodes the image and text prompts together.
|
||||||
|
|
||||||
|
Components:
|
||||||
|
image_resize_processor (`VaeImageProcessor`) text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor
|
||||||
|
(`Qwen2VLProcessor`) guider (`ClassifierFreeGuidance`)
|
||||||
|
|
||||||
|
Inputs:
|
||||||
|
image (`Union[Image, List]`):
|
||||||
|
Reference image(s) for denoising. Can be a single image or list of images.
|
||||||
|
prompt (`str`):
|
||||||
|
The prompt or prompts to guide image generation.
|
||||||
|
negative_prompt (`str`, *optional*):
|
||||||
|
The prompt or prompts not to guide the image generation.
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
resized_image (`List`):
|
||||||
|
Images resized to 1024x1024 target area for VAE encoding
|
||||||
|
resized_cond_image (`List`):
|
||||||
|
Images resized to 384x384 target area for VL text encoding
|
||||||
|
prompt_embeds (`Tensor`):
|
||||||
|
The prompt embeddings.
|
||||||
|
prompt_embeds_mask (`Tensor`):
|
||||||
|
The encoder attention mask.
|
||||||
|
negative_prompt_embeds (`Tensor`):
|
||||||
|
The negative prompt embeddings.
|
||||||
|
negative_prompt_embeds_mask (`Tensor`):
|
||||||
|
The negative prompt embeddings mask.
|
||||||
|
"""
|
||||||
|
|
||||||
model_name = "qwenimage-edit-plus"
|
model_name = "qwenimage-edit-plus"
|
||||||
block_classes = [
|
block_classes = [
|
||||||
QwenImageEditPlusResizeStep(target_area=384 * 384, output_name="resized_cond_image"),
|
QwenImageEditPlusResizeStep(),
|
||||||
QwenImageEditPlusTextEncoderStep(),
|
QwenImageEditPlusTextEncoderStep(),
|
||||||
]
|
]
|
||||||
block_names = ["resize", "encode"]
|
block_names = ["resize", "encode"]
|
||||||
@@ -73,12 +97,36 @@ class QwenImageEditPlusVLEncoderStep(SequentialPipelineBlocks):
|
|||||||
# ====================
|
# ====================
|
||||||
|
|
||||||
|
|
||||||
|
# auto_docstring
|
||||||
class QwenImageEditPlusVaeEncoderStep(SequentialPipelineBlocks):
|
class QwenImageEditPlusVaeEncoderStep(SequentialPipelineBlocks):
|
||||||
"""VAE encoder that handles multiple images with different sizes. Uses 1024x1024 target area."""
|
"""
|
||||||
|
VAE encoder step that encodes image inputs into latent representations.
|
||||||
|
Each image is resized independently based on its own aspect ratio to 1024x1024 target area.
|
||||||
|
|
||||||
|
Components:
|
||||||
|
image_resize_processor (`VaeImageProcessor`) image_processor (`VaeImageProcessor`) vae
|
||||||
|
(`AutoencoderKLQwenImage`)
|
||||||
|
|
||||||
|
Inputs:
|
||||||
|
image (`Union[Image, List]`):
|
||||||
|
Reference image(s) for denoising. Can be a single image or list of images.
|
||||||
|
generator (`Generator`, *optional*):
|
||||||
|
Torch generator for deterministic generation.
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
resized_image (`List`):
|
||||||
|
Images resized to 1024x1024 target area for VAE encoding
|
||||||
|
resized_cond_image (`List`):
|
||||||
|
Images resized to 384x384 target area for VL text encoding
|
||||||
|
processed_image (`Tensor`):
|
||||||
|
The processed image
|
||||||
|
image_latents (`Tensor`):
|
||||||
|
The latent representation of the input image.
|
||||||
|
"""
|
||||||
|
|
||||||
model_name = "qwenimage-edit-plus"
|
model_name = "qwenimage-edit-plus"
|
||||||
block_classes = [
|
block_classes = [
|
||||||
QwenImageEditPlusResizeStep(target_area=1024 * 1024, output_name="resized_image"),
|
QwenImageEditPlusResizeStep(),
|
||||||
QwenImageEditPlusProcessImagesInputStep(),
|
QwenImageEditPlusProcessImagesInputStep(),
|
||||||
QwenImageVaeEncoderStep(),
|
QwenImageVaeEncoderStep(),
|
||||||
]
|
]
|
||||||
@@ -98,11 +146,66 @@ class QwenImageEditPlusVaeEncoderStep(SequentialPipelineBlocks):
|
|||||||
|
|
||||||
|
|
||||||
# assemble input steps
|
# assemble input steps
|
||||||
|
# auto_docstring
|
||||||
class QwenImageEditPlusInputStep(SequentialPipelineBlocks):
|
class QwenImageEditPlusInputStep(SequentialPipelineBlocks):
|
||||||
|
"""
|
||||||
|
Input step that prepares the inputs for the Edit Plus denoising step. It:
|
||||||
|
- Standardizes text embeddings batch size.
|
||||||
|
- Processes list of image latents: patchifies, concatenates along dim=1, expands batch.
|
||||||
|
- Outputs lists of image_height/image_width for RoPE calculation.
|
||||||
|
- Defaults height/width from last image in the list.
|
||||||
|
|
||||||
|
Components:
|
||||||
|
pachifier (`QwenImagePachifier`)
|
||||||
|
|
||||||
|
Inputs:
|
||||||
|
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
||||||
|
The number of images to generate per prompt.
|
||||||
|
prompt_embeds (`Tensor`):
|
||||||
|
text embeddings used to guide the image generation. Can be generated from text_encoder step.
|
||||||
|
prompt_embeds_mask (`Tensor`):
|
||||||
|
mask for the text embeddings. Can be generated from text_encoder step.
|
||||||
|
negative_prompt_embeds (`Tensor`, *optional*):
|
||||||
|
negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
|
||||||
|
negative_prompt_embeds_mask (`Tensor`, *optional*):
|
||||||
|
mask for the negative text embeddings. Can be generated from text_encoder step.
|
||||||
|
height (`int`, *optional*):
|
||||||
|
The height in pixels of the generated image.
|
||||||
|
width (`int`, *optional*):
|
||||||
|
The width in pixels of the generated image.
|
||||||
|
image_latents (`Tensor`):
|
||||||
|
image latents used to guide the image generation. Can be generated from vae_encoder step.
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
batch_size (`int`):
|
||||||
|
The batch size of the prompt embeddings
|
||||||
|
dtype (`dtype`):
|
||||||
|
The data type of the prompt embeddings
|
||||||
|
prompt_embeds (`Tensor`):
|
||||||
|
The prompt embeddings. (batch-expanded)
|
||||||
|
prompt_embeds_mask (`Tensor`):
|
||||||
|
The encoder attention mask. (batch-expanded)
|
||||||
|
negative_prompt_embeds (`Tensor`):
|
||||||
|
The negative prompt embeddings. (batch-expanded)
|
||||||
|
negative_prompt_embeds_mask (`Tensor`):
|
||||||
|
The negative prompt embeddings mask. (batch-expanded)
|
||||||
|
image_height (`List`):
|
||||||
|
The image heights calculated from the image latents dimension
|
||||||
|
image_width (`List`):
|
||||||
|
The image widths calculated from the image latents dimension
|
||||||
|
height (`int`):
|
||||||
|
if not provided, updated to image height
|
||||||
|
width (`int`):
|
||||||
|
if not provided, updated to image width
|
||||||
|
image_latents (`Tensor`):
|
||||||
|
image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified,
|
||||||
|
concatenated, and batch-expanded)
|
||||||
|
"""
|
||||||
|
|
||||||
model_name = "qwenimage-edit-plus"
|
model_name = "qwenimage-edit-plus"
|
||||||
block_classes = [
|
block_classes = [
|
||||||
QwenImageTextInputsStep(),
|
QwenImageTextInputsStep(),
|
||||||
QwenImageEditPlusAdditionalInputsStep(image_latent_inputs=["image_latents"]),
|
QwenImageEditPlusAdditionalInputsStep(),
|
||||||
]
|
]
|
||||||
block_names = ["text_inputs", "additional_inputs"]
|
block_names = ["text_inputs", "additional_inputs"]
|
||||||
|
|
||||||
@@ -118,7 +221,50 @@ class QwenImageEditPlusInputStep(SequentialPipelineBlocks):
|
|||||||
|
|
||||||
|
|
||||||
# Qwen Image Edit Plus (image2image) core denoise step
|
# Qwen Image Edit Plus (image2image) core denoise step
|
||||||
|
# auto_docstring
|
||||||
class QwenImageEditPlusCoreDenoiseStep(SequentialPipelineBlocks):
|
class QwenImageEditPlusCoreDenoiseStep(SequentialPipelineBlocks):
|
||||||
|
"""
|
||||||
|
Core denoising workflow for QwenImage-Edit Plus edit (img2img) task.
|
||||||
|
|
||||||
|
Components:
|
||||||
|
pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider
|
||||||
|
(`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)
|
||||||
|
|
||||||
|
Inputs:
|
||||||
|
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
||||||
|
The number of images to generate per prompt.
|
||||||
|
prompt_embeds (`Tensor`):
|
||||||
|
text embeddings used to guide the image generation. Can be generated from text_encoder step.
|
||||||
|
prompt_embeds_mask (`Tensor`):
|
||||||
|
mask for the text embeddings. Can be generated from text_encoder step.
|
||||||
|
negative_prompt_embeds (`Tensor`, *optional*):
|
||||||
|
negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
|
||||||
|
negative_prompt_embeds_mask (`Tensor`, *optional*):
|
||||||
|
mask for the negative text embeddings. Can be generated from text_encoder step.
|
||||||
|
height (`int`, *optional*):
|
||||||
|
The height in pixels of the generated image.
|
||||||
|
width (`int`, *optional*):
|
||||||
|
The width in pixels of the generated image.
|
||||||
|
image_latents (`Tensor`):
|
||||||
|
image latents used to guide the image generation. Can be generated from vae_encoder step.
|
||||||
|
latents (`Tensor`, *optional*):
|
||||||
|
Pre-generated noisy latents for image generation.
|
||||||
|
generator (`Generator`, *optional*):
|
||||||
|
Torch generator for deterministic generation.
|
||||||
|
num_inference_steps (`int`, *optional*, defaults to 50):
|
||||||
|
The number of denoising steps.
|
||||||
|
sigmas (`List`, *optional*):
|
||||||
|
Custom sigmas for the denoising process.
|
||||||
|
attention_kwargs (`Dict`, *optional*):
|
||||||
|
Additional kwargs for attention processors.
|
||||||
|
**denoiser_input_fields (`None`, *optional*):
|
||||||
|
conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
latents (`Tensor`):
|
||||||
|
Denoised latents.
|
||||||
|
"""
|
||||||
|
|
||||||
model_name = "qwenimage-edit-plus"
|
model_name = "qwenimage-edit-plus"
|
||||||
block_classes = [
|
block_classes = [
|
||||||
QwenImageEditPlusInputStep(),
|
QwenImageEditPlusInputStep(),
|
||||||
@@ -144,9 +290,7 @@ class QwenImageEditPlusCoreDenoiseStep(SequentialPipelineBlocks):
|
|||||||
@property
|
@property
|
||||||
def outputs(self):
|
def outputs(self):
|
||||||
return [
|
return [
|
||||||
OutputParam(
|
OutputParam.template("latents"),
|
||||||
name="latents", type_hint=torch.Tensor, description="The latents generated by the denoising step"
|
|
||||||
),
|
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
@@ -155,7 +299,26 @@ class QwenImageEditPlusCoreDenoiseStep(SequentialPipelineBlocks):
|
|||||||
# ====================
|
# ====================
|
||||||
|
|
||||||
|
|
||||||
|
# auto_docstring
|
||||||
class QwenImageEditPlusDecodeStep(SequentialPipelineBlocks):
|
class QwenImageEditPlusDecodeStep(SequentialPipelineBlocks):
|
||||||
|
"""
|
||||||
|
Decode step that decodes the latents to images and postprocesses the generated image.
|
||||||
|
|
||||||
|
Components:
|
||||||
|
vae (`AutoencoderKLQwenImage`) image_processor (`VaeImageProcessor`)
|
||||||
|
|
||||||
|
Inputs:
|
||||||
|
latents (`Tensor`):
|
||||||
|
The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise
|
||||||
|
step.
|
||||||
|
output_type (`str`, *optional*, defaults to pil):
|
||||||
|
Output format: 'pil', 'np', 'pt'.
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
images (`List`):
|
||||||
|
Generated images. (tensor output of the vae decoder.)
|
||||||
|
"""
|
||||||
|
|
||||||
model_name = "qwenimage-edit-plus"
|
model_name = "qwenimage-edit-plus"
|
||||||
block_classes = [QwenImageDecoderStep(), QwenImageProcessImagesOutputStep()]
|
block_classes = [QwenImageDecoderStep(), QwenImageProcessImagesOutputStep()]
|
||||||
block_names = ["decode", "postprocess"]
|
block_names = ["decode", "postprocess"]
|
||||||
@@ -179,7 +342,53 @@ EDIT_PLUS_AUTO_BLOCKS = InsertableDict(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# auto_docstring
|
||||||
class QwenImageEditPlusAutoBlocks(SequentialPipelineBlocks):
|
class QwenImageEditPlusAutoBlocks(SequentialPipelineBlocks):
|
||||||
|
"""
|
||||||
|
Auto Modular pipeline for edit (img2img) tasks using QwenImage-Edit Plus.
|
||||||
|
- `image` is required input (can be single image or list of images).
|
||||||
|
- Each image is resized independently based on its own aspect ratio.
|
||||||
|
- VL encoder uses 384x384 target area, VAE encoder uses 1024x1024 target area.
|
||||||
|
|
||||||
|
Components:
|
||||||
|
image_resize_processor (`VaeImageProcessor`) text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor
|
||||||
|
(`Qwen2VLProcessor`) guider (`ClassifierFreeGuidance`) image_processor (`VaeImageProcessor`) vae
|
||||||
|
(`AutoencoderKLQwenImage`) pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`)
|
||||||
|
transformer (`QwenImageTransformer2DModel`)
|
||||||
|
|
||||||
|
Inputs:
|
||||||
|
image (`Union[Image, List]`):
|
||||||
|
Reference image(s) for denoising. Can be a single image or list of images.
|
||||||
|
prompt (`str`):
|
||||||
|
The prompt or prompts to guide image generation.
|
||||||
|
negative_prompt (`str`, *optional*):
|
||||||
|
The prompt or prompts not to guide the image generation.
|
||||||
|
generator (`Generator`, *optional*):
|
||||||
|
Torch generator for deterministic generation.
|
||||||
|
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
||||||
|
The number of images to generate per prompt.
|
||||||
|
height (`int`, *optional*):
|
||||||
|
The height in pixels of the generated image.
|
||||||
|
width (`int`, *optional*):
|
||||||
|
The width in pixels of the generated image.
|
||||||
|
latents (`Tensor`, *optional*):
|
||||||
|
Pre-generated noisy latents for image generation.
|
||||||
|
num_inference_steps (`int`, *optional*, defaults to 50):
|
||||||
|
The number of denoising steps.
|
||||||
|
sigmas (`List`, *optional*):
|
||||||
|
Custom sigmas for the denoising process.
|
||||||
|
attention_kwargs (`Dict`, *optional*):
|
||||||
|
Additional kwargs for attention processors.
|
||||||
|
**denoiser_input_fields (`None`, *optional*):
|
||||||
|
conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
|
||||||
|
output_type (`str`, *optional*, defaults to pil):
|
||||||
|
Output format: 'pil', 'np', 'pt'.
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
images (`List`):
|
||||||
|
Generated images.
|
||||||
|
"""
|
||||||
|
|
||||||
model_name = "qwenimage-edit-plus"
|
model_name = "qwenimage-edit-plus"
|
||||||
block_classes = EDIT_PLUS_AUTO_BLOCKS.values()
|
block_classes = EDIT_PLUS_AUTO_BLOCKS.values()
|
||||||
block_names = EDIT_PLUS_AUTO_BLOCKS.keys()
|
block_names = EDIT_PLUS_AUTO_BLOCKS.keys()
|
||||||
@@ -196,5 +405,5 @@ class QwenImageEditPlusAutoBlocks(SequentialPipelineBlocks):
|
|||||||
@property
|
@property
|
||||||
def outputs(self):
|
def outputs(self):
|
||||||
return [
|
return [
|
||||||
OutputParam(name="images", type_hint=List[List[PIL.Image.Image]], description="The generated images"),
|
OutputParam.template("images"),
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -12,12 +12,6 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
|
|
||||||
from typing import List
|
|
||||||
|
|
||||||
import PIL.Image
|
|
||||||
import torch
|
|
||||||
|
|
||||||
from ...utils import logging
|
from ...utils import logging
|
||||||
from ..modular_pipeline import SequentialPipelineBlocks
|
from ..modular_pipeline import SequentialPipelineBlocks
|
||||||
from ..modular_pipeline_utils import InsertableDict, OutputParam
|
from ..modular_pipeline_utils import InsertableDict, OutputParam
|
||||||
@@ -55,8 +49,44 @@ logger = logging.get_logger(__name__)
|
|||||||
# ====================
|
# ====================
|
||||||
|
|
||||||
|
|
||||||
|
# auto_docstring
|
||||||
class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks):
|
class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks):
|
||||||
"""Text encoder that takes text prompt, will generate a prompt based on image if not provided."""
|
"""
|
||||||
|
QwenImage-Layered Text encoder step that encode the text prompt, will generate a prompt based on image if not
|
||||||
|
provided.
|
||||||
|
|
||||||
|
Components:
|
||||||
|
image_resize_processor (`VaeImageProcessor`) text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor
|
||||||
|
(`Qwen2VLProcessor`) tokenizer (`Qwen2Tokenizer`): The tokenizer to use guider (`ClassifierFreeGuidance`)
|
||||||
|
|
||||||
|
Inputs:
|
||||||
|
image (`Union[Image, List]`):
|
||||||
|
Reference image(s) for denoising. Can be a single image or list of images.
|
||||||
|
resolution (`int`, *optional*, defaults to 640):
|
||||||
|
The target area to resize the image to, can be 1024 or 640
|
||||||
|
prompt (`str`, *optional*):
|
||||||
|
The prompt or prompts to guide image generation.
|
||||||
|
use_en_prompt (`bool`, *optional*, defaults to False):
|
||||||
|
Whether to use English prompt template
|
||||||
|
negative_prompt (`str`, *optional*):
|
||||||
|
The prompt or prompts not to guide the image generation.
|
||||||
|
max_sequence_length (`int`, *optional*, defaults to 1024):
|
||||||
|
Maximum sequence length for prompt encoding.
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
resized_image (`List`):
|
||||||
|
The resized images
|
||||||
|
prompt (`str`):
|
||||||
|
The prompt or prompts to guide image generation. If not provided, updated using image caption
|
||||||
|
prompt_embeds (`Tensor`):
|
||||||
|
The prompt embeddings.
|
||||||
|
prompt_embeds_mask (`Tensor`):
|
||||||
|
The encoder attention mask.
|
||||||
|
negative_prompt_embeds (`Tensor`):
|
||||||
|
The negative prompt embeddings.
|
||||||
|
negative_prompt_embeds_mask (`Tensor`):
|
||||||
|
The negative prompt embeddings mask.
|
||||||
|
"""
|
||||||
|
|
||||||
model_name = "qwenimage-layered"
|
model_name = "qwenimage-layered"
|
||||||
block_classes = [
|
block_classes = [
|
||||||
@@ -77,7 +107,32 @@ class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks):
|
|||||||
|
|
||||||
|
|
||||||
# Edit VAE encoder
|
# Edit VAE encoder
|
||||||
|
# auto_docstring
|
||||||
class QwenImageLayeredVaeEncoderStep(SequentialPipelineBlocks):
|
class QwenImageLayeredVaeEncoderStep(SequentialPipelineBlocks):
|
||||||
|
"""
|
||||||
|
Vae encoder step that encode the image inputs into their latent representations.
|
||||||
|
|
||||||
|
Components:
|
||||||
|
image_resize_processor (`VaeImageProcessor`) image_processor (`VaeImageProcessor`) vae
|
||||||
|
(`AutoencoderKLQwenImage`)
|
||||||
|
|
||||||
|
Inputs:
|
||||||
|
image (`Union[Image, List]`):
|
||||||
|
Reference image(s) for denoising. Can be a single image or list of images.
|
||||||
|
resolution (`int`, *optional*, defaults to 640):
|
||||||
|
The target area to resize the image to, can be 1024 or 640
|
||||||
|
generator (`Generator`, *optional*):
|
||||||
|
Torch generator for deterministic generation.
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
resized_image (`List`):
|
||||||
|
The resized images
|
||||||
|
processed_image (`Tensor`):
|
||||||
|
The processed image
|
||||||
|
image_latents (`Tensor`):
|
||||||
|
The latent representation of the input image.
|
||||||
|
"""
|
||||||
|
|
||||||
model_name = "qwenimage-layered"
|
model_name = "qwenimage-layered"
|
||||||
block_classes = [
|
block_classes = [
|
||||||
QwenImageLayeredResizeStep(),
|
QwenImageLayeredResizeStep(),
|
||||||
@@ -98,11 +153,60 @@ class QwenImageLayeredVaeEncoderStep(SequentialPipelineBlocks):
|
|||||||
|
|
||||||
|
|
||||||
# assemble input steps
|
# assemble input steps
|
||||||
|
# auto_docstring
|
||||||
class QwenImageLayeredInputStep(SequentialPipelineBlocks):
|
class QwenImageLayeredInputStep(SequentialPipelineBlocks):
|
||||||
|
"""
|
||||||
|
Input step that prepares the inputs for the layered denoising step. It:
|
||||||
|
- make sure the text embeddings have consistent batch size as well as the additional inputs.
|
||||||
|
- update height/width based `image_latents`, patchify `image_latents`.
|
||||||
|
|
||||||
|
Components:
|
||||||
|
pachifier (`QwenImageLayeredPachifier`)
|
||||||
|
|
||||||
|
Inputs:
|
||||||
|
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
||||||
|
The number of images to generate per prompt.
|
||||||
|
prompt_embeds (`Tensor`):
|
||||||
|
text embeddings used to guide the image generation. Can be generated from text_encoder step.
|
||||||
|
prompt_embeds_mask (`Tensor`):
|
||||||
|
mask for the text embeddings. Can be generated from text_encoder step.
|
||||||
|
negative_prompt_embeds (`Tensor`, *optional*):
|
||||||
|
negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
|
||||||
|
negative_prompt_embeds_mask (`Tensor`, *optional*):
|
||||||
|
mask for the negative text embeddings. Can be generated from text_encoder step.
|
||||||
|
image_latents (`Tensor`):
|
||||||
|
image latents used to guide the image generation. Can be generated from vae_encoder step.
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
batch_size (`int`):
|
||||||
|
The batch size of the prompt embeddings
|
||||||
|
dtype (`dtype`):
|
||||||
|
The data type of the prompt embeddings
|
||||||
|
prompt_embeds (`Tensor`):
|
||||||
|
The prompt embeddings. (batch-expanded)
|
||||||
|
prompt_embeds_mask (`Tensor`):
|
||||||
|
The encoder attention mask. (batch-expanded)
|
||||||
|
negative_prompt_embeds (`Tensor`):
|
||||||
|
The negative prompt embeddings. (batch-expanded)
|
||||||
|
negative_prompt_embeds_mask (`Tensor`):
|
||||||
|
The negative prompt embeddings mask. (batch-expanded)
|
||||||
|
image_height (`int`):
|
||||||
|
The image height calculated from the image latents dimension
|
||||||
|
image_width (`int`):
|
||||||
|
The image width calculated from the image latents dimension
|
||||||
|
height (`int`):
|
||||||
|
if not provided, updated to image height
|
||||||
|
width (`int`):
|
||||||
|
if not provided, updated to image width
|
||||||
|
image_latents (`Tensor`):
|
||||||
|
image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified
|
||||||
|
with layered pachifier and batch-expanded)
|
||||||
|
"""
|
||||||
|
|
||||||
model_name = "qwenimage-layered"
|
model_name = "qwenimage-layered"
|
||||||
block_classes = [
|
block_classes = [
|
||||||
QwenImageTextInputsStep(),
|
QwenImageTextInputsStep(),
|
||||||
QwenImageLayeredAdditionalInputsStep(image_latent_inputs=["image_latents"]),
|
QwenImageLayeredAdditionalInputsStep(),
|
||||||
]
|
]
|
||||||
block_names = ["text_inputs", "additional_inputs"]
|
block_names = ["text_inputs", "additional_inputs"]
|
||||||
|
|
||||||
@@ -116,7 +220,48 @@ class QwenImageLayeredInputStep(SequentialPipelineBlocks):
|
|||||||
|
|
||||||
|
|
||||||
# Qwen Image Layered (image2image) core denoise step
|
# Qwen Image Layered (image2image) core denoise step
|
||||||
|
# auto_docstring
|
||||||
class QwenImageLayeredCoreDenoiseStep(SequentialPipelineBlocks):
|
class QwenImageLayeredCoreDenoiseStep(SequentialPipelineBlocks):
|
||||||
|
"""
|
||||||
|
Core denoising workflow for QwenImage-Layered img2img task.
|
||||||
|
|
||||||
|
Components:
|
||||||
|
pachifier (`QwenImageLayeredPachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider
|
||||||
|
(`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)
|
||||||
|
|
||||||
|
Inputs:
|
||||||
|
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
||||||
|
The number of images to generate per prompt.
|
||||||
|
prompt_embeds (`Tensor`):
|
||||||
|
text embeddings used to guide the image generation. Can be generated from text_encoder step.
|
||||||
|
prompt_embeds_mask (`Tensor`):
|
||||||
|
mask for the text embeddings. Can be generated from text_encoder step.
|
||||||
|
negative_prompt_embeds (`Tensor`, *optional*):
|
||||||
|
negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
|
||||||
|
negative_prompt_embeds_mask (`Tensor`, *optional*):
|
||||||
|
mask for the negative text embeddings. Can be generated from text_encoder step.
|
||||||
|
image_latents (`Tensor`):
|
||||||
|
image latents used to guide the image generation. Can be generated from vae_encoder step.
|
||||||
|
latents (`Tensor`, *optional*):
|
||||||
|
Pre-generated noisy latents for image generation.
|
||||||
|
layers (`int`, *optional*, defaults to 4):
|
||||||
|
Number of layers to extract from the image
|
||||||
|
generator (`Generator`, *optional*):
|
||||||
|
Torch generator for deterministic generation.
|
||||||
|
num_inference_steps (`int`, *optional*, defaults to 50):
|
||||||
|
The number of denoising steps.
|
||||||
|
sigmas (`List`, *optional*):
|
||||||
|
Custom sigmas for the denoising process.
|
||||||
|
attention_kwargs (`Dict`, *optional*):
|
||||||
|
Additional kwargs for attention processors.
|
||||||
|
**denoiser_input_fields (`None`, *optional*):
|
||||||
|
conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
latents (`Tensor`):
|
||||||
|
Denoised latents.
|
||||||
|
"""
|
||||||
|
|
||||||
model_name = "qwenimage-layered"
|
model_name = "qwenimage-layered"
|
||||||
block_classes = [
|
block_classes = [
|
||||||
QwenImageLayeredInputStep(),
|
QwenImageLayeredInputStep(),
|
||||||
@@ -142,9 +287,7 @@ class QwenImageLayeredCoreDenoiseStep(SequentialPipelineBlocks):
|
|||||||
@property
|
@property
|
||||||
def outputs(self):
|
def outputs(self):
|
||||||
return [
|
return [
|
||||||
OutputParam(
|
OutputParam.template("latents"),
|
||||||
name="latents", type_hint=torch.Tensor, description="The latents generated by the denoising step"
|
|
||||||
),
|
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
@@ -162,7 +305,54 @@ LAYERED_AUTO_BLOCKS = InsertableDict(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# auto_docstring
|
||||||
class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks):
|
class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks):
|
||||||
|
"""
|
||||||
|
Auto Modular pipeline for layered denoising tasks using QwenImage-Layered.
|
||||||
|
|
||||||
|
Components:
|
||||||
|
image_resize_processor (`VaeImageProcessor`) text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor
|
||||||
|
(`Qwen2VLProcessor`) tokenizer (`Qwen2Tokenizer`): The tokenizer to use guider (`ClassifierFreeGuidance`)
|
||||||
|
image_processor (`VaeImageProcessor`) vae (`AutoencoderKLQwenImage`) pachifier (`QwenImageLayeredPachifier`)
|
||||||
|
scheduler (`FlowMatchEulerDiscreteScheduler`) transformer (`QwenImageTransformer2DModel`)
|
||||||
|
|
||||||
|
Inputs:
|
||||||
|
image (`Union[Image, List]`):
|
||||||
|
Reference image(s) for denoising. Can be a single image or list of images.
|
||||||
|
resolution (`int`, *optional*, defaults to 640):
|
||||||
|
The target area to resize the image to, can be 1024 or 640
|
||||||
|
prompt (`str`, *optional*):
|
||||||
|
The prompt or prompts to guide image generation.
|
||||||
|
use_en_prompt (`bool`, *optional*, defaults to False):
|
||||||
|
Whether to use English prompt template
|
||||||
|
negative_prompt (`str`, *optional*):
|
||||||
|
The prompt or prompts not to guide the image generation.
|
||||||
|
max_sequence_length (`int`, *optional*, defaults to 1024):
|
||||||
|
Maximum sequence length for prompt encoding.
|
||||||
|
generator (`Generator`, *optional*):
|
||||||
|
Torch generator for deterministic generation.
|
||||||
|
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
||||||
|
The number of images to generate per prompt.
|
||||||
|
latents (`Tensor`, *optional*):
|
||||||
|
Pre-generated noisy latents for image generation.
|
||||||
|
layers (`int`, *optional*, defaults to 4):
|
||||||
|
Number of layers to extract from the image
|
||||||
|
num_inference_steps (`int`, *optional*, defaults to 50):
|
||||||
|
The number of denoising steps.
|
||||||
|
sigmas (`List`, *optional*):
|
||||||
|
Custom sigmas for the denoising process.
|
||||||
|
attention_kwargs (`Dict`, *optional*):
|
||||||
|
Additional kwargs for attention processors.
|
||||||
|
**denoiser_input_fields (`None`, *optional*):
|
||||||
|
conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
|
||||||
|
output_type (`str`, *optional*, defaults to pil):
|
||||||
|
Output format: 'pil', 'np', 'pt'.
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
images (`List`):
|
||||||
|
Generated images.
|
||||||
|
"""
|
||||||
|
|
||||||
model_name = "qwenimage-layered"
|
model_name = "qwenimage-layered"
|
||||||
block_classes = LAYERED_AUTO_BLOCKS.values()
|
block_classes = LAYERED_AUTO_BLOCKS.values()
|
||||||
block_names = LAYERED_AUTO_BLOCKS.keys()
|
block_names = LAYERED_AUTO_BLOCKS.keys()
|
||||||
@@ -174,5 +364,5 @@ class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks):
|
|||||||
@property
|
@property
|
||||||
def outputs(self):
|
def outputs(self):
|
||||||
return [
|
return [
|
||||||
OutputParam(name="images", type_hint=List[List[PIL.Image.Image]], description="The generated images"),
|
OutputParam.template("images"),
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -131,7 +131,7 @@ class ZImageLoopDenoiser(ModularPipelineBlocks):
|
|||||||
),
|
),
|
||||||
InputParam(
|
InputParam(
|
||||||
kwargs_type="denoiser_input_fields",
|
kwargs_type="denoiser_input_fields",
|
||||||
description="conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.",
|
description="The conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.",
|
||||||
),
|
),
|
||||||
]
|
]
|
||||||
guider_input_names = []
|
guider_input_names = []
|
||||||
|
|||||||
@@ -84,7 +84,6 @@ EXAMPLE_DOC_STRING = """
|
|||||||
>>> from diffusers import ControlNetModel, StableDiffusionXLControlNetImg2ImgPipeline, AutoencoderKL
|
>>> from diffusers import ControlNetModel, StableDiffusionXLControlNetImg2ImgPipeline, AutoencoderKL
|
||||||
>>> from diffusers.utils import load_image
|
>>> from diffusers.utils import load_image
|
||||||
|
|
||||||
|
|
||||||
>>> depth_estimator = DPTForDepthEstimation.from_pretrained("Intel/dpt-hybrid-midas").to("cuda")
|
>>> depth_estimator = DPTForDepthEstimation.from_pretrained("Intel/dpt-hybrid-midas").to("cuda")
|
||||||
>>> feature_extractor = DPTImageProcessor.from_pretrained("Intel/dpt-hybrid-midas")
|
>>> feature_extractor = DPTImageProcessor.from_pretrained("Intel/dpt-hybrid-midas")
|
||||||
>>> controlnet = ControlNetModel.from_pretrained(
|
>>> controlnet = ControlNetModel.from_pretrained(
|
||||||
|
|||||||
@@ -53,7 +53,6 @@ EXAMPLE_DOC_STRING = """
|
|||||||
>>> from transformers import AutoTokenizer, LlamaForCausalLM
|
>>> from transformers import AutoTokenizer, LlamaForCausalLM
|
||||||
>>> from diffusers import HiDreamImagePipeline
|
>>> from diffusers import HiDreamImagePipeline
|
||||||
|
|
||||||
|
|
||||||
>>> tokenizer_4 = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct")
|
>>> tokenizer_4 = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct")
|
||||||
>>> text_encoder_4 = LlamaForCausalLM.from_pretrained(
|
>>> text_encoder_4 = LlamaForCausalLM.from_pretrained(
|
||||||
... "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
... "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
|||||||
@@ -85,7 +85,6 @@ EXAMPLE_DOC_STRING = """
|
|||||||
>>> from diffusers import ControlNetModel, StableDiffusionXLControlNetPAGImg2ImgPipeline, AutoencoderKL
|
>>> from diffusers import ControlNetModel, StableDiffusionXLControlNetPAGImg2ImgPipeline, AutoencoderKL
|
||||||
>>> from diffusers.utils import load_image
|
>>> from diffusers.utils import load_image
|
||||||
|
|
||||||
|
|
||||||
>>> depth_estimator = DPTForDepthEstimation.from_pretrained("Intel/dpt-hybrid-midas").to("cuda")
|
>>> depth_estimator = DPTForDepthEstimation.from_pretrained("Intel/dpt-hybrid-midas").to("cuda")
|
||||||
>>> feature_extractor = DPTFeatureExtractor.from_pretrained("Intel/dpt-hybrid-midas")
|
>>> feature_extractor = DPTFeatureExtractor.from_pretrained("Intel/dpt-hybrid-midas")
|
||||||
>>> controlnet = ControlNetModel.from_pretrained(
|
>>> controlnet = ControlNetModel.from_pretrained(
|
||||||
|
|||||||
@@ -459,7 +459,6 @@ class StableDiffusionLatentUpscalePipeline(DiffusionPipeline, StableDiffusionMix
|
|||||||
>>> from diffusers import StableDiffusionLatentUpscalePipeline, StableDiffusionPipeline
|
>>> from diffusers import StableDiffusionLatentUpscalePipeline, StableDiffusionPipeline
|
||||||
>>> import torch
|
>>> import torch
|
||||||
|
|
||||||
|
|
||||||
>>> pipeline = StableDiffusionPipeline.from_pretrained(
|
>>> pipeline = StableDiffusionPipeline.from_pretrained(
|
||||||
... "CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16
|
... "CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16
|
||||||
... )
|
... )
|
||||||
|
|||||||
300
utils/modular_auto_docstring.py
Normal file
300
utils/modular_auto_docstring.py
Normal file
@@ -0,0 +1,300 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2025 The HuggingFace Inc. team.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
"""
|
||||||
|
Auto Docstring Generator for Modular Pipeline Blocks
|
||||||
|
|
||||||
|
This script scans Python files for classes that have `# auto_docstring` comment above them
|
||||||
|
and inserts/updates the docstring from the class's `doc` property.
|
||||||
|
|
||||||
|
Run from the root of the repo:
|
||||||
|
python utils/modular_auto_docstring.py [path] [--fix_and_overwrite]
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
# Check for auto_docstring markers (will error if found without proper docstring)
|
||||||
|
python utils/modular_auto_docstring.py
|
||||||
|
|
||||||
|
# Check specific directory
|
||||||
|
python utils/modular_auto_docstring.py src/diffusers/modular_pipelines/
|
||||||
|
|
||||||
|
# Fix and overwrite the docstrings
|
||||||
|
python utils/modular_auto_docstring.py --fix_and_overwrite
|
||||||
|
|
||||||
|
Usage in code:
|
||||||
|
# auto_docstring
|
||||||
|
class QwenImageAutoVaeEncoderStep(AutoPipelineBlocks):
|
||||||
|
# docstring will be automatically inserted here
|
||||||
|
|
||||||
|
@property
|
||||||
|
def doc(self):
|
||||||
|
return "Your docstring content..."
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import ast
|
||||||
|
import glob
|
||||||
|
import importlib
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
|
||||||
|
|
||||||
|
# All paths are set with the intent you should run this script from the root of the repo
|
||||||
|
DIFFUSERS_PATH = "src/diffusers"
|
||||||
|
REPO_PATH = "."
|
||||||
|
|
||||||
|
# Pattern to match the auto_docstring comment
|
||||||
|
AUTO_DOCSTRING_PATTERN = re.compile(r"^\s*#\s*auto_docstring\s*$")
|
||||||
|
|
||||||
|
|
||||||
|
def setup_diffusers_import():
|
||||||
|
"""Setup import path to use the local diffusers module."""
|
||||||
|
src_path = os.path.join(REPO_PATH, "src")
|
||||||
|
if src_path not in sys.path:
|
||||||
|
sys.path.insert(0, src_path)
|
||||||
|
|
||||||
|
|
||||||
|
def get_module_from_filepath(filepath: str) -> str:
|
||||||
|
"""Convert a filepath to a module name."""
|
||||||
|
filepath = os.path.normpath(filepath)
|
||||||
|
|
||||||
|
if filepath.startswith("src" + os.sep):
|
||||||
|
filepath = filepath[4:]
|
||||||
|
|
||||||
|
if filepath.endswith(".py"):
|
||||||
|
filepath = filepath[:-3]
|
||||||
|
|
||||||
|
module_name = filepath.replace(os.sep, ".")
|
||||||
|
return module_name
|
||||||
|
|
||||||
|
|
||||||
|
def load_module(filepath: str):
|
||||||
|
"""Load a module from filepath."""
|
||||||
|
setup_diffusers_import()
|
||||||
|
module_name = get_module_from_filepath(filepath)
|
||||||
|
|
||||||
|
try:
|
||||||
|
module = importlib.import_module(module_name)
|
||||||
|
return module
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Warning: Could not import module {module_name}: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def get_doc_from_class(module, class_name: str) -> str:
|
||||||
|
"""Get the doc property from an instantiated class."""
|
||||||
|
if module is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
cls = getattr(module, class_name, None)
|
||||||
|
if cls is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
instance = cls()
|
||||||
|
if hasattr(instance, "doc"):
|
||||||
|
return instance.doc
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Warning: Could not instantiate {class_name}: {e}")
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def find_auto_docstring_classes(filepath: str) -> list:
|
||||||
|
"""
|
||||||
|
Find all classes in a file that have # auto_docstring comment above them.
|
||||||
|
|
||||||
|
Returns list of (class_name, class_line_number, has_existing_docstring, docstring_end_line)
|
||||||
|
"""
|
||||||
|
with open(filepath, "r", encoding="utf-8", newline="\n") as f:
|
||||||
|
lines = f.readlines()
|
||||||
|
|
||||||
|
# Parse AST to find class locations and their docstrings
|
||||||
|
content = "".join(lines)
|
||||||
|
try:
|
||||||
|
tree = ast.parse(content)
|
||||||
|
except SyntaxError as e:
|
||||||
|
print(f"Syntax error in {filepath}: {e}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
# Build a map of class_name -> (class_line, has_docstring, docstring_end_line)
|
||||||
|
class_info = {}
|
||||||
|
for node in ast.walk(tree):
|
||||||
|
if isinstance(node, ast.ClassDef):
|
||||||
|
has_docstring = False
|
||||||
|
docstring_end_line = node.lineno # default to class line
|
||||||
|
|
||||||
|
if node.body and isinstance(node.body[0], ast.Expr):
|
||||||
|
first_stmt = node.body[0]
|
||||||
|
if isinstance(first_stmt.value, ast.Constant) and isinstance(first_stmt.value.value, str):
|
||||||
|
has_docstring = True
|
||||||
|
docstring_end_line = first_stmt.end_lineno or first_stmt.lineno
|
||||||
|
|
||||||
|
class_info[node.name] = (node.lineno, has_docstring, docstring_end_line)
|
||||||
|
|
||||||
|
# Now scan for # auto_docstring comments
|
||||||
|
classes_to_update = []
|
||||||
|
|
||||||
|
for i, line in enumerate(lines):
|
||||||
|
if AUTO_DOCSTRING_PATTERN.match(line):
|
||||||
|
# Found the marker, look for class definition on next non-empty, non-comment line
|
||||||
|
j = i + 1
|
||||||
|
while j < len(lines):
|
||||||
|
next_line = lines[j].strip()
|
||||||
|
if next_line and not next_line.startswith("#"):
|
||||||
|
break
|
||||||
|
j += 1
|
||||||
|
|
||||||
|
if j < len(lines) and lines[j].strip().startswith("class "):
|
||||||
|
# Extract class name
|
||||||
|
match = re.match(r"class\s+(\w+)", lines[j].strip())
|
||||||
|
if match:
|
||||||
|
class_name = match.group(1)
|
||||||
|
if class_name in class_info:
|
||||||
|
class_line, has_docstring, docstring_end_line = class_info[class_name]
|
||||||
|
classes_to_update.append((class_name, class_line, has_docstring, docstring_end_line))
|
||||||
|
|
||||||
|
return classes_to_update
|
||||||
|
|
||||||
|
|
||||||
|
def strip_class_name_line(doc: str, class_name: str) -> str:
|
||||||
|
"""Remove the 'class ClassName' line from the doc if present."""
|
||||||
|
lines = doc.strip().split("\n")
|
||||||
|
if lines and lines[0].strip() == f"class {class_name}":
|
||||||
|
# Remove the class line and any blank line following it
|
||||||
|
lines = lines[1:]
|
||||||
|
while lines and not lines[0].strip():
|
||||||
|
lines = lines[1:]
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
|
def format_docstring(doc: str, indent: str = " ") -> str:
|
||||||
|
"""Format a doc string as a properly indented docstring."""
|
||||||
|
lines = doc.strip().split("\n")
|
||||||
|
|
||||||
|
if len(lines) == 1:
|
||||||
|
return f'{indent}"""{lines[0]}"""\n'
|
||||||
|
else:
|
||||||
|
result = [f'{indent}"""\n']
|
||||||
|
for line in lines:
|
||||||
|
if line.strip():
|
||||||
|
result.append(f"{indent}{line}\n")
|
||||||
|
else:
|
||||||
|
result.append("\n")
|
||||||
|
result.append(f'{indent}"""\n')
|
||||||
|
return "".join(result)
|
||||||
|
|
||||||
|
|
||||||
|
def process_file(filepath: str, overwrite: bool = False) -> list:
|
||||||
|
"""
|
||||||
|
Process a file and find/insert docstrings for # auto_docstring marked classes.
|
||||||
|
|
||||||
|
Returns list of classes that need updating.
|
||||||
|
"""
|
||||||
|
classes_to_update = find_auto_docstring_classes(filepath)
|
||||||
|
|
||||||
|
if not classes_to_update:
|
||||||
|
return []
|
||||||
|
|
||||||
|
if not overwrite:
|
||||||
|
# Just return the list of classes that need updating
|
||||||
|
return [(filepath, cls_name, line) for cls_name, line, _, _ in classes_to_update]
|
||||||
|
|
||||||
|
# Load the module to get doc properties
|
||||||
|
module = load_module(filepath)
|
||||||
|
|
||||||
|
with open(filepath, "r", encoding="utf-8", newline="\n") as f:
|
||||||
|
lines = f.readlines()
|
||||||
|
|
||||||
|
# Process in reverse order to maintain line numbers
|
||||||
|
updated = False
|
||||||
|
for class_name, class_line, has_docstring, docstring_end_line in reversed(classes_to_update):
|
||||||
|
doc = get_doc_from_class(module, class_name)
|
||||||
|
|
||||||
|
if doc is None:
|
||||||
|
print(f"Warning: Could not get doc for {class_name} in {filepath}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Remove the "class ClassName" line since it's redundant in a docstring
|
||||||
|
doc = strip_class_name_line(doc, class_name)
|
||||||
|
|
||||||
|
# Format the new docstring with 4-space indent
|
||||||
|
new_docstring = format_docstring(doc, " ")
|
||||||
|
|
||||||
|
if has_docstring:
|
||||||
|
# Replace existing docstring (line after class definition to docstring_end_line)
|
||||||
|
# class_line is 1-indexed, we want to replace from class_line+1 to docstring_end_line
|
||||||
|
lines = lines[:class_line] + [new_docstring] + lines[docstring_end_line:]
|
||||||
|
else:
|
||||||
|
# Insert new docstring right after class definition line
|
||||||
|
# class_line is 1-indexed, so lines[class_line-1] is the class line
|
||||||
|
# Insert at position class_line (which is right after the class line)
|
||||||
|
lines = lines[:class_line] + [new_docstring] + lines[class_line:]
|
||||||
|
|
||||||
|
updated = True
|
||||||
|
print(f"Updated docstring for {class_name} in {filepath}")
|
||||||
|
|
||||||
|
if updated:
|
||||||
|
with open(filepath, "w", encoding="utf-8", newline="\n") as f:
|
||||||
|
f.writelines(lines)
|
||||||
|
|
||||||
|
return [(filepath, cls_name, line) for cls_name, line, _, _ in classes_to_update]
|
||||||
|
|
||||||
|
|
||||||
|
def check_auto_docstrings(path: str = None, overwrite: bool = False):
|
||||||
|
"""
|
||||||
|
Check all files for # auto_docstring markers and optionally fix them.
|
||||||
|
"""
|
||||||
|
if path is None:
|
||||||
|
path = DIFFUSERS_PATH
|
||||||
|
|
||||||
|
if os.path.isfile(path):
|
||||||
|
all_files = [path]
|
||||||
|
else:
|
||||||
|
all_files = glob.glob(os.path.join(path, "**/*.py"), recursive=True)
|
||||||
|
|
||||||
|
all_markers = []
|
||||||
|
|
||||||
|
for filepath in all_files:
|
||||||
|
markers = process_file(filepath, overwrite)
|
||||||
|
all_markers.extend(markers)
|
||||||
|
|
||||||
|
if not overwrite and len(all_markers) > 0:
|
||||||
|
message = "\n".join([f"- {f}: {cls} at line {line}" for f, cls, line in all_markers])
|
||||||
|
raise ValueError(
|
||||||
|
f"Found the following # auto_docstring markers that need docstrings:\n{message}\n\n"
|
||||||
|
f"Run `python utils/modular_auto_docstring.py --fix_and_overwrite` to fix them."
|
||||||
|
)
|
||||||
|
|
||||||
|
if overwrite and len(all_markers) > 0:
|
||||||
|
print(f"\nUpdated {len(all_markers)} docstring(s).")
|
||||||
|
elif len(all_markers) == 0:
|
||||||
|
print("No # auto_docstring markers found.")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Check and fix # auto_docstring markers in modular pipeline blocks",
|
||||||
|
)
|
||||||
|
parser.add_argument("path", nargs="?", default=None, help="File or directory to process (default: src/diffusers)")
|
||||||
|
parser.add_argument(
|
||||||
|
"--fix_and_overwrite",
|
||||||
|
action="store_true",
|
||||||
|
help="Whether to fix the docstrings by inserting them from doc property.",
|
||||||
|
)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
check_auto_docstrings(args.path, args.fix_and_overwrite)
|
||||||
Reference in New Issue
Block a user