Compare commits

..

15 Commits

Author SHA1 Message Date
yiyixuxu
1c90ce33f2 up 2026-01-10 12:21:26 +01:00
yiyixuxu
507953f415 more more 2026-01-10 12:19:14 +01:00
yiyixuxu
f0555af1c6 up up up 2026-01-10 12:15:53 +01:00
yiyixuxu
2a81f2ec54 style 2026-01-10 12:15:36 +01:00
yiyixuxu
d20f413f78 more auto docstring 2026-01-10 12:11:28 +01:00
yiyixuxu
ff09bf1a63 add modular_auto_docstring! 2026-01-10 11:55:03 +01:00
yiyixuxu
34a743e2dc style 2026-01-10 10:57:27 +01:00
yiyixuxu
43ab14845d update outputs 2026-01-10 10:56:54 +01:00
YiYi Xu
fbfe5c8d6b Merge branch 'main' into modular-doc-improv 2026-01-09 23:54:23 -10:00
YiYi Xu
418313bbf6 [Modular] better docstring (#12932)
add output to auto blocks + core denoising block for better doc string
2026-01-09 23:53:56 -10:00
yiyixuxu
b29873dee7 up up 2026-01-10 10:52:53 +01:00
Rafael Tvelov
2120c3096f Fix: typo in autoencoder_dc.py (#12687)
Fix typo in autoencoder_dc.py

Fixing typo in `get_block` function's parameter name:
"qkv_mutliscales" -> "qkv_multiscales"

Co-authored-by: YiYi Xu <yixu310@gmail.com>
2026-01-09 22:01:54 -10:00
Sayak Paul
ed6e5ecf67 [LoRA] add LoRA support to LTX-2 (#12933)
* up

* fixes

* tests

* docs.

* fix

* change loading info.

* up

* up
2026-01-10 11:27:22 +05:30
Sayak Paul
d44b5f86e6 fix how is_fsdp is determined (#12960)
up
2026-01-10 10:34:25 +05:30
yiyixuxu
7b499de6d0 up 2026-01-10 03:35:15 +01:00
27 changed files with 3469 additions and 154 deletions

View File

@@ -29,7 +29,7 @@ Cache methods speedup diffusion transformers by storing and reusing intermediate
[[autodoc]] apply_faster_cache
## FirstBlockCacheConfig
### FirstBlockCacheConfig
[[autodoc]] FirstBlockCacheConfig

View File

@@ -33,6 +33,7 @@ LoRA is a fast and lightweight training method that inserts and trains a signifi
- [`QwenImageLoraLoaderMixin`] provides similar functions for [Qwen Image](https://huggingface.co/docs/diffusers/main/en/api/pipelines/qwen).
- [`ZImageLoraLoaderMixin`] provides similar functions for [Z-Image](https://huggingface.co/docs/diffusers/main/en/api/pipelines/zimage).
- [`Flux2LoraLoaderMixin`] provides similar functions for [Flux2](https://huggingface.co/docs/diffusers/main/en/api/pipelines/flux2).
- [`LTX2LoraLoaderMixin`] provides similar functions for [Flux2](https://huggingface.co/docs/diffusers/main/en/api/pipelines/ltx2).
- [`LoraBaseMixin`] provides a base class with several utility methods to fuse, unfuse, unload, LoRAs and more.
> [!TIP]
@@ -62,6 +63,10 @@ LoRA is a fast and lightweight training method that inserts and trains a signifi
[[autodoc]] loaders.lora_pipeline.Flux2LoraLoaderMixin
## LTX2LoraLoaderMixin
[[autodoc]] loaders.lora_pipeline.LTX2LoraLoaderMixin
## CogVideoXLoraLoaderMixin
[[autodoc]] loaders.lora_pipeline.CogVideoXLoraLoaderMixin

View File

@@ -14,6 +14,10 @@
# LTX-2
<div class="flex flex-wrap space-x-1">
<img alt="LoRA" src="https://img.shields.io/badge/LoRA-d8b4fe?style=flat"/>
</div>
LTX-2 is a DiT-based audio-video foundation model designed to generate synchronized video and audio within a single model. It brings together the core building blocks of modern video generation, with open weights and a focus on practical, local execution.
You can find all the original LTX-Video checkpoints under the [Lightricks](https://huggingface.co/Lightricks) organization.

View File

@@ -68,20 +68,6 @@ config = FasterCacheConfig(
pipeline.transformer.enable_cache(config)
```
## FirstBlockCache
[FirstBlock Cache](https://huggingface.co/docs/diffusers/main/en/api/cache#diffusers.FirstBlockCacheConfig) checks how much the early layers of the denoiser changes from one timestep to the next. If the change is small, the model skips the expensive later layers and reuses the previous output.
```py
import torch
from diffusers import DiffusionPipeline
from diffusers.hooks import apply_first_block_cache, FirstBlockCacheConfig
pipeline = DiffusionPipeline.from_pretrained(
"Qwen/Qwen-Image", torch_dtype=torch.bfloat16
)
apply_first_block_cache(pipeline.transformer, FirstBlockCacheConfig(threshold=0.2))
```
## TaylorSeer Cache
[TaylorSeer Cache](https://huggingface.co/papers/2403.06923) accelerates diffusion inference by using Taylor series expansions to approximate and cache intermediate activations across denoising steps. The method predicts future outputs based on past computations, reusing them at specified intervals to reduce redundant calculations.
@@ -101,7 +87,8 @@ from diffusers import FluxPipeline, TaylorSeerCacheConfig
pipe = FluxPipeline.from_pretrained(
"black-forest-labs/FLUX.1-dev",
torch_dtype=torch.bfloat16,
).to("cuda")
)
pipe.to("cuda")
config = TaylorSeerCacheConfig(
cache_interval=5,
@@ -110,4 +97,4 @@ config = TaylorSeerCacheConfig(
taylor_factors_dtype=torch.bfloat16,
)
pipe.transformer.enable_cache(config)
```
```

View File

@@ -1228,7 +1228,7 @@ def main(args):
else {"device": accelerator.device, "dtype": weight_dtype}
)
is_fsdp = accelerator.state.fsdp_plugin is not None
is_fsdp = getattr(accelerator.state, "fsdp_plugin", None) is not None
if not is_fsdp:
transformer.to(**transformer_to_kwargs)

View File

@@ -1178,7 +1178,7 @@ def main(args):
else {"device": accelerator.device, "dtype": weight_dtype}
)
is_fsdp = accelerator.state.fsdp_plugin is not None
is_fsdp = getattr(accelerator.state, "fsdp_plugin", None) is not None
if not is_fsdp:
transformer.to(**transformer_to_kwargs)

View File

@@ -67,6 +67,7 @@ if is_torch_available():
"SD3LoraLoaderMixin",
"AuraFlowLoraLoaderMixin",
"StableDiffusionXLLoraLoaderMixin",
"LTX2LoraLoaderMixin",
"LTXVideoLoraLoaderMixin",
"LoraLoaderMixin",
"FluxLoraLoaderMixin",
@@ -121,6 +122,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
HunyuanVideoLoraLoaderMixin,
KandinskyLoraLoaderMixin,
LoraLoaderMixin,
LTX2LoraLoaderMixin,
LTXVideoLoraLoaderMixin,
Lumina2LoraLoaderMixin,
Mochi1LoraLoaderMixin,

View File

@@ -2140,6 +2140,54 @@ def _convert_non_diffusers_ltxv_lora_to_diffusers(state_dict, non_diffusers_pref
return converted_state_dict
def _convert_non_diffusers_ltx2_lora_to_diffusers(state_dict, non_diffusers_prefix="diffusion_model"):
# Remove the prefix
state_dict = {k: v for k, v in state_dict.items() if k.startswith(f"{non_diffusers_prefix}.")}
converted_state_dict = {k.removeprefix(f"{non_diffusers_prefix}."): v for k, v in state_dict.items()}
if non_diffusers_prefix == "diffusion_model":
rename_dict = {
"patchify_proj": "proj_in",
"audio_patchify_proj": "audio_proj_in",
"av_ca_video_scale_shift_adaln_single": "av_cross_attn_video_scale_shift",
"av_ca_a2v_gate_adaln_single": "av_cross_attn_video_a2v_gate",
"av_ca_audio_scale_shift_adaln_single": "av_cross_attn_audio_scale_shift",
"av_ca_v2a_gate_adaln_single": "av_cross_attn_audio_v2a_gate",
"scale_shift_table_a2v_ca_video": "video_a2v_cross_attn_scale_shift_table",
"scale_shift_table_a2v_ca_audio": "audio_a2v_cross_attn_scale_shift_table",
"q_norm": "norm_q",
"k_norm": "norm_k",
}
else:
rename_dict = {"aggregate_embed": "text_proj_in"}
# Apply renaming
renamed_state_dict = {}
for key, value in converted_state_dict.items():
new_key = key[:]
for old_pattern, new_pattern in rename_dict.items():
new_key = new_key.replace(old_pattern, new_pattern)
renamed_state_dict[new_key] = value
# Handle adaln_single -> time_embed and audio_adaln_single -> audio_time_embed
final_state_dict = {}
for key, value in renamed_state_dict.items():
if key.startswith("adaln_single."):
new_key = key.replace("adaln_single.", "time_embed.")
final_state_dict[new_key] = value
elif key.startswith("audio_adaln_single."):
new_key = key.replace("audio_adaln_single.", "audio_time_embed.")
final_state_dict[new_key] = value
else:
final_state_dict[key] = value
# Add transformer prefix
prefix = "transformer" if non_diffusers_prefix == "diffusion_model" else "connectors"
final_state_dict = {f"{prefix}.{k}": v for k, v in final_state_dict.items()}
return final_state_dict
def _convert_non_diffusers_qwen_lora_to_diffusers(state_dict):
has_diffusion_model = any(k.startswith("diffusion_model.") for k in state_dict)
if has_diffusion_model:

View File

@@ -48,6 +48,7 @@ from .lora_conversion_utils import (
_convert_non_diffusers_flux2_lora_to_diffusers,
_convert_non_diffusers_hidream_lora_to_diffusers,
_convert_non_diffusers_lora_to_diffusers,
_convert_non_diffusers_ltx2_lora_to_diffusers,
_convert_non_diffusers_ltxv_lora_to_diffusers,
_convert_non_diffusers_lumina2_lora_to_diffusers,
_convert_non_diffusers_qwen_lora_to_diffusers,
@@ -74,6 +75,7 @@ logger = logging.get_logger(__name__)
TEXT_ENCODER_NAME = "text_encoder"
UNET_NAME = "unet"
TRANSFORMER_NAME = "transformer"
LTX2_CONNECTOR_NAME = "connectors"
_MODULE_NAME_TO_ATTRIBUTE_MAP_FLUX = {"x_embedder": "in_channels"}
@@ -3011,6 +3013,233 @@ class LTXVideoLoraLoaderMixin(LoraBaseMixin):
super().unfuse_lora(components=components, **kwargs)
class LTX2LoraLoaderMixin(LoraBaseMixin):
r"""
Load LoRA layers into [`LTX2VideoTransformer3DModel`]. Specific to [`LTX2Pipeline`].
"""
_lora_loadable_modules = ["transformer", "connectors"]
transformer_name = TRANSFORMER_NAME
connectors_name = LTX2_CONNECTOR_NAME
@classmethod
@validate_hf_hub_args
def lora_state_dict(
cls,
pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]],
**kwargs,
):
r"""
See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`] for more details.
"""
# Load the main state dict first which has the LoRA layers for either of
# transformer and text encoder or both.
cache_dir = kwargs.pop("cache_dir", None)
force_download = kwargs.pop("force_download", False)
proxies = kwargs.pop("proxies", None)
local_files_only = kwargs.pop("local_files_only", None)
token = kwargs.pop("token", None)
revision = kwargs.pop("revision", None)
subfolder = kwargs.pop("subfolder", None)
weight_name = kwargs.pop("weight_name", None)
use_safetensors = kwargs.pop("use_safetensors", None)
return_lora_metadata = kwargs.pop("return_lora_metadata", False)
allow_pickle = False
if use_safetensors is None:
use_safetensors = True
allow_pickle = True
user_agent = {"file_type": "attn_procs_weights", "framework": "pytorch"}
state_dict, metadata = _fetch_state_dict(
pretrained_model_name_or_path_or_dict=pretrained_model_name_or_path_or_dict,
weight_name=weight_name,
use_safetensors=use_safetensors,
local_files_only=local_files_only,
cache_dir=cache_dir,
force_download=force_download,
proxies=proxies,
token=token,
revision=revision,
subfolder=subfolder,
user_agent=user_agent,
allow_pickle=allow_pickle,
)
is_dora_scale_present = any("dora_scale" in k for k in state_dict)
if is_dora_scale_present:
warn_msg = "It seems like you are using a DoRA checkpoint that is not compatible in Diffusers at the moment. So, we are going to filter out the keys associated to 'dora_scale` from the state dict. If you think this is a mistake please open an issue https://github.com/huggingface/diffusers/issues/new."
logger.warning(warn_msg)
state_dict = {k: v for k, v in state_dict.items() if "dora_scale" not in k}
final_state_dict = state_dict
is_non_diffusers_format = any(k.startswith("diffusion_model.") for k in state_dict)
has_connector = any(k.startswith("text_embedding_projection.") for k in state_dict)
if is_non_diffusers_format:
final_state_dict = _convert_non_diffusers_ltx2_lora_to_diffusers(state_dict)
if has_connector:
connectors_state_dict = _convert_non_diffusers_ltx2_lora_to_diffusers(
state_dict, "text_embedding_projection"
)
final_state_dict.update(connectors_state_dict)
out = (final_state_dict, metadata) if return_lora_metadata else final_state_dict
return out
def load_lora_weights(
self,
pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]],
adapter_name: Optional[str] = None,
hotswap: bool = False,
**kwargs,
):
"""
See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for more details.
"""
if not USE_PEFT_BACKEND:
raise ValueError("PEFT backend is required for this method.")
low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", _LOW_CPU_MEM_USAGE_DEFAULT_LORA)
if low_cpu_mem_usage and is_peft_version("<", "0.13.0"):
raise ValueError(
"`low_cpu_mem_usage=True` is not compatible with this `peft` version. Please update it with `pip install -U peft`."
)
# if a dict is passed, copy it instead of modifying it inplace
if isinstance(pretrained_model_name_or_path_or_dict, dict):
pretrained_model_name_or_path_or_dict = pretrained_model_name_or_path_or_dict.copy()
# First, ensure that the checkpoint is a compatible one and can be successfully loaded.
kwargs["return_lora_metadata"] = True
state_dict, metadata = self.lora_state_dict(pretrained_model_name_or_path_or_dict, **kwargs)
is_correct_format = all("lora" in key for key in state_dict.keys())
if not is_correct_format:
raise ValueError("Invalid LoRA checkpoint.")
transformer_peft_state_dict = {
k: v for k, v in state_dict.items() if k.startswith(f"{self.transformer_name}.")
}
connectors_peft_state_dict = {k: v for k, v in state_dict.items() if k.startswith(f"{self.connectors_name}.")}
self.load_lora_into_transformer(
transformer_peft_state_dict,
transformer=getattr(self, self.transformer_name) if not hasattr(self, "transformer") else self.transformer,
adapter_name=adapter_name,
metadata=metadata,
_pipeline=self,
low_cpu_mem_usage=low_cpu_mem_usage,
hotswap=hotswap,
)
if connectors_peft_state_dict:
self.load_lora_into_transformer(
connectors_peft_state_dict,
transformer=getattr(self, self.connectors_name)
if not hasattr(self, "connectors")
else self.connectors,
adapter_name=adapter_name,
metadata=metadata,
_pipeline=self,
low_cpu_mem_usage=low_cpu_mem_usage,
hotswap=hotswap,
prefix=self.connectors_name,
)
@classmethod
def load_lora_into_transformer(
cls,
state_dict,
transformer,
adapter_name=None,
_pipeline=None,
low_cpu_mem_usage=False,
hotswap: bool = False,
metadata=None,
prefix: str = "transformer",
):
"""
See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_into_unet`] for more details.
"""
if low_cpu_mem_usage and is_peft_version("<", "0.13.0"):
raise ValueError(
"`low_cpu_mem_usage=True` is not compatible with this `peft` version. Please update it with `pip install -U peft`."
)
# Load the layers corresponding to transformer.
logger.info(f"Loading {prefix}.")
transformer.load_lora_adapter(
state_dict,
network_alphas=None,
adapter_name=adapter_name,
metadata=metadata,
_pipeline=_pipeline,
low_cpu_mem_usage=low_cpu_mem_usage,
hotswap=hotswap,
prefix=prefix,
)
@classmethod
# Copied from diffusers.loaders.lora_pipeline.CogVideoXLoraLoaderMixin.save_lora_weights
def save_lora_weights(
cls,
save_directory: Union[str, os.PathLike],
transformer_lora_layers: Dict[str, Union[torch.nn.Module, torch.Tensor]] = None,
is_main_process: bool = True,
weight_name: str = None,
save_function: Callable = None,
safe_serialization: bool = True,
transformer_lora_adapter_metadata: Optional[dict] = None,
):
r"""
See [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for more information.
"""
lora_layers = {}
lora_metadata = {}
if transformer_lora_layers:
lora_layers[cls.transformer_name] = transformer_lora_layers
lora_metadata[cls.transformer_name] = transformer_lora_adapter_metadata
if not lora_layers:
raise ValueError("You must pass at least one of `transformer_lora_layers` or `text_encoder_lora_layers`.")
cls._save_lora_weights(
save_directory=save_directory,
lora_layers=lora_layers,
lora_metadata=lora_metadata,
is_main_process=is_main_process,
weight_name=weight_name,
save_function=save_function,
safe_serialization=safe_serialization,
)
# Copied from diffusers.loaders.lora_pipeline.CogVideoXLoraLoaderMixin.fuse_lora
def fuse_lora(
self,
components: List[str] = ["transformer"],
lora_scale: float = 1.0,
safe_fusing: bool = False,
adapter_names: Optional[List[str]] = None,
**kwargs,
):
r"""
See [`~loaders.StableDiffusionLoraLoaderMixin.fuse_lora`] for more details.
"""
super().fuse_lora(
components=components,
lora_scale=lora_scale,
safe_fusing=safe_fusing,
adapter_names=adapter_names,
**kwargs,
)
# Copied from diffusers.loaders.lora_pipeline.CogVideoXLoraLoaderMixin.unfuse_lora
def unfuse_lora(self, components: List[str] = ["transformer"], **kwargs):
r"""
See [`~loaders.StableDiffusionLoraLoaderMixin.unfuse_lora`] for more details.
"""
super().unfuse_lora(components=components, **kwargs)
class SanaLoraLoaderMixin(LoraBaseMixin):
r"""
Load LoRA layers into [`SanaTransformer2DModel`]. Specific to [`SanaPipeline`].

View File

@@ -67,6 +67,8 @@ _SET_ADAPTER_SCALE_FN_MAPPING = {
"QwenImageTransformer2DModel": lambda model_cls, weights: weights,
"Flux2Transformer2DModel": lambda model_cls, weights: weights,
"ZImageTransformer2DModel": lambda model_cls, weights: weights,
"LTX2VideoTransformer3DModel": lambda model_cls, weights: weights,
"LTX2TextConnectors": lambda model_cls, weights: weights,
}

View File

@@ -102,14 +102,14 @@ def get_block(
attention_head_dim: int,
norm_type: str,
act_fn: str,
qkv_mutliscales: Tuple[int, ...] = (),
qkv_multiscales: Tuple[int, ...] = (),
):
if block_type == "ResBlock":
block = ResBlock(in_channels, out_channels, norm_type, act_fn)
elif block_type == "EfficientViTBlock":
block = EfficientViTBlock(
in_channels, attention_head_dim=attention_head_dim, norm_type=norm_type, qkv_multiscales=qkv_mutliscales
in_channels, attention_head_dim=attention_head_dim, norm_type=norm_type, qkv_multiscales=qkv_multiscales
)
else:
@@ -247,7 +247,7 @@ class Encoder(nn.Module):
attention_head_dim=attention_head_dim,
norm_type="rms_norm",
act_fn="silu",
qkv_mutliscales=qkv_multiscales[i],
qkv_multiscales=qkv_multiscales[i],
)
down_block_list.append(block)
@@ -339,7 +339,7 @@ class Decoder(nn.Module):
attention_head_dim=attention_head_dim,
norm_type=norm_type[i],
act_fn=act_fn[i],
qkv_mutliscales=qkv_multiscales[i],
qkv_multiscales=qkv_multiscales[i],
)
up_block_list.append(block)

View File

@@ -41,11 +41,9 @@ class CacheMixin:
Enable caching techniques on the model.
Args:
config (`Union[PyramidAttentionBroadcastConfig, FasterCacheConfig, FirstBlockCacheConfig]`):
config (`Union[PyramidAttentionBroadcastConfig]`):
The configuration for applying the caching technique. Currently supported caching techniques are:
- [`~hooks.PyramidAttentionBroadcastConfig`]
- [`~hooks.FasterCacheConfig`]
- [`~hooks.FirstBlockCacheConfig`]
Example:

View File

@@ -18,6 +18,7 @@ from collections import OrderedDict
from dataclasses import dataclass, field, fields
from typing import Any, Dict, List, Literal, Optional, Type, Union
import PIL.Image
import torch
from ..configuration_utils import ConfigMixin, FrozenDict
@@ -342,6 +343,185 @@ class InputParam:
def __repr__(self):
return f"<{self.name}: {'required' if self.required else 'optional'}, default={self.default}>"
@classmethod
def template(cls, name: str) -> Optional["InputParam"]:
"""Get template for name if exists, otherwise None."""
if hasattr(cls, name) and callable(getattr(cls, name)):
return getattr(cls, name)()
return None
# ======================================================
# InputParam templates
# ======================================================
@classmethod
def prompt(cls) -> "InputParam":
return cls(
name="prompt", type_hint=str, required=True, description="The prompt or prompts to guide image generation."
)
@classmethod
def negative_prompt(cls) -> "InputParam":
return cls(
name="negative_prompt",
type_hint=str,
default=None,
description="The prompt or prompts not to guide the image generation.",
)
@classmethod
def max_sequence_length(cls, default: int = 512) -> "InputParam":
return cls(
name="max_sequence_length",
type_hint=int,
default=default,
description="Maximum sequence length for prompt encoding.",
)
@classmethod
def height(cls, default: Optional[int] = None) -> "InputParam":
return cls(
name="height", type_hint=int, default=default, description="The height in pixels of the generated image."
)
@classmethod
def width(cls, default: Optional[int] = None) -> "InputParam":
return cls(
name="width", type_hint=int, default=default, description="The width in pixels of the generated image."
)
@classmethod
def num_inference_steps(cls, default: int = 50) -> "InputParam":
return cls(
name="num_inference_steps", type_hint=int, default=default, description="The number of denoising steps."
)
@classmethod
def num_images_per_prompt(cls, default: int = 1) -> "InputParam":
return cls(
name="num_images_per_prompt",
type_hint=int,
default=default,
description="The number of images to generate per prompt.",
)
@classmethod
def generator(cls) -> "InputParam":
return cls(
name="generator",
type_hint=torch.Generator,
default=None,
description="Torch generator for deterministic generation.",
)
@classmethod
def sigmas(cls) -> "InputParam":
return cls(
name="sigmas", type_hint=List[float], default=None, description="Custom sigmas for the denoising process."
)
@classmethod
def strength(cls, default: float = 0.9) -> "InputParam":
return cls(name="strength", type_hint=float, default=default, description="Strength for img2img/inpainting.")
# images
@classmethod
def image(cls) -> "InputParam":
return cls(
name="image",
type_hint=PIL.Image.Image,
required=True,
description="Input image for img2img, editing, or conditioning.",
)
@classmethod
def mask_image(cls) -> "InputParam":
return cls(
name="mask_image", type_hint=PIL.Image.Image, required=True, description="Mask image for inpainting."
)
@classmethod
def control_image(cls) -> "InputParam":
return cls(
name="control_image",
type_hint=PIL.Image.Image,
required=True,
description="Control image for ControlNet conditioning.",
)
@classmethod
def padding_mask_crop(cls) -> "InputParam":
return cls(
name="padding_mask_crop",
type_hint=int,
default=None,
description="Padding for mask cropping in inpainting.",
)
@classmethod
def latents(cls) -> "InputParam":
return cls(
name="latents",
type_hint=torch.Tensor,
default=None,
description="Pre-generated noisy latents for image generation.",
)
@classmethod
def timesteps(cls) -> "InputParam":
return cls(
name="timesteps", type_hint=torch.Tensor, default=None, description="Timesteps for the denoising process."
)
@classmethod
def output_type(cls) -> "InputParam":
return cls(name="output_type", type_hint=str, default="pil", description="Output format: 'pil', 'np', 'pt''.")
@classmethod
def attention_kwargs(cls) -> "InputParam":
return cls(
name="attention_kwargs",
type_hint=Dict[str, Any],
default=None,
description="Additional kwargs for attention processors.",
)
@classmethod
def denoiser_input_fields(cls) -> "InputParam":
return cls(
kwargs_type="denoiser_input_fields",
type_hint=torch.Tensor,
description="conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.",
)
# ControlNet
@classmethod
def control_guidance_start(cls, default: float = 0.0) -> "InputParam":
return cls(
name="control_guidance_start",
type_hint=float,
default=default,
description="When to start applying ControlNet.",
)
@classmethod
def control_guidance_end(cls, default: float = 1.0) -> "InputParam":
return cls(
name="control_guidance_end",
type_hint=float,
default=default,
description="When to stop applying ControlNet.",
)
@classmethod
def controlnet_conditioning_scale(cls, default: float = 1.0) -> "InputParam":
return cls(
name="controlnet_conditioning_scale",
type_hint=float,
default=default,
description="Scale for ControlNet conditioning.",
)
@dataclass
class OutputParam:
@@ -357,6 +537,25 @@ class OutputParam:
f"<{self.name}: {self.type_hint.__name__ if hasattr(self.type_hint, '__name__') else str(self.type_hint)}>"
)
@classmethod
def template(cls, name: str) -> Optional["OutputParam"]:
"""Get template for name if exists, otherwise None."""
if hasattr(cls, name) and callable(getattr(cls, name)):
return getattr(cls, name)()
return None
# ======================================================
# OutputParam templates
# ======================================================
@classmethod
def images(cls) -> "OutputParam":
return cls(name="images", type_hint=List[PIL.Image.Image], description="Generated images.")
@classmethod
def latents(cls) -> "OutputParam":
return cls(name="latents", type_hint=torch.Tensor, description="Denoised latents.")
def format_inputs_short(inputs):
"""

View File

@@ -134,11 +134,11 @@ class QwenImagePrepareLatentsStep(ModularPipelineBlocks):
@property
def inputs(self) -> List[InputParam]:
return [
InputParam("latents"),
InputParam(name="height"),
InputParam(name="width"),
InputParam(name="num_images_per_prompt", default=1),
InputParam(name="generator"),
InputParam.latents(),
InputParam.height(),
InputParam.width(),
InputParam.num_images_per_prompt(),
InputParam.generator(),
InputParam(
name="batch_size",
required=True,
@@ -225,12 +225,14 @@ class QwenImageLayeredPrepareLatentsStep(ModularPipelineBlocks):
@property
def inputs(self) -> List[InputParam]:
return [
InputParam("latents"),
InputParam(name="height"),
InputParam(name="width"),
InputParam(name="layers", default=4),
InputParam(name="num_images_per_prompt", default=1),
InputParam(name="generator"),
InputParam.latents(),
InputParam.height(),
InputParam.width(),
InputParam(
name="layers", type_hint=int, default=4, description="Number of layers to extract from the image"
),
InputParam.num_images_per_prompt(),
InputParam.generator(),
InputParam(
name="batch_size",
required=True,
@@ -466,8 +468,8 @@ class QwenImageSetTimestepsStep(ModularPipelineBlocks):
@property
def inputs(self) -> List[InputParam]:
return [
InputParam(name="num_inference_steps", default=50),
InputParam(name="sigmas"),
InputParam.num_inference_steps(),
InputParam.sigmas(),
InputParam(
name="latents",
required=True,
@@ -532,8 +534,8 @@ class QwenImageLayeredSetTimestepsStep(ModularPipelineBlocks):
@property
def inputs(self) -> List[InputParam]:
return [
InputParam("num_inference_steps", default=50, type_hint=int),
InputParam("sigmas", type_hint=List[float]),
InputParam.num_inference_steps(),
InputParam.sigmas(),
InputParam("image_latents", required=True, type_hint=torch.Tensor),
]
@@ -590,15 +592,15 @@ class QwenImageSetTimestepsWithStrengthStep(ModularPipelineBlocks):
@property
def inputs(self) -> List[InputParam]:
return [
InputParam(name="num_inference_steps", default=50),
InputParam(name="sigmas"),
InputParam.num_inference_steps(),
InputParam.sigmas(),
InputParam(
name="latents",
required=True,
type_hint=torch.Tensor,
description="The latents to use for the denoising process, used to calculate the image sequence length.",
),
InputParam(name="strength", default=0.9),
InputParam.strength(0.9),
]
@property
@@ -886,7 +888,7 @@ class QwenImageLayeredRoPEInputsStep(ModularPipelineBlocks):
def inputs(self) -> List[InputParam]:
return [
InputParam(name="batch_size", required=True),
InputParam(name="layers", required=True),
InputParam(name="layers", default=4, description="Number of layers to extract from the image"),
InputParam(name="height", required=True),
InputParam(name="width", required=True),
InputParam(name="prompt_embeds_mask"),
@@ -971,9 +973,9 @@ class QwenImageControlNetBeforeDenoiserStep(ModularPipelineBlocks):
@property
def inputs(self) -> List[InputParam]:
return [
InputParam("control_guidance_start", default=0.0),
InputParam("control_guidance_end", default=1.0),
InputParam("controlnet_conditioning_scale", default=1.0),
InputParam.control_guidance_start(),
InputParam.control_guidance_end(),
InputParam.controlnet_conditioning_scale(),
InputParam("control_image_latents", required=True),
InputParam(
"timesteps",

View File

@@ -12,10 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List, Union
from typing import List
import numpy as np
import PIL
import torch
from ...configuration_utils import FrozenDict
@@ -91,7 +89,7 @@ class QwenImageLayeredAfterDenoiseStep(ModularPipelineBlocks):
InputParam("latents", required=True, type_hint=torch.Tensor),
InputParam("height", required=True, type_hint=int),
InputParam("width", required=True, type_hint=int),
InputParam("layers", required=True, type_hint=int),
InputParam("layers", default=4, description="Number of layers to extract from the image"),
]
@torch.no_grad()
@@ -140,13 +138,7 @@ class QwenImageDecoderStep(ModularPipelineBlocks):
@property
def intermediate_outputs(self) -> List[str]:
return [
OutputParam(
"images",
type_hint=Union[List[PIL.Image.Image], List[torch.Tensor], List[np.array]],
description="The generated images, can be a PIL.Image.Image, torch.Tensor or a numpy array",
)
]
return [OutputParam.images()]
@torch.no_grad()
def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
@@ -198,14 +190,19 @@ class QwenImageLayeredDecoderStep(ModularPipelineBlocks):
@property
def inputs(self) -> List[InputParam]:
return [
InputParam("latents", required=True, type_hint=torch.Tensor),
InputParam("output_type", default="pil", type_hint=str),
InputParam(
"latents",
required=True,
type_hint=torch.Tensor,
description="The latents to decode, can be generated in the denoise step",
),
InputParam.output_type(),
]
@property
def intermediate_outputs(self) -> List[OutputParam]:
return [
OutputParam(name="images", type_hint=List[List[PIL.Image.Image]]),
OutputParam.images(),
]
@torch.no_grad()
@@ -273,12 +270,7 @@ class QwenImageProcessImagesOutputStep(ModularPipelineBlocks):
def inputs(self) -> List[InputParam]:
return [
InputParam("images", required=True, description="the generated image from decoders step"),
InputParam(
name="output_type",
default="pil",
type_hint=str,
description="The type of the output images, can be 'pil', 'np', 'pt'",
),
InputParam.output_type(),
]
@staticmethod
@@ -323,12 +315,7 @@ class QwenImageInpaintProcessImagesOutputStep(ModularPipelineBlocks):
def inputs(self) -> List[InputParam]:
return [
InputParam("images", required=True, description="the generated image from decoders step"),
InputParam(
name="output_type",
default="pil",
type_hint=str,
description="The type of the output images, can be 'pil', 'np', 'pt'",
),
InputParam.output_type(),
InputParam("mask_overlay_kwargs"),
]

View File

@@ -218,7 +218,7 @@ class QwenImageLoopDenoiser(ModularPipelineBlocks):
@property
def inputs(self) -> List[InputParam]:
return [
InputParam("attention_kwargs"),
InputParam.attention_kwargs(),
InputParam(
"latents",
required=True,
@@ -231,10 +231,7 @@ class QwenImageLoopDenoiser(ModularPipelineBlocks):
type_hint=int,
description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.",
),
InputParam(
kwargs_type="denoiser_input_fields",
description="conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.",
),
InputParam.denoiser_input_fields(),
InputParam(
"img_shapes",
required=True,
@@ -322,7 +319,7 @@ class QwenImageEditLoopDenoiser(ModularPipelineBlocks):
@property
def inputs(self) -> List[InputParam]:
return [
InputParam("attention_kwargs"),
InputParam.attention_kwargs(),
InputParam(
"latents",
required=True,
@@ -335,10 +332,7 @@ class QwenImageEditLoopDenoiser(ModularPipelineBlocks):
type_hint=int,
description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.",
),
InputParam(
kwargs_type="denoiser_input_fields",
description="conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.",
),
InputParam.denoiser_input_fields(),
InputParam(
"img_shapes",
required=True,
@@ -424,7 +418,7 @@ class QwenImageLoopAfterDenoiser(ModularPipelineBlocks):
@property
def intermediate_outputs(self) -> List[OutputParam]:
return [
OutputParam("latents", type_hint=torch.Tensor, description="The denoised latents."),
OutputParam.latents(),
]
@torch.no_grad()

View File

@@ -301,8 +301,12 @@ class QwenImageEditResizeStep(ModularPipelineBlocks):
@property
def inputs(self) -> List[InputParam]:
return [
InputParam(
name=self._image_input_name, required=True, type_hint=torch.Tensor, description="The image to resize"
InputParam.template(self._image_input_name)
or InputParam(
name=self._image_input_name,
required=True,
type_hint=torch.Tensor,
description="Input image for conditioning",
),
]
@@ -381,7 +385,8 @@ class QwenImageLayeredResizeStep(ModularPipelineBlocks):
@property
def inputs(self) -> List[InputParam]:
return [
InputParam(
InputParam.template(self._image_input_name)
or InputParam(
name=self._image_input_name, required=True, type_hint=torch.Tensor, description="The image to resize"
),
InputParam(
@@ -484,7 +489,8 @@ class QwenImageEditPlusResizeStep(ModularPipelineBlocks):
@property
def inputs(self) -> List[InputParam]:
return [
InputParam(
InputParam.template(self._image_input_name)
or InputParam(
name=self._image_input_name,
required=True,
type_hint=torch.Tensor,
@@ -564,7 +570,9 @@ class QwenImageLayeredGetImagePromptStep(ModularPipelineBlocks):
@property
def inputs(self) -> List[InputParam]:
return [
InputParam(name="prompt", type_hint=str, description="The prompt to encode"),
InputParam(
name="prompt", type_hint=str, description="The prompt to encode"
), # it is not required for qwenimage-layered, unlike other pipelines
InputParam(
name="resized_image",
required=True,
@@ -647,11 +655,9 @@ class QwenImageTextEncoderStep(ModularPipelineBlocks):
@property
def inputs(self) -> List[InputParam]:
return [
InputParam(name="prompt", required=True, type_hint=str, description="The prompt to encode"),
InputParam(name="negative_prompt", type_hint=str, description="The negative prompt to encode"),
InputParam(
name="max_sequence_length", type_hint=int, description="The max sequence length to use", default=1024
),
InputParam.prompt(),
InputParam.negative_prompt(),
InputParam.max_sequence_length(1024),
]
@property
@@ -772,8 +778,8 @@ class QwenImageEditTextEncoderStep(ModularPipelineBlocks):
@property
def inputs(self) -> List[InputParam]:
return [
InputParam(name="prompt", required=True, type_hint=str, description="The prompt to encode"),
InputParam(name="negative_prompt", type_hint=str, description="The negative prompt to encode"),
InputParam.prompt(),
InputParam.negative_prompt(),
InputParam(
name="resized_image",
required=True,
@@ -895,8 +901,8 @@ class QwenImageEditPlusTextEncoderStep(ModularPipelineBlocks):
@property
def inputs(self) -> List[InputParam]:
return [
InputParam(name="prompt", required=True, type_hint=str, description="The prompt to encode"),
InputParam(name="negative_prompt", type_hint=str, description="The negative prompt to encode"),
InputParam.prompt(),
InputParam.negative_prompt(),
InputParam(
name="resized_cond_image",
required=True,
@@ -1010,11 +1016,11 @@ class QwenImageInpaintProcessImagesInputStep(ModularPipelineBlocks):
@property
def inputs(self) -> List[InputParam]:
return [
InputParam("mask_image", required=True),
InputParam("image", required=True),
InputParam("height"),
InputParam("width"),
InputParam("padding_mask_crop"),
InputParam.mask_image(),
InputParam.image(),
InputParam.height(),
InputParam.width(),
InputParam.padding_mask_crop(),
]
@property
@@ -1082,9 +1088,14 @@ class QwenImageEditInpaintProcessImagesInputStep(ModularPipelineBlocks):
@property
def inputs(self) -> List[InputParam]:
return [
InputParam("mask_image", required=True),
InputParam("resized_image", required=True),
InputParam("padding_mask_crop"),
InputParam.mask_image(),
InputParam(
"resized_image",
required=True,
type_hint=PIL.Image.Image,
description="The resized image. should be generated using a resize step",
),
InputParam.padding_mask_crop(),
]
@property
@@ -1140,9 +1151,9 @@ class QwenImageProcessImagesInputStep(ModularPipelineBlocks):
@property
def inputs(self) -> List[InputParam]:
return [
InputParam("image", required=True),
InputParam("height"),
InputParam("width"),
InputParam.image(),
InputParam.height(),
InputParam.width(),
]
@property
@@ -1312,7 +1323,10 @@ class QwenImageVaeEncoderStep(ModularPipelineBlocks):
@property
def inputs(self) -> List[InputParam]:
return [InputParam(self._image_input_name, required=True), InputParam("generator")]
return [
InputParam.template(self._image_input_name) or InputParam(name=self._image_input_name, required=True),
InputParam.generator(),
]
@property
def intermediate_outputs(self) -> List[OutputParam]:
@@ -1383,10 +1397,10 @@ class QwenImageControlNetVaeEncoderStep(ModularPipelineBlocks):
@property
def inputs(self) -> List[InputParam]:
inputs = [
InputParam("control_image", required=True),
InputParam("height"),
InputParam("width"),
InputParam("generator"),
InputParam.control_image(),
InputParam.height(),
InputParam.width(),
InputParam.generator(),
]
return inputs

View File

@@ -129,7 +129,7 @@ class QwenImageTextInputsStep(ModularPipelineBlocks):
@property
def inputs(self) -> List[InputParam]:
return [
InputParam(name="num_images_per_prompt", default=1),
InputParam.num_images_per_prompt(),
InputParam(name="prompt_embeds", required=True, kwargs_type="denoiser_input_fields"),
InputParam(name="prompt_embeds_mask", required=True, kwargs_type="denoiser_input_fields"),
InputParam(name="negative_prompt_embeds", kwargs_type="denoiser_input_fields"),
@@ -269,17 +269,17 @@ class QwenImageAdditionalInputsStep(ModularPipelineBlocks):
@property
def inputs(self) -> List[InputParam]:
inputs = [
InputParam(name="num_images_per_prompt", default=1),
InputParam.num_images_per_prompt(),
InputParam(name="batch_size", required=True),
InputParam(name="height"),
InputParam(name="width"),
InputParam.height(),
InputParam.width(),
]
for image_latent_input_name in self._image_latent_inputs:
inputs.append(InputParam(name=image_latent_input_name))
inputs.append(InputParam.template(image_latent_input_name) or InputParam(name=image_latent_input_name))
for input_name in self._additional_batch_inputs:
inputs.append(InputParam(name=input_name))
inputs.append(InputParam.template(input_name) or InputParam(name=input_name))
return inputs
@@ -398,17 +398,17 @@ class QwenImageEditPlusAdditionalInputsStep(ModularPipelineBlocks):
@property
def inputs(self) -> List[InputParam]:
inputs = [
InputParam(name="num_images_per_prompt", default=1),
InputParam.num_images_per_prompt(),
InputParam(name="batch_size", required=True),
InputParam(name="height"),
InputParam(name="width"),
InputParam.height(),
InputParam.width(),
]
for image_latent_input_name in self._image_latent_inputs:
inputs.append(InputParam(name=image_latent_input_name))
inputs.append(InputParam.template(image_latent_input_name) or InputParam(name=image_latent_input_name))
for input_name in self._additional_batch_inputs:
inputs.append(InputParam(name=input_name))
inputs.append(InputParam.template(input_name) or InputParam(name=input_name))
return inputs
@@ -544,15 +544,15 @@ class QwenImageLayeredAdditionalInputsStep(ModularPipelineBlocks):
@property
def inputs(self) -> List[InputParam]:
inputs = [
InputParam(name="num_images_per_prompt", default=1),
InputParam.num_images_per_prompt(),
InputParam(name="batch_size", required=True),
]
for image_latent_input_name in self._image_latent_inputs:
inputs.append(InputParam(name=image_latent_input_name))
inputs.append(InputParam.template(image_latent_input_name) or InputParam(name=image_latent_input_name))
for input_name in self._additional_batch_inputs:
inputs.append(InputParam(name=input_name))
inputs.append(InputParam.template(input_name) or InputParam(name=input_name))
return inputs
@@ -638,9 +638,9 @@ class QwenImageControlNetInputsStep(ModularPipelineBlocks):
return [
InputParam(name="control_image_latents", required=True),
InputParam(name="batch_size", required=True),
InputParam(name="num_images_per_prompt", default=1),
InputParam(name="height"),
InputParam(name="width"),
InputParam.num_images_per_prompt(),
InputParam.height(),
InputParam.width(),
]
@torch.no_grad()

View File

@@ -16,7 +16,7 @@ from typing import Optional
from ...utils import logging
from ..modular_pipeline import AutoPipelineBlocks, ConditionalPipelineBlocks, SequentialPipelineBlocks
from ..modular_pipeline_utils import InsertableDict
from ..modular_pipeline_utils import InsertableDict, OutputParam
from .before_denoise import (
QwenImageCreateMaskLatentsStep,
QwenImageEditRoPEInputsStep,
@@ -56,8 +56,61 @@ logger = logging.get_logger(__name__)
# ====================
# auto_docstring
class QwenImageEditVLEncoderStep(SequentialPipelineBlocks):
"""VL encoder that takes both image and text prompts."""
"""
class QwenImageEditVLEncoderStep
QwenImage-Edit VL encoder step that encode the image and text prompts together.
Components:
image_resize_processor (`VaeImageProcessor`) [subfolder=]
text_encoder (`Qwen2_5_VLForConditionalGeneration`) [subfolder=]
processor (`Qwen2VLProcessor`) [subfolder=]
guider (`ClassifierFreeGuidance`) [subfolder=]
Configs:
prompt_template_encode (default: <|im_start|>system
Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how
the user's text instruction should alter or modify the image. Generate a new image that meets the user's
requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user
<|vision_start|><|image_pad|><|vision_end|>{}<|im_end|> <|im_start|>assistant )
prompt_template_encode_start_idx (default: 64)
Inputs:
image (`Image`):
Input image for img2img, editing, or conditioning.
prompt (`str`):
The prompt or prompts to guide image generation.
negative_prompt (`str`, *optional*):
The prompt or prompts not to guide the image generation.
Outputs:
resized_image (`List`):
The resized images
prompt_embeds (`Tensor`):
The prompt embeddings
prompt_embeds_mask (`Tensor`):
The encoder attention mask
negative_prompt_embeds (`Tensor`):
The negative prompt embeddings
negative_prompt_embeds_mask (`Tensor`):
The negative prompt embeddings mask
"""
model_name = "qwenimage-edit"
block_classes = [
@@ -77,7 +130,40 @@ class QwenImageEditVLEncoderStep(SequentialPipelineBlocks):
# Edit VAE encoder
# auto_docstring
class QwenImageEditVaeEncoderStep(SequentialPipelineBlocks):
"""
class QwenImageEditVaeEncoderStep
Vae encoder step that encode the image inputs into their latent representations.
Components:
image_resize_processor (`VaeImageProcessor`) [subfolder=]
image_processor (`VaeImageProcessor`) [subfolder=]
vae (`AutoencoderKLQwenImage`) [subfolder=]
Inputs:
image (`Image`):
Input image for img2img, editing, or conditioning.
generator (`Generator`, *optional*):
Torch generator for deterministic generation.
Outputs:
resized_image (`List`):
The resized images
processed_image (`None`):
image_latents (`Tensor`):
The latents representing the reference image(s). Single tensor or list depending on input.
"""
model_name = "qwenimage-edit"
block_classes = [
QwenImageEditResizeStep(),
@@ -92,7 +178,54 @@ class QwenImageEditVaeEncoderStep(SequentialPipelineBlocks):
# Edit Inpaint VAE encoder
# auto_docstring
class QwenImageEditInpaintVaeEncoderStep(SequentialPipelineBlocks):
"""
class QwenImageEditInpaintVaeEncoderStep
This step is used for processing image and mask inputs for QwenImage-Edit inpaint tasks. It:
- resize the image for target area (1024 * 1024) while maintaining the aspect ratio.
- process the resized image and mask image.
- create image latents.
Components:
image_resize_processor (`VaeImageProcessor`) [subfolder=]
image_mask_processor (`InpaintProcessor`) [subfolder=]
vae (`AutoencoderKLQwenImage`) [subfolder=]
Inputs:
image (`Image`):
Input image for img2img, editing, or conditioning.
mask_image (`Image`):
Mask image for inpainting.
padding_mask_crop (`int`, *optional*):
Padding for mask cropping in inpainting.
generator (`Generator`, *optional*):
Torch generator for deterministic generation.
Outputs:
resized_image (`List`):
The resized images
processed_image (`None`):
processed_mask_image (`None`):
mask_overlay_kwargs (`Dict`):
The kwargs for the postprocess step to apply the mask overlay
image_latents (`Tensor`):
The latents representing the reference image(s). Single tensor or list depending on input.
"""
model_name = "qwenimage-edit"
block_classes = [
QwenImageEditResizeStep(),
@@ -134,7 +267,55 @@ class QwenImageEditAutoVaeEncoderStep(AutoPipelineBlocks):
# assemble input steps
# auto_docstring
class QwenImageEditInputStep(SequentialPipelineBlocks):
"""
class QwenImageEditInputStep
Input step that prepares the inputs for the edit denoising step. It:
- make sure the text embeddings have consistent batch size as well as the additional inputs.
- update height/width based `image_latents`, patchify `image_latents`.
Components:
pachifier (`QwenImagePachifier`) [subfolder=]
Inputs:
num_images_per_prompt (`int`, *optional*, defaults to 1):
The number of images to generate per prompt.
prompt_embeds (`None`):
prompt_embeds_mask (`None`):
negative_prompt_embeds (`None`, *optional*):
negative_prompt_embeds_mask (`None`, *optional*):
height (`int`, *optional*):
The height in pixels of the generated image.
width (`int`, *optional*):
The width in pixels of the generated image.
image_latents (`None`, *optional*):
Outputs:
batch_size (`int`):
Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt
dtype (`dtype`):
Data type of model tensor inputs (determined by `prompt_embeds`)
image_height (`int`):
The image height calculated from the image latents dimension
image_width (`int`):
The image width calculated from the image latents dimension
"""
model_name = "qwenimage-edit"
block_classes = [
QwenImageTextInputsStep(),
@@ -151,7 +332,57 @@ class QwenImageEditInputStep(SequentialPipelineBlocks):
)
# auto_docstring
class QwenImageEditInpaintInputStep(SequentialPipelineBlocks):
"""
class QwenImageEditInpaintInputStep
Input step that prepares the inputs for the edit inpaint denoising step. It:
- make sure the text embeddings have consistent batch size as well as the additional inputs.
- update height/width based `image_latents`, patchify `image_latents`.
Components:
pachifier (`QwenImagePachifier`) [subfolder=]
Inputs:
num_images_per_prompt (`int`, *optional*, defaults to 1):
The number of images to generate per prompt.
prompt_embeds (`None`):
prompt_embeds_mask (`None`):
negative_prompt_embeds (`None`, *optional*):
negative_prompt_embeds_mask (`None`, *optional*):
height (`int`, *optional*):
The height in pixels of the generated image.
width (`int`, *optional*):
The width in pixels of the generated image.
image_latents (`None`, *optional*):
processed_mask_image (`None`, *optional*):
Outputs:
batch_size (`int`):
Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt
dtype (`dtype`):
Data type of model tensor inputs (determined by `prompt_embeds`)
image_height (`int`):
The image height calculated from the image latents dimension
image_width (`int`):
The image width calculated from the image latents dimension
"""
model_name = "qwenimage-edit"
block_classes = [
QwenImageTextInputsStep(),
@@ -171,7 +402,51 @@ class QwenImageEditInpaintInputStep(SequentialPipelineBlocks):
# assemble prepare latents steps
# auto_docstring
class QwenImageEditInpaintPrepareLatentsStep(SequentialPipelineBlocks):
"""
class QwenImageEditInpaintPrepareLatentsStep
This step prepares the latents/image_latents and mask inputs for the edit inpainting denoising step. It:
- Add noise to the image latents to create the latents input for the denoiser.
- Create the patchified latents `mask` based on the processed mask image.
Components:
scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
pachifier (`QwenImagePachifier`) [subfolder=]
Inputs:
latents (`Tensor`):
The initial random noised, can be generated in prepare latent step.
image_latents (`Tensor`):
The image latents to use for the denoising process. Can be generated in vae encoder and packed in input
step.
timesteps (`Tensor`):
The timesteps to use for the denoising process. Can be generated in set_timesteps step.
processed_mask_image (`Tensor`):
The processed mask to use for the inpainting process.
height (`None`):
width (`None`):
dtype (`None`):
Outputs:
initial_noise (`Tensor`):
The initial random noised used for inpainting denoising.
mask (`Tensor`):
The mask to use for the inpainting process.
"""
model_name = "qwenimage-edit"
block_classes = [QwenImagePrepareLatentsWithStrengthStep(), QwenImageCreateMaskLatentsStep()]
block_names = ["add_noise_to_latents", "create_mask_latents"]
@@ -186,7 +461,68 @@ class QwenImageEditInpaintPrepareLatentsStep(SequentialPipelineBlocks):
# Qwen Image Edit (image2image) core denoise step
# auto_docstring
class QwenImageEditCoreDenoiseStep(SequentialPipelineBlocks):
"""
class QwenImageEditCoreDenoiseStep
Core denoising workflow for QwenImage-Edit edit (img2img) task.
Components:
pachifier (`QwenImagePachifier`) [subfolder=]
scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
guider (`ClassifierFreeGuidance`) [subfolder=]
transformer (`QwenImageTransformer2DModel`) [subfolder=]
Inputs:
num_images_per_prompt (`int`, *optional*, defaults to 1):
The number of images to generate per prompt.
prompt_embeds (`None`):
prompt_embeds_mask (`None`):
negative_prompt_embeds (`None`, *optional*):
negative_prompt_embeds_mask (`None`, *optional*):
height (`int`, *optional*):
The height in pixels of the generated image.
width (`int`, *optional*):
The width in pixels of the generated image.
image_latents (`None`, *optional*):
latents (`Tensor`, *optional*):
Pre-generated noisy latents for image generation.
generator (`Generator`, *optional*):
Torch generator for deterministic generation.
num_inference_steps (`int`, *optional*, defaults to 50):
The number of denoising steps.
sigmas (`List`, *optional*):
Custom sigmas for the denoising process.
attention_kwargs (`Dict`, *optional*):
Additional kwargs for attention processors.
**denoiser_input_fields (`Tensor`, *optional*):
conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
Outputs:
latents (`Tensor`):
Denoised latents.
"""
model_name = "qwenimage-edit"
block_classes = [
QwenImageEditInputStep(),
@@ -209,9 +545,81 @@ class QwenImageEditCoreDenoiseStep(SequentialPipelineBlocks):
def description(self):
return "Core denoising workflow for QwenImage-Edit edit (img2img) task."
@property
def outputs(self):
return [
OutputParam.latents(),
]
# Qwen Image Edit (inpainting) core denoise step
# auto_docstring
class QwenImageEditInpaintCoreDenoiseStep(SequentialPipelineBlocks):
"""
class QwenImageEditInpaintCoreDenoiseStep
Core denoising workflow for QwenImage-Edit edit inpaint task.
Components:
pachifier (`QwenImagePachifier`) [subfolder=]
scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
guider (`ClassifierFreeGuidance`) [subfolder=]
transformer (`QwenImageTransformer2DModel`) [subfolder=]
Inputs:
num_images_per_prompt (`int`, *optional*, defaults to 1):
The number of images to generate per prompt.
prompt_embeds (`None`):
prompt_embeds_mask (`None`):
negative_prompt_embeds (`None`, *optional*):
negative_prompt_embeds_mask (`None`, *optional*):
height (`int`, *optional*):
The height in pixels of the generated image.
width (`int`, *optional*):
The width in pixels of the generated image.
image_latents (`None`, *optional*):
processed_mask_image (`None`, *optional*):
latents (`Tensor`, *optional*):
Pre-generated noisy latents for image generation.
generator (`Generator`, *optional*):
Torch generator for deterministic generation.
num_inference_steps (`int`, *optional*, defaults to 50):
The number of denoising steps.
sigmas (`List`, *optional*):
Custom sigmas for the denoising process.
strength (`float`, *optional*, defaults to 0.9):
Strength for img2img/inpainting.
attention_kwargs (`Dict`, *optional*):
Additional kwargs for attention processors.
**denoiser_input_fields (`Tensor`, *optional*):
conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
Outputs:
latents (`Tensor`):
Denoised latents.
"""
model_name = "qwenimage-edit"
block_classes = [
QwenImageEditInpaintInputStep(),
@@ -236,6 +644,12 @@ class QwenImageEditInpaintCoreDenoiseStep(SequentialPipelineBlocks):
def description(self):
return "Core denoising workflow for QwenImage-Edit edit inpaint task."
@property
def outputs(self):
return [
OutputParam.latents(),
]
# Auto core denoise step for QwenImage Edit
class QwenImageEditAutoCoreDenoiseStep(ConditionalPipelineBlocks):
@@ -264,6 +678,12 @@ class QwenImageEditAutoCoreDenoiseStep(ConditionalPipelineBlocks):
"Supports edit (img2img) and edit inpainting tasks for QwenImage-Edit."
)
@property
def outputs(self):
return [
OutputParam.latents(),
]
# ====================
# 4. DECODE
@@ -271,7 +691,33 @@ class QwenImageEditAutoCoreDenoiseStep(ConditionalPipelineBlocks):
# Decode step (standard)
# auto_docstring
class QwenImageEditDecodeStep(SequentialPipelineBlocks):
"""
class QwenImageEditDecodeStep
Decode step that decodes the latents to images and postprocess the generated image.
Components:
vae (`AutoencoderKLQwenImage`) [subfolder=]
image_processor (`VaeImageProcessor`) [subfolder=]
Inputs:
latents (`Tensor`):
The latents to decode, can be generated in the denoise step
output_type (`str`, *optional*, defaults to pil):
Output format: 'pil', 'np', 'pt''.
Outputs:
images (`List`):
Generated images.
"""
model_name = "qwenimage-edit"
block_classes = [QwenImageDecoderStep(), QwenImageProcessImagesOutputStep()]
block_names = ["decode", "postprocess"]
@@ -282,7 +728,36 @@ class QwenImageEditDecodeStep(SequentialPipelineBlocks):
# Inpaint decode step
# auto_docstring
class QwenImageEditInpaintDecodeStep(SequentialPipelineBlocks):
"""
class QwenImageEditInpaintDecodeStep
Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask
overlay to the original image.
Components:
vae (`AutoencoderKLQwenImage`) [subfolder=]
image_mask_processor (`InpaintProcessor`) [subfolder=]
Inputs:
latents (`Tensor`):
The latents to decode, can be generated in the denoise step
output_type (`str`, *optional*, defaults to pil):
Output format: 'pil', 'np', 'pt''.
mask_overlay_kwargs (`None`, *optional*):
Outputs:
images (`List`):
Generated images.
"""
model_name = "qwenimage-edit"
block_classes = [QwenImageDecoderStep(), QwenImageInpaintProcessImagesOutputStep()]
block_names = ["decode", "postprocess"]
@@ -307,6 +782,12 @@ class QwenImageEditAutoDecodeStep(AutoPipelineBlocks):
" - `QwenImageEditDecodeStep` (edit) is used when `mask` is not provided.\n"
)
@property
def outputs(self):
return [
OutputParam.latents(),
]
# ====================
# 5. AUTO BLOCKS & PRESETS
@@ -322,7 +803,110 @@ EDIT_AUTO_BLOCKS = InsertableDict(
)
# auto_docstring
class QwenImageEditAutoBlocks(SequentialPipelineBlocks):
"""
class QwenImageEditAutoBlocks
Auto Modular pipeline for edit (img2img) and edit inpaint tasks using QwenImage-Edit.
- for edit (img2img) generation, you need to provide `image`
- for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide
`padding_mask_crop`
Components:
image_resize_processor (`VaeImageProcessor`) [subfolder=]
text_encoder (`Qwen2_5_VLForConditionalGeneration`) [subfolder=]
processor (`Qwen2VLProcessor`) [subfolder=]
guider (`ClassifierFreeGuidance`) [subfolder=]
image_mask_processor (`InpaintProcessor`) [subfolder=]
vae (`AutoencoderKLQwenImage`) [subfolder=]
image_processor (`VaeImageProcessor`) [subfolder=]
pachifier (`QwenImagePachifier`) [subfolder=]
scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
transformer (`QwenImageTransformer2DModel`) [subfolder=]
Configs:
prompt_template_encode (default: <|im_start|>system
Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how
the user's text instruction should alter or modify the image. Generate a new image that meets the user's
requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user
<|vision_start|><|image_pad|><|vision_end|>{}<|im_end|> <|im_start|>assistant )
prompt_template_encode_start_idx (default: 64)
Inputs:
image (`Image`):
Input image for img2img, editing, or conditioning.
prompt (`str`):
The prompt or prompts to guide image generation.
negative_prompt (`str`, *optional*):
The prompt or prompts not to guide the image generation.
mask_image (`Image`, *optional*):
Mask image for inpainting.
padding_mask_crop (`int`, *optional*):
Padding for mask cropping in inpainting.
generator (`Generator`, *optional*):
Torch generator for deterministic generation.
num_images_per_prompt (`int`, *optional*, defaults to 1):
The number of images to generate per prompt.
height (`int`):
The height in pixels of the generated image.
width (`int`):
The width in pixels of the generated image.
image_latents (`None`):
processed_mask_image (`None`, *optional*):
latents (`Tensor`):
Pre-generated noisy latents for image generation.
num_inference_steps (`int`):
The number of denoising steps.
sigmas (`List`, *optional*):
Custom sigmas for the denoising process.
strength (`float`, *optional*, defaults to 0.9):
Strength for img2img/inpainting.
attention_kwargs (`Dict`, *optional*):
Additional kwargs for attention processors.
**denoiser_input_fields (`Tensor`, *optional*):
conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
output_type (`str`, *optional*, defaults to pil):
Output format: 'pil', 'np', 'pt''.
mask_overlay_kwargs (`None`, *optional*):
Outputs:
images (`List`):
Generated images.
"""
model_name = "qwenimage-edit"
block_classes = EDIT_AUTO_BLOCKS.values()
block_names = EDIT_AUTO_BLOCKS.keys()
@@ -334,3 +918,9 @@ class QwenImageEditAutoBlocks(SequentialPipelineBlocks):
"- for edit (img2img) generation, you need to provide `image`\n"
"- for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop`\n"
)
@property
def outputs(self):
return [
OutputParam.images(),
]

View File

@@ -12,9 +12,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from ...utils import logging
from ..modular_pipeline import SequentialPipelineBlocks
from ..modular_pipeline_utils import InsertableDict
from ..modular_pipeline_utils import InsertableDict, OutputParam
from .before_denoise import (
QwenImageEditPlusRoPEInputsStep,
QwenImagePrepareLatentsStep,
@@ -48,8 +49,63 @@ logger = logging.get_logger(__name__)
# ====================
# auto_docstring
class QwenImageEditPlusVLEncoderStep(SequentialPipelineBlocks):
"""VL encoder that takes both image and text prompts. Uses 384x384 target area."""
"""
class QwenImageEditPlusVLEncoderStep
QwenImage-Edit Plus VL encoder step that encodes the image and text prompts together.
Components:
image_resize_processor (`VaeImageProcessor`) [subfolder=]
text_encoder (`Qwen2_5_VLForConditionalGeneration`) [subfolder=]
processor (`Qwen2VLProcessor`) [subfolder=]
guider (`ClassifierFreeGuidance`) [subfolder=]
Configs:
prompt_template_encode (default: <|im_start|>system
Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how
the user's text instruction should alter or modify the image. Generate a new image that meets the user's
requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user
{}<|im_end|> <|im_start|>assistant )
img_template_encode (default: Picture {}: <|vision_start|><|image_pad|><|vision_end|>)
prompt_template_encode_start_idx (default: 64)
Inputs:
image (`Image`):
Input image for img2img, editing, or conditioning.
prompt (`str`):
The prompt or prompts to guide image generation.
negative_prompt (`str`, *optional*):
The prompt or prompts not to guide the image generation.
Outputs:
resized_cond_image (`List`):
The resized images
prompt_embeds (`Tensor`):
The prompt embeddings
prompt_embeds_mask (`Tensor`):
The encoder attention mask
negative_prompt_embeds (`Tensor`):
The negative prompt embeddings
negative_prompt_embeds_mask (`Tensor`):
The negative prompt embeddings mask
"""
model_name = "qwenimage-edit-plus"
block_classes = [
@@ -68,8 +124,40 @@ class QwenImageEditPlusVLEncoderStep(SequentialPipelineBlocks):
# ====================
# auto_docstring
class QwenImageEditPlusVaeEncoderStep(SequentialPipelineBlocks):
"""VAE encoder that handles multiple images with different sizes. Uses 1024x1024 target area."""
"""
class QwenImageEditPlusVaeEncoderStep
VAE encoder step that encodes image inputs into latent representations. Each image is resized independently based
on its own aspect ratio to 1024x1024 target area.
Components:
image_resize_processor (`VaeImageProcessor`) [subfolder=]
image_processor (`VaeImageProcessor`) [subfolder=]
vae (`AutoencoderKLQwenImage`) [subfolder=]
Inputs:
image (`Image`):
Input image for img2img, editing, or conditioning.
generator (`Generator`, *optional*):
Torch generator for deterministic generation.
Outputs:
resized_image (`List`):
The resized images
processed_image (`None`):
image_latents (`Tensor`):
The latents representing the reference image(s). Single tensor or list depending on input.
"""
model_name = "qwenimage-edit-plus"
block_classes = [
@@ -93,7 +181,57 @@ class QwenImageEditPlusVaeEncoderStep(SequentialPipelineBlocks):
# assemble input steps
# auto_docstring
class QwenImageEditPlusInputStep(SequentialPipelineBlocks):
"""
class QwenImageEditPlusInputStep
Input step that prepares the inputs for the Edit Plus denoising step. It:
- Standardizes text embeddings batch size.
- Processes list of image latents: patchifies, concatenates along dim=1, expands batch.
- Outputs lists of image_height/image_width for RoPE calculation.
- Defaults height/width from last image in the list.
Components:
pachifier (`QwenImagePachifier`) [subfolder=]
Inputs:
num_images_per_prompt (`int`, *optional*, defaults to 1):
The number of images to generate per prompt.
prompt_embeds (`None`):
prompt_embeds_mask (`None`):
negative_prompt_embeds (`None`, *optional*):
negative_prompt_embeds_mask (`None`, *optional*):
height (`int`, *optional*):
The height in pixels of the generated image.
width (`int`, *optional*):
The width in pixels of the generated image.
image_latents (`None`, *optional*):
Outputs:
batch_size (`int`):
Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt
dtype (`dtype`):
Data type of model tensor inputs (determined by `prompt_embeds`)
image_height (`List`):
The image heights calculated from the image latents dimension
image_width (`List`):
The image widths calculated from the image latents dimension
"""
model_name = "qwenimage-edit-plus"
block_classes = [
QwenImageTextInputsStep(),
@@ -113,7 +251,68 @@ class QwenImageEditPlusInputStep(SequentialPipelineBlocks):
# Qwen Image Edit Plus (image2image) core denoise step
# auto_docstring
class QwenImageEditPlusCoreDenoiseStep(SequentialPipelineBlocks):
"""
class QwenImageEditPlusCoreDenoiseStep
Core denoising workflow for QwenImage-Edit Plus edit (img2img) task.
Components:
pachifier (`QwenImagePachifier`) [subfolder=]
scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
guider (`ClassifierFreeGuidance`) [subfolder=]
transformer (`QwenImageTransformer2DModel`) [subfolder=]
Inputs:
num_images_per_prompt (`int`, *optional*, defaults to 1):
The number of images to generate per prompt.
prompt_embeds (`None`):
prompt_embeds_mask (`None`):
negative_prompt_embeds (`None`, *optional*):
negative_prompt_embeds_mask (`None`, *optional*):
height (`int`, *optional*):
The height in pixels of the generated image.
width (`int`, *optional*):
The width in pixels of the generated image.
image_latents (`None`, *optional*):
latents (`Tensor`, *optional*):
Pre-generated noisy latents for image generation.
generator (`Generator`, *optional*):
Torch generator for deterministic generation.
num_inference_steps (`int`, *optional*, defaults to 50):
The number of denoising steps.
sigmas (`List`, *optional*):
Custom sigmas for the denoising process.
attention_kwargs (`Dict`, *optional*):
Additional kwargs for attention processors.
**denoiser_input_fields (`Tensor`, *optional*):
conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
Outputs:
latents (`Tensor`):
Denoised latents.
"""
model_name = "qwenimage-edit-plus"
block_classes = [
QwenImageEditPlusInputStep(),
@@ -136,13 +335,45 @@ class QwenImageEditPlusCoreDenoiseStep(SequentialPipelineBlocks):
def description(self):
return "Core denoising workflow for QwenImage-Edit Plus edit (img2img) task."
@property
def outputs(self):
return [
OutputParam.latents(),
]
# ====================
# 4. DECODE
# ====================
# auto_docstring
class QwenImageEditPlusDecodeStep(SequentialPipelineBlocks):
"""
class QwenImageEditPlusDecodeStep
Decode step that decodes the latents to images and postprocesses the generated image.
Components:
vae (`AutoencoderKLQwenImage`) [subfolder=]
image_processor (`VaeImageProcessor`) [subfolder=]
Inputs:
latents (`Tensor`):
The latents to decode, can be generated in the denoise step
output_type (`str`, *optional*, defaults to pil):
Output format: 'pil', 'np', 'pt''.
Outputs:
images (`List`):
Generated images.
"""
model_name = "qwenimage-edit-plus"
block_classes = [QwenImageDecoderStep(), QwenImageProcessImagesOutputStep()]
block_names = ["decode", "postprocess"]
@@ -166,7 +397,95 @@ EDIT_PLUS_AUTO_BLOCKS = InsertableDict(
)
# auto_docstring
class QwenImageEditPlusAutoBlocks(SequentialPipelineBlocks):
"""
class QwenImageEditPlusAutoBlocks
Auto Modular pipeline for edit (img2img) tasks using QwenImage-Edit Plus.
- `image` is required input (can be single image or list of images).
- Each image is resized independently based on its own aspect ratio.
- VL encoder uses 384x384 target area, VAE encoder uses 1024x1024 target area.
Components:
image_resize_processor (`VaeImageProcessor`) [subfolder=]
text_encoder (`Qwen2_5_VLForConditionalGeneration`) [subfolder=]
processor (`Qwen2VLProcessor`) [subfolder=]
guider (`ClassifierFreeGuidance`) [subfolder=]
image_processor (`VaeImageProcessor`) [subfolder=]
vae (`AutoencoderKLQwenImage`) [subfolder=]
pachifier (`QwenImagePachifier`) [subfolder=]
scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
transformer (`QwenImageTransformer2DModel`) [subfolder=]
Configs:
prompt_template_encode (default: <|im_start|>system
Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how
the user's text instruction should alter or modify the image. Generate a new image that meets the user's
requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user
{}<|im_end|> <|im_start|>assistant )
img_template_encode (default: Picture {}: <|vision_start|><|image_pad|><|vision_end|>)
prompt_template_encode_start_idx (default: 64)
Inputs:
image (`Image`):
Input image for img2img, editing, or conditioning.
prompt (`str`):
The prompt or prompts to guide image generation.
negative_prompt (`str`, *optional*):
The prompt or prompts not to guide the image generation.
generator (`Generator`, *optional*):
Torch generator for deterministic generation.
num_images_per_prompt (`int`, *optional*, defaults to 1):
The number of images to generate per prompt.
height (`int`, *optional*):
The height in pixels of the generated image.
width (`int`, *optional*):
The width in pixels of the generated image.
latents (`Tensor`, *optional*):
Pre-generated noisy latents for image generation.
num_inference_steps (`int`, *optional*, defaults to 50):
The number of denoising steps.
sigmas (`List`, *optional*):
Custom sigmas for the denoising process.
attention_kwargs (`Dict`, *optional*):
Additional kwargs for attention processors.
**denoiser_input_fields (`Tensor`, *optional*):
conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
output_type (`str`, *optional*, defaults to pil):
Output format: 'pil', 'np', 'pt''.
Outputs:
images (`List`):
Generated images.
"""
model_name = "qwenimage-edit-plus"
block_classes = EDIT_PLUS_AUTO_BLOCKS.values()
block_names = EDIT_PLUS_AUTO_BLOCKS.keys()
@@ -179,3 +498,9 @@ class QwenImageEditPlusAutoBlocks(SequentialPipelineBlocks):
"- Each image is resized independently based on its own aspect ratio.\n"
"- VL encoder uses 384x384 target area, VAE encoder uses 1024x1024 target area."
)
@property
def outputs(self):
return [
OutputParam.images(),
]

View File

@@ -15,7 +15,7 @@
from ...utils import logging
from ..modular_pipeline import SequentialPipelineBlocks
from ..modular_pipeline_utils import InsertableDict
from ..modular_pipeline_utils import InsertableDict, OutputParam
from .before_denoise import (
QwenImageLayeredPrepareLatentsStep,
QwenImageLayeredRoPEInputsStep,
@@ -50,8 +50,102 @@ logger = logging.get_logger(__name__)
# ====================
# auto_docstring
class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks):
"""Text encoder that takes text prompt, will generate a prompt based on image if not provided."""
"""
class QwenImageLayeredTextEncoderStep
QwenImage-Layered Text encoder step that encode the text prompt, will generate a prompt based on image if not
provided.
Components:
image_resize_processor (`VaeImageProcessor`) [subfolder=]
text_encoder (`Qwen2_5_VLForConditionalGeneration`) [subfolder=]
processor (`Qwen2VLProcessor`) [subfolder=]
tokenizer (`Qwen2Tokenizer`): The tokenizer to use [subfolder=]
guider (`ClassifierFreeGuidance`) [subfolder=]
Configs:
image_caption_prompt_en (default: <|im_start|>system
You are a helpful assistant.<|im_end|> <|im_start|>user # Image Annotator You are a professional image annotator.
Please write an image caption based on the input image:
1. Write the caption using natural, descriptive language without structured formats or rich text.
2. Enrich caption details by including:
- Object attributes, such as quantity, color, shape, size, material, state, position, actions, and so on
- Vision Relations between objects, such as spatial relations, functional relations, possessive relations,
attachment relations, action relations, comparative relations, causal relations, and so on
- Environmental details, such as weather, lighting, colors, textures, atmosphere, and so on
- Identify the text clearly visible in the image, without translation or explanation, and highlight it in the
caption with quotation marks
3. Maintain authenticity and accuracy:
- Avoid generalizations
- Describe all visible information in the image, while do not add information not explicitly shown in the image
<|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant )
image_caption_prompt_cn (default: <|im_start|>system
You are a helpful assistant.<|im_end|> <|im_start|>user # 图像标注器 你是一个专业的图像标注器。请基于输入图像,撰写图注:
1. 使用自然、描述性的语言撰写图注,不要使用结构化形式或富文本形式。
2. 通过加入以下内容,丰富图注细节:
- 对象的属性:如数量、颜色、形状、大小、位置、材质、状态、动作等
- 对象间的视觉关系:如空间关系、功能关系、动作关系、从属关系、比较关系、因果关系等
- 环境细节:例如天气、光照、颜色、纹理、气氛等
- 文字内容:识别图像中清晰可见的文字,不做翻译和解释,用引号在图注中强调
3. 保持真实性与准确性:
- 不要使用笼统的描述
- 描述图像中所有可见的信息,但不要加入没有在图像中出现的内容
<|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant )
prompt_template_encode (default: <|im_start|>system
Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the
objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant )
prompt_template_encode_start_idx (default: 34)
tokenizer_max_length (default: 1024)
Inputs:
image (`Image`):
Input image for img2img, editing, or conditioning.
resolution (`int`, *optional*, defaults to 640):
The target area to resize the image to, can be 1024 or 640
prompt (`str`, *optional*):
The prompt to encode
use_en_prompt (`bool`, *optional*, defaults to False):
Whether to use English prompt template
negative_prompt (`str`, *optional*):
The prompt or prompts not to guide the image generation.
max_sequence_length (`int`, *optional*, defaults to 1024):
Maximum sequence length for prompt encoding.
Outputs:
resized_image (`List`):
The resized images
prompt_embeds (`Tensor`):
The prompt embeddings
prompt_embeds_mask (`Tensor`):
The encoder attention mask
negative_prompt_embeds (`Tensor`):
The negative prompt embeddings
negative_prompt_embeds_mask (`Tensor`):
The negative prompt embeddings mask
"""
model_name = "qwenimage-layered"
block_classes = [
@@ -72,7 +166,43 @@ class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks):
# Edit VAE encoder
# auto_docstring
class QwenImageLayeredVaeEncoderStep(SequentialPipelineBlocks):
"""
class QwenImageLayeredVaeEncoderStep
Vae encoder step that encode the image inputs into their latent representations.
Components:
image_resize_processor (`VaeImageProcessor`) [subfolder=]
image_processor (`VaeImageProcessor`) [subfolder=]
vae (`AutoencoderKLQwenImage`) [subfolder=]
Inputs:
image (`Image`):
Input image for img2img, editing, or conditioning.
resolution (`int`, *optional*, defaults to 640):
The target area to resize the image to, can be 1024 or 640
generator (`Generator`, *optional*):
Torch generator for deterministic generation.
Outputs:
resized_image (`List`):
The resized images
processed_image (`None`):
image_latents (`Tensor`):
The latents representing the reference image(s). Single tensor or list depending on input.
"""
model_name = "qwenimage-layered"
block_classes = [
QwenImageLayeredResizeStep(),
@@ -93,7 +223,55 @@ class QwenImageLayeredVaeEncoderStep(SequentialPipelineBlocks):
# assemble input steps
# auto_docstring
class QwenImageLayeredInputStep(SequentialPipelineBlocks):
"""
class QwenImageLayeredInputStep
Input step that prepares the inputs for the layered denoising step. It:
- make sure the text embeddings have consistent batch size as well as the additional inputs.
- update height/width based `image_latents`, patchify `image_latents`.
Components:
pachifier (`QwenImageLayeredPachifier`) [subfolder=]
Inputs:
num_images_per_prompt (`int`, *optional*, defaults to 1):
The number of images to generate per prompt.
prompt_embeds (`None`):
prompt_embeds_mask (`None`):
negative_prompt_embeds (`None`, *optional*):
negative_prompt_embeds_mask (`None`, *optional*):
image_latents (`None`, *optional*):
Outputs:
batch_size (`int`):
Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt
dtype (`dtype`):
Data type of model tensor inputs (determined by `prompt_embeds`)
image_height (`int`):
The image height calculated from the image latents dimension
image_width (`int`):
The image width calculated from the image latents dimension
height (`int`):
The height of the image output
width (`int`):
The width of the image output
"""
model_name = "qwenimage-layered"
block_classes = [
QwenImageTextInputsStep(),
@@ -111,7 +289,65 @@ class QwenImageLayeredInputStep(SequentialPipelineBlocks):
# Qwen Image Layered (image2image) core denoise step
# auto_docstring
class QwenImageLayeredCoreDenoiseStep(SequentialPipelineBlocks):
"""
class QwenImageLayeredCoreDenoiseStep
Core denoising workflow for QwenImage-Layered img2img task.
Components:
pachifier (`QwenImageLayeredPachifier`) [subfolder=]
scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
guider (`ClassifierFreeGuidance`) [subfolder=]
transformer (`QwenImageTransformer2DModel`) [subfolder=]
Inputs:
num_images_per_prompt (`int`, *optional*, defaults to 1):
The number of images to generate per prompt.
prompt_embeds (`None`):
prompt_embeds_mask (`None`):
negative_prompt_embeds (`None`, *optional*):
negative_prompt_embeds_mask (`None`, *optional*):
image_latents (`None`, *optional*):
latents (`Tensor`, *optional*):
Pre-generated noisy latents for image generation.
layers (`int`, *optional*, defaults to 4):
Number of layers to extract from the image
generator (`Generator`, *optional*):
Torch generator for deterministic generation.
num_inference_steps (`int`, *optional*, defaults to 50):
The number of denoising steps.
sigmas (`List`, *optional*):
Custom sigmas for the denoising process.
attention_kwargs (`Dict`, *optional*):
Additional kwargs for attention processors.
**denoiser_input_fields (`Tensor`, *optional*):
conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
Outputs:
latents (`Tensor`):
Denoised latents.
"""
model_name = "qwenimage-layered"
block_classes = [
QwenImageLayeredInputStep(),
@@ -134,6 +370,12 @@ class QwenImageLayeredCoreDenoiseStep(SequentialPipelineBlocks):
def description(self):
return "Core denoising workflow for QwenImage-Layered img2img task."
@property
def outputs(self):
return [
OutputParam.latents(),
]
# ====================
# 4. AUTO BLOCKS & PRESETS
@@ -149,7 +391,127 @@ LAYERED_AUTO_BLOCKS = InsertableDict(
)
# auto_docstring
class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks):
"""
class QwenImageLayeredAutoBlocks
Auto Modular pipeline for layered denoising tasks using QwenImage-Layered.
Components:
image_resize_processor (`VaeImageProcessor`) [subfolder=]
text_encoder (`Qwen2_5_VLForConditionalGeneration`) [subfolder=]
processor (`Qwen2VLProcessor`) [subfolder=]
tokenizer (`Qwen2Tokenizer`): The tokenizer to use [subfolder=]
guider (`ClassifierFreeGuidance`) [subfolder=]
image_processor (`VaeImageProcessor`) [subfolder=]
vae (`AutoencoderKLQwenImage`) [subfolder=]
pachifier (`QwenImageLayeredPachifier`) [subfolder=]
scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
transformer (`QwenImageTransformer2DModel`) [subfolder=]
Configs:
image_caption_prompt_en (default: <|im_start|>system
You are a helpful assistant.<|im_end|> <|im_start|>user # Image Annotator You are a professional image annotator.
Please write an image caption based on the input image:
1. Write the caption using natural, descriptive language without structured formats or rich text.
2. Enrich caption details by including:
- Object attributes, such as quantity, color, shape, size, material, state, position, actions, and so on
- Vision Relations between objects, such as spatial relations, functional relations, possessive relations,
attachment relations, action relations, comparative relations, causal relations, and so on
- Environmental details, such as weather, lighting, colors, textures, atmosphere, and so on
- Identify the text clearly visible in the image, without translation or explanation, and highlight it in the
caption with quotation marks
3. Maintain authenticity and accuracy:
- Avoid generalizations
- Describe all visible information in the image, while do not add information not explicitly shown in the image
<|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant )
image_caption_prompt_cn (default: <|im_start|>system
You are a helpful assistant.<|im_end|> <|im_start|>user # 图像标注器 你是一个专业的图像标注器。请基于输入图像,撰写图注:
1. 使用自然、描述性的语言撰写图注,不要使用结构化形式或富文本形式。
2. 通过加入以下内容,丰富图注细节:
- 对象的属性:如数量、颜色、形状、大小、位置、材质、状态、动作等
- 对象间的视觉关系:如空间关系、功能关系、动作关系、从属关系、比较关系、因果关系等
- 环境细节:例如天气、光照、颜色、纹理、气氛等
- 文字内容:识别图像中清晰可见的文字,不做翻译和解释,用引号在图注中强调
3. 保持真实性与准确性:
- 不要使用笼统的描述
- 描述图像中所有可见的信息,但不要加入没有在图像中出现的内容
<|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant )
prompt_template_encode (default: <|im_start|>system
Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the
objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant )
prompt_template_encode_start_idx (default: 34)
tokenizer_max_length (default: 1024)
Inputs:
image (`Image`):
Input image for img2img, editing, or conditioning.
resolution (`int`, *optional*, defaults to 640):
The target area to resize the image to, can be 1024 or 640
prompt (`str`, *optional*):
The prompt to encode
use_en_prompt (`bool`, *optional*, defaults to False):
Whether to use English prompt template
negative_prompt (`str`, *optional*):
The prompt or prompts not to guide the image generation.
max_sequence_length (`int`, *optional*, defaults to 1024):
Maximum sequence length for prompt encoding.
generator (`Generator`, *optional*):
Torch generator for deterministic generation.
num_images_per_prompt (`int`, *optional*, defaults to 1):
The number of images to generate per prompt.
latents (`Tensor`, *optional*):
Pre-generated noisy latents for image generation.
layers (`int`, *optional*, defaults to 4):
Number of layers to extract from the image
num_inference_steps (`int`, *optional*, defaults to 50):
The number of denoising steps.
sigmas (`List`, *optional*):
Custom sigmas for the denoising process.
attention_kwargs (`Dict`, *optional*):
Additional kwargs for attention processors.
**denoiser_input_fields (`Tensor`, *optional*):
conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
output_type (`str`, *optional*, defaults to pil):
Output format: 'pil', 'np', 'pt''.
Outputs:
images (`List`):
Generated images.
"""
model_name = "qwenimage-layered"
block_classes = LAYERED_AUTO_BLOCKS.values()
block_names = LAYERED_AUTO_BLOCKS.keys()
@@ -157,3 +519,9 @@ class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks):
@property
def description(self):
return "Auto Modular pipeline for layered denoising tasks using QwenImage-Layered."
@property
def outputs(self):
return [
OutputParam.images(),
]

View File

@@ -129,10 +129,7 @@ class ZImageLoopDenoiser(ModularPipelineBlocks):
type_hint=int,
description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.",
),
InputParam(
kwargs_type="denoiser_input_fields",
description="conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.",
),
InputParam.denoiser_input_fields(),
]
guider_input_names = []
uncond_guider_input_names = []

View File

@@ -5,6 +5,7 @@ import torch.nn as nn
import torch.nn.functional as F
from ...configuration_utils import ConfigMixin, register_to_config
from ...loaders import PeftAdapterMixin
from ...models.attention import FeedForward
from ...models.modeling_utils import ModelMixin
from ...models.transformers.transformer_ltx2 import LTX2Attention, LTX2AudioVideoAttnProcessor
@@ -252,7 +253,7 @@ class LTX2ConnectorTransformer1d(nn.Module):
return hidden_states, attention_mask
class LTX2TextConnectors(ModelMixin, ConfigMixin):
class LTX2TextConnectors(ModelMixin, PeftAdapterMixin, ConfigMixin):
"""
Text connector stack used by LTX 2.0 to process the packed text encoder hidden states for both the video and audio
streams.

View File

@@ -21,7 +21,7 @@ import torch
from transformers import Gemma3ForConditionalGeneration, GemmaTokenizer, GemmaTokenizerFast
from ...callbacks import MultiPipelineCallbacks, PipelineCallback
from ...loaders import FromSingleFileMixin, LTXVideoLoraLoaderMixin
from ...loaders import FromSingleFileMixin, LTX2LoraLoaderMixin
from ...models.autoencoders import AutoencoderKLLTX2Audio, AutoencoderKLLTX2Video
from ...models.transformers import LTX2VideoTransformer3DModel
from ...schedulers import FlowMatchEulerDiscreteScheduler
@@ -184,7 +184,7 @@ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
return noise_cfg
class LTX2Pipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraLoaderMixin):
class LTX2Pipeline(DiffusionPipeline, FromSingleFileMixin, LTX2LoraLoaderMixin):
r"""
Pipeline for text-to-video generation.

View File

@@ -0,0 +1,293 @@
# Copyright 2025 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import unittest
import torch
from transformers import AutoTokenizer, Gemma3ForConditionalGeneration
from diffusers import (
AutoencoderKLLTX2Audio,
AutoencoderKLLTX2Video,
FlowMatchEulerDiscreteScheduler,
LTX2Pipeline,
LTX2VideoTransformer3DModel,
)
from diffusers.pipelines.ltx2 import LTX2TextConnectors
from diffusers.pipelines.ltx2.vocoder import LTX2Vocoder
from diffusers.utils.import_utils import is_peft_available
from ..testing_utils import floats_tensor, require_peft_backend
if is_peft_available():
from peft import LoraConfig
sys.path.append(".")
from .utils import PeftLoraLoaderMixinTests # noqa: E402
@require_peft_backend
class LTX2LoRATests(unittest.TestCase, PeftLoraLoaderMixinTests):
pipeline_class = LTX2Pipeline
scheduler_cls = FlowMatchEulerDiscreteScheduler
scheduler_kwargs = {}
transformer_kwargs = {
"in_channels": 4,
"out_channels": 4,
"patch_size": 1,
"patch_size_t": 1,
"num_attention_heads": 2,
"attention_head_dim": 8,
"cross_attention_dim": 16,
"audio_in_channels": 4,
"audio_out_channels": 4,
"audio_num_attention_heads": 2,
"audio_attention_head_dim": 4,
"audio_cross_attention_dim": 8,
"num_layers": 1,
"qk_norm": "rms_norm_across_heads",
"caption_channels": 32,
"rope_double_precision": False,
"rope_type": "split",
}
transformer_cls = LTX2VideoTransformer3DModel
vae_kwargs = {
"in_channels": 3,
"out_channels": 3,
"latent_channels": 4,
"block_out_channels": (8,),
"decoder_block_out_channels": (8,),
"layers_per_block": (1,),
"decoder_layers_per_block": (1, 1),
"spatio_temporal_scaling": (True,),
"decoder_spatio_temporal_scaling": (True,),
"decoder_inject_noise": (False, False),
"downsample_type": ("spatial",),
"upsample_residual": (False,),
"upsample_factor": (1,),
"timestep_conditioning": False,
"patch_size": 1,
"patch_size_t": 1,
"encoder_causal": True,
"decoder_causal": False,
}
vae_cls = AutoencoderKLLTX2Video
audio_vae_kwargs = {
"base_channels": 4,
"output_channels": 2,
"ch_mult": (1,),
"num_res_blocks": 1,
"attn_resolutions": None,
"in_channels": 2,
"resolution": 32,
"latent_channels": 2,
"norm_type": "pixel",
"causality_axis": "height",
"dropout": 0.0,
"mid_block_add_attention": False,
"sample_rate": 16000,
"mel_hop_length": 160,
"is_causal": True,
"mel_bins": 8,
}
audio_vae_cls = AutoencoderKLLTX2Audio
vocoder_kwargs = {
"in_channels": 16, # output_channels * mel_bins = 2 * 8
"hidden_channels": 32,
"out_channels": 2,
"upsample_kernel_sizes": [4, 4],
"upsample_factors": [2, 2],
"resnet_kernel_sizes": [3],
"resnet_dilations": [[1, 3, 5]],
"leaky_relu_negative_slope": 0.1,
"output_sampling_rate": 16000,
}
vocoder_cls = LTX2Vocoder
connectors_kwargs = {
"caption_channels": 32, # Will be set dynamically from text_encoder
"text_proj_in_factor": 2, # Will be set dynamically from text_encoder
"video_connector_num_attention_heads": 4,
"video_connector_attention_head_dim": 8,
"video_connector_num_layers": 1,
"video_connector_num_learnable_registers": None,
"audio_connector_num_attention_heads": 4,
"audio_connector_attention_head_dim": 8,
"audio_connector_num_layers": 1,
"audio_connector_num_learnable_registers": None,
"connector_rope_base_seq_len": 32,
"rope_theta": 10000.0,
"rope_double_precision": False,
"causal_temporal_positioning": False,
"rope_type": "split",
}
connectors_cls = LTX2TextConnectors
tokenizer_cls, tokenizer_id = AutoTokenizer, "hf-internal-testing/tiny-gemma3"
text_encoder_cls, text_encoder_id = (
Gemma3ForConditionalGeneration,
"hf-internal-testing/tiny-gemma3",
)
denoiser_target_modules = ["to_q", "to_k", "to_out.0"]
@property
def output_shape(self):
return (1, 5, 32, 32, 3)
def get_dummy_inputs(self, with_generator=True):
batch_size = 1
sequence_length = 16
num_channels = 4
num_frames = 5
num_latent_frames = 2
latent_height = 8
latent_width = 8
generator = torch.manual_seed(0)
noise = floats_tensor((batch_size, num_latent_frames, num_channels, latent_height, latent_width))
input_ids = torch.randint(1, sequence_length, size=(batch_size, sequence_length), generator=generator)
pipeline_inputs = {
"prompt": "a robot dancing",
"num_frames": num_frames,
"num_inference_steps": 2,
"guidance_scale": 1.0,
"height": 32,
"width": 32,
"frame_rate": 25.0,
"max_sequence_length": sequence_length,
"output_type": "np",
}
if with_generator:
pipeline_inputs.update({"generator": generator})
return noise, input_ids, pipeline_inputs
def get_dummy_components(self, scheduler_cls=None, use_dora=False, lora_alpha=None):
# Override to instantiate LTX2-specific components (connectors, audio_vae, vocoder)
torch.manual_seed(0)
text_encoder = self.text_encoder_cls.from_pretrained(self.text_encoder_id)
tokenizer = self.tokenizer_cls.from_pretrained(self.tokenizer_id)
# Update caption_channels and text_proj_in_factor based on text_encoder config
transformer_kwargs = self.transformer_kwargs.copy()
transformer_kwargs["caption_channels"] = text_encoder.config.text_config.hidden_size
connectors_kwargs = self.connectors_kwargs.copy()
connectors_kwargs["caption_channels"] = text_encoder.config.text_config.hidden_size
connectors_kwargs["text_proj_in_factor"] = text_encoder.config.text_config.num_hidden_layers + 1
torch.manual_seed(0)
transformer = self.transformer_cls(**transformer_kwargs)
torch.manual_seed(0)
vae = self.vae_cls(**self.vae_kwargs)
vae.use_framewise_encoding = False
vae.use_framewise_decoding = False
torch.manual_seed(0)
audio_vae = self.audio_vae_cls(**self.audio_vae_kwargs)
torch.manual_seed(0)
vocoder = self.vocoder_cls(**self.vocoder_kwargs)
torch.manual_seed(0)
connectors = self.connectors_cls(**connectors_kwargs)
if scheduler_cls is None:
scheduler_cls = self.scheduler_cls
scheduler = scheduler_cls(**self.scheduler_kwargs)
rank = 4
lora_alpha = rank if lora_alpha is None else lora_alpha
text_lora_config = LoraConfig(
r=rank,
lora_alpha=lora_alpha,
target_modules=self.text_encoder_target_modules,
init_lora_weights=False,
use_dora=use_dora,
)
denoiser_lora_config = LoraConfig(
r=rank,
lora_alpha=lora_alpha,
target_modules=["to_q", "to_k", "to_v", "to_out.0"],
init_lora_weights=False,
use_dora=use_dora,
)
pipeline_components = {
"transformer": transformer,
"vae": vae,
"audio_vae": audio_vae,
"scheduler": scheduler,
"text_encoder": text_encoder,
"tokenizer": tokenizer,
"connectors": connectors,
"vocoder": vocoder,
}
return pipeline_components, text_lora_config, denoiser_lora_config
def test_simple_inference_with_text_lora_denoiser_fused_multi(self):
super().test_simple_inference_with_text_lora_denoiser_fused_multi(expected_atol=9e-3)
def test_simple_inference_with_text_denoiser_lora_unfused(self):
super().test_simple_inference_with_text_denoiser_lora_unfused(expected_atol=9e-3)
@unittest.skip("Not supported in LTX2.")
def test_simple_inference_with_text_denoiser_block_scale(self):
pass
@unittest.skip("Not supported in LTX2.")
def test_simple_inference_with_text_denoiser_block_scale_for_all_dict_options(self):
pass
@unittest.skip("Not supported in LTX2.")
def test_modify_padding_mode(self):
pass
@unittest.skip("Text encoder LoRA is not supported in LTX2.")
def test_simple_inference_with_partial_text_lora(self):
pass
@unittest.skip("Text encoder LoRA is not supported in LTX2.")
def test_simple_inference_with_text_lora(self):
pass
@unittest.skip("Text encoder LoRA is not supported in LTX2.")
def test_simple_inference_with_text_lora_and_scale(self):
pass
@unittest.skip("Text encoder LoRA is not supported in LTX2.")
def test_simple_inference_with_text_lora_fused(self):
pass
@unittest.skip("Text encoder LoRA is not supported in LTX2.")
def test_simple_inference_with_text_lora_save_load(self):
pass
@unittest.skip("Text encoder LoRA is not supported in LTX2.")
def test_simple_inference_save_pretrained_with_text_lora(self):
pass

View File

@@ -0,0 +1,286 @@
# coding=utf-8
# Copyright 2025 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Auto Docstring Generator for Modular Pipeline Blocks
This script scans Python files for classes that have `# auto_docstring` comment above them
and inserts/updates the docstring from the class's `doc` property.
Run from the root of the repo:
python utils/modular_auto_docstring.py [path] [--fix_and_overwrite]
Examples:
# Check for auto_docstring markers (will error if found without proper docstring)
python utils/modular_auto_docstring.py
# Check specific directory
python utils/modular_auto_docstring.py src/diffusers/modular_pipelines/
# Fix and overwrite the docstrings
python utils/modular_auto_docstring.py --fix_and_overwrite
Usage in code:
# auto_docstring
class QwenImageAutoVaeEncoderStep(AutoPipelineBlocks):
# docstring will be automatically inserted here
@property
def doc(self):
return "Your docstring content..."
"""
import argparse
import ast
import glob
import importlib
import os
import re
import sys
# All paths are set with the intent you should run this script from the root of the repo
DIFFUSERS_PATH = "src/diffusers"
REPO_PATH = "."
# Pattern to match the auto_docstring comment
AUTO_DOCSTRING_PATTERN = re.compile(r"^\s*#\s*auto_docstring\s*$")
def setup_diffusers_import():
"""Setup import path to use the local diffusers module."""
src_path = os.path.join(REPO_PATH, "src")
if src_path not in sys.path:
sys.path.insert(0, src_path)
def get_module_from_filepath(filepath: str) -> str:
"""Convert a filepath to a module name."""
filepath = os.path.normpath(filepath)
if filepath.startswith("src" + os.sep):
filepath = filepath[4:]
if filepath.endswith(".py"):
filepath = filepath[:-3]
module_name = filepath.replace(os.sep, ".")
return module_name
def load_module(filepath: str):
"""Load a module from filepath."""
setup_diffusers_import()
module_name = get_module_from_filepath(filepath)
try:
module = importlib.import_module(module_name)
return module
except Exception as e:
print(f"Warning: Could not import module {module_name}: {e}")
return None
def get_doc_from_class(module, class_name: str) -> str:
"""Get the doc property from an instantiated class."""
if module is None:
return None
cls = getattr(module, class_name, None)
if cls is None:
return None
try:
instance = cls()
if hasattr(instance, "doc"):
return instance.doc
except Exception as e:
print(f"Warning: Could not instantiate {class_name}: {e}")
return None
def find_auto_docstring_classes(filepath: str) -> list:
"""
Find all classes in a file that have # auto_docstring comment above them.
Returns list of (class_name, class_line_number, has_existing_docstring, docstring_end_line)
"""
with open(filepath, "r", encoding="utf-8", newline="\n") as f:
lines = f.readlines()
# Parse AST to find class locations and their docstrings
content = "".join(lines)
try:
tree = ast.parse(content)
except SyntaxError as e:
print(f"Syntax error in {filepath}: {e}")
return []
# Build a map of class_name -> (class_line, has_docstring, docstring_end_line)
class_info = {}
for node in ast.walk(tree):
if isinstance(node, ast.ClassDef):
has_docstring = False
docstring_end_line = node.lineno # default to class line
if node.body and isinstance(node.body[0], ast.Expr):
first_stmt = node.body[0]
if isinstance(first_stmt.value, ast.Constant) and isinstance(first_stmt.value.value, str):
has_docstring = True
docstring_end_line = first_stmt.end_lineno or first_stmt.lineno
class_info[node.name] = (node.lineno, has_docstring, docstring_end_line)
# Now scan for # auto_docstring comments
classes_to_update = []
for i, line in enumerate(lines):
if AUTO_DOCSTRING_PATTERN.match(line):
# Found the marker, look for class definition on next non-empty, non-comment line
j = i + 1
while j < len(lines):
next_line = lines[j].strip()
if next_line and not next_line.startswith("#"):
break
j += 1
if j < len(lines) and lines[j].strip().startswith("class "):
# Extract class name
match = re.match(r"class\s+(\w+)", lines[j].strip())
if match:
class_name = match.group(1)
if class_name in class_info:
class_line, has_docstring, docstring_end_line = class_info[class_name]
classes_to_update.append((class_name, class_line, has_docstring, docstring_end_line))
return classes_to_update
def format_docstring(doc: str, indent: str = " ") -> str:
"""Format a doc string as a properly indented docstring."""
lines = doc.strip().split("\n")
if len(lines) == 1:
return f'{indent}"""{lines[0]}"""\n'
else:
result = [f'{indent}"""\n']
for line in lines:
if line.strip():
result.append(f"{indent}{line}\n")
else:
result.append("\n")
result.append(f'{indent}"""\n')
return "".join(result)
def process_file(filepath: str, overwrite: bool = False) -> list:
"""
Process a file and find/insert docstrings for # auto_docstring marked classes.
Returns list of classes that need updating.
"""
classes_to_update = find_auto_docstring_classes(filepath)
if not classes_to_update:
return []
if not overwrite:
# Just return the list of classes that need updating
return [(filepath, cls_name, line) for cls_name, line, _, _ in classes_to_update]
# Load the module to get doc properties
module = load_module(filepath)
with open(filepath, "r", encoding="utf-8", newline="\n") as f:
lines = f.readlines()
# Process in reverse order to maintain line numbers
updated = False
for class_name, class_line, has_docstring, docstring_end_line in reversed(classes_to_update):
doc = get_doc_from_class(module, class_name)
if doc is None:
print(f"Warning: Could not get doc for {class_name} in {filepath}")
continue
# Format the new docstring with 4-space indent
new_docstring = format_docstring(doc, " ")
if has_docstring:
# Replace existing docstring (line after class definition to docstring_end_line)
# class_line is 1-indexed, we want to replace from class_line+1 to docstring_end_line
lines = lines[:class_line] + [new_docstring] + lines[docstring_end_line:]
else:
# Insert new docstring right after class definition line
# class_line is 1-indexed, so lines[class_line-1] is the class line
# Insert at position class_line (which is right after the class line)
lines = lines[:class_line] + [new_docstring] + lines[class_line:]
updated = True
print(f"Updated docstring for {class_name} in {filepath}")
if updated:
with open(filepath, "w", encoding="utf-8", newline="\n") as f:
f.writelines(lines)
return [(filepath, cls_name, line) for cls_name, line, _, _ in classes_to_update]
def check_auto_docstrings(path: str = None, overwrite: bool = False):
"""
Check all files for # auto_docstring markers and optionally fix them.
"""
if path is None:
path = DIFFUSERS_PATH
if os.path.isfile(path):
all_files = [path]
else:
all_files = glob.glob(os.path.join(path, "**/*.py"), recursive=True)
all_markers = []
for filepath in all_files:
markers = process_file(filepath, overwrite)
all_markers.extend(markers)
if not overwrite and len(all_markers) > 0:
message = "\n".join([f"- {f}: {cls} at line {line}" for f, cls, line in all_markers])
raise ValueError(
f"Found the following # auto_docstring markers that need docstrings:\n{message}\n\n"
f"Run `python utils/modular_auto_docstring.py --fix_and_overwrite` to fix them."
)
if overwrite and len(all_markers) > 0:
print(f"\nUpdated {len(all_markers)} docstring(s).")
elif len(all_markers) == 0:
print("No # auto_docstring markers found.")
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Check and fix # auto_docstring markers in modular pipeline blocks",
)
parser.add_argument("path", nargs="?", default=None, help="File or directory to process (default: src/diffusers)")
parser.add_argument(
"--fix_and_overwrite",
action="store_true",
help="Whether to fix the docstrings by inserting them from doc property.",
)
args = parser.parse_args()
check_auto_docstrings(args.path, args.fix_and_overwrite)